diff options
| author | Paolo Abeni <pabeni@redhat.com> | 2026-03-05 13:32:49 +0300 |
|---|---|---|
| committer | Paolo Abeni <pabeni@redhat.com> | 2026-03-05 13:32:50 +0300 |
| commit | 6d32a196beb41525f055f720e98f97fb441f0ee0 (patch) | |
| tree | 530e7bd31012de8ae7483c50641b48501d070161 | |
| parent | d8103bfe41eeb8b6842672502a0025b7278931a8 (diff) | |
| parent | f20c73b0460d15301cf1bddf0f85d060a38a75df (diff) | |
| download | linux-6d32a196beb41525f055f720e98f97fb441f0ee0.tar.xz | |
Merge tag 'nf-next-26-03-04' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next
Florian Westphal says:
====================
netfilter: updates for net-next
The following patchset contains Netfilter updates for *net-next*,
including changes to IPv6 stack and updates to IPVS from Julian Anastasov.
1) ipv6: export fib6_lookup for nft_fib_ipv6 module
2) factor out ipv6_anycast_destination logic so its usable without
dst_entry. These are dependencies for patch 3.
3) switch nft_fib_ipv6 module to no longer need temporary dst_entry
object allocations by using fib6_lookup() + RCU.
This gets us ~13% higher packet rate in my tests.
Patches 4 to 8, from Eric Dumazet, zap sk_callback_lock usage in
netfilter. Patch 9 removes another sk_callback_lock instance.
Remaining patches, from Julian Anastasov, improve IPVS, Quoting Julian:
* Add infrastructure for resizable hash tables based on hlist_bl.
* Change the 256-bucket service hash table to be resizable.
* Change the global connection table to be per-net and resizable.
* Make connection hashing more secure for setups with multiple services.
netfilter pull request nf-next-26-03-04
* tag 'nf-next-26-03-04' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next:
ipvs: use more keys for connection hashing
ipvs: switch to per-net connection table
ipvs: use resizable hash table for services
ipvs: add resizable hash tables
rculist_bl: add hlist_bl_for_each_entry_continue_rcu
netfilter: nfnetlink_queue: remove locking in nfqnl_get_sk_secctx
netfilter: nfnetlink_queue: no longer acquire sk_callback_lock
netfilter: nfnetlink_log: no longer acquire sk_callback_lock
netfilter: nft_meta: no longer acquire sk_callback_lock in nft_meta_get_eval_skugid()
netfilter: xt_owner: no longer acquire sk_callback_lock in mt_owner()
netfilter: nf_log_syslog: no longer acquire sk_callback_lock in nf_log_dump_sk_uid_gid()
netfilter: nft_fib_ipv6: switch to fib6_lookup
ipv6: make ipv6_anycast_destination logic usable without dst_entry
ipv6: export fib6_lookup for nft_fib_ipv6
====================
Link: https://patch.msgid.link/20260304114921.31042-1-fw@strlen.de
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
| -rw-r--r-- | include/linux/rculist_bl.h | 49 | ||||
| -rw-r--r-- | include/net/ip6_route.h | 15 | ||||
| -rw-r--r-- | include/net/ip_vs.h | 377 | ||||
| -rw-r--r-- | net/ipv6/fib6_rules.c | 3 | ||||
| -rw-r--r-- | net/ipv6/ip6_fib.c | 3 | ||||
| -rw-r--r-- | net/ipv6/netfilter/nft_fib_ipv6.c | 79 | ||||
| -rw-r--r-- | net/netfilter/ipvs/ip_vs_conn.c | 991 | ||||
| -rw-r--r-- | net/netfilter/ipvs/ip_vs_core.c | 179 | ||||
| -rw-r--r-- | net/netfilter/ipvs/ip_vs_ctl.c | 691 | ||||
| -rw-r--r-- | net/netfilter/ipvs/ip_vs_pe_sip.c | 4 | ||||
| -rw-r--r-- | net/netfilter/ipvs/ip_vs_sync.c | 23 | ||||
| -rw-r--r-- | net/netfilter/nf_log_syslog.c | 16 | ||||
| -rw-r--r-- | net/netfilter/nfnetlink_log.c | 19 | ||||
| -rw-r--r-- | net/netfilter/nfnetlink_queue.c | 24 | ||||
| -rw-r--r-- | net/netfilter/nft_meta.c | 23 | ||||
| -rw-r--r-- | net/netfilter/xt_owner.c | 28 |
16 files changed, 2011 insertions, 513 deletions
diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h index 0b952d06eb0b..36363b876e53 100644 --- a/include/linux/rculist_bl.h +++ b/include/linux/rculist_bl.h @@ -8,21 +8,31 @@ #include <linux/list_bl.h> #include <linux/rcupdate.h> +/* return the first ptr or next element in an RCU protected list */ +#define hlist_bl_first_rcu(head) \ + (*((struct hlist_bl_node __rcu **)(&(head)->first))) +#define hlist_bl_next_rcu(node) \ + (*((struct hlist_bl_node __rcu **)(&(node)->next))) + static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); - rcu_assign_pointer(h->first, + rcu_assign_pointer(hlist_bl_first_rcu(h), (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK)); } -static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) -{ - return (struct hlist_bl_node *) - ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); -} +#define hlist_bl_first_rcu_dereference(head) \ +({ \ + struct hlist_bl_head *__head = (head); \ + \ + (struct hlist_bl_node *) \ + ((unsigned long)rcu_dereference_check(hlist_bl_first_rcu(__head), \ + hlist_bl_is_locked(__head)) & \ + ~LIST_BL_LOCKMASK); \ +}) /** * hlist_bl_del_rcu - deletes entry from hash list without re-initialization @@ -73,7 +83,7 @@ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, { struct hlist_bl_node *first; - /* don't need hlist_bl_first_rcu because we're under lock */ + /* don't need hlist_bl_first_rcu* because we're under lock */ first = hlist_bl_first(h); n->next = first; @@ -93,9 +103,30 @@ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, * */ #define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \ - for (pos = hlist_bl_first_rcu(head); \ + for (pos = hlist_bl_first_rcu_dereference(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_raw(hlist_bl_next_rcu(pos))) + +/** + * hlist_bl_for_each_entry_continue_rcu - continue iteration over list of given + * type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_bl_node to use as a loop cursor. + * @member: the name of the hlist_bl_node within the struct. + * + * Continue to iterate over list of given type, continuing after + * the current position which must have been in the list when the RCU read + * lock was taken. + * This would typically require either that you obtained the node from a + * previous walk of the list in the same RCU read-side critical section, or + * that you held some sort of non-RCU reference (such as a reference count) + * to keep the node alive *and* in the list. + */ +#define hlist_bl_for_each_entry_continue_rcu(tpos, pos, member) \ + for (pos = rcu_dereference_raw(hlist_bl_next_rcu(&(tpos)->member)); \ + pos && \ + ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ + pos = rcu_dereference_raw(hlist_bl_next_rcu(pos))) #endif diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index a55f9bf95fe3..0c8eeb6abe7a 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -252,15 +252,22 @@ static inline bool ipv6_unicast_destination(const struct sk_buff *skb) return rt->rt6i_flags & RTF_LOCAL; } +static inline bool __ipv6_anycast_destination(const struct rt6key *rt6i_dst, + u32 rt6i_flags, + const struct in6_addr *daddr) +{ + return rt6i_flags & RTF_ANYCAST || + (rt6i_dst->plen < 127 && + !(rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) && + ipv6_addr_equal(&rt6i_dst->addr, daddr)); +} + static inline bool ipv6_anycast_destination(const struct dst_entry *dst, const struct in6_addr *daddr) { const struct rt6_info *rt = dst_rt6_info(dst); - return rt->rt6i_flags & RTF_ANYCAST || - (rt->rt6i_dst.plen < 127 && - !(rt->rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) && - ipv6_addr_equal(&rt->rt6i_dst.addr, daddr)); + return __ipv6_anycast_destination(&rt->rt6i_dst, rt->rt6i_flags, daddr); } int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index ad8a16146ac5..72d325c81313 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -11,6 +11,7 @@ #include <asm/types.h> /* for __uXX types */ #include <linux/list.h> /* for struct list_head */ +#include <linux/rculist_bl.h> /* for struct hlist_bl_head */ #include <linux/spinlock.h> /* for struct rwlock_t */ #include <linux/atomic.h> /* for struct atomic_t */ #include <linux/refcount.h> /* for struct refcount_t */ @@ -30,15 +31,22 @@ #endif #include <net/net_namespace.h> /* Netw namespace */ #include <linux/sched/isolation.h> +#include <linux/siphash.h> #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 -/* - * Hash table: for virtual service lookups - */ -#define IP_VS_SVC_TAB_BITS 8 -#define IP_VS_SVC_TAB_SIZE BIT(IP_VS_SVC_TAB_BITS) -#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) + +/* conn_tab limits (as per Kconfig) */ +#define IP_VS_CONN_TAB_MIN_BITS 8 +#if BITS_PER_LONG > 32 +#define IP_VS_CONN_TAB_MAX_BITS 27 +#else +#define IP_VS_CONN_TAB_MAX_BITS 20 +#endif + +/* svc_table limits */ +#define IP_VS_SVC_TAB_MIN_BITS 4 +#define IP_VS_SVC_TAB_MAX_BITS 20 /* Generic access of ipvs struct */ static inline struct netns_ipvs *net_ipvs(struct net* net) @@ -49,8 +57,6 @@ static inline struct netns_ipvs *net_ipvs(struct net* net) /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; -extern struct mutex __ip_vs_mutex; - struct ip_vs_iphdr { int hdr_flags; /* ipvs flags */ __u32 off; /* Where IP or IPv4 header starts */ @@ -271,6 +277,10 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len, pr_err(msg, ##__VA_ARGS__); \ } while (0) +struct ip_vs_aligned_lock { + spinlock_t l; /* Protect buckets */ +} ____cacheline_aligned_in_smp; + /* For arrays per family */ enum { IP_VS_AF_INET, @@ -283,6 +293,13 @@ static inline int ip_vs_af_index(int af) return af == AF_INET6 ? IP_VS_AF_INET6 : IP_VS_AF_INET; } +/* work_flags */ +enum { + IP_VS_WORK_SVC_RESIZE, /* Schedule svc_resize_work */ + IP_VS_WORK_SVC_NORESIZE, /* Stopping svc_resize_work */ + IP_VS_WORK_CONN_RESIZE, /* Schedule conn_resize_work */ +}; + /* The port number of FTP service (in network order). */ #define FTPPORT cpu_to_be16(21) #define FTPDATA cpu_to_be16(20) @@ -484,6 +501,198 @@ struct ip_vs_est_kt_data { int est_row; /* estimated row */ }; +/* IPVS resizable hash tables */ +struct ip_vs_rht { + struct hlist_bl_head *buckets; + struct ip_vs_rht __rcu *new_tbl; /* New/Same table */ + seqcount_t *seqc; /* Protects moves */ + struct ip_vs_aligned_lock *lock; /* Protect seqc */ + int mask; /* Buckets mask */ + int size; /* Buckets */ + int seqc_mask; /* seqc mask */ + int lock_mask; /* lock mask */ + u32 table_id; + int u_thresh; /* upper threshold */ + int l_thresh; /* lower threshold */ + int lfactor; /* Load Factor (shift)*/ + int bits; /* size = 1 << bits */ + siphash_key_t hash_key; + struct rcu_head rcu_head; +}; + +/** + * ip_vs_rht_for_each_table() - Walk the hash tables + * @table: struct ip_vs_rht __rcu *table + * @t: current table, used as cursor, struct ip_vs_rht *var + * @p: previous table, temp struct ip_vs_rht *var + * + * Walk tables assuming others can not change the installed tables + */ +#define ip_vs_rht_for_each_table(table, t, p) \ + for (p = NULL, t = rcu_dereference_protected(table, 1); \ + t != p; \ + p = t, t = rcu_dereference_protected(t->new_tbl, 1)) + +/** + * ip_vs_rht_for_each_table_rcu() - Walk the hash tables under RCU reader lock + * @table: struct ip_vs_rht __rcu *table + * @t: current table, used as cursor, struct ip_vs_rht *var + * @p: previous table, temp struct ip_vs_rht *var + * + * We usually search in one table and also in second table on resizing + */ +#define ip_vs_rht_for_each_table_rcu(table, t, p) \ + for (p = NULL, t = rcu_dereference(table); \ + t != p; \ + p = t, t = rcu_dereference(t->new_tbl)) + +/** + * ip_vs_rht_for_each_bucket() - Walk all table buckets + * @t: current table, used as cursor, struct ip_vs_rht *var + * @bucket: bucket index, used as cursor, u32 var + * @head: bucket address, used as cursor, struct hlist_bl_head *var + */ +#define ip_vs_rht_for_each_bucket(t, bucket, head) \ + for (bucket = 0, head = (t)->buckets; \ + bucket < t->size; bucket++, head++) + +/** + * ip_vs_rht_for_bucket_retry() - Retry bucket if entries are moved + * @t: current table, used as cursor, struct ip_vs_rht *var + * @bucket: index of current bucket or hash key + * @sc: temp seqcount_t *var + * @seq: temp unsigned int var for sequence count + * @retry: temp int var + */ +#define ip_vs_rht_for_bucket_retry(t, bucket, sc, seq, retry) \ + for (retry = 1, sc = &(t)->seqc[(bucket) & (t)->seqc_mask]; \ + retry && ({ seq = read_seqcount_begin(sc); 1; }); \ + retry = read_seqcount_retry(sc, seq)) + +/** + * DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() - Declare variables + * + * Variables for ip_vs_rht_walk_buckets_rcu + */ +#define DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() \ + struct ip_vs_rht *_t, *_p; \ + unsigned int _seq; \ + seqcount_t *_sc; \ + u32 _bucket; \ + int _retry +/** + * ip_vs_rht_walk_buckets_rcu() - Walk all buckets under RCU read lock + * @table: struct ip_vs_rht __rcu *table + * @head: bucket address, used as cursor, struct hlist_bl_head *var + * + * Can be used while others add/delete/move entries + * Not suitable if duplicates are not desired + * Possible cases for reader that uses cond_resched_rcu() in the loop: + * - new table can not be installed, no need to repeat + * - new table can be installed => check and repeat if new table is + * installed, needed for !PREEMPT_RCU + */ +#define ip_vs_rht_walk_buckets_rcu(table, head) \ + ip_vs_rht_for_each_table_rcu(table, _t, _p) \ + ip_vs_rht_for_each_bucket(_t, _bucket, head) \ + ip_vs_rht_for_bucket_retry(_t, _bucket, _sc, \ + _seq, _retry) + +/** + * DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() - Declare variables + * + * Variables for ip_vs_rht_walk_bucket_rcu + */ +#define DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() \ + unsigned int _seq; \ + seqcount_t *_sc; \ + int _retry +/** + * ip_vs_rht_walk_bucket_rcu() - Walk bucket under RCU read lock + * @t: current table, struct ip_vs_rht *var + * @bucket: index of current bucket or hash key + * @head: bucket address, used as cursor, struct hlist_bl_head *var + * + * Can be used while others add/delete/move entries + * Not suitable if duplicates are not desired + * Possible cases for reader that uses cond_resched_rcu() in the loop: + * - new table can not be installed, no need to repeat + * - new table can be installed => check and repeat if new table is + * installed, needed for !PREEMPT_RCU + */ +#define ip_vs_rht_walk_bucket_rcu(t, bucket, head) \ + if (({ head = (t)->buckets + ((bucket) & (t)->mask); 0; })) \ + {} \ + else \ + ip_vs_rht_for_bucket_retry(t, (bucket), _sc, _seq, _retry) + +/** + * DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() - Declare variables + * + * Variables for ip_vs_rht_walk_buckets_safe_rcu + */ +#define DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() \ + struct ip_vs_rht *_t, *_p; \ + u32 _bucket +/** + * ip_vs_rht_walk_buckets_safe_rcu() - Walk all buckets under RCU read lock + * @table: struct ip_vs_rht __rcu *table + * @head: bucket address, used as cursor, struct hlist_bl_head *var + * + * Can be used while others add/delete entries but moving is disabled + * Using cond_resched_rcu() should be safe if tables do not change + */ +#define ip_vs_rht_walk_buckets_safe_rcu(table, head) \ + ip_vs_rht_for_each_table_rcu(table, _t, _p) \ + ip_vs_rht_for_each_bucket(_t, _bucket, head) + +/** + * DECLARE_IP_VS_RHT_WALK_BUCKETS() - Declare variables + * + * Variables for ip_vs_rht_walk_buckets + */ +#define DECLARE_IP_VS_RHT_WALK_BUCKETS() \ + struct ip_vs_rht *_t, *_p; \ + u32 _bucket + +/** + * ip_vs_rht_walk_buckets() - Walk all buckets + * @table: struct ip_vs_rht __rcu *table + * @head: bucket address, used as cursor, struct hlist_bl_head *var + * + * Use if others can not add/delete/move entries + */ +#define ip_vs_rht_walk_buckets(table, head) \ + ip_vs_rht_for_each_table(table, _t, _p) \ + ip_vs_rht_for_each_bucket(_t, _bucket, head) + +/* Entries can be in one of two tables, so we flip bit when new table is + * created and store it as highest bit in hash keys + */ +#define IP_VS_RHT_TABLE_ID_MASK BIT(31) + +/* Check if hash key is from this table */ +static inline bool ip_vs_rht_same_table(struct ip_vs_rht *t, u32 hash_key) +{ + return !((t->table_id ^ hash_key) & IP_VS_RHT_TABLE_ID_MASK); +} + +/* Build per-table hash key from hash value */ +static inline u32 ip_vs_rht_build_hash_key(struct ip_vs_rht *t, u32 hash) +{ + return t->table_id | (hash & ~IP_VS_RHT_TABLE_ID_MASK); +} + +void ip_vs_rht_free(struct ip_vs_rht *t); +void ip_vs_rht_rcu_free(struct rcu_head *head); +struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks); +int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n, + int lfactor, int min_bits, int max_bits); +void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor, + int min_bits, int max_bits); +u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af, + const union nf_inet_addr *addr, u32 v1, u32 v2); + struct dst_entry; struct iphdr; struct ip_vs_conn; @@ -577,50 +786,48 @@ struct ip_vs_conn_param { __u8 pe_data_len; }; +/* Hash node in conn_tab */ +struct ip_vs_conn_hnode { + struct hlist_bl_node node; /* node in conn_tab */ + u32 hash_key; /* Key for the hash table */ + u8 dir; /* 0=out->in, 1=in->out */ +} __packed; + /* IP_VS structure allocated for each dynamically scheduled connection */ struct ip_vs_conn { - struct hlist_node c_list; /* hashed list heads */ - /* Protocol, addresses and port numbers */ + /* Cacheline for hash table nodes - rarely modified */ + + struct ip_vs_conn_hnode hn0; /* Original direction */ + u8 af; /* address family */ __be16 cport; + struct ip_vs_conn_hnode hn1; /* Reply direction */ + u8 daf; /* Address family of the dest */ __be16 dport; - __be16 vport; - u16 af; /* address family */ - union nf_inet_addr caddr; /* client address */ - union nf_inet_addr vaddr; /* virtual address */ - union nf_inet_addr daddr; /* destination address */ + struct ip_vs_dest *dest; /* real server */ + atomic_t n_control; /* Number of controlled ones */ volatile __u32 flags; /* status flags */ - __u16 protocol; /* Which protocol (TCP/UDP) */ - __u16 daf; /* Address family of the dest */ - struct netns_ipvs *ipvs; - - /* counter and timer */ - refcount_t refcnt; /* reference count */ - struct timer_list timer; /* Expiration timer */ - volatile unsigned long timeout; /* timeout */ + /* 44/64 */ - /* Flags and state transition */ - spinlock_t lock; /* lock for state transition */ + struct ip_vs_conn *control; /* Master control connection */ + const struct ip_vs_pe *pe; + char *pe_data; + __u8 pe_data_len; volatile __u16 state; /* state info */ volatile __u16 old_state; /* old state, to be used for * state transition triggered * synchronization */ - __u32 fwmark; /* Fire wall mark from skb */ - unsigned long sync_endtime; /* jiffies + sent_retries */ + /* 2-byte hole */ + /* 64/96 */ - /* Control members */ - struct ip_vs_conn *control; /* Master control connection */ - atomic_t n_control; /* Number of controlled ones */ - struct ip_vs_dest *dest; /* real server */ - atomic_t in_pkts; /* incoming packet counter */ + union nf_inet_addr caddr; /* client address */ + union nf_inet_addr vaddr; /* virtual address */ + /* 96/128 */ - /* Packet transmitter for different forwarding methods. If it - * mangles the packet, it must return NF_DROP or better NF_STOLEN, - * otherwise this must be changed to a sk_buff **. - * NF_ACCEPT can be returned when destination is local. - */ - int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); + union nf_inet_addr daddr; /* destination address */ + __u32 fwmark; /* Fire wall mark from skb */ + __be16 vport; + __u16 protocol; /* Which protocol (TCP/UDP) */ /* Note: we can group the following members into a structure, * in order to save more space, and the following members are @@ -628,14 +835,31 @@ struct ip_vs_conn { */ struct ip_vs_app *app; /* bound ip_vs_app object */ void *app_data; /* Application private data */ + /* 128/168 */ struct_group(sync_conn_opt, struct ip_vs_seq in_seq; /* incoming seq. struct */ struct ip_vs_seq out_seq; /* outgoing seq. struct */ ); + /* 152/192 */ - const struct ip_vs_pe *pe; - char *pe_data; - __u8 pe_data_len; + struct timer_list timer; /* Expiration timer */ + volatile unsigned long timeout; /* timeout */ + spinlock_t lock; /* lock for state transition */ + refcount_t refcnt; /* reference count */ + atomic_t in_pkts; /* incoming packet counter */ + /* 64-bit: 4-byte gap */ + + /* 188/256 */ + unsigned long sync_endtime; /* jiffies + sent_retries */ + struct netns_ipvs *ipvs; + + /* Packet transmitter for different forwarding methods. If it + * mangles the packet, it must return NF_DROP or better NF_STOLEN, + * otherwise this must be changed to a sk_buff **. + * NF_ACCEPT can be returned when destination is local. + */ + int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); struct rcu_head rcu_head; }; @@ -691,14 +915,15 @@ struct ip_vs_dest_user_kern { * forwarding entries. */ struct ip_vs_service { - struct hlist_node s_list; /* node in service table */ - atomic_t refcnt; /* reference counter */ - + struct hlist_bl_node s_list; /* node in service table */ + u32 hash_key; /* Key for the hash table */ u16 af; /* address family */ __u16 protocol; /* which protocol (TCP/UDP) */ + union nf_inet_addr addr; /* IP address for virtual service */ - __be16 port; /* port number for the service */ __u32 fwmark; /* firewall mark of the service */ + atomic_t refcnt; /* reference counter */ + __be16 port; /* port number for the service */ unsigned int flags; /* service status flags */ unsigned int timeout; /* persistent timeout in ticks */ __be32 netmask; /* grouping granularity, mask/plen */ @@ -808,8 +1033,8 @@ struct ip_vs_pe { int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb); bool (*ct_match)(const struct ip_vs_conn_param *p, struct ip_vs_conn *ct); - u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval, - bool inverse); + u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, + struct ip_vs_rht *t, bool inverse); int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf); /* create connections for real-server outgoing packets */ struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc, @@ -949,6 +1174,7 @@ struct netns_ipvs { /* ip_vs_conn */ atomic_t conn_count; /* connection counter */ atomic_t no_cport_conns[IP_VS_AF_MAX]; + struct delayed_work conn_resize_work;/* resize conn_tab */ /* ip_vs_ctl */ struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ @@ -957,6 +1183,10 @@ struct netns_ipvs { struct list_head dest_trash; spinlock_t dest_trash_lock; struct timer_list dest_trash_timer; /* expiration timer */ + struct mutex service_mutex; /* service reconfig */ + struct rw_semaphore svc_resize_sem; /* svc_table resizing */ + struct delayed_work svc_resize_work; /* resize svc_table */ + atomic_t svc_table_changes;/* ++ on new table */ /* Service counters */ atomic_t num_services[IP_VS_AF_MAX]; /* Services */ atomic_t fwm_services[IP_VS_AF_MAX]; /* Services */ @@ -1021,6 +1251,8 @@ struct netns_ipvs { int sysctl_est_nice; /* kthread nice */ int est_stopped; /* stop tasks */ #endif + int sysctl_conn_lfactor; + int sysctl_svc_lfactor; /* ip_vs_lblc */ int sysctl_lblc_expiration; @@ -1030,6 +1262,7 @@ struct netns_ipvs { int sysctl_lblcr_expiration; struct ctl_table_header *lblcr_ctl_header; struct ctl_table *lblcr_ctl_table; + unsigned long work_flags; /* IP_VS_WORK_* flags */ /* ip_vs_est */ struct delayed_work est_reload_work;/* Reload kthread tasks */ struct mutex est_mutex; /* protect kthread tasks */ @@ -1061,9 +1294,9 @@ struct netns_ipvs { unsigned int mixed_address_family_dests; unsigned int hooks_afmask; /* &1=AF_INET, &2=AF_INET6 */ - /* the service mutex that protect svc_table and svc_fwm_table */ - struct mutex service_mutex; - struct hlist_head svc_table[IP_VS_SVC_TAB_SIZE]; /* Services */ + struct ip_vs_rht __rcu *svc_table; /* Services */ + struct ip_vs_rht __rcu *conn_tab; /* Connections */ + atomic_t conn_tab_changes;/* ++ on new table */ }; #define DEFAULT_SYNC_THRESHOLD 3 @@ -1313,6 +1546,24 @@ static inline int sysctl_est_nice(struct netns_ipvs *ipvs) #endif +/* Get load factor to map conn_count/u_thresh to t->size */ +static inline int sysctl_conn_lfactor(struct netns_ipvs *ipvs) +{ + return READ_ONCE(ipvs->sysctl_conn_lfactor); +} + +/* Get load factor to map num_services/u_thresh to t->size + * Smaller value decreases u_thresh to reduce collisions but increases + * the table size + * Returns factor where: + * - <0: u_thresh = size >> -factor, eg. lfactor -2 = 25% load + * - >=0: u_thresh = size << factor, eg. lfactor 1 = 200% load + */ +static inline int sysctl_svc_lfactor(struct netns_ipvs *ipvs) +{ + return READ_ONCE(ipvs->sysctl_svc_lfactor); +} + /* IPVS core functions * (from ip_vs_core.c) */ @@ -1386,6 +1637,23 @@ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) } void ip_vs_conn_put(struct ip_vs_conn *cp); void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport); +int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, + int lfactor); +struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets, + int lfactor); + +static inline struct ip_vs_conn * +ip_vs_hn0_to_conn(struct ip_vs_conn_hnode *hn) +{ + return container_of(hn, struct ip_vs_conn, hn0); +} + +static inline struct ip_vs_conn * +ip_vs_hn_to_conn(struct ip_vs_conn_hnode *hn) +{ + return hn->dir ? container_of(hn, struct ip_vs_conn, hn1) : + container_of(hn, struct ip_vs_conn, hn0); +} struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, @@ -1739,6 +2007,13 @@ static inline char ip_vs_fwd_tag(struct ip_vs_conn *cp) return fwd; } +/* Check if connection uses double hashing */ +static inline bool ip_vs_conn_use_hash2(struct ip_vs_conn *cp) +{ + return IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ && + !(cp->flags & IP_VS_CONN_F_TEMPLATE); +} + void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int dir); diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index fd5f7112a51f..e1b2b4fa6e18 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -92,6 +92,9 @@ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, return err; } +#if IS_MODULE(CONFIG_NFT_FIB_IPV6) +EXPORT_SYMBOL_GPL(fib6_lookup); +#endif struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 9058e71241dc..a6e58a435735 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -342,6 +342,9 @@ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, res, flags); } +#if IS_MODULE(CONFIG_NFT_FIB_IPV6) +EXPORT_SYMBOL_GPL(fib6_lookup); +#endif static void __net_init fib6_tables_init(struct net *net) { diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index dc375b725b28..8b2dba88ee96 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -52,7 +52,13 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK; fl6->flowi6_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, dev); - return lookup_flags; + return lookup_flags | RT6_LOOKUP_F_DST_NOREF; +} + +static int nft_fib6_lookup(struct net *net, struct flowi6 *fl6, + struct fib6_result *res, int flags) +{ + return fib6_lookup(net, fl6->flowi6_oif, fl6, res, flags); } static u32 __nft_fib6_eval_type(const struct nft_fib *priv, @@ -60,13 +66,14 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, struct ipv6hdr *iph) { const struct net_device *dev = NULL; + struct fib6_result res = {}; int route_err, addrtype; - struct rt6_info *rt; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_proto = pkt->tprot, .flowi6_uid = sock_net_uid(nft_net(pkt), NULL), }; + int lookup_flags; u32 ret = 0; if (priv->flags & NFTA_FIB_F_IIF) @@ -74,29 +81,23 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, else if (priv->flags & NFTA_FIB_F_OIF) dev = nft_out(pkt); - nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph); + lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph); if (dev && nf_ipv6_chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) ret = RTN_LOCAL; - route_err = nf_ip6_route(nft_net(pkt), (struct dst_entry **)&rt, - flowi6_to_flowi(&fl6), false); + route_err = nft_fib6_lookup(nft_net(pkt), &fl6, &res, lookup_flags); if (route_err) goto err; - if (rt->rt6i_flags & RTF_REJECT) { - route_err = rt->dst.error; - dst_release(&rt->dst); - goto err; - } + if (res.fib6_flags & RTF_REJECT) + return res.fib6_type; - if (ipv6_anycast_destination((struct dst_entry *)rt, &fl6.daddr)) + if (__ipv6_anycast_destination(&res.f6i->fib6_dst, res.fib6_flags, &fl6.daddr)) ret = RTN_ANYCAST; - else if (!dev && rt->rt6i_flags & RTF_LOCAL) + else if (!dev && res.fib6_flags & RTF_LOCAL) ret = RTN_LOCAL; - dst_release(&rt->dst); - if (ret) return ret; @@ -152,6 +153,33 @@ static bool nft_fib_v6_skip_icmpv6(const struct sk_buff *skb, u8 next, const str return ipv6_addr_type(&iph->daddr) & IPV6_ADDR_LINKLOCAL; } +static bool nft_fib6_info_nh_dev_match(const struct net_device *nh_dev, + const struct net_device *dev) +{ + return nh_dev == dev || + l3mdev_master_ifindex_rcu(nh_dev) == dev->ifindex; +} + +static bool nft_fib6_info_nh_uses_dev(struct fib6_info *rt, + const struct net_device *dev) +{ + const struct net_device *nh_dev; + struct fib6_info *iter; + + nh_dev = fib6_info_nh_dev(rt); + if (nft_fib6_info_nh_dev_match(nh_dev, dev)) + return true; + + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { + nh_dev = fib6_info_nh_dev(iter); + + if (nft_fib6_info_nh_dev_match(nh_dev, dev)) + return true; + } + + return false; +} + void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { @@ -160,14 +188,14 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct net_device *found = NULL; const struct net_device *oif = NULL; u32 *dest = ®s->data[priv->dreg]; + struct fib6_result res = {}; struct ipv6hdr *iph, _iph; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_proto = pkt->tprot, .flowi6_uid = sock_net_uid(nft_net(pkt), NULL), }; - struct rt6_info *rt; - int lookup_flags; + int lookup_flags, ret; if (nft_fib_can_skip(pkt)) { nft_fib_store_result(dest, priv, nft_in(pkt)); @@ -193,26 +221,17 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph); *dest = 0; - rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb, - lookup_flags); - if (rt->dst.error) - goto put_rt_err; - - /* Should not see RTF_LOCAL here */ - if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL)) - goto put_rt_err; + ret = nft_fib6_lookup(nft_net(pkt), &fl6, &res, lookup_flags); + if (ret || res.fib6_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL)) + return; if (!oif) { - found = rt->rt6i_idev->dev; + found = fib6_info_nh_dev(res.f6i); } else { - if (oif == rt->rt6i_idev->dev || - l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == oif->ifindex) + if (nft_fib6_info_nh_uses_dev(res.f6i, oif)) found = oif; } - nft_fib_store_result(dest, priv, found); - put_rt_err: - ip6_rt_put(rt); } EXPORT_SYMBOL_GPL(nft_fib6_eval); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index deaf16a90e2f..2082bfb2d93c 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -47,28 +47,12 @@ static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); -/* size and mask values */ +/* Max table size */ int ip_vs_conn_tab_size __read_mostly; -static int ip_vs_conn_tab_mask __read_mostly; - -/* - * Connection hash table: for input and output packets lookups of IPVS - */ -static struct hlist_head *ip_vs_conn_tab __read_mostly; /* SLAB cache for IPVS connections */ static struct kmem_cache *ip_vs_conn_cachep __read_mostly; -/* random value for IPVS connection hash */ -static unsigned int ip_vs_conn_rnd __read_mostly; - -/* - * Fine locking granularity for big connection hash table - */ -#define CT_LOCKARRAY_BITS 5 -#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) -#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) - /* We need an addrstrlen that works with or without v6 */ #ifdef CONFIG_IP_VS_IPV6 #define IP_VS_ADDRSTRLEN INET6_ADDRSTRLEN @@ -76,23 +60,102 @@ static unsigned int ip_vs_conn_rnd __read_mostly; #define IP_VS_ADDRSTRLEN (8+1) #endif -struct ip_vs_aligned_lock +/* Connection hashing: + * - hash (add conn) and unhash (del conn) are safe for RCU readers walking + * the bucket, they will not jump to another bucket or hash table and to miss + * conns + * - rehash (fill cport) hashes the conn to new bucket or even new table, + * so we use seqcount to retry lookups on buckets where we delete + * conns (unhash) because after hashing their next ptr can point to another + * bucket or hash table + * - hash table resize works like rehash but always rehashes into new table + * - bit lock on bucket serializes all operations that modify the chain + * - cp->lock protects conn fields like cp->flags, cp->dest + */ + +/* Lock conn_tab bucket for conn hash/unhash, not for rehash */ +static __always_inline void +conn_tab_lock(struct ip_vs_rht *t, struct ip_vs_conn *cp, u32 hash_key, + u32 hash_key2, bool use2, bool new_hash, + struct hlist_bl_head **head_ret, struct hlist_bl_head **head2_ret) { - spinlock_t l; -} __attribute__((__aligned__(SMP_CACHE_BYTES))); + struct hlist_bl_head *head, *head2; + u32 hash_key_new, hash_key_new2; + struct ip_vs_rht *t2 = t; + u32 idx, idx2; + + idx = hash_key & t->mask; + if (use2) + idx2 = hash_key2 & t->mask; + else + idx2 = idx; + if (!new_hash) { + /* We need to lock the bucket in the right table */ + +retry: + if (!ip_vs_rht_same_table(t, hash_key)) { + /* It is already moved to new table */ + t = rcu_dereference(t->new_tbl); + /* Rehashing works in two steps and we may detect + * both nodes in different tables, use idx/idx2 + * for proper lock ordering for heads. + */ + idx = hash_key & t->mask; + idx |= IP_VS_RHT_TABLE_ID_MASK; + } + if (use2) { + if (!ip_vs_rht_same_table(t2, hash_key2)) { + /* It is already moved to new table */ + t2 = rcu_dereference(t2->new_tbl); + idx2 = hash_key2 & t2->mask; + idx2 |= IP_VS_RHT_TABLE_ID_MASK; + } + } else { + idx2 = idx; + } + } -/* lock array for conn table */ -static struct ip_vs_aligned_lock -__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; + head = t->buckets + (hash_key & t->mask); + head2 = use2 ? t2->buckets + (hash_key2 & t2->mask) : head; -static inline void ct_write_lock_bh(unsigned int key) -{ - spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); + local_bh_disable(); + /* Do not touch seqcount, this is a safe operation */ + + if (idx <= idx2) { + hlist_bl_lock(head); + if (head != head2) + hlist_bl_lock(head2); + } else { + hlist_bl_lock(head2); + hlist_bl_lock(head); + } + if (!new_hash) { + /* Ensure hash_key is read under lock */ + hash_key_new = READ_ONCE(cp->hn0.hash_key); + hash_key_new2 = READ_ONCE(cp->hn1.hash_key); + /* Hash changed ? */ + if (hash_key != hash_key_new || + (hash_key2 != hash_key_new2 && use2)) { + if (head != head2) + hlist_bl_unlock(head2); + hlist_bl_unlock(head); + local_bh_enable(); + hash_key = hash_key_new; + hash_key2 = hash_key_new2; + goto retry; + } + } + *head_ret = head; + *head2_ret = head2; } -static inline void ct_write_unlock_bh(unsigned int key) +static inline void conn_tab_unlock(struct hlist_bl_head *head, + struct hlist_bl_head *head2) { - spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); + if (head != head2) + hlist_bl_unlock(head2); + hlist_bl_unlock(head); + local_bh_enable(); } static void ip_vs_conn_expire(struct timer_list *t); @@ -100,48 +163,70 @@ static void ip_vs_conn_expire(struct timer_list *t); /* * Returns hash value for IPVS connection entry */ -static unsigned int ip_vs_conn_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, - const union nf_inet_addr *addr, - __be16 port) +static u32 ip_vs_conn_hashkey(struct ip_vs_rht *t, int af, unsigned int proto, + const union nf_inet_addr *addr, __be16 port, + const union nf_inet_addr *laddr, __be16 lport) { + u64 a = (u32)proto << 16 | (__force u32)port; + u64 d; + #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), - (__force u32)port, proto, ip_vs_conn_rnd) ^ - ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; + if (af == AF_INET6) { + u64 b = (u64)addr->all[0] << 32 | addr->all[1]; + u64 c = (u64)addr->all[2] << 32 | addr->all[3]; + + a |= (u64)laddr->all[2] << 32 ^ (__force u32)lport; + c ^= laddr->all[1]; + d = (u64)laddr->all[0] << 32 | laddr->all[3]; + return (u32)siphash_4u64(a, b, c, d, &t->hash_key); + } #endif - return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, - ip_vs_conn_rnd) ^ - ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; + a |= (u64)addr->all[0] << 32; + d = (u64)laddr->all[0] << 32 | (__force u32)lport; + return (u32)siphash_2u64(a, d, &t->hash_key); } static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, - bool inverse) + struct ip_vs_rht *t, bool inverse) { + const union nf_inet_addr *laddr; const union nf_inet_addr *addr; + __be16 lport; __be16 port; if (p->pe_data && p->pe->hashkey_raw) - return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) & - ip_vs_conn_tab_mask; + return p->pe->hashkey_raw(p, t, inverse); if (likely(!inverse)) { addr = p->caddr; port = p->cport; + laddr = p->vaddr; + lport = p->vport; } else { addr = p->vaddr; port = p->vport; + laddr = p->caddr; + lport = p->cport; } - return ip_vs_conn_hashkey(p->ipvs, p->af, p->protocol, addr, port); + return ip_vs_conn_hashkey(t, p->af, p->protocol, addr, port, laddr, + lport); } -static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) +static unsigned int ip_vs_conn_hashkey_conn(struct ip_vs_rht *t, + const struct ip_vs_conn *cp, + bool out) { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, - &cp->caddr, cp->cport, NULL, 0, &p); + if (!out) + ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, + &cp->caddr, cp->cport, &cp->vaddr, + cp->vport, &p); + else + ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, + &cp->daddr, cp->dport, &cp->caddr, + cp->cport, &p); if (cp->pe) { p.pe = cp->pe; @@ -149,31 +234,51 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) p.pe_data_len = cp->pe_data_len; } - return ip_vs_conn_hashkey_param(&p, false); + return ip_vs_conn_hashkey_param(&p, t, out); } -/* - * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port. +/* Hashes ip_vs_conn in conn_tab * returns bool success. */ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) { - unsigned int hash; + struct netns_ipvs *ipvs = cp->ipvs; + struct hlist_bl_head *head, *head2; + u32 hash_key, hash_key2; + struct ip_vs_rht *t; + u32 hash, hash2; + bool use2; int ret; if (cp->flags & IP_VS_CONN_F_ONE_PACKET) return 0; - /* Hash by protocol, client address and port */ - hash = ip_vs_conn_hashkey_conn(cp); + /* New entries go into recent table */ + t = rcu_dereference(ipvs->conn_tab); + t = rcu_dereference(t->new_tbl); - ct_write_lock_bh(hash); + hash = ip_vs_conn_hashkey_conn(t, cp, false); + hash_key = ip_vs_rht_build_hash_key(t, hash); + if (ip_vs_conn_use_hash2(cp)) { + hash2 = ip_vs_conn_hashkey_conn(t, cp, true); + hash_key2 = ip_vs_rht_build_hash_key(t, hash2); + use2 = true; + } else { + hash_key2 = hash_key; + use2 = false; + } + conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */, + &head, &head2); spin_lock(&cp->lock); if (!(cp->flags & IP_VS_CONN_F_HASHED)) { cp->flags |= IP_VS_CONN_F_HASHED; + WRITE_ONCE(cp->hn0.hash_key, hash_key); + WRITE_ONCE(cp->hn1.hash_key, hash_key2); refcount_inc(&cp->refcnt); - hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); + hlist_bl_add_head_rcu(&cp->hn0.node, head); + if (use2) + hlist_bl_add_head_rcu(&cp->hn1.node, head2); ret = 1; } else { pr_err("%s(): request for already hashed, called from %pS\n", @@ -182,75 +287,64 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) } spin_unlock(&cp->lock); - ct_write_unlock_bh(hash); - - return ret; -} - - -/* - * UNhashes ip_vs_conn from ip_vs_conn_tab. - * returns bool success. Caller should hold conn reference. - */ -static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) -{ - unsigned int hash; - int ret; + conn_tab_unlock(head, head2); - /* unhash it and decrease its reference counter */ - hash = ip_vs_conn_hashkey_conn(cp); - - ct_write_lock_bh(hash); - spin_lock(&cp->lock); - - if (cp->flags & IP_VS_CONN_F_HASHED) { - hlist_del_rcu(&cp->c_list); - cp->flags &= ~IP_VS_CONN_F_HASHED; - refcount_dec(&cp->refcnt); - ret = 1; - } else - ret = 0; - - spin_unlock(&cp->lock); - ct_write_unlock_bh(hash); + /* Schedule resizing if load increases */ + if (atomic_read(&ipvs->conn_count) > t->u_thresh && + !test_and_set_bit(IP_VS_WORK_CONN_RESIZE, &ipvs->work_flags)) + mod_delayed_work(system_unbound_wq, &ipvs->conn_resize_work, 0); return ret; } -/* Try to unlink ip_vs_conn from ip_vs_conn_tab. +/* Try to unlink ip_vs_conn from conn_tab. * returns bool success. */ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) { - unsigned int hash; + struct netns_ipvs *ipvs = cp->ipvs; + struct hlist_bl_head *head, *head2; + u32 hash_key, hash_key2; + struct ip_vs_rht *t; bool ret = false; + bool use2; if (cp->flags & IP_VS_CONN_F_ONE_PACKET) return refcount_dec_if_one(&cp->refcnt); - hash = ip_vs_conn_hashkey_conn(cp); + rcu_read_lock(); + + t = rcu_dereference(ipvs->conn_tab); + hash_key = READ_ONCE(cp->hn0.hash_key); + hash_key2 = READ_ONCE(cp->hn1.hash_key); + use2 = ip_vs_conn_use_hash2(cp); - ct_write_lock_bh(hash); + conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */, + &head, &head2); spin_lock(&cp->lock); if (cp->flags & IP_VS_CONN_F_HASHED) { /* Decrease refcnt and unlink conn only if we are last user */ if (refcount_dec_if_one(&cp->refcnt)) { - hlist_del_rcu(&cp->c_list); + hlist_bl_del_rcu(&cp->hn0.node); + if (use2) + hlist_bl_del_rcu(&cp->hn1.node); cp->flags &= ~IP_VS_CONN_F_HASHED; ret = true; } } spin_unlock(&cp->lock); - ct_write_unlock_bh(hash); + conn_tab_unlock(head, head2); + + rcu_read_unlock(); return ret; } /* - * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Gets ip_vs_conn associated with supplied parameters in the conn_tab. * Called for pkts coming from OUTside-to-INside. * p->caddr, p->cport: pkt source address (foreign host) * p->vaddr, p->vport: pkt dest address (load balancer) @@ -258,26 +352,42 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) static inline struct ip_vs_conn * __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) { - unsigned int hash; + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); + struct netns_ipvs *ipvs = p->ipvs; + struct ip_vs_conn_hnode *hn; + struct hlist_bl_head *head; + struct ip_vs_rht *t, *pt; + struct hlist_bl_node *e; struct ip_vs_conn *cp; - - hash = ip_vs_conn_hashkey_param(p, false); + u32 hash, hash_key; rcu_read_lock(); - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { - if (p->cport == cp->cport && p->vport == cp->vport && - cp->af == p->af && - ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && - ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && - ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && - p->protocol == cp->protocol && - cp->ipvs == p->ipvs) { - if (!__ip_vs_conn_get(cp)) - continue; - /* HIT */ - rcu_read_unlock(); - return cp; + ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) { + hash = ip_vs_conn_hashkey_param(p, t, false); + hash_key = ip_vs_rht_build_hash_key(t, hash); + ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { + hlist_bl_for_each_entry_rcu(hn, e, head, node) { + if (READ_ONCE(hn->hash_key) != hash_key || + hn->dir != 0) + continue; + cp = ip_vs_hn0_to_conn(hn); + if (p->cport == cp->cport && + p->vport == cp->vport && cp->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, + &cp->caddr) && + ip_vs_addr_equal(p->af, p->vaddr, + &cp->vaddr) && + (!p->cport ^ + (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && + p->protocol == cp->protocol) { + if (__ip_vs_conn_get(cp)) { + /* HIT */ + rcu_read_unlock(); + return cp; + } + } + } } } @@ -350,37 +460,53 @@ EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); /* Get reference to connection template */ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) { - unsigned int hash; + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); + struct netns_ipvs *ipvs = p->ipvs; + struct ip_vs_conn_hnode *hn; + struct hlist_bl_head *head; + struct ip_vs_rht *t, *pt; + struct hlist_bl_node *e; struct ip_vs_conn *cp; - - hash = ip_vs_conn_hashkey_param(p, false); + u32 hash, hash_key; rcu_read_lock(); - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { - if (unlikely(p->pe_data && p->pe->ct_match)) { - if (cp->ipvs != p->ipvs) - continue; - if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { - if (__ip_vs_conn_get(cp)) - goto out; + ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) { + hash = ip_vs_conn_hashkey_param(p, t, false); + hash_key = ip_vs_rht_build_hash_key(t, hash); + ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { + hlist_bl_for_each_entry_rcu(hn, e, head, node) { + if (READ_ONCE(hn->hash_key) != hash_key || + hn->dir != 0) + continue; + cp = ip_vs_hn0_to_conn(hn); + if (unlikely(p->pe_data && p->pe->ct_match)) { + if (p->pe == cp->pe && + p->pe->ct_match(p, cp) && + __ip_vs_conn_get(cp)) + goto out; + continue; + } + if (cp->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, + &cp->caddr) && + /* protocol should only be IPPROTO_IP if + * p->vaddr is a fwmark + */ + ip_vs_addr_equal(p->protocol == IPPROTO_IP ? + AF_UNSPEC : p->af, + p->vaddr, &cp->vaddr) && + p->vport == cp->vport && + p->cport == cp->cport && + cp->flags & IP_VS_CONN_F_TEMPLATE && + p->protocol == cp->protocol && + cp->dport != htons(0xffff)) { + if (__ip_vs_conn_get(cp)) + goto out; + } } - continue; } - if (cp->af == p->af && - ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && - /* protocol should only be IPPROTO_IP if - * p->vaddr is a fwmark */ - ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : - p->af, p->vaddr, &cp->vaddr) && - p->vport == cp->vport && p->cport == cp->cport && - cp->flags & IP_VS_CONN_F_TEMPLATE && - p->protocol == cp->protocol && - cp->ipvs == p->ipvs) { - if (__ip_vs_conn_get(cp)) - goto out; - } } cp = NULL; @@ -396,58 +522,68 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) return cp; } -/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. +/* Gets ip_vs_conn associated with supplied parameters in the conn_tab. * Called for pkts coming from inside-to-OUTside. * p->caddr, p->cport: pkt source address (inside host) * p->vaddr, p->vport: pkt dest address (foreign host) */ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) { - unsigned int hash; - struct ip_vs_conn *cp, *ret=NULL; + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); + struct netns_ipvs *ipvs = p->ipvs; const union nf_inet_addr *saddr; + struct ip_vs_conn_hnode *hn; + struct hlist_bl_head *head; + struct ip_vs_rht *t, *pt; + struct hlist_bl_node *e; + struct ip_vs_conn *cp; + u32 hash, hash_key; __be16 sport; - /* - * Check for "full" addressed entries - */ - hash = ip_vs_conn_hashkey_param(p, true); - rcu_read_lock(); - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { - if (p->vport != cp->cport) - continue; + ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) { + hash = ip_vs_conn_hashkey_param(p, t, true); + hash_key = ip_vs_rht_build_hash_key(t, hash); + ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { + hlist_bl_for_each_entry_rcu(hn, e, head, node) { + /* dir can be 0 for DR/TUN */ + if (READ_ONCE(hn->hash_key) != hash_key) + continue; + cp = ip_vs_hn_to_conn(hn); + if (p->vport != cp->cport) + continue; - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { - sport = cp->vport; - saddr = &cp->vaddr; - } else { - sport = cp->dport; - saddr = &cp->daddr; - } + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + sport = cp->vport; + saddr = &cp->vaddr; + } else { + sport = cp->dport; + saddr = &cp->daddr; + } - if (p->cport == sport && cp->af == p->af && - ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && - ip_vs_addr_equal(p->af, p->caddr, saddr) && - p->protocol == cp->protocol && - cp->ipvs == p->ipvs) { - if (!__ip_vs_conn_get(cp)) - continue; - /* HIT */ - ret = cp; - break; + if (p->cport == sport && cp->af == p->af && + ip_vs_addr_equal(p->af, p->vaddr, + &cp->caddr) && + ip_vs_addr_equal(p->af, p->caddr, saddr) && + p->protocol == cp->protocol) { + if (__ip_vs_conn_get(cp)) + goto out; + } + } } } + cp = NULL; +out: rcu_read_unlock(); IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", ip_vs_proto_name(p->protocol), IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), - ret ? "hit" : "not hit"); + cp ? "hit" : "not hit"); - return ret; + return cp; } struct ip_vs_conn * @@ -492,23 +628,289 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) */ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) { - if (ip_vs_conn_unhash(cp)) { - struct netns_ipvs *ipvs = cp->ipvs; - int af_id = ip_vs_af_index(cp->af); + struct hlist_bl_head *head, *head2, *head_new; + bool use2 = ip_vs_conn_use_hash2(cp); + struct netns_ipvs *ipvs = cp->ipvs; + int af_id = ip_vs_af_index(cp->af); + u32 hash_r = 0, hash_key_r = 0; + struct ip_vs_rht *t, *tp, *t2; + struct ip_vs_conn_hnode *hn; + u32 hash_key, hash_key_new; + struct ip_vs_conn_param p; + int ntbl; + int dir; - spin_lock_bh(&cp->lock); - if (cp->flags & IP_VS_CONN_F_NO_CPORT) { + /* No packets from inside, so we can do it in 2 steps. */ + dir = use2 ? 1 : 0; + +next_dir: + if (dir) + ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->daddr, + cp->dport, &cp->caddr, cport, &p); + else + ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->caddr, + cport, &cp->vaddr, cp->vport, &p); + hn = dir ? &cp->hn1 : &cp->hn0; + ntbl = 0; + + /* Attempt to rehash cp safely, by informing seqcount readers */ + t = rcu_dereference(ipvs->conn_tab); + hash_key = READ_ONCE(hn->hash_key); + tp = NULL; + +retry: + /* Moved to new table ? */ + if (!ip_vs_rht_same_table(t, hash_key)) { + t = rcu_dereference(t->new_tbl); + ntbl++; + /* We are lost? */ + if (ntbl >= 2) + return; + } + + /* Rehashing during resize? Use the recent table for adds */ + t2 = rcu_dereference(t->new_tbl); + /* Calc new hash once per table */ + if (tp != t2) { + hash_r = ip_vs_conn_hashkey_param(&p, t2, dir); + hash_key_r = ip_vs_rht_build_hash_key(t2, hash_r); + tp = t2; + } + head = t->buckets + (hash_key & t->mask); + head2 = t2->buckets + (hash_key_r & t2->mask); + head_new = head2; + + if (head > head2 && t == t2) + swap(head, head2); + + /* Lock seqcount only for the old bucket, even if we are on new table + * because it affects the del operation, not the adding. + */ + spin_lock_bh(&t->lock[hash_key & t->lock_mask].l); + preempt_disable_nested(); + write_seqcount_begin(&t->seqc[hash_key & t->seqc_mask]); + + /* Lock buckets in same (increasing) order */ + hlist_bl_lock(head); + if (head != head2) + hlist_bl_lock(head2); + + /* Ensure hash_key is read under lock */ + hash_key_new = READ_ONCE(hn->hash_key); + /* Racing with another rehashing ? */ + if (unlikely(hash_key != hash_key_new)) { + if (head != head2) + hlist_bl_unlock(head2); + hlist_bl_unlock(head); + write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]); + preempt_enable_nested(); + spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l); + hash_key = hash_key_new; + goto retry; + } + + spin_lock(&cp->lock); + if ((cp->flags & IP_VS_CONN_F_NO_CPORT) && + (cp->flags & IP_VS_CONN_F_HASHED)) { + /* We do not recalc hash_key_r under lock, we assume the + * parameters in cp do not change, i.e. cport is + * the only possible change. + */ + WRITE_ONCE(hn->hash_key, hash_key_r); + if (!use2) + WRITE_ONCE(cp->hn1.hash_key, hash_key_r); + /* For dir=1 we do not check in flags if hn is already + * rehashed but this check will do it. + */ + if (head != head2) { + hlist_bl_del_rcu(&hn->node); + hlist_bl_add_head_rcu(&hn->node, head_new); + } + if (!dir) { atomic_dec(&ipvs->no_cport_conns[af_id]); cp->flags &= ~IP_VS_CONN_F_NO_CPORT; cp->cport = cport; } - spin_unlock_bh(&cp->lock); - - /* hash on new dport */ - ip_vs_conn_hash(cp); } + spin_unlock(&cp->lock); + + if (head != head2) + hlist_bl_unlock(head2); + hlist_bl_unlock(head); + write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]); + preempt_enable_nested(); + spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l); + if (dir--) + goto next_dir; } +/* Get default load factor to map conn_count/u_thresh to t->size */ +static int ip_vs_conn_default_load_factor(struct netns_ipvs *ipvs) +{ + int factor; + + if (net_eq(ipvs->net, &init_net)) + factor = -3; + else + factor = -1; + /* Double hashing adds twice more nodes for NAT */ + factor--; + return factor; +} + +/* Get the desired conn_tab size */ +int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, + int lfactor) +{ + return ip_vs_rht_desired_size(ipvs, t, atomic_read(&ipvs->conn_count), + lfactor, IP_VS_CONN_TAB_MIN_BITS, + ip_vs_conn_tab_bits); +} + +/* Allocate conn_tab */ +struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets, + int lfactor) +{ + struct ip_vs_rht *t; + int scounts, locks; + + /* scounts: affects readers during resize */ + scounts = clamp(buckets >> 6, 1, 256); + /* locks: based on parallel IP_VS_CONN_F_NO_CPORT operations + resize */ + locks = clamp(8, 1, scounts); + + t = ip_vs_rht_alloc(buckets, scounts, locks); + if (!t) + return NULL; + t->lfactor = lfactor; + ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_CONN_TAB_MIN_BITS, + ip_vs_conn_tab_bits); + return t; +} + +/* conn_tab resizer work */ +static void conn_resize_work_handler(struct work_struct *work) +{ + struct hlist_bl_head *head, *head2; + unsigned int resched_score = 0; + struct hlist_bl_node *cn, *nn; + struct ip_vs_rht *t, *t_new; + struct ip_vs_conn_hnode *hn; + struct netns_ipvs *ipvs; + struct ip_vs_conn *cp; + bool more_work = false; + u32 hash, hash_key; + int limit = 0; + int new_size; + int lfactor; + u32 bucket; + + ipvs = container_of(work, struct netns_ipvs, conn_resize_work.work); + + /* Allow work to be queued again */ + clear_bit(IP_VS_WORK_CONN_RESIZE, &ipvs->work_flags); + t = rcu_dereference_protected(ipvs->conn_tab, 1); + /* Do nothing if table is removed */ + if (!t) + goto out; + /* New table needs to be registered? BUG! */ + if (t != rcu_dereference_protected(t->new_tbl, 1)) + goto out; + + lfactor = sysctl_conn_lfactor(ipvs); + /* Should we resize ? */ + new_size = ip_vs_conn_desired_size(ipvs, t, lfactor); + if (new_size == t->size && lfactor == t->lfactor) + goto out; + + t_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor); + if (!t_new) { + more_work = true; + goto out; + } + /* Flip the table_id */ + t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK; + + rcu_assign_pointer(t->new_tbl, t_new); + + /* Wait RCU readers to see the new table, we do not want new + * conns to go into old table and to be left there. + */ + synchronize_rcu(); + + ip_vs_rht_for_each_bucket(t, bucket, head) { +same_bucket: + if (++limit >= 16) { + if (resched_score >= 100) { + resched_score = 0; + cond_resched(); + } + limit = 0; + } + if (hlist_bl_empty(head)) { + resched_score++; + continue; + } + /* Preemption calls ahead... */ + resched_score = 0; + + /* seqcount_t usage considering PREEMPT_RT rules: + * - other writers (SoftIRQ) => serialize with spin_lock_bh + * - readers (SoftIRQ) => disable BHs + * - readers (processes) => preemption should be disabled + */ + spin_lock_bh(&t->lock[bucket & t->lock_mask].l); + preempt_disable_nested(); + write_seqcount_begin(&t->seqc[bucket & t->seqc_mask]); + hlist_bl_lock(head); + + hlist_bl_for_each_entry_safe(hn, cn, nn, head, node) { + cp = ip_vs_hn_to_conn(hn); + hash = ip_vs_conn_hashkey_conn(t_new, cp, hn->dir); + hash_key = ip_vs_rht_build_hash_key(t_new, hash); + + head2 = t_new->buckets + (hash & t_new->mask); + hlist_bl_lock(head2); + /* t_new->seqc are not used at this stage, we race + * only with add/del, so only lock the bucket. + */ + hlist_bl_del_rcu(&hn->node); + WRITE_ONCE(hn->hash_key, hash_key); + /* Keep both hash keys in sync if no double hashing */ + if (!ip_vs_conn_use_hash2(cp)) + WRITE_ONCE(cp->hn1.hash_key, hash_key); + hlist_bl_add_head_rcu(&hn->node, head2); + hlist_bl_unlock(head2); + /* Too long chain? Do it in steps */ + if (++limit >= 64) + break; + } + + hlist_bl_unlock(head); + write_seqcount_end(&t->seqc[bucket & t->seqc_mask]); + preempt_enable_nested(); + spin_unlock_bh(&t->lock[bucket & t->lock_mask].l); + if (limit >= 64) + goto same_bucket; + } + + rcu_assign_pointer(ipvs->conn_tab, t_new); + /* Inform readers that new table is installed */ + smp_mb__before_atomic(); + atomic_inc(&ipvs->conn_tab_changes); + + /* RCU readers should not see more than two tables in chain. + * To prevent new table to be attached wait here instead of + * freeing the old table in RCU callback. + */ + synchronize_rcu(); + ip_vs_rht_free(t); + +out: + /* Monitor if we need to shrink table */ + queue_delayed_work(system_unbound_wq, &ipvs->conn_resize_work, + more_work ? 1 : 2 * HZ); +} /* * Bind a connection entry with the corresponding packet_xmit. @@ -792,17 +1194,11 @@ int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest) IP_VS_DBG_ADDR(ct->daf, &ct->daddr), ntohs(ct->dport)); - /* - * Invalidate the connection template + /* Invalidate the connection template. Prefer to avoid + * rehashing, it will move it as first in chain, so use + * only dport as indication, it is not a hash key. */ - if (ct->vport != htons(0xffff)) { - if (ip_vs_conn_unhash(ct)) { - ct->dport = htons(0xffff); - ct->vport = htons(0xffff); - ct->cport = 0; - ip_vs_conn_hash(ct); - } - } + ct->dport = htons(0xffff); /* * Simply decrease the refcnt of the template, @@ -943,7 +1339,7 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) /* - * Create a new connection entry and hash it into the ip_vs_conn_tab + * Create a new connection entry and hash it into the conn_tab */ struct ip_vs_conn * ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, @@ -961,10 +1357,13 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, return NULL; } - INIT_HLIST_NODE(&cp->c_list); + INIT_HLIST_BL_NODE(&cp->hn0.node); + INIT_HLIST_BL_NODE(&cp->hn1.node); timer_setup(&cp->timer, ip_vs_conn_expire, 0); cp->ipvs = ipvs; + cp->hn0.dir = 0; cp->af = p->af; + cp->hn1.dir = 1; cp->daf = dest_af; cp->protocol = p->protocol; ip_vs_addr_set(p->af, &cp->caddr, p->caddr); @@ -1045,7 +1444,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, if (ip_vs_conntrack_enabled(ipvs)) cp->flags |= IP_VS_CONN_F_NFCT; - /* Hash it in the ip_vs_conn_tab finally */ + /* Hash it in the conn_tab finally */ ip_vs_conn_hash(cp); return cp; @@ -1057,32 +1456,50 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, #ifdef CONFIG_PROC_FS struct ip_vs_iter_state { struct seq_net_private p; - unsigned int bucket; + struct ip_vs_rht *t; + int gen; + u32 bucket; unsigned int skip_elems; }; -static void *ip_vs_conn_array(struct ip_vs_iter_state *iter) +static void *ip_vs_conn_array(struct seq_file *seq) { + struct ip_vs_iter_state *iter = seq->private; + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_rht *t = iter->t; + struct ip_vs_conn_hnode *hn; + struct hlist_bl_node *e; int idx; - struct ip_vs_conn *cp; - for (idx = iter->bucket; idx < ip_vs_conn_tab_size; idx++) { + if (!t) + return NULL; + for (idx = iter->bucket; idx < t->size; idx++) { unsigned int skip = 0; - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[idx], node) { /* __ip_vs_conn_get() is not needed by * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show */ + if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key))) + break; + if (hn->dir != 0) + continue; if (skip >= iter->skip_elems) { iter->bucket = idx; - return cp; + return hn; } ++skip; } + if (!(idx & 31)) { + cond_resched_rcu(); + /* New table installed ? */ + if (iter->gen != atomic_read(&ipvs->conn_tab_changes)) + break; + } iter->skip_elems = 0; - cond_resched_rcu(); } iter->bucket = idx; @@ -1093,38 +1510,52 @@ static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ip_vs_iter_state *iter = seq->private; + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); rcu_read_lock(); + iter->gen = atomic_read(&ipvs->conn_tab_changes); + smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ + iter->t = rcu_dereference(ipvs->conn_tab); if (*pos == 0) { iter->skip_elems = 0; iter->bucket = 0; return SEQ_START_TOKEN; } - return ip_vs_conn_array(iter); + return ip_vs_conn_array(seq); } static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct ip_vs_conn *cp = v; struct ip_vs_iter_state *iter = seq->private; - struct hlist_node *e; + struct ip_vs_conn_hnode *hn = v; + struct hlist_bl_node *e; + struct ip_vs_rht *t; ++*pos; if (v == SEQ_START_TOKEN) - return ip_vs_conn_array(iter); + return ip_vs_conn_array(seq); + + t = iter->t; + if (!t) + return NULL; /* more on same hash chain? */ - e = rcu_dereference(hlist_next_rcu(&cp->c_list)); - if (e) { + hlist_bl_for_each_entry_continue_rcu(hn, e, node) { + /* Our cursor was moved to new table ? */ + if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key))) + break; + if (hn->dir != 0) + continue; iter->skip_elems++; - return hlist_entry(e, struct ip_vs_conn, c_list); + return hn; } iter->skip_elems = 0; iter->bucket++; - return ip_vs_conn_array(iter); + return ip_vs_conn_array(seq); } static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) @@ -1140,14 +1571,12 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); else { - const struct ip_vs_conn *cp = v; - struct net *net = seq_file_net(seq); + struct ip_vs_conn_hnode *hn = v; + const struct ip_vs_conn *cp = ip_vs_hn0_to_conn(hn); char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; size_t len = 0; char dbuf[IP_VS_ADDRSTRLEN]; - if (!net_eq(cp->ipvs->net, net)) - return 0; if (cp->pe_data) { pe_data[0] = ' '; len = strlen(cp->pe->name); @@ -1219,10 +1648,6 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); else { const struct ip_vs_conn *cp = v; - struct net *net = seq_file_net(seq); - - if (!net_eq(cp->ipvs->net, net)) - return 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->daf == AF_INET6) @@ -1312,22 +1737,33 @@ static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp) return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET); } -/* Called from keventd and must protect itself from softirqs */ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) { - int idx; + struct ip_vs_conn_hnode *hn; + struct hlist_bl_node *e; struct ip_vs_conn *cp; + struct ip_vs_rht *t; + unsigned int r; + int idx; + r = get_random_u32(); rcu_read_lock(); + t = rcu_dereference(ipvs->conn_tab); + if (!t) + goto out; /* * Randomly scan 1/32 of the whole table every second */ - for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { - unsigned int hash = get_random_u32() & ip_vs_conn_tab_mask; + for (idx = 0; idx < (t->size >> 5); idx++) { + unsigned int hash = (r + idx) & t->mask; - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->ipvs != ipvs) + /* Don't care if due to moved entry we jump to another bucket + * and even to new table + */ + hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[hash], node) { + if (hn->dir != 0) continue; + cp = ip_vs_hn0_to_conn(hn); if (atomic_read(&cp->n_control)) continue; if (cp->flags & IP_VS_CONN_F_TEMPLATE) { @@ -1374,27 +1810,43 @@ drop: IP_VS_DBG(4, "drop connection\n"); ip_vs_conn_del(cp); } - cond_resched_rcu(); + if (!(idx & 31)) { + cond_resched_rcu(); + t = rcu_dereference(ipvs->conn_tab); + if (!t) + goto out; + } } + +out: rcu_read_unlock(); } #endif -/* - * Flush all the connection entries in the ip_vs_conn_tab - */ +/* Flush all the connection entries in the conn_tab */ static void ip_vs_conn_flush(struct netns_ipvs *ipvs) { - int idx; + DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU(); struct ip_vs_conn *cp, *cp_c; + struct ip_vs_conn_hnode *hn; + struct hlist_bl_head *head; + struct ip_vs_rht *t, *p; + struct hlist_bl_node *e; + + if (!rcu_dereference_protected(ipvs->conn_tab, 1)) + return; + cancel_delayed_work_sync(&ipvs->conn_resize_work); + if (!atomic_read(&ipvs->conn_count)) + goto unreg; flush_again: + /* Rely on RCU grace period while accessing cp after ip_vs_conn_del */ rcu_read_lock(); - for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { - - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { - if (cp->ipvs != ipvs) + ip_vs_rht_walk_buckets_safe_rcu(ipvs->conn_tab, head) { + hlist_bl_for_each_entry_rcu(hn, e, head, node) { + if (hn->dir != 0) continue; + cp = ip_vs_hn0_to_conn(hn); if (atomic_read(&cp->n_control)) continue; cp_c = cp->control; @@ -1415,21 +1867,51 @@ flush_again: schedule(); goto flush_again; } + +unreg: + /* Unregister the hash table and release it after RCU grace period. + * This is needed because other works may not be stopped yet and + * they may walk the tables. + */ + t = rcu_dereference_protected(ipvs->conn_tab, 1); + rcu_assign_pointer(ipvs->conn_tab, NULL); + /* Inform readers that conn_tab is changed */ + smp_mb__before_atomic(); + atomic_inc(&ipvs->conn_tab_changes); + while (1) { + p = rcu_dereference_protected(t->new_tbl, 1); + call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); + if (p == t) + break; + t = p; + } } #ifdef CONFIG_SYSCTL void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) { - int idx; + DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); + unsigned int resched_score = 0; struct ip_vs_conn *cp, *cp_c; + struct ip_vs_conn_hnode *hn; + struct hlist_bl_head *head; struct ip_vs_dest *dest; + struct hlist_bl_node *e; + int old_gen, new_gen; + if (!atomic_read(&ipvs->conn_count)) + return; + old_gen = atomic_read(&ipvs->conn_tab_changes); rcu_read_lock(); - for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { - if (cp->ipvs != ipvs) - continue; +repeat: + smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ + ip_vs_rht_walk_buckets_rcu(ipvs->conn_tab, head) { + hlist_bl_for_each_entry_rcu(hn, e, head, node) { + if (hn->dir != 0) + continue; + cp = ip_vs_hn0_to_conn(hn); + resched_score++; dest = cp->dest; if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE)) continue; @@ -1444,13 +1926,25 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) IP_VS_DBG(4, "del controlling connection\n"); ip_vs_conn_del(cp_c); } + resched_score += 10; + } + resched_score++; + if (resched_score >= 100) { + resched_score = 0; + cond_resched_rcu(); + /* netns clean up started, abort delayed work */ + if (!READ_ONCE(ipvs->enable)) + goto out; + new_gen = atomic_read(&ipvs->conn_tab_changes); + /* New table installed ? */ + if (old_gen != new_gen) { + old_gen = new_gen; + goto repeat; + } } - cond_resched_rcu(); - - /* netns clean up started, abort delayed work */ - if (!READ_ONCE(ipvs->enable)) - break; } + +out: rcu_read_unlock(); } #endif @@ -1465,6 +1959,10 @@ int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs) atomic_set(&ipvs->conn_count, 0); for (idx = 0; idx < IP_VS_AF_MAX; idx++) atomic_set(&ipvs->no_cport_conns[idx], 0); + INIT_DELAYED_WORK(&ipvs->conn_resize_work, conn_resize_work_handler); + RCU_INIT_POINTER(ipvs->conn_tab, NULL); + atomic_set(&ipvs->conn_tab_changes, 0); + ipvs->sysctl_conn_lfactor = ip_vs_conn_default_load_factor(ipvs); #ifdef CONFIG_PROC_FS if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net, @@ -1500,56 +1998,36 @@ void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs) int __init ip_vs_conn_init(void) { + int min = IP_VS_CONN_TAB_MIN_BITS; + int max = IP_VS_CONN_TAB_MAX_BITS; size_t tab_array_size; int max_avail; -#if BITS_PER_LONG > 32 - int max = 27; -#else - int max = 20; -#endif - int min = 8; - int idx; max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT; - max_avail -= 2; /* ~4 in hash row */ + /* 64-bit: 27 bits at 64GB, 32-bit: 20 bits at 512MB */ + max_avail += 1; /* hash table loaded at 50% */ max_avail -= 1; /* IPVS up to 1/2 of mem */ max_avail -= order_base_2(sizeof(struct ip_vs_conn)); max = clamp(max_avail, min, max); ip_vs_conn_tab_bits = clamp(ip_vs_conn_tab_bits, min, max); ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; - ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; /* * Allocate the connection hash table and initialize its list heads */ tab_array_size = array_size(ip_vs_conn_tab_size, - sizeof(*ip_vs_conn_tab)); - ip_vs_conn_tab = kvmalloc_objs(*ip_vs_conn_tab, ip_vs_conn_tab_size); - if (!ip_vs_conn_tab) - return -ENOMEM; + sizeof(struct hlist_bl_head)); /* Allocate ip_vs_conn slab cache */ ip_vs_conn_cachep = KMEM_CACHE(ip_vs_conn, SLAB_HWCACHE_ALIGN); - if (!ip_vs_conn_cachep) { - kvfree(ip_vs_conn_tab); + if (!ip_vs_conn_cachep) return -ENOMEM; - } pr_info("Connection hash table configured (size=%d, memory=%zdKbytes)\n", ip_vs_conn_tab_size, tab_array_size / 1024); IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n", sizeof(struct ip_vs_conn)); - for (idx = 0; idx < ip_vs_conn_tab_size; idx++) - INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); - - for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { - spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l); - } - - /* calculate the random value for connection hash */ - get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); - return 0; } @@ -1559,5 +2037,4 @@ void ip_vs_conn_cleanup(void) rcu_barrier(); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); - kvfree(ip_vs_conn_tab); } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 869f18e0e835..f5b7a2047291 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -117,6 +117,185 @@ void ip_vs_init_hash_table(struct list_head *table, int rows) INIT_LIST_HEAD(&table[rows]); } +/* IPVS Resizable Hash Tables: + * - list_bl buckets with bit lock + * + * Goals: + * - RCU lookup for entry can run in parallel with add/del/move operations + * - hash keys can be on non-contiguous memory + * - support entries with duplicate keys + * - unlink entries without lookup, use the saved table and bucket id + * - resizing can trigger on load change or depending on key refresh period + * - customizable load factor to balance between speed and memory usage + * - add/del/move operations should be allowed for any context + * + * Resizing: + * - new table is attached to the current table and all entries are moved + * with new hash key. Finally, the new table is installed as current one and + * the old table is released after RCU grace period. + * - RCU read-side critical sections will walk two tables while resizing is + * in progress + * - new entries are added to the new table + * - entries will be deleted from the old or from the new table, the table_id + * can be saved into entry as part of the hash key to know where the entry is + * hashed + * - move operations may delay readers or to cause retry for the modified + * bucket. As result, searched entry will be found but walkers that operate + * on multiple entries may see same entry twice if bucket walking is retried. + * - for fast path the number of entries (load) can be compared to u_thresh + * and l_thresh to decide when to trigger table growing/shrinking. They + * are calculated based on load factor (shift count), negative value allows + * load to be below 100% to reduce collisions by maintaining larger table + * while positive value tolerates collisions by using smaller table and load + * above 100%: u_thresh(load) = size * (2 ^ lfactor) + * + * Locking: + * - lock: protect seqc if other context except resizer can move entries + * - seqc: seqcount_t, delay/retry readers while entries are moved to + * new table on resizing + * - bit lock: serialize bucket modifications + * - writers may use other locking mechanisms to serialize operations for + * resizing, moving and installing new tables + */ + +void ip_vs_rht_free(struct ip_vs_rht *t) +{ + kvfree(t->buckets); + kvfree(t->seqc); + kvfree(t->lock); + kfree(t); +} + +void ip_vs_rht_rcu_free(struct rcu_head *head) +{ + struct ip_vs_rht *t; + + t = container_of(head, struct ip_vs_rht, rcu_head); + ip_vs_rht_free(t); +} + +struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks) +{ + struct ip_vs_rht *t = kzalloc(sizeof(*t), GFP_KERNEL); + int i; + + if (!t) + return NULL; + if (scounts) { + int ml = roundup_pow_of_two(nr_cpu_ids); + + scounts = min(scounts, buckets); + scounts = min(scounts, ml); + t->seqc = kvmalloc_array(scounts, sizeof(*t->seqc), GFP_KERNEL); + if (!t->seqc) + goto err; + for (i = 0; i < scounts; i++) + seqcount_init(&t->seqc[i]); + + if (locks) { + locks = min(locks, scounts); + t->lock = kvmalloc_array(locks, sizeof(*t->lock), + GFP_KERNEL); + if (!t->lock) + goto err; + for (i = 0; i < locks; i++) + spin_lock_init(&t->lock[i].l); + } + } + + t->buckets = kvmalloc_array(buckets, sizeof(*t->buckets), GFP_KERNEL); + if (!t->buckets) + goto err; + for (i = 0; i < buckets; i++) + INIT_HLIST_BL_HEAD(&t->buckets[i]); + t->mask = buckets - 1; + t->size = buckets; + t->seqc_mask = scounts - 1; + t->lock_mask = locks - 1; + t->u_thresh = buckets; + t->l_thresh = buckets >> 4; + t->bits = order_base_2(buckets); + /* new_tbl points to self if no new table is filled */ + RCU_INIT_POINTER(t->new_tbl, t); + get_random_bytes(&t->hash_key, sizeof(t->hash_key)); + return t; + +err: + ip_vs_rht_free(t); + return NULL; +} + +/* Get the desired table size for n entries based on current table size and + * by using the formula size = n / (2^lfactor) + * lfactor: shift value for the load factor: + * - >0: u_thresh=size << lfactor, for load factor above 100% + * - <0: u_thresh=size >> -lfactor, for load factor below 100% + * - 0: for load factor of 100% + */ +int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n, + int lfactor, int min_bits, int max_bits) +{ + if (!t) + return 1 << min_bits; + n = roundup_pow_of_two(n); + if (lfactor < 0) { + int factor = min(-lfactor, max_bits); + + n = min(n, 1 << (max_bits - factor)); + n <<= factor; + } else { + n = min(n >> lfactor, 1 << max_bits); + } + if (lfactor != t->lfactor) + return clamp(n, 1 << min_bits, 1 << max_bits); + if (n > t->size) + return n; + if (n > t->size >> 4) + return t->size; + /* Shrink but keep it n * 2 to prevent frequent resizing */ + return clamp(n << 1, 1 << min_bits, 1 << max_bits); +} + +/* Set thresholds based on table size and load factor: + * u_thresh = size * (2^lfactor) + * l_thresh = u_thresh / 16 + * u_thresh/l_thresh can be used to check if load triggers a table grow/shrink + */ +void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor, + int min_bits, int max_bits) +{ + if (size >= 1 << max_bits) + t->u_thresh = INT_MAX; /* stop growing */ + else if (lfactor <= 0) + t->u_thresh = size >> min(-lfactor, max_bits); + else + t->u_thresh = min(size, 1 << (30 - lfactor)) << lfactor; + + /* l_thresh: shrink when load is 16 times lower, can be 0 */ + if (size >= 1 << max_bits) + t->l_thresh = (1 << max_bits) >> 4; + else if (size > 1 << min_bits) + t->l_thresh = t->u_thresh >> 4; + else + t->l_thresh = 0; /* stop shrinking */ +} + +/* Return hash value for local info (fast, insecure) */ +u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af, + const union nf_inet_addr *addr, u32 v1, u32 v2) +{ + u32 v3; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + v3 = ipv6_addr_hash(&addr->in6); + else +#endif + v3 = addr->all[0]; + + return jhash_3words(v1, v2, v3, (u32)t->hash_key.key[0]); +} + static inline void ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index f7d454df2b58..032425025d88 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -29,6 +29,7 @@ #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/mutex.h> +#include <linux/rcupdate_wait.h> #include <net/net_namespace.h> #include <linux/nsproxy.h> @@ -293,47 +294,59 @@ ip_vs_use_count_dec(void) } - +/* Service hashing: + * Operation Locking order + * --------------------------------------------------------------------------- + * add table service_mutex, svc_resize_sem(W) + * del table service_mutex + * move between tables svc_resize_sem(W), seqcount_t(W), bit lock + * add/del service service_mutex, bit lock + * find service RCU, seqcount_t(R) + * walk services(blocking) service_mutex, svc_resize_sem(R) + * walk services(non-blocking) RCU, seqcount_t(R) + * + * - new tables are linked/unlinked under service_mutex and svc_resize_sem + * - new table is linked on resizing and all operations can run in parallel + * in 2 tables until the new table is registered as current one + * - two contexts can modify buckets: config and table resize, both in + * process context + * - only table resizer can move entries, so we do not protect t->seqc[] + * items with t->lock[] + * - lookups occur under RCU lock and seqcount reader lock to detect if + * services are moved to new table + * - move operations may disturb readers: find operation will not miss entries + * but walkers may see same entry twice if they are forced to retry chains + * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold + * service_mutex to disallow new tables to be installed or to check + * svc_table_changes and repeat the RCU read section if new table is installed + */ /* * Returns hash value for virtual service */ -static inline unsigned int -ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, +static inline u32 +ip_vs_svc_hashval(struct ip_vs_rht *t, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { - unsigned int porth = ntohs(port); - __be32 addr_fold = addr->ip; - __u32 ahash; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - addr_fold = addr->ip6[0]^addr->ip6[1]^ - addr->ip6[2]^addr->ip6[3]; -#endif - ahash = ntohl(addr_fold); - ahash ^= ((size_t) ipvs >> 8); - - return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & - IP_VS_SVC_TAB_MASK; + return ip_vs_rht_hash_linfo(t, af, addr, ntohs(port), proto); } /* * Returns hash value of fwmark for virtual service lookup */ -static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark) +static inline u32 ip_vs_svc_fwm_hashval(struct ip_vs_rht *t, int af, + __u32 fwmark) { - return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; + return jhash_2words(fwmark, af, (u32)t->hash_key.key[0]); } -/* - * Hashes a service in the svc_table by <netns,proto,addr,port> - * or by fwmark. - * Should be called with locked tables. - */ +/* Hashes a service in the svc_table by <proto,addr,port> or by fwmark */ static int ip_vs_svc_hash(struct ip_vs_service *svc) { - unsigned int hash; + struct netns_ipvs *ipvs = svc->ipvs; + struct hlist_bl_head *head; + struct ip_vs_rht *t; + u32 hash; if (svc->flags & IP_VS_SVC_F_HASHED) { pr_err("%s(): request for already hashed, called from %pS\n", @@ -341,23 +354,32 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) return 0; } + /* increase its refcnt because it is referenced by the svc table */ + atomic_inc(&svc->refcnt); + + /* New entries go into recent table */ + t = rcu_dereference_protected(ipvs->svc_table, 1); + t = rcu_dereference_protected(t->new_tbl, 1); + if (svc->fwmark == 0) { /* - * Hash it by <netns,protocol,addr,port> + * Hash it by <protocol,addr,port> */ - hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol, + hash = ip_vs_svc_hashval(t, svc->af, svc->protocol, &svc->addr, svc->port); } else { /* * Hash it by fwmark */ - hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark); + hash = ip_vs_svc_fwm_hashval(t, svc->af, svc->fwmark); } - hlist_add_head_rcu(&svc->s_list, &svc->ipvs->svc_table[hash]); - + head = t->buckets + (hash & t->mask); + hlist_bl_lock(head); + WRITE_ONCE(svc->hash_key, ip_vs_rht_build_hash_key(t, hash)); svc->flags |= IP_VS_SVC_F_HASHED; - /* increase its refcnt because it is referenced by the svc table */ - atomic_inc(&svc->refcnt); + hlist_bl_add_head_rcu(&svc->s_list, head); + hlist_bl_unlock(head); + return 1; } @@ -368,17 +390,45 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) */ static int ip_vs_svc_unhash(struct ip_vs_service *svc) { + struct netns_ipvs *ipvs = svc->ipvs; + struct hlist_bl_head *head; + struct ip_vs_rht *t; + u32 hash_key2; + u32 hash_key; + if (!(svc->flags & IP_VS_SVC_F_HASHED)) { pr_err("%s(): request for unhash flagged, called from %pS\n", __func__, __builtin_return_address(0)); return 0; } + t = rcu_dereference_protected(ipvs->svc_table, 1); + hash_key = READ_ONCE(svc->hash_key); + /* We need to lock the bucket in the right table */ + if (ip_vs_rht_same_table(t, hash_key)) { + head = t->buckets + (hash_key & t->mask); + hlist_bl_lock(head); + /* Ensure hash_key is read under lock */ + hash_key2 = READ_ONCE(svc->hash_key); + /* Moved to new table ? */ + if (hash_key != hash_key2) { + hlist_bl_unlock(head); + t = rcu_dereference_protected(t->new_tbl, 1); + head = t->buckets + (hash_key2 & t->mask); + hlist_bl_lock(head); + } + } else { + /* It is already moved to new table */ + t = rcu_dereference_protected(t->new_tbl, 1); + head = t->buckets + (hash_key & t->mask); + hlist_bl_lock(head); + } /* Remove it from svc_table */ - hlist_del_rcu(&svc->s_list); + hlist_bl_del_rcu(&svc->s_list); svc->flags &= ~IP_VS_SVC_F_HASHED; atomic_dec(&svc->refcnt); + hlist_bl_unlock(head); return 1; } @@ -390,18 +440,29 @@ static inline struct ip_vs_service * __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { - unsigned int hash; + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); + struct hlist_bl_head *head; struct ip_vs_service *svc; - - /* Check for "full" addressed entries */ - hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport); - - hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) { - if (svc->af == af && ip_vs_addr_equal(af, &svc->addr, vaddr) && - svc->port == vport && svc->protocol == protocol && - !svc->fwmark) { - /* HIT */ - return svc; + struct ip_vs_rht *t, *p; + struct hlist_bl_node *e; + u32 hash, hash_key; + + ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) { + /* Check for "full" addressed entries */ + hash = ip_vs_svc_hashval(t, af, protocol, vaddr, vport); + + hash_key = ip_vs_rht_build_hash_key(t, hash); + ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { + hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { + if (READ_ONCE(svc->hash_key) == hash_key && + svc->af == af && + ip_vs_addr_equal(af, &svc->addr, vaddr) && + svc->port == vport && + svc->protocol == protocol && !svc->fwmark) { + /* HIT */ + return svc; + } + } } } @@ -415,16 +476,26 @@ __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, static inline struct ip_vs_service * __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) { - unsigned int hash; + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); + struct hlist_bl_head *head; struct ip_vs_service *svc; - - /* Check for fwmark addressed entries */ - hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark); - - hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) { - if (svc->fwmark == fwmark && svc->af == af) { - /* HIT */ - return svc; + struct ip_vs_rht *t, *p; + struct hlist_bl_node *e; + u32 hash, hash_key; + + ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) { + /* Check for fwmark addressed entries */ + hash = ip_vs_svc_fwm_hashval(t, af, fwmark); + + hash_key = ip_vs_rht_build_hash_key(t, hash); + ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { + hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { + if (READ_ONCE(svc->hash_key) == hash_key && + svc->fwmark == fwmark && svc->af == af) { + /* HIT */ + return svc; + } + } } } @@ -487,6 +558,220 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol return svc; } +/* Return the number of registered services */ +static int ip_vs_get_num_services(struct netns_ipvs *ipvs) +{ + int ns = 0, ni = IP_VS_AF_MAX; + + while (--ni >= 0) + ns += atomic_read(&ipvs->num_services[ni]); + return ns; +} + +/* Get default load factor to map num_services/u_thresh to t->size */ +static int ip_vs_svc_default_load_factor(struct netns_ipvs *ipvs) +{ + int factor; + + if (net_eq(ipvs->net, &init_net)) + factor = -3; /* grow if load is above 12.5% */ + else + factor = -2; /* grow if load is above 25% */ + return factor; +} + +/* Get the desired svc_table size */ +static int ip_vs_svc_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, + int lfactor) +{ + return ip_vs_rht_desired_size(ipvs, t, ip_vs_get_num_services(ipvs), + lfactor, IP_VS_SVC_TAB_MIN_BITS, + IP_VS_SVC_TAB_MAX_BITS); +} + +/* Allocate svc_table */ +static struct ip_vs_rht *ip_vs_svc_table_alloc(struct netns_ipvs *ipvs, + int buckets, int lfactor) +{ + struct ip_vs_rht *t; + int scounts, locks; + + /* No frequent lookups to race with resizing, so use max of 64 + * seqcounts. Only resizer moves entries, so use 0 locks. + */ + scounts = clamp(buckets >> 4, 1, 64); + locks = 0; + + t = ip_vs_rht_alloc(buckets, scounts, locks); + if (!t) + return NULL; + t->lfactor = lfactor; + ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_SVC_TAB_MIN_BITS, + IP_VS_SVC_TAB_MAX_BITS); + return t; +} + +/* svc_table resizer work */ +static void svc_resize_work_handler(struct work_struct *work) +{ + struct hlist_bl_head *head, *head2; + struct ip_vs_rht *t_free = NULL; + unsigned int resched_score = 0; + struct hlist_bl_node *cn, *nn; + struct ip_vs_rht *t, *t_new; + struct ip_vs_service *svc; + struct netns_ipvs *ipvs; + bool more_work = true; + seqcount_t *sc; + int limit = 0; + int new_size; + int lfactor; + u32 bucket; + + ipvs = container_of(work, struct netns_ipvs, svc_resize_work.work); + + if (!down_write_trylock(&ipvs->svc_resize_sem)) + goto out; + if (!mutex_trylock(&ipvs->service_mutex)) + goto unlock_sem; + more_work = false; + clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags); + if (!READ_ONCE(ipvs->enable) || + test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + goto unlock_m; + t = rcu_dereference_protected(ipvs->svc_table, 1); + /* Do nothing if table is removed */ + if (!t) + goto unlock_m; + /* New table needs to be registered? BUG! */ + if (t != rcu_dereference_protected(t->new_tbl, 1)) + goto unlock_m; + + lfactor = sysctl_svc_lfactor(ipvs); + /* Should we resize ? */ + new_size = ip_vs_svc_desired_size(ipvs, t, lfactor); + if (new_size == t->size && lfactor == t->lfactor) + goto unlock_m; + + t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); + if (!t_new) { + more_work = true; + goto unlock_m; + } + /* Flip the table_id */ + t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK; + + rcu_assign_pointer(t->new_tbl, t_new); + /* Allow add/del to new_tbl while moving from old table */ + mutex_unlock(&ipvs->service_mutex); + + ip_vs_rht_for_each_bucket(t, bucket, head) { +same_bucket: + if (++limit >= 16) { + if (!READ_ONCE(ipvs->enable) || + test_bit(IP_VS_WORK_SVC_NORESIZE, + &ipvs->work_flags)) + goto unlock_sem; + if (resched_score >= 100) { + resched_score = 0; + cond_resched(); + } + limit = 0; + } + if (hlist_bl_empty(head)) { + resched_score++; + continue; + } + /* Preemption calls ahead... */ + resched_score = 0; + + sc = &t->seqc[bucket & t->seqc_mask]; + /* seqcount_t usage considering PREEMPT_RT rules: + * - we are the only writer => preemption can be allowed + * - readers (SoftIRQ) => disable BHs + * - readers (processes) => preemption should be disabled + */ + local_bh_disable(); + preempt_disable_nested(); + write_seqcount_begin(sc); + hlist_bl_lock(head); + + hlist_bl_for_each_entry_safe(svc, cn, nn, head, s_list) { + u32 hash; + + /* New hash for the new table */ + if (svc->fwmark == 0) { + /* Hash it by <protocol,addr,port> */ + hash = ip_vs_svc_hashval(t_new, svc->af, + svc->protocol, + &svc->addr, svc->port); + } else { + /* Hash it by fwmark */ + hash = ip_vs_svc_fwm_hashval(t_new, svc->af, + svc->fwmark); + } + hlist_bl_del_rcu(&svc->s_list); + head2 = t_new->buckets + (hash & t_new->mask); + + hlist_bl_lock(head2); + WRITE_ONCE(svc->hash_key, + ip_vs_rht_build_hash_key(t_new, hash)); + /* t_new->seqc are not used at this stage, we race + * only with add/del, so only lock the bucket. + */ + hlist_bl_add_head_rcu(&svc->s_list, head2); + hlist_bl_unlock(head2); + /* Too long chain? Do it in steps */ + if (++limit >= 64) + break; + } + + hlist_bl_unlock(head); + write_seqcount_end(sc); + preempt_enable_nested(); + local_bh_enable(); + if (limit >= 64) + goto same_bucket; + } + + /* Tables can be switched only under service_mutex */ + while (!mutex_trylock(&ipvs->service_mutex)) { + cond_resched(); + if (!READ_ONCE(ipvs->enable) || + test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + goto unlock_sem; + } + if (!READ_ONCE(ipvs->enable) || + test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + goto unlock_m; + + rcu_assign_pointer(ipvs->svc_table, t_new); + /* Inform readers that new table is installed */ + smp_mb__before_atomic(); + atomic_inc(&ipvs->svc_table_changes); + t_free = t; + +unlock_m: + mutex_unlock(&ipvs->service_mutex); + +unlock_sem: + up_write(&ipvs->svc_resize_sem); + + if (t_free) { + /* RCU readers should not see more than two tables in chain. + * To prevent new table to be attached wait here instead of + * freeing the old table in RCU callback. + */ + synchronize_rcu(); + ip_vs_rht_free(t_free); + } + +out: + if (!READ_ONCE(ipvs->enable) || !more_work || + test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + return; + queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1); +} static inline void __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) @@ -1357,12 +1642,14 @@ static int ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { - int ret = 0; struct ip_vs_scheduler *sched = NULL; + struct ip_vs_rht *tc_new = NULL; + struct ip_vs_rht *t, *t_new = NULL; int af_id = ip_vs_af_index(u->af); - struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; + struct ip_vs_pe *pe = NULL; int ret_hooks = -1; + int ret = 0; /* increase the module use count */ if (!ip_vs_use_count_inc()) @@ -1404,6 +1691,29 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, } #endif + t = rcu_dereference_protected(ipvs->svc_table, 1); + if (!t) { + int lfactor = sysctl_svc_lfactor(ipvs); + int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor); + + t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); + if (!t_new) { + ret = -ENOMEM; + goto out_err; + } + } + + if (!rcu_dereference_protected(ipvs->conn_tab, 1)) { + int lfactor = sysctl_conn_lfactor(ipvs); + int new_size = ip_vs_conn_desired_size(ipvs, NULL, lfactor); + + tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor); + if (!tc_new) { + ret = -ENOMEM; + goto out_err; + } + } + if (!atomic_read(&ipvs->num_services[af_id])) { ret = ip_vs_register_hooks(ipvs, u->af); if (ret < 0) @@ -1449,6 +1759,16 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, if (ret < 0) goto out_err; + if (t_new) { + clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); + rcu_assign_pointer(ipvs->svc_table, t_new); + t_new = NULL; + } + if (tc_new) { + rcu_assign_pointer(ipvs->conn_tab, tc_new); + tc_new = NULL; + } + /* Update the virtual service counters */ if (svc->port == FTPPORT) atomic_inc(&ipvs->ftpsvc_counter[af_id]); @@ -1470,6 +1790,12 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, /* Hash the service into the service table */ ip_vs_svc_hash(svc); + /* Schedule resize work */ + if (t && ip_vs_get_num_services(ipvs) > t->u_thresh && + !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags)) + queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, + 1); + *svc_p = svc; if (!READ_ONCE(ipvs->enable)) { @@ -1484,6 +1810,10 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, out_err: + if (tc_new) + ip_vs_rht_free(tc_new); + if (t_new) + ip_vs_rht_free(t_new); if (ret_hooks >= 0) ip_vs_unregister_hooks(ipvs, u->af); if (svc != NULL) { @@ -1671,10 +2001,38 @@ static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) */ static int ip_vs_del_service(struct ip_vs_service *svc) { + struct netns_ipvs *ipvs; + struct ip_vs_rht *t, *p; + int ns; + if (svc == NULL) return -EEXIST; + ipvs = svc->ipvs; ip_vs_unlink_service(svc, false); - + t = rcu_dereference_protected(ipvs->svc_table, 1); + + /* Drop the table if no more services */ + ns = ip_vs_get_num_services(ipvs); + if (!ns) { + /* Stop the resizer and drop the tables */ + set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); + cancel_delayed_work_sync(&ipvs->svc_resize_work); + if (t) { + rcu_assign_pointer(ipvs->svc_table, NULL); + while (1) { + p = rcu_dereference_protected(t->new_tbl, 1); + call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); + if (p == t) + break; + t = p; + } + } + } else if (ns <= t->l_thresh && + !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, + &ipvs->work_flags)) { + queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, + 1); + } return 0; } @@ -1684,14 +2042,36 @@ static int ip_vs_del_service(struct ip_vs_service *svc) */ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) { - int idx; + DECLARE_IP_VS_RHT_WALK_BUCKETS(); + struct hlist_bl_head *head; struct ip_vs_service *svc; - struct hlist_node *n; + struct hlist_bl_node *ne; + struct hlist_bl_node *e; + struct ip_vs_rht *t, *p; + + /* Stop the resizer and drop the tables */ + if (!test_and_set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + cancel_delayed_work_sync(&ipvs->svc_resize_work); + /* No resizer, so now we have exclusive write access */ + + if (ip_vs_get_num_services(ipvs)) { + ip_vs_rht_walk_buckets(ipvs->svc_table, head) { + hlist_bl_for_each_entry_safe(svc, e, ne, head, s_list) + ip_vs_unlink_service(svc, cleanup); + } + } - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry_safe(svc, n, &ipvs->svc_table[idx], - s_list) - ip_vs_unlink_service(svc, cleanup); + /* Unregister the hash table and release it after RCU grace period */ + t = rcu_dereference_protected(ipvs->svc_table, 1); + if (t) { + rcu_assign_pointer(ipvs->svc_table, NULL); + while (1) { + p = rcu_dereference_protected(t->new_tbl, 1); + call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); + if (p == t) + break; + t = p; + } } return 0; } @@ -1742,19 +2122,44 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct netns_ipvs *ipvs = net_ipvs(net); + DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); + unsigned int resched_score = 0; + struct hlist_bl_head *head; struct ip_vs_service *svc; + struct hlist_bl_node *e; struct ip_vs_dest *dest; - unsigned int idx; + int old_gen, new_gen; if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); + + old_gen = atomic_read(&ipvs->svc_table_changes); + rcu_read_lock(); - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list) + +repeat: + smp_rmb(); /* ipvs->svc_table and svc_table_changes */ + ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { + hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { list_for_each_entry_rcu(dest, &svc->destinations, - n_list) + n_list) { ip_vs_forget_dev(dest, dev); + resched_score += 10; + } + resched_score++; + } + resched_score++; + if (resched_score >= 100) { + resched_score = 0; + cond_resched_rcu(); + new_gen = atomic_read(&ipvs->svc_table_changes); + /* New table installed ? */ + if (old_gen != new_gen) { + old_gen = new_gen; + goto repeat; + } + } } rcu_read_unlock(); @@ -1777,14 +2182,28 @@ static int ip_vs_zero_service(struct ip_vs_service *svc) static int ip_vs_zero_all(struct netns_ipvs *ipvs) { - int idx; + DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); + unsigned int resched_score = 0; + struct hlist_bl_head *head; struct ip_vs_service *svc; + struct hlist_bl_node *e; + + rcu_read_lock(); - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) + ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { + hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { ip_vs_zero_service(svc); + resched_score += 10; + } + resched_score++; + if (resched_score >= 100) { + resched_score = 0; + cond_resched_rcu(); + } } + rcu_read_unlock(); + ip_vs_zero_stats(&ipvs->tot_stats->s); return 0; } @@ -2218,7 +2637,8 @@ static struct ctl_table vs_vars[] = { struct ip_vs_iter { struct seq_net_private p; /* Do not move this, netns depends upon it*/ - int bucket; + struct ip_vs_rht *t; + u32 bucket; }; /* @@ -2239,17 +2659,23 @@ static inline const char *ip_vs_fwd_name(unsigned int flags) } } - +/* Do not expect consistent view during add, del and move(table resize). + * We may miss entries and even show duplicates. + */ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) { - struct net *net = seq_file_net(seq); - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_iter *iter = seq->private; - int idx; + struct ip_vs_rht *t = iter->t; struct ip_vs_service *svc; + struct hlist_bl_node *e; + int idx; - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list) { + if (!t) + return NULL; + for (idx = 0; idx < t->size; idx++) { + hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[idx], s_list) { + if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) + break; if (pos-- == 0) { iter->bucket = idx; return svc; @@ -2262,18 +2688,22 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { + struct ip_vs_iter *iter = seq->private; + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + rcu_read_lock(); + iter->t = rcu_dereference(ipvs->svc_table); return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct hlist_node *e; - struct ip_vs_iter *iter; struct ip_vs_service *svc; - struct net *net = seq_file_net(seq); - struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_iter *iter; + struct hlist_bl_node *e; + struct ip_vs_rht *t; ++*pos; if (v == SEQ_START_TOKEN) @@ -2281,15 +2711,22 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) svc = v; iter = seq->private; + t = iter->t; + if (!t) + return NULL; - e = rcu_dereference(hlist_next_rcu(&svc->s_list)); - if (e) - return hlist_entry(e, struct ip_vs_service, s_list); + hlist_bl_for_each_entry_continue_rcu(svc, e, s_list) { + /* Our cursor was moved to new table ? */ + if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) + break; + return svc; + } - while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - hlist_for_each_entry_rcu(svc, - &ipvs->svc_table[iter->bucket], - s_list) { + while (++iter->bucket < t->size) { + hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[iter->bucket], + s_list) { + if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) + break; return svc; } } @@ -2770,13 +3207,18 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_services *get, struct ip_vs_get_services __user *uptr) { - int idx, count=0; - struct ip_vs_service *svc; struct ip_vs_service_entry entry; + DECLARE_IP_VS_RHT_WALK_BUCKETS(); + struct hlist_bl_head *head; + struct ip_vs_service *svc; + struct hlist_bl_node *e; + int count = 0; int ret = 0; - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) { + lockdep_assert_held(&ipvs->svc_resize_sem); + /* All service modifications are disabled, go ahead */ + ip_vs_rht_walk_buckets(ipvs->svc_table, head) { + hlist_bl_for_each_entry(svc, e, head, s_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET) continue; @@ -2948,6 +3390,35 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } + if (cmd == IP_VS_SO_GET_SERVICES) { + struct ip_vs_get_services *get; + size_t size; + + get = (struct ip_vs_get_services *)arg; + size = struct_size(get, entrytable, get->num_services); + if (*len != size) { + pr_err("length: %u != %zu\n", *len, size); + return -EINVAL; + } + /* Protect against table resizer moving the entries. + * Try reverse locking, so that we do not hold the mutex + * while waiting for semaphore. + */ + while (1) { + ret = down_read_killable(&ipvs->svc_resize_sem); + if (ret < 0) + return ret; + if (mutex_trylock(&ipvs->service_mutex)) + break; + up_read(&ipvs->svc_resize_sem); + cond_resched(); + } + ret = __ip_vs_get_service_entries(ipvs, get, user); + up_read(&ipvs->svc_resize_sem); + mutex_unlock(&ipvs->service_mutex); + return ret; + } + mutex_lock(&ipvs->service_mutex); switch (cmd) { case IP_VS_SO_GET_VERSION: @@ -2976,22 +3447,6 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } break; - case IP_VS_SO_GET_SERVICES: - { - struct ip_vs_get_services *get; - size_t size; - - get = (struct ip_vs_get_services *)arg; - size = struct_size(get, entrytable, get->num_services); - if (*len != size) { - pr_err("length: %u != %zu\n", *len, size); - ret = -EINVAL; - goto out; - } - ret = __ip_vs_get_service_entries(ipvs, get, user); - } - break; - case IP_VS_SO_GET_SERVICE: { struct ip_vs_service_entry *entry; @@ -3277,15 +3732,19 @@ nla_put_failure: static int ip_vs_genl_dump_services(struct sk_buff *skb, struct netlink_callback *cb) { - int idx = 0, i; - int start = cb->args[0]; - struct ip_vs_service *svc; + DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU(); struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); + struct hlist_bl_head *head; + struct ip_vs_service *svc; + struct hlist_bl_node *e; + int start = cb->args[0]; + int idx = 0; + down_read(&ipvs->svc_resize_sem); rcu_read_lock(); - for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - hlist_for_each_entry_rcu(svc, &ipvs->svc_table[i], s_list) { + ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) { + hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { if (++idx <= start) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { @@ -3297,6 +3756,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, nla_put_failure: rcu_read_unlock(); + up_read(&ipvs->svc_resize_sem); cb->args[0] = idx; return skb->len; @@ -4306,8 +4766,10 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) /* Initialize service_mutex, svc_table per netns */ __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key); - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) - INIT_HLIST_HEAD(&ipvs->svc_table[idx]); + init_rwsem(&ipvs->svc_resize_sem); + INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler); + atomic_set(&ipvs->svc_table_changes, 0); + RCU_INIT_POINTER(ipvs->svc_table, NULL); /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) @@ -4326,6 +4788,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) } INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); + ipvs->sysctl_svc_lfactor = ip_vs_svc_default_load_factor(ipvs); /* procfs stats */ ipvs->tot_stats = kzalloc_obj(*ipvs->tot_stats); diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 85f31d71e29a..0c83c7b69581 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -132,9 +132,9 @@ static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p, } static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p, - u32 initval, bool inverse) + struct ip_vs_rht *t, bool inverse) { - return jhash(p->pe_data, p->pe_data_len, initval); + return jhash(p->pe_data, p->pe_data_len, (u32)t->hash_key.key[0]); } static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index b2ba3befbd55..93038abbf5e0 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1755,6 +1755,28 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, if (!ip_vs_use_count_inc()) return -ENOPROTOOPT; + /* Backup server can be started without services just to sync conns, + * make sure conn_tab is created even if ipvs->enable is 0. + */ + if (state == IP_VS_STATE_BACKUP) { + mutex_lock(&ipvs->service_mutex); + if (!rcu_dereference_protected(ipvs->conn_tab, 1)) { + int lfactor = sysctl_conn_lfactor(ipvs); + int new_size = ip_vs_conn_desired_size(ipvs, NULL, + lfactor); + struct ip_vs_rht *tc_new; + + tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor); + if (!tc_new) { + mutex_unlock(&ipvs->service_mutex); + result = -ENOMEM; + goto out_module; + } + rcu_assign_pointer(ipvs->conn_tab, tc_new); + } + mutex_unlock(&ipvs->service_mutex); + } + /* Do not hold one mutex and then to block on another */ for (;;) { rtnl_lock(); @@ -1922,6 +1944,7 @@ out_early: mutex_unlock(&ipvs->sync_mutex); rtnl_unlock(); +out_module: /* decrease the module use count */ ip_vs_use_count_dec(); return result; diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index 41503847d9d7..0507d67cad27 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -165,18 +165,26 @@ static struct nf_logger nf_arp_logger __read_mostly = { static void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, struct sock *sk) { + const struct socket *sock; + const struct file *file; + if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk))) return; - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket && sk->sk_socket->file) { - const struct cred *cred = sk->sk_socket->file->f_cred; + /* The sk pointer remains valid as long as the skb is. The sk_socket and + * file pointer may become NULL if the socket is closed. Both structures + * (including file->cred) are RCU freed which means they can be accessed + * within a RCU read section. + */ + sock = READ_ONCE(sk->sk_socket); + file = sock ? READ_ONCE(sock->file) : NULL; + if (file) { + const struct cred *cred = file->f_cred; nf_log_buf_add(m, "UID=%u GID=%u ", from_kuid_munged(&init_user_ns, cred->fsuid), from_kgid_munged(&init_user_ns, cred->fsgid)); } - read_unlock_bh(&sk->sk_callback_lock); } static noinline_for_stack int diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index b35a90955e2e..730c2541a849 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -611,19 +611,26 @@ __build_packet_message(struct nfnl_log_net *log, /* UID */ sk = skb->sk; if (sk && sk_fullsock(sk)) { - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket && sk->sk_socket->file) { - struct file *file = sk->sk_socket->file; + const struct socket *sock; + const struct file *file; + + /* The sk pointer remains valid as long as the skb is. + * The sk_socket and file pointer may become NULL + * if the socket is closed. + * Both structures (including file->cred) are RCU freed + * which means they can be accessed within a RCU read section. + */ + sock = READ_ONCE(sk->sk_socket); + file = sock ? READ_ONCE(sock->file) : NULL; + if (file) { const struct cred *cred = file->f_cred; struct user_namespace *user_ns = inst->peer_user_ns; __be32 uid = htonl(from_kuid_munged(user_ns, cred->fsuid)); __be32 gid = htonl(from_kgid_munged(user_ns, cred->fsgid)); - read_unlock_bh(&sk->sk_callback_lock); if (nla_put_be32(inst->skb, NFULA_UID, uid) || nla_put_be32(inst->skb, NFULA_GID, gid)) goto nla_put_failure; - } else - read_unlock_bh(&sk->sk_callback_lock); + } } /* local sequence number */ diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 7f5248b5f1ee..5379d8ff39c0 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -545,14 +545,23 @@ nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) { + const struct socket *sock; + const struct file *file; const struct cred *cred; if (!sk_fullsock(sk)) return 0; - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket && sk->sk_socket->file) { - cred = sk->sk_socket->file->f_cred; + /* The sk pointer remains valid as long as the skb is. + * The sk_socket and file pointer may become NULL + * if the socket is closed. + * Both structures (including file->cred) are RCU freed + * which means they can be accessed within a RCU read section. + */ + sock = READ_ONCE(sk->sk_socket); + file = sock ? READ_ONCE(sock->file) : NULL; + if (file) { + cred = file->f_cred; if (nla_put_be32(skb, NFQA_UID, htonl(from_kuid_munged(&init_user_ns, cred->fsuid)))) goto nla_put_failure; @@ -560,11 +569,9 @@ static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) htonl(from_kgid_munged(&init_user_ns, cred->fsgid)))) goto nla_put_failure; } - read_unlock_bh(&sk->sk_callback_lock); return 0; nla_put_failure: - read_unlock_bh(&sk->sk_callback_lock); return -1; } @@ -585,15 +592,8 @@ static int nfqnl_get_sk_secctx(struct sk_buff *skb, struct lsm_context *ctx) { int seclen = 0; #if IS_ENABLED(CONFIG_NETWORK_SECMARK) - - if (!skb || !sk_fullsock(skb->sk)) - return 0; - - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->secmark) seclen = security_secid_to_secctx(skb->secmark, ctx); - read_unlock_bh(&skb->sk->sk_callback_lock); #endif return seclen; } diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 983158274c68..d0df6cf374d1 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -131,33 +131,36 @@ nft_meta_get_eval_skugid(enum nft_meta_keys key, u32 *dest, const struct nft_pktinfo *pkt) { - struct sock *sk = skb_to_full_sk(pkt->skb); - struct socket *sock; + const struct sock *sk = skb_to_full_sk(pkt->skb); + const struct socket *sock; + const struct file *file; if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) return false; - read_lock_bh(&sk->sk_callback_lock); - sock = sk->sk_socket; - if (!sock || !sock->file) { - read_unlock_bh(&sk->sk_callback_lock); + /* The sk pointer remains valid as long as the skb is. The sk_socket and + * file pointer may become NULL if the socket is closed. Both structures + * (including file->cred) are RCU freed which means they can be accessed + * within a RCU read section. + */ + sock = READ_ONCE(sk->sk_socket); + file = sock ? READ_ONCE(sock->file) : NULL; + if (!file) return false; - } switch (key) { case NFT_META_SKUID: *dest = from_kuid_munged(sock_net(sk)->user_ns, - sock->file->f_cred->fsuid); + file->f_cred->fsuid); break; case NFT_META_SKGID: *dest = from_kgid_munged(sock_net(sk)->user_ns, - sock->file->f_cred->fsgid); + file->f_cred->fsgid); break; default: break; } - read_unlock_bh(&sk->sk_callback_lock); return true; } diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 50332888c8d2..5bfb4843df66 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -63,11 +63,12 @@ static bool owner_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_owner_match_info *info = par->matchinfo; - const struct file *filp; struct sock *sk = skb_to_full_sk(skb); struct net *net = xt_net(par); + const struct socket *sock; + const struct file *filp; - if (!sk || !sk->sk_socket || !net_eq(net, sock_net(sk))) + if (!sk || !READ_ONCE(sk->sk_socket) || !net_eq(net, sock_net(sk))) return (info->match ^ info->invert) == 0; else if (info->match & info->invert & XT_OWNER_SOCKET) /* @@ -76,23 +77,25 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) */ return false; - read_lock_bh(&sk->sk_callback_lock); - filp = sk->sk_socket ? sk->sk_socket->file : NULL; - if (filp == NULL) { - read_unlock_bh(&sk->sk_callback_lock); + /* The sk pointer remains valid as long as the skb is. The sk_socket and + * file pointer may become NULL if the socket is closed. Both structures + * (including file->cred) are RCU freed which means they can be accessed + * within a RCU read section. + */ + sock = READ_ONCE(sk->sk_socket); + filp = sock ? READ_ONCE(sock->file) : NULL; + if (filp == NULL) return ((info->match ^ info->invert) & (XT_OWNER_UID | XT_OWNER_GID)) == 0; - } if (info->match & XT_OWNER_UID) { kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); + if ((uid_gte(filp->f_cred->fsuid, uid_min) && uid_lte(filp->f_cred->fsuid, uid_max)) ^ - !(info->invert & XT_OWNER_UID)) { - read_unlock_bh(&sk->sk_callback_lock); + !(info->invert & XT_OWNER_UID)) return false; - } } if (info->match & XT_OWNER_GID) { @@ -117,13 +120,10 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) } } - if (match ^ !(info->invert & XT_OWNER_GID)) { - read_unlock_bh(&sk->sk_callback_lock); + if (match ^ !(info->invert & XT_OWNER_GID)) return false; - } } - read_unlock_bh(&sk->sk_callback_lock); return true; } |
