diff options
| author | Julian Anastasov <ja@ssi.bg> | 2026-03-04 00:04:05 +0300 |
|---|---|---|
| committer | Florian Westphal <fw@strlen.de> | 2026-03-04 13:45:45 +0300 |
| commit | b655388111cf7e43f70e49db64bdaa42bcb8a038 (patch) | |
| tree | a50a74e2d242fcd8e94617386d7f8313f425f564 /net | |
| parent | 1ac252ad036cdb18f5fb7f76bb6061adfed9cedf (diff) | |
| download | linux-b655388111cf7e43f70e49db64bdaa42bcb8a038.tar.xz | |
ipvs: add resizable hash tables
Add infrastructure for resizable hash tables based on hlist_bl
which we will use in followup patches.
The tables allow RCU lookups during resizing, bucket modifications
are protected with per-bucket bit lock and additional custom locking,
the tables are resized when load reaches thresholds determined based
on load factor parameter.
Compared to other implementations we rely on:
* fast entry removal by using node unlinking without pre-lookup
* entry rehashing when hash key changes
* entries can contain multiple hash nodes
* custom locking depending on different contexts
* adjustable load factor to customize the grow/shrink process
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Florian Westphal <fw@strlen.de>
Diffstat (limited to 'net')
| -rw-r--r-- | net/netfilter/ipvs/ip_vs_conn.c | 5 | ||||
| -rw-r--r-- | net/netfilter/ipvs/ip_vs_core.c | 179 |
2 files changed, 179 insertions, 5 deletions
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index deaf16a90e2f..a6fd3b64428f 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -76,11 +76,6 @@ static unsigned int ip_vs_conn_rnd __read_mostly; #define IP_VS_ADDRSTRLEN (8+1) #endif -struct ip_vs_aligned_lock -{ - spinlock_t l; -} __attribute__((__aligned__(SMP_CACHE_BYTES))); - /* lock array for conn table */ static struct ip_vs_aligned_lock __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 869f18e0e835..f5b7a2047291 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -117,6 +117,185 @@ void ip_vs_init_hash_table(struct list_head *table, int rows) INIT_LIST_HEAD(&table[rows]); } +/* IPVS Resizable Hash Tables: + * - list_bl buckets with bit lock + * + * Goals: + * - RCU lookup for entry can run in parallel with add/del/move operations + * - hash keys can be on non-contiguous memory + * - support entries with duplicate keys + * - unlink entries without lookup, use the saved table and bucket id + * - resizing can trigger on load change or depending on key refresh period + * - customizable load factor to balance between speed and memory usage + * - add/del/move operations should be allowed for any context + * + * Resizing: + * - new table is attached to the current table and all entries are moved + * with new hash key. Finally, the new table is installed as current one and + * the old table is released after RCU grace period. + * - RCU read-side critical sections will walk two tables while resizing is + * in progress + * - new entries are added to the new table + * - entries will be deleted from the old or from the new table, the table_id + * can be saved into entry as part of the hash key to know where the entry is + * hashed + * - move operations may delay readers or to cause retry for the modified + * bucket. As result, searched entry will be found but walkers that operate + * on multiple entries may see same entry twice if bucket walking is retried. + * - for fast path the number of entries (load) can be compared to u_thresh + * and l_thresh to decide when to trigger table growing/shrinking. They + * are calculated based on load factor (shift count), negative value allows + * load to be below 100% to reduce collisions by maintaining larger table + * while positive value tolerates collisions by using smaller table and load + * above 100%: u_thresh(load) = size * (2 ^ lfactor) + * + * Locking: + * - lock: protect seqc if other context except resizer can move entries + * - seqc: seqcount_t, delay/retry readers while entries are moved to + * new table on resizing + * - bit lock: serialize bucket modifications + * - writers may use other locking mechanisms to serialize operations for + * resizing, moving and installing new tables + */ + +void ip_vs_rht_free(struct ip_vs_rht *t) +{ + kvfree(t->buckets); + kvfree(t->seqc); + kvfree(t->lock); + kfree(t); +} + +void ip_vs_rht_rcu_free(struct rcu_head *head) +{ + struct ip_vs_rht *t; + + t = container_of(head, struct ip_vs_rht, rcu_head); + ip_vs_rht_free(t); +} + +struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks) +{ + struct ip_vs_rht *t = kzalloc(sizeof(*t), GFP_KERNEL); + int i; + + if (!t) + return NULL; + if (scounts) { + int ml = roundup_pow_of_two(nr_cpu_ids); + + scounts = min(scounts, buckets); + scounts = min(scounts, ml); + t->seqc = kvmalloc_array(scounts, sizeof(*t->seqc), GFP_KERNEL); + if (!t->seqc) + goto err; + for (i = 0; i < scounts; i++) + seqcount_init(&t->seqc[i]); + + if (locks) { + locks = min(locks, scounts); + t->lock = kvmalloc_array(locks, sizeof(*t->lock), + GFP_KERNEL); + if (!t->lock) + goto err; + for (i = 0; i < locks; i++) + spin_lock_init(&t->lock[i].l); + } + } + + t->buckets = kvmalloc_array(buckets, sizeof(*t->buckets), GFP_KERNEL); + if (!t->buckets) + goto err; + for (i = 0; i < buckets; i++) + INIT_HLIST_BL_HEAD(&t->buckets[i]); + t->mask = buckets - 1; + t->size = buckets; + t->seqc_mask = scounts - 1; + t->lock_mask = locks - 1; + t->u_thresh = buckets; + t->l_thresh = buckets >> 4; + t->bits = order_base_2(buckets); + /* new_tbl points to self if no new table is filled */ + RCU_INIT_POINTER(t->new_tbl, t); + get_random_bytes(&t->hash_key, sizeof(t->hash_key)); + return t; + +err: + ip_vs_rht_free(t); + return NULL; +} + +/* Get the desired table size for n entries based on current table size and + * by using the formula size = n / (2^lfactor) + * lfactor: shift value for the load factor: + * - >0: u_thresh=size << lfactor, for load factor above 100% + * - <0: u_thresh=size >> -lfactor, for load factor below 100% + * - 0: for load factor of 100% + */ +int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n, + int lfactor, int min_bits, int max_bits) +{ + if (!t) + return 1 << min_bits; + n = roundup_pow_of_two(n); + if (lfactor < 0) { + int factor = min(-lfactor, max_bits); + + n = min(n, 1 << (max_bits - factor)); + n <<= factor; + } else { + n = min(n >> lfactor, 1 << max_bits); + } + if (lfactor != t->lfactor) + return clamp(n, 1 << min_bits, 1 << max_bits); + if (n > t->size) + return n; + if (n > t->size >> 4) + return t->size; + /* Shrink but keep it n * 2 to prevent frequent resizing */ + return clamp(n << 1, 1 << min_bits, 1 << max_bits); +} + +/* Set thresholds based on table size and load factor: + * u_thresh = size * (2^lfactor) + * l_thresh = u_thresh / 16 + * u_thresh/l_thresh can be used to check if load triggers a table grow/shrink + */ +void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor, + int min_bits, int max_bits) +{ + if (size >= 1 << max_bits) + t->u_thresh = INT_MAX; /* stop growing */ + else if (lfactor <= 0) + t->u_thresh = size >> min(-lfactor, max_bits); + else + t->u_thresh = min(size, 1 << (30 - lfactor)) << lfactor; + + /* l_thresh: shrink when load is 16 times lower, can be 0 */ + if (size >= 1 << max_bits) + t->l_thresh = (1 << max_bits) >> 4; + else if (size > 1 << min_bits) + t->l_thresh = t->u_thresh >> 4; + else + t->l_thresh = 0; /* stop shrinking */ +} + +/* Return hash value for local info (fast, insecure) */ +u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af, + const union nf_inet_addr *addr, u32 v1, u32 v2) +{ + u32 v3; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + v3 = ipv6_addr_hash(&addr->in6); + else +#endif + v3 = addr->all[0]; + + return jhash_3words(v1, v2, v3, (u32)t->hash_key.key[0]); +} + static inline void ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { |
