diff options
-rw-r--r-- | Documentation/bpf/map_hash.rst | 8 | ||||
-rw-r--r-- | Documentation/bpf/map_lru_hash_update.dot | 6 | ||||
-rw-r--r-- | kernel/bpf/bpf_lru_list.c | 9 | ||||
-rw-r--r-- | kernel/bpf/bpf_lru_list.h | 1 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/test_lru_map.c | 72 |
5 files changed, 52 insertions, 44 deletions
diff --git a/Documentation/bpf/map_hash.rst b/Documentation/bpf/map_hash.rst index d2343952f2cb..8606bf958a8c 100644 --- a/Documentation/bpf/map_hash.rst +++ b/Documentation/bpf/map_hash.rst @@ -233,10 +233,16 @@ attempts in order to enforce the LRU property which have increasing impacts on other CPUs involved in the following operation attempts: - Attempt to use CPU-local state to batch operations -- Attempt to fetch free nodes from global lists +- Attempt to fetch ``target_free`` free nodes from global lists - Attempt to pull any node from a global list and remove it from the hashmap - Attempt to pull any node from any CPU's list and remove it from the hashmap +The number of nodes to borrow from the global list in a batch, ``target_free``, +depends on the size of the map. Larger batch size reduces lock contention, but +may also exhaust the global structure. The value is computed at map init to +avoid exhaustion, by limiting aggregate reservation by all CPUs to half the map +size. With a minimum of a single element and maximum budget of 128 at a time. + This algorithm is described visually in the following diagram. See the description in commit 3a08c2fd7634 ("bpf: LRU List") for a full explanation of the corresponding operations: diff --git a/Documentation/bpf/map_lru_hash_update.dot b/Documentation/bpf/map_lru_hash_update.dot index a0fee349d29c..ab10058f5b79 100644 --- a/Documentation/bpf/map_lru_hash_update.dot +++ b/Documentation/bpf/map_lru_hash_update.dot @@ -35,18 +35,18 @@ digraph { fn_bpf_lru_list_pop_free_to_local [shape=rectangle,fillcolor=2, label="Flush local pending, Rotate Global list, move - LOCAL_FREE_TARGET + target_free from global -> local"] // Also corresponds to: // fn__local_list_flush() // fn_bpf_lru_list_rotate() fn___bpf_lru_node_move_to_free[shape=diamond,fillcolor=2, - label="Able to free\nLOCAL_FREE_TARGET\nnodes?"] + label="Able to free\ntarget_free\nnodes?"] fn___bpf_lru_list_shrink_inactive [shape=rectangle,fillcolor=3, label="Shrink inactive list up to remaining - LOCAL_FREE_TARGET + target_free (global LRU -> local)"] fn___bpf_lru_list_shrink [shape=diamond,fillcolor=2, label="> 0 entries in\nlocal free list?"] diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 3dabdd137d10..2d6e1c98d8ad 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -337,12 +337,12 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, list) { __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); - if (++nfree == LOCAL_FREE_TARGET) + if (++nfree == lru->target_free) break; } - if (nfree < LOCAL_FREE_TARGET) - __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, + if (nfree < lru->target_free) + __bpf_lru_list_shrink(lru, l, lru->target_free - nfree, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); @@ -577,6 +577,9 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } + + lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2, + 1, LOCAL_FREE_TARGET); } static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index cbd8d3720c2b..fe2661a58ea9 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -58,6 +58,7 @@ struct bpf_lru { del_from_htab_func del_from_htab; void *del_arg; unsigned int hash_offset; + unsigned int target_free; unsigned int nr_scans; bool percpu; }; diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c index fda7589c5023..4ae83f4b7fc7 100644 --- a/tools/testing/selftests/bpf/test_lru_map.c +++ b/tools/testing/selftests/bpf/test_lru_map.c @@ -138,6 +138,12 @@ static int sched_next_online(int pid, int *next_to_try) return ret; } +/* Inverse of how bpf_common_lru_populate derives target_free from map_size. */ +static unsigned int __map_size(unsigned int tgt_free) +{ + return tgt_free * nr_cpus * 2; +} + /* Size of the LRU map is 2 * Add key=1 (+1 key) * Add key=2 (+1 key) @@ -231,11 +237,11 @@ static void test_lru_sanity0(int map_type, int map_flags) printf("Pass\n"); } -/* Size of the LRU map is 1.5*tgt_free - * Insert 1 to tgt_free (+tgt_free keys) - * Lookup 1 to tgt_free/2 - * Insert 1+tgt_free to 2*tgt_free (+tgt_free keys) - * => 1+tgt_free/2 to LOCALFREE_TARGET will be removed by LRU +/* Verify that unreferenced elements are recycled before referenced ones. + * Insert elements. + * Reference a subset of these. + * Insert more, enough to trigger recycling. + * Verify that unreferenced are recycled. */ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free) { @@ -257,7 +263,7 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free) batch_size = tgt_free / 2; assert(batch_size * 2 == tgt_free); - map_size = tgt_free + batch_size; + map_size = __map_size(tgt_free) + batch_size; lru_map_fd = create_map(map_type, map_flags, map_size); assert(lru_map_fd != -1); @@ -266,13 +272,13 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free) value[0] = 1234; - /* Insert 1 to tgt_free (+tgt_free keys) */ - end_key = 1 + tgt_free; + /* Insert map_size - batch_size keys */ + end_key = 1 + __map_size(tgt_free); for (key = 1; key < end_key; key++) assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); - /* Lookup 1 to tgt_free/2 */ + /* Lookup 1 to batch_size */ end_key = 1 + batch_size; for (key = 1; key < end_key; key++) { assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value)); @@ -280,12 +286,13 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free) BPF_NOEXIST)); } - /* Insert 1+tgt_free to 2*tgt_free - * => 1+tgt_free/2 to LOCALFREE_TARGET will be + /* Insert another map_size - batch_size keys + * Map will contain 1 to batch_size plus these latest, i.e., + * => previous 1+batch_size to map_size - batch_size will have been * removed by LRU */ - key = 1 + tgt_free; - end_key = key + tgt_free; + key = 1 + __map_size(tgt_free); + end_key = key + __map_size(tgt_free); for (; key < end_key; key++) { assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); @@ -301,17 +308,8 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free) printf("Pass\n"); } -/* Size of the LRU map 1.5 * tgt_free - * Insert 1 to tgt_free (+tgt_free keys) - * Update 1 to tgt_free/2 - * => The original 1 to tgt_free/2 will be removed due to - * the LRU shrink process - * Re-insert 1 to tgt_free/2 again and do a lookup immeidately - * Insert 1+tgt_free to tgt_free*3/2 - * Insert 1+tgt_free*3/2 to tgt_free*5/2 - * => Key 1+tgt_free to tgt_free*3/2 - * will be removed from LRU because it has never - * been lookup and ref bit is not set +/* Verify that insertions exceeding map size will recycle the oldest. + * Verify that unreferenced elements are recycled before referenced. */ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) { @@ -334,7 +332,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) batch_size = tgt_free / 2; assert(batch_size * 2 == tgt_free); - map_size = tgt_free + batch_size; + map_size = __map_size(tgt_free) + batch_size; lru_map_fd = create_map(map_type, map_flags, map_size); assert(lru_map_fd != -1); @@ -343,8 +341,8 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) value[0] = 1234; - /* Insert 1 to tgt_free (+tgt_free keys) */ - end_key = 1 + tgt_free; + /* Insert map_size - batch_size keys */ + end_key = 1 + __map_size(tgt_free); for (key = 1; key < end_key; key++) assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); @@ -357,8 +355,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) * shrink the inactive list to get tgt_free * number of free nodes. * - * Hence, the oldest key 1 to tgt_free/2 - * are removed from the LRU list. + * Hence, the oldest key is removed from the LRU list. */ key = 1; if (map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { @@ -370,8 +367,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) BPF_EXIST)); } - /* Re-insert 1 to tgt_free/2 again and do a lookup - * immeidately. + /* Re-insert 1 to batch_size again and do a lookup immediately. */ end_key = 1 + batch_size; value[0] = 4321; @@ -387,17 +383,18 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) value[0] = 1234; - /* Insert 1+tgt_free to tgt_free*3/2 */ - end_key = 1 + tgt_free + batch_size; - for (key = 1 + tgt_free; key < end_key; key++) + /* Insert batch_size new elements */ + key = 1 + __map_size(tgt_free); + end_key = key + batch_size; + for (; key < end_key; key++) /* These newly added but not referenced keys will be * gone during the next LRU shrink. */ assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); - /* Insert 1+tgt_free*3/2 to tgt_free*5/2 */ - end_key = key + tgt_free; + /* Insert map_size - batch_size elements */ + end_key += __map_size(tgt_free); for (; key < end_key; key++) { assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); @@ -500,7 +497,8 @@ static void test_lru_sanity4(int map_type, int map_flags, unsigned int tgt_free) lru_map_fd = create_map(map_type, map_flags, 3 * tgt_free * nr_cpus); else - lru_map_fd = create_map(map_type, map_flags, 3 * tgt_free); + lru_map_fd = create_map(map_type, map_flags, + 3 * __map_size(tgt_free)); assert(lru_map_fd != -1); expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, |