From b168ed458ddecc176f3b9a1f4bcd83d7a4541c14 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Wed, 4 Dec 2024 15:31:11 +0100 Subject: kernel/cgroup: Add "dmem" memory accounting cgroup This code is based on the RDMA and misc cgroup initially, but now uses page_counter. It uses the same min/low/max semantics as the memory cgroup as a result. There's a small mismatch as TTM uses u64, and page_counter long pages. In practice it's not a problem. 32-bits systems don't really come with >=4GB cards and as long as we're consistently wrong with units, it's fine. The device page size may not be in the same units as kernel page size, and each region might also have a different page size (VRAM vs GART for example). The interface is simple: - Call dmem_cgroup_register_region() - Use dmem_cgroup_try_charge to check if you can allocate a chunk of memory, use dmem_cgroup__uncharge when freeing it. This may return an error code, or -EAGAIN when the cgroup limit is reached. In that case a reference to the limiting pool is returned. - The limiting cs can be used as compare function for dmem_cgroup_state_evict_valuable. - After having evicted enough, drop reference to limiting cs with dmem_cgroup_pool_state_put. This API allows you to limit device resources with cgroups. You can see the supported cards in /sys/fs/cgroup/dmem.capacity You need to echo +dmem to cgroup.subtree_control, and then you can partition device memory. Co-developed-by: Friedrich Vock Signed-off-by: Friedrich Vock Co-developed-by: Maxime Ripard Signed-off-by: Maarten Lankhorst Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20241204143112.1250983-1-dev@lankhorst.se Signed-off-by: Maxime Ripard --- include/linux/cgroup_dmem.h | 66 +++++++++++++++++++++++++++++++++++++++++++ include/linux/cgroup_subsys.h | 4 +++ include/linux/page_counter.h | 2 +- 3 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 include/linux/cgroup_dmem.h (limited to 'include/linux') diff --git a/include/linux/cgroup_dmem.h b/include/linux/cgroup_dmem.h new file mode 100644 index 000000000000..dd4869f1d736 --- /dev/null +++ b/include/linux/cgroup_dmem.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023-2024 Intel Corporation + */ + +#ifndef _CGROUP_DMEM_H +#define _CGROUP_DMEM_H + +#include +#include + +struct dmem_cgroup_pool_state; + +/* Opaque definition of a cgroup region, used internally */ +struct dmem_cgroup_region; + +#if IS_ENABLED(CONFIG_CGROUP_DMEM) +struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *name_fmt, ...) __printf(2,3); +void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region); +int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, + struct dmem_cgroup_pool_state **ret_pool, + struct dmem_cgroup_pool_state **ret_limit_pool); +void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size); +bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool, + struct dmem_cgroup_pool_state *test_pool, + bool ignore_low, bool *ret_hit_low); + +void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool); +#else +static inline __printf(2,3) struct dmem_cgroup_region * +dmem_cgroup_register_region(u64 size, const char *name_fmt, ...) +{ + return NULL; +} + +static inline void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region) +{ } + +static inline int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, + struct dmem_cgroup_pool_state **ret_pool, + struct dmem_cgroup_pool_state **ret_limit_pool) +{ + *ret_pool = NULL; + + if (ret_limit_pool) + *ret_limit_pool = NULL; + + return 0; +} + +static inline void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size) +{ } + +static inline +bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool, + struct dmem_cgroup_pool_state *test_pool, + bool ignore_low, bool *ret_hit_low) +{ + return true; +} + +static inline void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool) +{ } + +#endif +#endif /* _CGROUP_DMEM_H */ diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 445235487230..3fd0bcbf3080 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -65,6 +65,10 @@ SUBSYS(rdma) SUBSYS(misc) #endif +#if IS_ENABLED(CONFIG_CGROUP_DMEM) +SUBSYS(dmem) +#endif + /* * The following subsystems are not supported on the default hierarchy. */ diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 79dbd8bc35a7..46406f3fe34d 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -96,7 +96,7 @@ static inline void page_counter_reset_watermark(struct page_counter *counter) counter->watermark = usage; } -#ifdef CONFIG_MEMCG +#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM) void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, bool recursive_protection); -- cgit v1.2.3