From ceddae22cd08ba9f52a995cfb573fee89fa4afc4 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 13 Jul 2023 19:59:38 +0800 Subject: cgroup: remove obsolete comment above struct cgroupstats There's no flag in the delay accounting structure indicates that the task is waiting on IO since commit 1193829da1a6 ("delayacct: cleanup flags in struct task_delay_info and functions use it"). So remove the comment. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- include/uapi/linux/cgroupstats.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/cgroupstats.h b/include/uapi/linux/cgroupstats.h index aa306e4cd6c1..80b2c8594480 100644 --- a/include/uapi/linux/cgroupstats.h +++ b/include/uapi/linux/cgroupstats.h @@ -24,8 +24,6 @@ * basis. This data is shared using taskstats. * * Most of these states are derived by looking at the task->state value - * For the nr_io_wait state, a flag in the delay accounting structure - * indicates that the task is waiting on IO * * Each member is aligned to a 8 byte boundary. */ -- cgit v1.2.3 From 62157e11d9a4ca7210bb2b0e8fa0557a6ada7fad Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Tue, 18 Jul 2023 14:38:34 +0530 Subject: cgroup/misc: update struct members descriptions Update the miscellaneous controller's structure member's description of struct misc_res and struct misc_cg. Signed-off-by: Kamalesh Babulal Signed-off-by: Tejun Heo --- include/linux/misc_cgroup.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index c238207d1615..6555c0f57158 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -31,7 +31,7 @@ struct misc_cg; * struct misc_res: Per cgroup per misc type resource * @max: Maximum limit on the resource. * @usage: Current usage of the resource. - * @failed: True if charged failed for the resource in a cgroup. + * @events: Number of times, the resource limit exceeded. */ struct misc_res { unsigned long max; @@ -42,6 +42,7 @@ struct misc_res { /** * struct misc_cg - Miscellaneous controller's cgroup structure. * @css: cgroup subsys state object. + * @events_file: Handle for the misc resources events file. * @res: Array of misc resources usage in the cgroup. */ struct misc_cg { -- cgit v1.2.3 From 32bf85c60ca3584a7ba3bef19da2779b73b2e7d6 Mon Sep 17 00:00:00 2001 From: Haitao Huang Date: Mon, 17 Jul 2023 18:08:45 -0700 Subject: cgroup/misc: Change counters to be explicit 64bit types So the variables can account for resources of huge quantities even on 32-bit machines. Signed-off-by: Haitao Huang Signed-off-by: Tejun Heo --- include/linux/misc_cgroup.h | 25 +++++++++------------ kernel/cgroup/misc.c | 55 ++++++++++++++++++++++----------------------- 2 files changed, 38 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index 6555c0f57158..e799b1f8d05b 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -34,9 +34,9 @@ struct misc_cg; * @events: Number of times, the resource limit exceeded. */ struct misc_res { - unsigned long max; - atomic_long_t usage; - atomic_long_t events; + u64 max; + atomic64_t usage; + atomic64_t events; }; /** @@ -54,12 +54,10 @@ struct misc_cg { struct misc_res res[MISC_CG_RES_TYPES]; }; -unsigned long misc_cg_res_total_usage(enum misc_res_type type); -int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity); -int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount); -void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount); +u64 misc_cg_res_total_usage(enum misc_res_type type); +int misc_cg_set_capacity(enum misc_res_type type, u64 capacity); +int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount); +void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount); /** * css_misc() - Get misc cgroup from the css. @@ -100,27 +98,26 @@ static inline void put_misc_cg(struct misc_cg *cg) #else /* !CONFIG_CGROUP_MISC */ -static inline unsigned long misc_cg_res_total_usage(enum misc_res_type type) +static inline u64 misc_cg_res_total_usage(enum misc_res_type type) { return 0; } -static inline int misc_cg_set_capacity(enum misc_res_type type, - unsigned long capacity) +static inline int misc_cg_set_capacity(enum misc_res_type type, u64 capacity) { return 0; } static inline int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) + u64 amount) { return 0; } static inline void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) + u64 amount) { } diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index ae2f4dd47508..abbe9aa5cdd1 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -14,7 +14,7 @@ #include #define MAX_STR "max" -#define MAX_NUM ULONG_MAX +#define MAX_NUM U64_MAX /* Miscellaneous res name, keep it in sync with enum misc_res_type */ static const char *const misc_res_name[] = { @@ -37,7 +37,7 @@ static struct misc_cg root_cg; * more than the actual capacity. We are using Limits resource distribution * model of cgroup for miscellaneous controller. */ -static unsigned long misc_res_capacity[MISC_CG_RES_TYPES]; +static u64 misc_res_capacity[MISC_CG_RES_TYPES]; /** * parent_misc() - Get the parent of the passed misc cgroup. @@ -74,10 +74,10 @@ static inline bool valid_type(enum misc_res_type type) * Context: Any context. * Return: Current total usage of the resource. */ -unsigned long misc_cg_res_total_usage(enum misc_res_type type) +u64 misc_cg_res_total_usage(enum misc_res_type type) { if (valid_type(type)) - return atomic_long_read(&root_cg.res[type].usage); + return atomic64_read(&root_cg.res[type].usage); return 0; } @@ -95,7 +95,7 @@ EXPORT_SYMBOL_GPL(misc_cg_res_total_usage); * * %0 - Successfully registered the capacity. * * %-EINVAL - If @type is invalid. */ -int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity) +int misc_cg_set_capacity(enum misc_res_type type, u64 capacity) { if (!valid_type(type)) return -EINVAL; @@ -114,9 +114,9 @@ EXPORT_SYMBOL_GPL(misc_cg_set_capacity); * Context: Any context. */ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) + u64 amount) { - WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage), + WARN_ONCE(atomic64_add_negative(-amount, &cg->res[type].usage), "misc cgroup resource %s became less than 0", misc_res_name[type]); } @@ -137,13 +137,12 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, * * -EBUSY - If max limit will be crossed or total usage will be more than the * capacity. */ -int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) +int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount) { struct misc_cg *i, *j; int ret; struct misc_res *res; - int new_usage; + s64 new_usage; if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type]))) return -EINVAL; @@ -154,7 +153,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, for (i = cg; i; i = parent_misc(i)) { res = &i->res[type]; - new_usage = atomic_long_add_return(amount, &res->usage); + new_usage = atomic64_add_return(amount, &res->usage); if (new_usage > READ_ONCE(res->max) || new_usage > READ_ONCE(misc_res_capacity[type])) { ret = -EBUSY; @@ -165,7 +164,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, err_charge: for (j = i; j; j = parent_misc(j)) { - atomic_long_inc(&j->res[type].events); + atomic64_inc(&j->res[type].events); cgroup_file_notify(&j->events_file); } @@ -184,8 +183,7 @@ EXPORT_SYMBOL_GPL(misc_cg_try_charge); * * Context: Any context. */ -void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) +void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount) { struct misc_cg *i; @@ -209,7 +207,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) { int i; struct misc_cg *cg = css_misc(seq_css(sf)); - unsigned long max; + u64 max; for (i = 0; i < MISC_CG_RES_TYPES; i++) { if (READ_ONCE(misc_res_capacity[i])) { @@ -217,7 +215,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) if (max == MAX_NUM) seq_printf(sf, "%s max\n", misc_res_name[i]); else - seq_printf(sf, "%s %lu\n", misc_res_name[i], + seq_printf(sf, "%s %llu\n", misc_res_name[i], max); } } @@ -241,13 +239,13 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) * Return: * * >= 0 - Number of bytes processed in the input. * * -EINVAL - If buf is not valid. - * * -ERANGE - If number is bigger than the unsigned long capacity. + * * -ERANGE - If number is bigger than the u64 capacity. */ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct misc_cg *cg; - unsigned long max; + u64 max; int ret = 0, i; enum misc_res_type type = MISC_CG_RES_TYPES; char *token; @@ -271,7 +269,7 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, if (!strcmp(MAX_STR, buf)) { max = MAX_NUM; } else { - ret = kstrtoul(buf, 0, &max); + ret = kstrtou64(buf, 0, &max); if (ret) return ret; } @@ -297,13 +295,13 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, static int misc_cg_current_show(struct seq_file *sf, void *v) { int i; - unsigned long usage; + u64 usage; struct misc_cg *cg = css_misc(seq_css(sf)); for (i = 0; i < MISC_CG_RES_TYPES; i++) { - usage = atomic_long_read(&cg->res[i].usage); + usage = atomic64_read(&cg->res[i].usage); if (READ_ONCE(misc_res_capacity[i]) || usage) - seq_printf(sf, "%s %lu\n", misc_res_name[i], usage); + seq_printf(sf, "%s %llu\n", misc_res_name[i], usage); } return 0; @@ -322,12 +320,12 @@ static int misc_cg_current_show(struct seq_file *sf, void *v) static int misc_cg_capacity_show(struct seq_file *sf, void *v) { int i; - unsigned long cap; + u64 cap; for (i = 0; i < MISC_CG_RES_TYPES; i++) { cap = READ_ONCE(misc_res_capacity[i]); if (cap) - seq_printf(sf, "%s %lu\n", misc_res_name[i], cap); + seq_printf(sf, "%s %llu\n", misc_res_name[i], cap); } return 0; @@ -336,12 +334,13 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v) static int misc_events_show(struct seq_file *sf, void *v) { struct misc_cg *cg = css_misc(seq_css(sf)); - unsigned long events, i; + u64 events; + int i; for (i = 0; i < MISC_CG_RES_TYPES; i++) { - events = atomic_long_read(&cg->res[i].events); + events = atomic64_read(&cg->res[i].events); if (READ_ONCE(misc_res_capacity[i]) || events) - seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events); + seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events); } return 0; } @@ -397,7 +396,7 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css) for (i = 0; i < MISC_CG_RES_TYPES; i++) { WRITE_ONCE(cg->res[i].max, MAX_NUM); - atomic_long_set(&cg->res[i].usage, 0); + atomic64_set(&cg->res[i].usage, 0); } return &cg->css; -- cgit v1.2.3 From 0437719c1a97791481c5fd59642494f2108701a8 Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Mon, 7 Aug 2023 11:29:30 +0800 Subject: cgroup/rstat: Record the cumulative per-cpu time of cgroup and its descendants The member variable bstat of the structure cgroup_rstat_cpu records the per-cpu time of the cgroup itself, but does not include the per-cpu time of its descendants. The per-cpu time including descendants is very useful for calculating the per-cpu usage of cgroups. Although we can indirectly obtain the total per-cpu time of the cgroup and its descendants by accumulating the per-cpu bstat of each descendant of the cgroup. But after a child cgroup is removed, we will lose its bstat information. This will cause the cumulative value to be non-monotonic, thus affecting the accuracy of cgroup per-cpu usage. So we add the subtree_bstat variable to record the total per-cpu time of this cgroup and its descendants, which is similar to "cpuacct.usage*" in cgroup v1. And this is also helpful for the migration from cgroup v1 to cgroup v2. After adding this variable, we can obtain the per-cpu time of cgroup and its descendants in user mode through eBPF/drgn, etc. And we are still trying to determine how to expose it in the cgroupfs interface. Suggested-by: Tejun Heo Signed-off-by: Hao Jia Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 14 ++++++++++++++ kernel/cgroup/rstat.c | 12 ++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8a0d5466c7be..7a2862172f51 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -341,6 +341,20 @@ struct cgroup_rstat_cpu { */ struct cgroup_base_stat last_bstat; + /* + * This field is used to record the cumulative per-cpu time of + * the cgroup and its descendants. Currently it can be read via + * eBPF/drgn etc, and we are still trying to determine how to + * expose it in the cgroupfs interface. + */ + struct cgroup_base_stat subtree_bstat; + + /* + * Snapshots at the last reading. These are used to calculate the + * deltas to propagate to the per-cpu subtree_bstat. + */ + struct cgroup_base_stat last_subtree_bstat; + /* * Child cgroups with stat updates on this cpu since the last read * are linked on the parent's ->updated_children through diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 2542c21b6b6d..d80d7a608141 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -344,6 +344,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_rstat_cpu *prstatc; struct cgroup_base_stat delta; unsigned seq; @@ -357,17 +358,24 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) delta = rstatc->bstat; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); - /* propagate percpu delta to global */ + /* propagate per-cpu delta to cgroup and per-cpu global statistics */ cgroup_base_stat_sub(&delta, &rstatc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatc->last_bstat, &delta); + cgroup_base_stat_add(&rstatc->subtree_bstat, &delta); - /* propagate global delta to parent (unless that's root) */ + /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ if (cgroup_parent(parent)) { delta = cgrp->bstat; cgroup_base_stat_sub(&delta, &cgrp->last_bstat); cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); + + delta = rstatc->subtree_bstat; + prstatc = cgroup_rstat_cpu(parent, cpu); + cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat); + cgroup_base_stat_add(&prstatc->subtree_bstat, &delta); + cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta); } } -- cgit v1.2.3