/* SPDX-License-Identifier: GPL-2.0 */ #include #include #include #include #include "dwarf-regs.h" /* for EM_HOST */ #include "syscalltbl.h" #include "util/cgroup.h" #include "util/hashmap.h" #include "util/trace.h" #include "util/util.h" #include #include #include #include /* reallocarray */ #include "bpf_skel/syscall_summary.h" #include "bpf_skel/syscall_summary.skel.h" static struct syscall_summary_bpf *skel; static struct rb_root cgroups = RB_ROOT; int trace_prepare_bpf_summary(enum trace_summary_mode mode) { skel = syscall_summary_bpf__open(); if (skel == NULL) { fprintf(stderr, "failed to open syscall summary bpf skeleton\n"); return -1; } if (mode == SUMMARY__BY_THREAD) skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD; else if (mode == SUMMARY__BY_CGROUP) skel->rodata->aggr_mode = SYSCALL_AGGR_CGROUP; else skel->rodata->aggr_mode = SYSCALL_AGGR_CPU; if (cgroup_is_v2("perf_event") > 0) skel->rodata->use_cgroup_v2 = 1; if (syscall_summary_bpf__load(skel) < 0) { fprintf(stderr, "failed to load syscall summary bpf skeleton\n"); return -1; } if (syscall_summary_bpf__attach(skel) < 0) { fprintf(stderr, "failed to attach syscall summary bpf skeleton\n"); return -1; } if (mode == SUMMARY__BY_CGROUP) read_all_cgroups(&cgroups); return 0; } void trace_start_bpf_summary(void) { skel->bss->enabled = 1; } void trace_end_bpf_summary(void) { skel->bss->enabled = 0; } struct syscall_node { int syscall_nr; struct syscall_stats stats; }; static double rel_stddev(struct syscall_stats *stat) { double variance, average; if (stat->count < 2) return 0; average = (double)stat->total_time / stat->count; variance = stat->squared_sum; variance -= (stat->total_time * stat->total_time) / stat->count; variance /= stat->count - 1; return 100 * sqrt(variance / stat->count) / average; } /* * The syscall_data is to maintain syscall stats ordered by total time. * It supports different summary modes like per-thread or global. * * For per-thread stats, it uses two-level data strurcture - * syscall_data is keyed by TID and has an array of nodes which * represents each syscall for the thread. * * For global stats, it's still two-level technically but we don't need * per-cpu analysis so it's keyed by the syscall number to combine stats * from different CPUs. And syscall_data always has a syscall_node so * it can effectively work as flat hierarchy. * * For per-cgroup stats, it uses two-level data structure like thread * syscall_data is keyed by CGROUP and has an array of node which * represents each syscall for the cgroup. */ struct syscall_data { u64 key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU, cgroup if AGGR_CGROUP */ int nr_events; int nr_nodes; u64 total_time; struct syscall_node *nodes; }; static int datacmp(const void *a, const void *b) { const struct syscall_data * const *sa = a; const struct syscall_data * const *sb = b; return (*sa)->total_time > (*sb)->total_time ? -1 : 1; } static int nodecmp(const void *a, const void *b) { const struct syscall_node *na = a; const struct syscall_node *nb = b; return na->stats.total_time > nb->stats.total_time ? -1 : 1; } static size_t sc_node_hash(long key, void *ctx __maybe_unused) { return key; } static bool sc_node_equal(long key1, long key2, void *ctx __maybe_unused) { return key1 == key2; } static int print_common_stats(struct syscall_data *data, FILE *fp) { int printed = 0; for (int i = 0; i < data->nr_nodes; i++) { struct syscall_node *node = &data->nodes[i]; struct syscall_stats *stat = &node->stats; double total = (double)(stat->total_time) / NSEC_PER_MSEC; double min = (double)(stat->min_time) / NSEC_PER_MSEC; double max = (double)(stat->max_time) / NSEC_PER_MSEC; double avg = total / stat->count; const char *name; /* TODO: support other ABIs */ name = syscalltbl__name(EM_HOST, node->syscall_nr); if (name) printed += fprintf(fp, " %-15s", name); else printed += fprintf(fp, " syscall:%-7d", node->syscall_nr); printed += fprintf(fp, " %8u %6u %9.3f %9.3f %9.3f %9.3f %9.2f%%\n", stat->count, stat->error, total, min, avg, max, rel_stddev(stat)); } return printed; } static int update_thread_stats(struct hashmap *hash, struct syscall_key *map_key, struct syscall_stats *map_data) { struct syscall_data *data; struct syscall_node *nodes; if (!hashmap__find(hash, map_key->cpu_or_tid, &data)) { data = zalloc(sizeof(*data)); if (data == NULL) return -ENOMEM; data->key = map_key->cpu_or_tid; if (hashmap__add(hash, data->key, data) < 0) { free(data); return -ENOMEM; } } /* update thread total stats */ data->nr_events += map_data->count; data->total_time += map_data->total_time; nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes)); if (nodes == NULL) return -ENOMEM; data->nodes = nodes; nodes = &data->nodes[data->nr_nodes++]; nodes->syscall_nr = map_key->nr; /* each thread has an entry for each syscall, just use the stat */ memcpy(&nodes->stats, map_data, sizeof(*map_data)); return 0; } static int print_thread_stat(struct syscall_data *data, FILE *fp) { int printed = 0; qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp); printed += fprintf(fp, " thread (%d), ", (int)data->key); printed += fprintf(fp, "%d events\n\n", data->nr_events); printed += fprintf(fp, " syscall calls errors total min avg max stddev\n"); printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); printed += print_common_stats(data, fp); printed += fprintf(fp, "\n\n"); return printed; } static int print_thread_stats(struct syscall_data **data, int nr_data, FILE *fp) { int printed = 0; for (int i = 0; i < nr_data; i++) printed += print_thread_stat(data[i], fp); return printed; } static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key, struct syscall_stats *map_data) { struct syscall_data *data; struct syscall_stats *stat; if (!hashmap__find(hash, map_key->nr, &data)) { data = zalloc(sizeof(*data)); if (data == NULL) return -ENOMEM; data->nodes = zalloc(sizeof(*data->nodes)); if (data->nodes == NULL) { free(data); return -ENOMEM; } data->nr_nodes = 1; data->key = map_key->nr; data->nodes->syscall_nr = data->key; if (hashmap__add(hash, data->key, data) < 0) { free(data->nodes); free(data); return -ENOMEM; } } /* update total stats for this syscall */ data->nr_events += map_data->count; data->total_time += map_data->total_time; /* This is sum of the same syscall from different CPUs */ stat = &data->nodes->stats; stat->total_time += map_data->total_time; stat->squared_sum += map_data->squared_sum; stat->count += map_data->count; stat->error += map_data->error; if (stat->max_time < map_data->max_time) stat->max_time = map_data->max_time; if (stat->min_time > map_data->min_time || stat->min_time == 0) stat->min_time = map_data->min_time; return 0; } static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp) { int printed = 0; int nr_events = 0; for (int i = 0; i < nr_data; i++) nr_events += data[i]->nr_events; printed += fprintf(fp, " total, %d events\n\n", nr_events); printed += fprintf(fp, " syscall calls errors total min avg max stddev\n"); printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); for (int i = 0; i < nr_data; i++) printed += print_common_stats(data[i], fp); printed += fprintf(fp, "\n\n"); return printed; } static int update_cgroup_stats(struct hashmap *hash, struct syscall_key *map_key, struct syscall_stats *map_data) { struct syscall_data *data; struct syscall_node *nodes; if (!hashmap__find(hash, map_key->cgroup, &data)) { data = zalloc(sizeof(*data)); if (data == NULL) return -ENOMEM; data->key = map_key->cgroup; if (hashmap__add(hash, data->key, data) < 0) { free(data); return -ENOMEM; } } /* update thread total stats */ data->nr_events += map_data->count; data->total_time += map_data->total_time; nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes)); if (nodes == NULL) return -ENOMEM; data->nodes = nodes; nodes = &data->nodes[data->nr_nodes++]; nodes->syscall_nr = map_key->nr; /* each thread has an entry for each syscall, just use the stat */ memcpy(&nodes->stats, map_data, sizeof(*map_data)); return 0; } static int print_cgroup_stat(struct syscall_data *data, FILE *fp) { int printed = 0; struct cgroup *cgrp = __cgroup__find(&cgroups, data->key); qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp); if (cgrp) printed += fprintf(fp, " cgroup %s,", cgrp->name); else printed += fprintf(fp, " cgroup id:%lu,", (unsigned long)data->key); printed += fprintf(fp, " %d events\n\n", data->nr_events); printed += fprintf(fp, " syscall calls errors total min avg max stddev\n"); printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); printed += print_common_stats(data, fp); printed += fprintf(fp, "\n\n"); return printed; } static int print_cgroup_stats(struct syscall_data **data, int nr_data, FILE *fp) { int printed = 0; for (int i = 0; i < nr_data; i++) printed += print_cgroup_stat(data[i], fp); return printed; } int trace_print_bpf_summary(FILE *fp) { struct bpf_map *map = skel->maps.syscall_stats_map; struct syscall_key *prev_key, key; struct syscall_data **data = NULL; struct hashmap schash; struct hashmap_entry *entry; int nr_data = 0; int printed = 0; int i; size_t bkt; hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL); printed = fprintf(fp, "\n Summary of events:\n\n"); /* get stats from the bpf map */ prev_key = NULL; while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) { struct syscall_stats stat; if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) { switch (skel->rodata->aggr_mode) { case SYSCALL_AGGR_THREAD: update_thread_stats(&schash, &key, &stat); break; case SYSCALL_AGGR_CPU: update_total_stats(&schash, &key, &stat); break; case SYSCALL_AGGR_CGROUP: update_cgroup_stats(&schash, &key, &stat); break; default: break; } } prev_key = &key; } nr_data = hashmap__size(&schash); data = calloc(nr_data, sizeof(*data)); if (data == NULL) goto out; i = 0; hashmap__for_each_entry(&schash, entry, bkt) data[i++] = entry->pvalue; qsort(data, nr_data, sizeof(*data), datacmp); switch (skel->rodata->aggr_mode) { case SYSCALL_AGGR_THREAD: printed += print_thread_stats(data, nr_data, fp); break; case SYSCALL_AGGR_CPU: printed += print_total_stats(data, nr_data, fp); break; case SYSCALL_AGGR_CGROUP: printed += print_cgroup_stats(data, nr_data, fp); break; default: break; } for (i = 0; i < nr_data && data; i++) { free(data[i]->nodes); free(data[i]); } free(data); out: hashmap__clear(&schash); return printed; } void trace_cleanup_bpf_summary(void) { if (!RB_EMPTY_ROOT(&cgroups)) { struct cgroup *cgrp, *tmp; rbtree_postorder_for_each_entry_safe(cgrp, tmp, &cgroups, node) cgroup__put(cgrp); cgroups = RB_ROOT; } syscall_summary_bpf__destroy(skel); }