summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
Diffstat (limited to 'tools')
-rw-r--r--tools/bpf/resolve_btfids/Makefile2
-rw-r--r--tools/iio/iio_generic_buffer.c2
-rw-r--r--tools/lib/bpf/btf.c6
-rw-r--r--tools/perf/util/symbol-elf.c6
-rw-r--r--tools/power/x86/turbostat/turbostat.86
-rw-r--r--tools/power/x86/turbostat/turbostat.c468
-rw-r--r--tools/testing/selftests/mm/ksm_tests.c32
-rw-r--r--tools/testing/selftests/mm/merge.c43
-rw-r--r--tools/testing/selftests/mm/thuge-gen.c6
-rw-r--r--tools/testing/selftests/mm/vm_util.c38
-rw-r--r--tools/testing/selftests/mm/vm_util.h2
-rw-r--r--tools/testing/selftests/mount_setattr/mount_setattr_test.c17
-rw-r--r--tools/testing/selftests/ublk/Makefile1
-rw-r--r--tools/testing/selftests/ublk/fault_inject.c4
-rw-r--r--tools/testing/selftests/ublk/file_backed.c20
-rw-r--r--tools/testing/selftests/ublk/kublk.c374
-rw-r--r--tools/testing/selftests/ublk/kublk.h73
-rw-r--r--tools/testing/selftests/ublk/null.c22
-rw-r--r--tools/testing/selftests/ublk/stripe.c17
-rwxr-xr-xtools/testing/selftests/ublk/test_common.sh5
-rwxr-xr-xtools/testing/selftests/ublk/test_generic_12.sh55
-rwxr-xr-xtools/testing/selftests/ublk/test_stress_03.sh8
-rwxr-xr-xtools/testing/selftests/ublk/test_stress_04.sh7
-rwxr-xr-xtools/testing/selftests/ublk/test_stress_05.sh7
-rw-r--r--tools/testing/selftests/ublk/trace/count_ios_per_tid.bt11
-rw-r--r--tools/testing/selftests/vDSO/vgetrandom-chacha.S2
-rw-r--r--tools/testing/vma/vma_internal.h2
27 files changed, 897 insertions, 339 deletions
diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile
index afbddea3a39c..ce1b556dfa90 100644
--- a/tools/bpf/resolve_btfids/Makefile
+++ b/tools/bpf/resolve_btfids/Makefile
@@ -17,7 +17,7 @@ endif
# Overrides for the prepare step libraries.
HOST_OVERRIDES := AR="$(HOSTAR)" CC="$(HOSTCC)" LD="$(HOSTLD)" ARCH="$(HOSTARCH)" \
- CROSS_COMPILE="" EXTRA_CFLAGS="$(HOSTCFLAGS)"
+ CROSS_COMPILE="" CLANG_CROSS_FLAGS="" EXTRA_CFLAGS="$(HOSTCFLAGS)"
RM ?= rm
HOSTCC ?= gcc
diff --git a/tools/iio/iio_generic_buffer.c b/tools/iio/iio_generic_buffer.c
index 9ef5ee087eda..bc82bb6a7a2a 100644
--- a/tools/iio/iio_generic_buffer.c
+++ b/tools/iio/iio_generic_buffer.c
@@ -335,7 +335,7 @@ static const struct option longopts[] = {
{ "device-num", 1, 0, 'N' },
{ "trigger-name", 1, 0, 't' },
{ "trigger-num", 1, 0, 'T' },
- { },
+ { }
};
int main(int argc, char **argv)
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index f1d495dc66bb..37682908cb0f 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1384,12 +1384,12 @@ static struct btf *btf_parse_raw_mmap(const char *path, struct btf *base_btf)
fd = open(path, O_RDONLY);
if (fd < 0)
- return libbpf_err_ptr(-errno);
+ return ERR_PTR(-errno);
if (fstat(fd, &st) < 0) {
err = -errno;
close(fd);
- return libbpf_err_ptr(err);
+ return ERR_PTR(err);
}
data = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
@@ -1397,7 +1397,7 @@ static struct btf *btf_parse_raw_mmap(const char *path, struct btf *base_btf)
close(fd);
if (data == MAP_FAILED)
- return libbpf_err_ptr(err);
+ return ERR_PTR(err);
btf = btf_new(data, st.st_size, base_btf, true);
if (IS_ERR(btf))
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 01818abd02df..6d2c280a1730 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -1668,6 +1668,12 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss,
continue;
}
+ /* Reject RISCV ELF "mapping symbols" */
+ if (ehdr.e_machine == EM_RISCV) {
+ if (elf_name[0] == '$' && strchr("dx", elf_name[1]))
+ continue;
+ }
+
if (runtime_ss->opdsec && sym.st_shndx == runtime_ss->opdidx) {
u32 offset = sym.st_value - syms_ss->opdshdr.sh_addr;
u64 *opd = opddata->d_buf + offset;
diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index b74ed916057e..fb11108aaf42 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -100,7 +100,7 @@ The column name "all" can be used to enable all disabled-by-default built-in cou
.PP
\fB--show column\fP show only the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names.
.PP
-\fB--show CATEGORY --hide CATEGORY\fP Show and hide also accept a single CATEGORY of columns: "all", "topology", "idle", "frequency", "power", "cpuidle", "hwidle", "swidle", "other". "idle" (enabled by default), includes "hwidle" and "idle_pct". "cpuidle" (default disabled) includes cpuidle software invocation counters. "swidle" includes "cpuidle" plus "idle_pct". "hwidle" includes only hardware based idle residency counters. Older versions of turbostat used the term "sysfs" for what is now "swidle".
+\fB--show CATEGORY --hide CATEGORY\fP Show and hide also accept a single CATEGORY of columns: "all", "topology", "idle", "frequency", "power", "cpuidle", "hwidle", "swidle", "other". "idle" (enabled by default), includes "hwidle" and "pct_idle". "cpuidle" (default disabled) includes cpuidle software invocation counters. "swidle" includes "cpuidle" plus "pct_idle". "hwidle" includes only hardware based idle residency counters. Older versions of turbostat used the term "sysfs" for what is now "swidle".
.PP
\fB--Dump\fP displays the raw counter values.
.PP
@@ -204,8 +204,8 @@ The system configuration dump (if --quiet is not used) is followed by statistics
.PP
\fBUncMHz\fP per-package uncore MHz, instantaneous sample.
.PP
-\fBUMHz1.0\fP per-package uncore MHz for domain=1 and fabric_cluster=0, instantaneous sample. System summary is the average of all packages.
-Intel Granite Rapids systems use domains 0-2 for CPUs, and 3-4 for IO, with cluster always 0.
+\fBUMHz1.0\fP per-package uncore MHz for pm_domain=1 and fabric_cluster=0, instantaneous sample. System summary is the average of all packages.
+Intel Granite Rapids systems use pm_domains 0-2 for CPUs, and 3-4 for IO, with cluster always 0.
For the "--show" and "--hide" options, use "UncMHz" to operate on all UMHz*.* as a group.
.SH TOO MUCH INFORMATION EXAMPLE
By default, turbostat dumps all possible information -- a system configuration header, followed by columns for all counters.
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 0170d3cc6819..5230e072e414 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -280,7 +280,7 @@ struct msr_counter bic[] = {
#define BIC_GROUP_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ)
#define BIC_GROUP_HW_IDLE (BIC_Busy | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6)
#define BIC_GROUP_SW_IDLE (BIC_Busy | BIC_cpuidle | BIC_pct_idle )
-#define BIC_GROUP_IDLE (BIC_GROUP_HW_IDLE | BIC_pct_idle)
+#define BIC_GROUP_IDLE (BIC_GROUP_HW_IDLE | BIC_pct_idle)
#define BIC_OTHER (BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC | BIC_cpuidle)
@@ -539,7 +539,7 @@ enum rapl_msrs {
#define RAPL_PKG_ALL (RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO)
#define RAPL_DRAM_ALL (RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_DRAM_POWER_INFO)
#define RAPL_CORE_ALL (RAPL_CORE | RAPL_CORE_POLICY)
-#define RAPL_GFX_ALL (RAPL_GFX | RAPL_GFX_POLIGY)
+#define RAPL_GFX_ALL (RAPL_GFX | RAPL_GFX_POLICY)
#define RAPL_AMD_F17H (RAPL_AMD_PWR_UNIT | RAPL_AMD_CORE_ENERGY_STAT | RAPL_AMD_PKG_ENERGY_STAT)
@@ -839,6 +839,23 @@ static const struct platform_features spr_features = {
.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS,
};
+static const struct platform_features dmr_features = {
+ .has_msr_misc_feature_control = spr_features.has_msr_misc_feature_control,
+ .has_msr_misc_pwr_mgmt = spr_features.has_msr_misc_pwr_mgmt,
+ .has_nhm_msrs = spr_features.has_nhm_msrs,
+ .has_config_tdp = spr_features.has_config_tdp,
+ .bclk_freq = spr_features.bclk_freq,
+ .supported_cstates = spr_features.supported_cstates,
+ .cst_limit = spr_features.cst_limit,
+ .has_msr_core_c1_res = spr_features.has_msr_core_c1_res,
+ .has_msr_module_c6_res_ms = 1, /* DMR has Dual Core Module and MC6 MSR */
+ .has_irtl_msrs = spr_features.has_irtl_msrs,
+ .has_cst_prewake_bit = spr_features.has_cst_prewake_bit,
+ .has_fixed_rapl_psys_unit = spr_features.has_fixed_rapl_psys_unit,
+ .trl_msrs = spr_features.trl_msrs,
+ .rapl_msrs = 0, /* DMR does not have RAPL MSRs */
+};
+
static const struct platform_features srf_features = {
.has_msr_misc_feature_control = 1,
.has_msr_misc_pwr_mgmt = 1,
@@ -1028,12 +1045,14 @@ static const struct platform_data turbostat_pdata[] = {
{ INTEL_EMERALDRAPIDS_X, &spr_features },
{ INTEL_GRANITERAPIDS_X, &spr_features },
{ INTEL_GRANITERAPIDS_D, &spr_features },
+ { INTEL_PANTHERCOVE_X, &dmr_features },
{ INTEL_LAKEFIELD, &cnl_features },
{ INTEL_ALDERLAKE, &adl_features },
{ INTEL_ALDERLAKE_L, &adl_features },
{ INTEL_RAPTORLAKE, &adl_features },
{ INTEL_RAPTORLAKE_P, &adl_features },
{ INTEL_RAPTORLAKE_S, &adl_features },
+ { INTEL_BARTLETTLAKE, &adl_features },
{ INTEL_METEORLAKE, &adl_features },
{ INTEL_METEORLAKE_L, &adl_features },
{ INTEL_ARROWLAKE_H, &adl_features },
@@ -1072,7 +1091,6 @@ void probe_platform_features(unsigned int family, unsigned int model)
{
int i;
-
if (authentic_amd || hygon_genuine) {
/* fallback to default features on unsupported models */
force_load++;
@@ -1106,8 +1124,7 @@ end:
if (platform)
return;
- fprintf(stderr, "Unsupported platform detected.\n"
- "\tSee RUN THE LATEST VERSION on turbostat(8)\n");
+ fprintf(stderr, "Unsupported platform detected.\n\tSee RUN THE LATEST VERSION on turbostat(8)\n");
exit(1);
}
@@ -1127,7 +1144,8 @@ char *progname;
#define CPU_SUBSET_MAXCPUS 8192 /* need to use before probe... */
cpu_set_t *cpu_present_set, *cpu_possible_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset;
-size_t cpu_present_setsize, cpu_possible_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size;
+size_t cpu_present_setsize, cpu_possible_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize,
+ cpu_subset_size;
#define MAX_ADDED_THREAD_COUNTERS 24
#define MAX_ADDED_CORE_COUNTERS 8
#define MAX_ADDED_PACKAGE_COUNTERS 16
@@ -2140,13 +2158,20 @@ int get_msr_fd(int cpu)
if (fd)
return fd;
-
+#if defined(ANDROID)
+ sprintf(pathname, "/dev/msr%d", cpu);
+#else
sprintf(pathname, "/dev/cpu/%d/msr", cpu);
+#endif
fd = open(pathname, O_RDONLY);
if (fd < 0)
+#if defined(ANDROID)
+ err(-1, "%s open failed, try chown or chmod +r /dev/msr*, "
+ "or run with --no-msr, or run as root", pathname);
+#else
err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, "
"or run with --no-msr, or run as root", pathname);
-
+#endif
fd_percpu[cpu] = fd;
return fd;
@@ -2215,32 +2240,52 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr)
return 0;
}
-int probe_rapl_msr(int cpu, off_t offset, int index)
+int add_msr_counter(int cpu, off_t offset)
{
ssize_t retval;
unsigned long long value;
- assert(!no_msr);
+ if (no_msr)
+ return -1;
+
+ if (!offset)
+ return -1;
retval = pread(get_msr_fd(cpu), &value, sizeof(value), offset);
/* if the read failed, the probe fails */
if (retval != sizeof(value))
- return 1;
+ return -1;
+
+ if (value == 0)
+ return 0;
+
+ return 1;
+}
+
+int add_rapl_msr_counter(int cpu, const struct rapl_counter_arch_info *cai)
+{
+ int ret;
+
+ if (!(platform->rapl_msrs & cai->feature_mask))
+ return -1;
+
+ ret = add_msr_counter(cpu, cai->msr);
+ if (ret < 0)
+ return -1;
- /* If an Energy Status Counter MSR returns 0, the probe fails */
- switch (index) {
+ switch (cai->rci_index) {
case RAPL_RCI_INDEX_ENERGY_PKG:
case RAPL_RCI_INDEX_ENERGY_CORES:
case RAPL_RCI_INDEX_DRAM:
case RAPL_RCI_INDEX_GFX:
case RAPL_RCI_INDEX_ENERGY_PLATFORM:
- if (value == 0)
+ if (ret == 0)
return 1;
}
/* PKG,DRAM_PERF_STATUS MSRs, can return any value */
- return 0;
+ return 1;
}
/* Convert CPU ID to domain ID for given added perf counter. */
@@ -2327,8 +2372,7 @@ void help(void)
" degrees Celsius\n"
" -h, --help\n"
" print this help message\n"
- " -v, --version\n"
- " print version information\n\nFor more help, run \"man turbostat\"\n");
+ " -v, --version\n\t\tprint version information\n\nFor more help, run \"man turbostat\"\n");
}
/*
@@ -2644,7 +2688,7 @@ void print_header(char *delim)
if (DO_BIC(BIC_SYS_LPI))
outp += sprintf(outp, "%sSYS%%LPI", (printed++ ? delim : ""));
- if (platform->rapl_msrs && !rapl_joules) {
+ if (!rapl_joules) {
if (DO_BIC(BIC_PkgWatt))
outp += sprintf(outp, "%sPkgWatt", (printed++ ? delim : ""));
if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl)
@@ -2657,7 +2701,7 @@ void print_header(char *delim)
outp += sprintf(outp, "%sPKG_%%", (printed++ ? delim : ""));
if (DO_BIC(BIC_RAM__))
outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : ""));
- } else if (platform->rapl_msrs && rapl_joules) {
+ } else {
if (DO_BIC(BIC_Pkg_J))
outp += sprintf(outp, "%sPkg_J", (printed++ ? delim : ""));
if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl)
@@ -3943,7 +3987,6 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
if (average.threads.nmi_count > 9999999)
sums_need_wide_columns = 1;
-
average.cores.c3 /= topo.allowed_cores;
average.cores.c6 /= topo.allowed_cores;
average.cores.c7 /= topo.allowed_cores;
@@ -4766,6 +4809,37 @@ unsigned long pmt_read_counter(struct pmt_counter *ppmt, unsigned int domain_id)
return (value & value_mask) >> value_shift;
}
+/* Rapl domain enumeration helpers */
+static inline int get_rapl_num_domains(void)
+{
+ int num_packages = topo.max_package_id + 1;
+ int num_cores_per_package;
+ int num_cores;
+
+ if (!platform->has_per_core_rapl)
+ return num_packages;
+
+ num_cores_per_package = topo.max_core_id + 1;
+ num_cores = num_cores_per_package * num_packages;
+
+ return num_cores;
+}
+
+static inline int get_rapl_domain_id(int cpu)
+{
+ int nr_cores_per_package = topo.max_core_id + 1;
+ int rapl_core_id;
+
+ if (!platform->has_per_core_rapl)
+ return cpus[cpu].physical_package_id;
+
+ /* Compute the system-wide unique core-id for @cpu */
+ rapl_core_id = cpus[cpu].physical_core_id;
+ rapl_core_id += cpus[cpu].physical_package_id * nr_cores_per_package;
+
+ return rapl_core_id;
+}
+
/*
* get_counters(...)
* migrate to cpu
@@ -4821,7 +4895,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
goto done;
if (platform->has_per_core_rapl) {
- status = get_rapl_counters(cpu, c->core_id, c, p);
+ status = get_rapl_counters(cpu, get_rapl_domain_id(cpu), c, p);
if (status != 0)
return status;
}
@@ -4887,7 +4961,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
p->sys_lpi = cpuidle_cur_sys_lpi_us;
if (!platform->has_per_core_rapl) {
- status = get_rapl_counters(cpu, p->package_id, c, p);
+ status = get_rapl_counters(cpu, get_rapl_domain_id(cpu), c, p);
if (status != 0)
return status;
}
@@ -6476,8 +6550,11 @@ void check_dev_msr()
if (no_msr)
return;
-
+#if defined(ANDROID)
+ sprintf(pathname, "/dev/msr%d", base_cpu);
+#else
sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
+#endif
if (stat(pathname, &sb))
if (system("/sbin/modprobe msr > /dev/null 2>&1"))
no_msr = 1;
@@ -6527,7 +6604,11 @@ void check_msr_permission(void)
failed += check_for_cap_sys_rawio();
/* test file permissions */
+#if defined(ANDROID)
+ sprintf(pathname, "/dev/msr%d", base_cpu);
+#else
sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
+#endif
if (euidaccess(pathname, R_OK)) {
failed++;
}
@@ -6737,8 +6818,10 @@ static void probe_intel_uncore_frequency_cluster(void)
* This allows "--show/--hide UncMHz" to be effective for
* the clustered MHz counters, as a group.
*/
- if BIC_IS_ENABLED(BIC_UNCORE_MHZ)
- add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, package_id);
+ if BIC_IS_ENABLED
+ (BIC_UNCORE_MHZ)
+ add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0,
+ package_id);
if (quiet)
continue;
@@ -6810,17 +6893,21 @@ static void probe_graphics(void)
else
goto next;
- set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", gt0_is_gt ? GFX_rc6 : SAM_mc6);
+ set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms",
+ gt0_is_gt ? GFX_rc6 : SAM_mc6);
set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", gt0_is_gt ? GFX_MHz : SAM_MHz);
- set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz);
+ set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq",
+ gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz);
- set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", gt0_is_gt ? SAM_mc6 : GFX_rc6);
+ set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms",
+ gt0_is_gt ? SAM_mc6 : GFX_rc6);
set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", gt0_is_gt ? SAM_MHz : GFX_MHz);
- set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz);
+ set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq",
+ gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz);
goto end;
}
@@ -7256,6 +7343,9 @@ void rapl_probe_intel(void)
else
bic_enabled &= ~bic_joules_bits;
+ if (!platform->rapl_msrs || no_msr)
+ return;
+
if (!(platform->rapl_msrs & RAPL_PKG_PERF_STATUS))
bic_enabled &= ~BIC_PKG__;
if (!(platform->rapl_msrs & RAPL_DRAM_PERF_STATUS))
@@ -7306,6 +7396,9 @@ void rapl_probe_amd(void)
else
bic_enabled &= ~bic_joules_bits;
+ if (!platform->rapl_msrs || no_msr)
+ return;
+
if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr))
return;
@@ -7332,6 +7425,158 @@ void print_power_limit_msr(int cpu, unsigned long long msr, char *label)
return;
}
+static int fread_int(char *path, int *val)
+{
+ FILE *filep;
+ int ret;
+
+ filep = fopen(path, "r");
+ if (!filep)
+ return -1;
+
+ ret = fscanf(filep, "%d", val);
+ fclose(filep);
+ return ret;
+}
+
+static int fread_ull(char *path, unsigned long long *val)
+{
+ FILE *filep;
+ int ret;
+
+ filep = fopen(path, "r");
+ if (!filep)
+ return -1;
+
+ ret = fscanf(filep, "%llu", val);
+ fclose(filep);
+ return ret;
+}
+
+static int fread_str(char *path, char *buf, int size)
+{
+ FILE *filep;
+ int ret;
+ char *cp;
+
+ filep = fopen(path, "r");
+ if (!filep)
+ return -1;
+
+ ret = fread(buf, 1, size, filep);
+ fclose(filep);
+
+ /* replace '\n' with '\0' */
+ cp = strchr(buf, '\n');
+ if (cp != NULL)
+ *cp = '\0';
+
+ return ret;
+}
+
+#define PATH_RAPL_SYSFS "/sys/class/powercap"
+
+static int dump_one_domain(char *domain_path)
+{
+ char path[PATH_MAX];
+ char str[PATH_MAX];
+ unsigned long long val;
+ int constraint;
+ int enable;
+ int ret;
+
+ snprintf(path, PATH_MAX, "%s/name", domain_path);
+ ret = fread_str(path, str, PATH_MAX);
+ if (ret <= 0)
+ return -1;
+
+ fprintf(outf, "%s: %s", domain_path + strlen(PATH_RAPL_SYSFS) + 1, str);
+
+ snprintf(path, PATH_MAX, "%s/enabled", domain_path);
+ ret = fread_int(path, &enable);
+ if (ret <= 0)
+ return -1;
+
+ if (!enable) {
+ fputs(" disabled\n", outf);
+ return 0;
+ }
+
+ for (constraint = 0;; constraint++) {
+ snprintf(path, PATH_MAX, "%s/constraint_%d_time_window_us", domain_path, constraint);
+ ret = fread_ull(path, &val);
+ if (ret <= 0)
+ break;
+
+ if (val > 1000000)
+ fprintf(outf, " %0.1fs", (double)val / 1000000);
+ else if (val > 1000)
+ fprintf(outf, " %0.1fms", (double)val / 1000);
+ else
+ fprintf(outf, " %0.1fus", (double)val);
+
+ snprintf(path, PATH_MAX, "%s/constraint_%d_power_limit_uw", domain_path, constraint);
+ ret = fread_ull(path, &val);
+ if (ret > 0 && val)
+ fprintf(outf, ":%lluW", val / 1000000);
+
+ snprintf(path, PATH_MAX, "%s/constraint_%d_max_power_uw", domain_path, constraint);
+ ret = fread_ull(path, &val);
+ if (ret > 0 && val)
+ fprintf(outf, ",max:%lluW", val / 1000000);
+ }
+ fputc('\n', outf);
+
+ return 0;
+}
+
+static int print_rapl_sysfs(void)
+{
+ DIR *dir, *cdir;
+ struct dirent *entry, *centry;
+ char path[PATH_MAX];
+ char str[PATH_MAX];
+
+ if ((dir = opendir(PATH_RAPL_SYSFS)) == NULL) {
+ warn("open %s failed", PATH_RAPL_SYSFS);
+ return 1;
+ }
+
+ while ((entry = readdir(dir)) != NULL) {
+ if (strlen(entry->d_name) > 100)
+ continue;
+
+ if (strncmp(entry->d_name, "intel-rapl", strlen("intel-rapl")))
+ continue;
+
+ snprintf(path, PATH_MAX, "%s/%s/name", PATH_RAPL_SYSFS, entry->d_name);
+
+ /* Parse top level domains first, including package and psys */
+ fread_str(path, str, PATH_MAX);
+ if (strncmp(str, "package", strlen("package")) && strncmp(str, "psys", strlen("psys")))
+ continue;
+
+ snprintf(path, PATH_MAX, "%s/%s", PATH_RAPL_SYSFS, entry->d_name);
+ if ((cdir = opendir(path)) == NULL) {
+ perror("opendir() error");
+ return 1;
+ }
+
+ dump_one_domain(path);
+
+ while ((centry = readdir(cdir)) != NULL) {
+ if (strncmp(centry->d_name, "intel-rapl", strlen("intel-rapl")))
+ continue;
+ snprintf(path, PATH_MAX, "%s/%s/%s", PATH_RAPL_SYSFS, entry->d_name, centry->d_name);
+ dump_one_domain(path);
+ }
+ closedir(cdir);
+ }
+
+ closedir(dir);
+ return 0;
+}
+
int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
{
unsigned long long msr;
@@ -7458,9 +7703,6 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
*/
void probe_rapl(void)
{
- if (!platform->rapl_msrs || no_msr)
- return;
-
if (genuine_intel)
rapl_probe_intel();
if (authentic_amd || hygon_genuine)
@@ -7469,6 +7711,11 @@ void probe_rapl(void)
if (quiet)
return;
+ print_rapl_sysfs();
+
+ if (!platform->rapl_msrs || no_msr)
+ return;
+
for_all_cpus(print_rapl, ODD_COUNTERS);
}
@@ -7801,44 +8048,42 @@ static int has_instr_count_access(void)
return has_access;
}
-int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
- double *scale_, enum rapl_unit *unit_)
+int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
+ double *scale_, enum rapl_unit *unit_)
{
+ int ret = -1;
+
if (no_perf)
return -1;
+ if (!cai->perf_name)
+ return -1;
+
const double scale = read_perf_scale(cai->perf_subsys, cai->perf_name);
if (scale == 0.0)
- return -1;
+ goto end;
const enum rapl_unit unit = read_perf_rapl_unit(cai->perf_subsys, cai->perf_name);
if (unit == RAPL_UNIT_INVALID)
- return -1;
+ goto end;
const unsigned int rapl_type = read_perf_type(cai->perf_subsys);
const unsigned int rapl_energy_pkg_config = read_perf_config(cai->perf_subsys, cai->perf_name);
- const int fd_counter =
- open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP);
- if (fd_counter == -1)
- return -1;
+ ret = open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP);
+ if (ret == -1)
+ goto end;
/* If it's the first counter opened, make it a group descriptor */
if (rci->fd_perf == -1)
- rci->fd_perf = fd_counter;
+ rci->fd_perf = ret;
*scale_ = scale;
*unit_ = unit;
- return fd_counter;
-}
-
-int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
- double *scale, enum rapl_unit *unit)
-{
- int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit);
+end:
if (debug >= 2)
fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu);
@@ -7863,7 +8108,7 @@ void linux_perf_init(void)
void rapl_perf_init(void)
{
- const unsigned int num_domains = (platform->has_per_core_rapl ? topo.max_core_id : topo.max_package_id) + 1;
+ const unsigned int num_domains = get_rapl_num_domains();
bool *domain_visited = calloc(num_domains, sizeof(bool));
rapl_counter_info_perdomain = calloc(num_domains, sizeof(*rapl_counter_info_perdomain));
@@ -7896,6 +8141,9 @@ void rapl_perf_init(void)
enum rapl_unit unit;
unsigned int next_domain;
+ if (!BIC_IS_ENABLED(cai->bic))
+ continue;
+
memset(domain_visited, 0, num_domains * sizeof(*domain_visited));
for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) {
@@ -7904,8 +8152,7 @@ void rapl_perf_init(void)
continue;
/* Skip already seen and handled RAPL domains */
- next_domain =
- platform->has_per_core_rapl ? cpus[cpu].physical_core_id : cpus[cpu].physical_package_id;
+ next_domain = get_rapl_domain_id(cpu);
assert(next_domain < num_domains);
@@ -7919,27 +8166,37 @@ void rapl_perf_init(void)
struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[next_domain];
- /* Check if the counter is enabled and accessible */
- if (BIC_IS_ENABLED(cai->bic) && (platform->rapl_msrs & cai->feature_mask)) {
+ /*
+ * rapl_counter_arch_infos[] can have multiple entries describing the same
+ * counter, due to the difference from different platforms/Vendors.
+ * E.g. rapl_counter_arch_infos[0] and rapl_counter_arch_infos[1] share the
+ * same perf_subsys and perf_name, but with different MSR address.
+ * rapl_counter_arch_infos[0] is for Intel and rapl_counter_arch_infos[1]
+ * is for AMD.
+ * In this case, it is possible that multiple rapl_counter_arch_infos[]
+ * entries are probed just because their perf/msr is duplicate and valid.
+ *
+ * Thus need a check to avoid re-probe the same counters.
+ */
+ if (rci->source[cai->rci_index] != COUNTER_SOURCE_NONE)
+ break;
- /* Use perf API for this counter */
- if (!no_perf && cai->perf_name
- && add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) {
- rci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
- rci->scale[cai->rci_index] = scale * cai->compat_scale;
- rci->unit[cai->rci_index] = unit;
- rci->flags[cai->rci_index] = cai->flags;
-
- /* Use MSR for this counter */
- } else if (!no_msr && cai->msr && probe_rapl_msr(cpu, cai->msr, cai->rci_index) == 0) {
- rci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
- rci->msr[cai->rci_index] = cai->msr;
- rci->msr_mask[cai->rci_index] = cai->msr_mask;
- rci->msr_shift[cai->rci_index] = cai->msr_shift;
- rci->unit[cai->rci_index] = RAPL_UNIT_JOULES;
- rci->scale[cai->rci_index] = *cai->platform_rapl_msr_scale * cai->compat_scale;
- rci->flags[cai->rci_index] = cai->flags;
- }
+ /* Use perf API for this counter */
+ if (add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) {
+ rci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
+ rci->scale[cai->rci_index] = scale * cai->compat_scale;
+ rci->unit[cai->rci_index] = unit;
+ rci->flags[cai->rci_index] = cai->flags;
+
+ /* Use MSR for this counter */
+ } else if (add_rapl_msr_counter(cpu, cai) >= 0) {
+ rci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
+ rci->msr[cai->rci_index] = cai->msr;
+ rci->msr_mask[cai->rci_index] = cai->msr_mask;
+ rci->msr_shift[cai->rci_index] = cai->msr_shift;
+ rci->unit[cai->rci_index] = RAPL_UNIT_JOULES;
+ rci->scale[cai->rci_index] = *cai->platform_rapl_msr_scale * cai->compat_scale;
+ rci->flags[cai->rci_index] = cai->flags;
}
if (rci->source[cai->rci_index] != COUNTER_SOURCE_NONE)
@@ -7972,65 +8229,63 @@ int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *gro
return NULL;
}
-int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai)
+int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai)
{
+ int ret = -1;
+
if (no_perf)
return -1;
+ if (!cai->perf_name)
+ return -1;
+
int *pfd_group = get_cstate_perf_group_fd(cci, cai->perf_subsys);
if (pfd_group == NULL)
- return -1;
+ goto end;
const unsigned int type = read_perf_type(cai->perf_subsys);
const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name);
- const int fd_counter = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP);
+ ret = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP);
- if (fd_counter == -1)
- return -1;
+ if (ret == -1)
+ goto end;
/* If it's the first counter opened, make it a group descriptor */
if (*pfd_group == -1)
- *pfd_group = fd_counter;
-
- return fd_counter;
-}
-
-int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai)
-{
- int ret = add_cstate_perf_counter_(cpu, cci, cai);
+ *pfd_group = ret;
+end:
if (debug >= 2)
fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu);
return ret;
}
-int add_msr_perf_counter_(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai)
+int add_msr_perf_counter(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai)
{
+ int ret = -1;
+
if (no_perf)
return -1;
+ if (!cai->perf_name)
+ return -1;
+
const unsigned int type = read_perf_type(cai->perf_subsys);
const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name);
- const int fd_counter = open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP);
+ ret = open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP);
- if (fd_counter == -1)
- return -1;
+ if (ret == -1)
+ goto end;
/* If it's the first counter opened, make it a group descriptor */
if (cci->fd_perf == -1)
- cci->fd_perf = fd_counter;
-
- return fd_counter;
-}
-
-int add_msr_perf_counter(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai)
-{
- int ret = add_msr_perf_counter_(cpu, cci, cai);
+ cci->fd_perf = ret;
+end:
if (debug)
fprintf(stderr, "%s: %s/%s: %d (cpu: %d)\n", __func__, cai->perf_subsys, cai->perf_name, ret, cpu);
@@ -8064,12 +8319,12 @@ void msr_perf_init_(void)
if (cai->needed) {
/* Use perf API for this counter */
- if (!no_perf && cai->perf_name && add_msr_perf_counter(cpu, cci, cai) != -1) {
+ if (add_msr_perf_counter(cpu, cci, cai) != -1) {
cci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
cai->present = true;
/* User MSR for this counter */
- } else if (!no_msr && cai->msr && probe_rapl_msr(cpu, cai->msr, cai->rci_index) == 0) {
+ } else if (add_msr_counter(cpu, cai->msr) >= 0) {
cci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
cci->msr[cai->rci_index] = cai->msr;
cci->msr_mask[cai->rci_index] = cai->msr_mask;
@@ -8177,13 +8432,13 @@ void cstate_perf_init_(bool soft_c1)
if (counter_needed && counter_supported) {
/* Use perf API for this counter */
- if (!no_perf && cai->perf_name && add_cstate_perf_counter(cpu, cci, cai) != -1) {
+ if (add_cstate_perf_counter(cpu, cci, cai) != -1) {
cci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
/* User MSR for this counter */
- } else if (!no_msr && cai->msr && pkg_cstate_limit >= cai->pkg_cstate_limit
- && probe_rapl_msr(cpu, cai->msr, cai->rci_index) == 0) {
+ } else if (pkg_cstate_limit >= cai->pkg_cstate_limit
+ && add_msr_counter(cpu, cai->msr) >= 0) {
cci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
cci->msr[cai->rci_index] = cai->msr;
}
@@ -9044,15 +9299,14 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo)
perf_device = "cpu_atom";
break;
- default: /* Don't change, we will probably fail and report a problem soon. */
+ default: /* Don't change, we will probably fail and report a problem soon. */
break;
}
}
perf_type = read_perf_type(perf_device);
if (perf_type == (unsigned int)-1) {
- warnx("%s: perf/%s/%s: failed to read %s",
- __func__, perf_device, pinfo->event, "type");
+ warnx("%s: perf/%s/%s: failed to read %s", __func__, perf_device, pinfo->event, "type");
continue;
}
@@ -9154,7 +9408,7 @@ struct pmt_mmio *pmt_mmio_open(unsigned int target_guid)
return NULL;
}
- for ( ; entry != NULL; entry = pmt_diriter_next(&pmt_iter)) {
+ for (; entry != NULL; entry = pmt_diriter_next(&pmt_iter)) {
if (fstatat(dirfd(pmt_iter.dir), entry->d_name, &st, 0) == -1)
break;
@@ -9594,7 +9848,7 @@ int get_and_dump_counters(void)
void print_version()
{
- fprintf(outf, "turbostat version 2025.04.06 - Len Brown <lenb@kernel.org>\n");
+ fprintf(outf, "turbostat version 2025.06.08 - Len Brown <lenb@kernel.org>\n");
}
#define COMMAND_LINE_SIZE 2048
@@ -10050,7 +10304,7 @@ void parse_add_command_pmt(char *add_command)
unsigned int lsb;
unsigned int msb;
unsigned int guid;
- unsigned int seq = 0; /* By default, pick first file in a sequence with a given GUID. */
+ unsigned int seq = 0; /* By default, pick first file in a sequence with a given GUID. */
unsigned int domain_id;
enum counter_scope scope = 0;
enum pmt_datatype type = PMT_TYPE_RAW;
diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c
index dcdd5bb20f3d..e80deac1436b 100644
--- a/tools/testing/selftests/mm/ksm_tests.c
+++ b/tools/testing/selftests/mm/ksm_tests.c
@@ -58,40 +58,12 @@ int debug;
static int ksm_write_sysfs(const char *file_path, unsigned long val)
{
- FILE *f = fopen(file_path, "w");
-
- if (!f) {
- fprintf(stderr, "f %s\n", file_path);
- perror("fopen");
- return 1;
- }
- if (fprintf(f, "%lu", val) < 0) {
- perror("fprintf");
- fclose(f);
- return 1;
- }
- fclose(f);
-
- return 0;
+ return write_sysfs(file_path, val);
}
static int ksm_read_sysfs(const char *file_path, unsigned long *val)
{
- FILE *f = fopen(file_path, "r");
-
- if (!f) {
- fprintf(stderr, "f %s\n", file_path);
- perror("fopen");
- return 1;
- }
- if (fscanf(f, "%lu", val) != 1) {
- perror("fscanf");
- fclose(f);
- return 1;
- }
- fclose(f);
-
- return 0;
+ return read_sysfs(file_path, val);
}
static void ksm_print_sysfs(void)
diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c
index c76646cdf6e6..bbae66fc5038 100644
--- a/tools/testing/selftests/mm/merge.c
+++ b/tools/testing/selftests/mm/merge.c
@@ -2,11 +2,14 @@
#define _GNU_SOURCE
#include "../kselftest_harness.h"
+#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
+#include <sys/syscall.h>
#include <sys/wait.h>
+#include <linux/perf_event.h>
#include "vm_util.h"
FIXTURE(merge)
@@ -452,4 +455,44 @@ TEST_F(merge, forked_source_vma)
ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr2 + 5 * page_size);
}
+TEST_F(merge, handle_uprobe_upon_merged_vma)
+{
+ const size_t attr_sz = sizeof(struct perf_event_attr);
+ unsigned int page_size = self->page_size;
+ const char *probe_file = "./foo";
+ char *carveout = self->carveout;
+ struct perf_event_attr attr;
+ unsigned long type;
+ void *ptr1, *ptr2;
+ int fd;
+
+ fd = open(probe_file, O_RDWR|O_CREAT, 0600);
+ ASSERT_GE(fd, 0);
+
+ ASSERT_EQ(ftruncate(fd, page_size), 0);
+ ASSERT_EQ(read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type), 0);
+
+ memset(&attr, 0, attr_sz);
+ attr.size = attr_sz;
+ attr.type = type;
+ attr.config1 = (__u64)(long)probe_file;
+ attr.config2 = 0x0;
+
+ ASSERT_GE(syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0), 0);
+
+ ptr1 = mmap(&carveout[page_size], 10 * page_size, PROT_EXEC,
+ MAP_PRIVATE | MAP_FIXED, fd, 0);
+ ASSERT_NE(ptr1, MAP_FAILED);
+
+ ptr2 = mremap(ptr1, page_size, 2 * page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, ptr1 + 5 * page_size);
+ ASSERT_NE(ptr2, MAP_FAILED);
+
+ ASSERT_NE(mremap(ptr2, page_size, page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, ptr1), MAP_FAILED);
+
+ close(fd);
+ remove(probe_file);
+}
+
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
index a41bc1234b37..95b6f043a3cb 100644
--- a/tools/testing/selftests/mm/thuge-gen.c
+++ b/tools/testing/selftests/mm/thuge-gen.c
@@ -77,7 +77,7 @@ void show(unsigned long ps)
system(buf);
}
-unsigned long read_sysfs(int warn, char *fmt, ...)
+unsigned long thuge_read_sysfs(int warn, char *fmt, ...)
{
char *line = NULL;
size_t linelen = 0;
@@ -106,7 +106,7 @@ unsigned long read_sysfs(int warn, char *fmt, ...)
unsigned long read_free(unsigned long ps)
{
- return read_sysfs(ps != getpagesize(),
+ return thuge_read_sysfs(ps != getpagesize(),
"/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
ps >> 10);
}
@@ -195,7 +195,7 @@ void find_pagesizes(void)
}
globfree(&g);
- if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest)
+ if (thuge_read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest)
ksft_exit_fail_msg("Please do echo %lu > /proc/sys/kernel/shmmax",
largest * NUM_PAGES);
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index 61d7bf1f8c62..5492e3f784df 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -486,3 +486,41 @@ int close_procmap(struct procmap_fd *procmap)
{
return close(procmap->fd);
}
+
+int write_sysfs(const char *file_path, unsigned long val)
+{
+ FILE *f = fopen(file_path, "w");
+
+ if (!f) {
+ fprintf(stderr, "f %s\n", file_path);
+ perror("fopen");
+ return 1;
+ }
+ if (fprintf(f, "%lu", val) < 0) {
+ perror("fprintf");
+ fclose(f);
+ return 1;
+ }
+ fclose(f);
+
+ return 0;
+}
+
+int read_sysfs(const char *file_path, unsigned long *val)
+{
+ FILE *f = fopen(file_path, "r");
+
+ if (!f) {
+ fprintf(stderr, "f %s\n", file_path);
+ perror("fopen");
+ return 1;
+ }
+ if (fscanf(f, "%lu", val) != 1) {
+ perror("fscanf");
+ fclose(f);
+ return 1;
+ }
+ fclose(f);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index adb5d294a220..b8136d12a0f8 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -88,6 +88,8 @@ int open_procmap(pid_t pid, struct procmap_fd *procmap_out);
int query_procmap(struct procmap_fd *procmap);
bool find_vma_procmap(struct procmap_fd *procmap, void *address);
int close_procmap(struct procmap_fd *procmap);
+int write_sysfs(const char *file_path, unsigned long val);
+int read_sysfs(const char *file_path, unsigned long *val);
static inline int open_self_procmap(struct procmap_fd *procmap_out)
{
diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
index 8b378c91debf..b1e4618399be 100644
--- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c
+++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
@@ -2079,24 +2079,9 @@ TEST_F(mount_setattr, detached_tree_propagation)
* means that the device information will be different for any
* statx() that was taken from /mnt/A before the mount compared
* to one after the mount.
- *
- * Since we already now that the device information between the
- * stx1 and stx2 samples are identical we also now that stx2 and
- * stx3 device information will necessarily differ.
*/
ASSERT_NE(stx1.stx_dev_minor, stx3.stx_dev_minor);
-
- /*
- * If mount propagation worked correctly then the tmpfs mount
- * that was created after the mount namespace was unshared will
- * have propagated onto /mnt/A in the detached mount tree.
- *
- * Verify that the device information for stx3 and stx4 are
- * identical. It is already established that stx3 is different
- * from both stx1 and stx2 sampled before the tmpfs mount was
- * done so if stx3 and stx4 are identical the proof is done.
- */
- ASSERT_EQ(stx3.stx_dev_minor, stx4.stx_dev_minor);
+ ASSERT_EQ(stx1.stx_dev_minor, stx4.stx_dev_minor);
EXPECT_EQ(close(fd_tree), 0);
}
diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 4dde8838261d..5d7f4ecfb816 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -19,6 +19,7 @@ TEST_PROGS += test_generic_08.sh
TEST_PROGS += test_generic_09.sh
TEST_PROGS += test_generic_10.sh
TEST_PROGS += test_generic_11.sh
+TEST_PROGS += test_generic_12.sh
TEST_PROGS += test_null_01.sh
TEST_PROGS += test_null_02.sh
diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c
index 5421774d7867..6e60f7d97125 100644
--- a/tools/testing/selftests/ublk/fault_inject.c
+++ b/tools/testing/selftests/ublk/fault_inject.c
@@ -46,9 +46,9 @@ static int ublk_fault_inject_queue_io(struct ublk_queue *q, int tag)
.tv_nsec = (long long)q->dev->private_data,
};
- ublk_queue_alloc_sqes(q, &sqe, 1);
+ ublk_io_alloc_sqes(ublk_get_io(q, tag), &sqe, 1);
io_uring_prep_timeout(sqe, &ts, 1, 0);
- sqe->user_data = build_user_data(tag, ublksrv_get_op(iod), 0, 1);
+ sqe->user_data = build_user_data(tag, ublksrv_get_op(iod), 0, q->q_id, 1);
ublk_queued_tgt_io(q, tag, 1);
diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c
index 509842df9bee..cfa59b631693 100644
--- a/tools/testing/selftests/ublk/file_backed.c
+++ b/tools/testing/selftests/ublk/file_backed.c
@@ -18,11 +18,11 @@ static int loop_queue_flush_io(struct ublk_queue *q, const struct ublksrv_io_des
unsigned ublk_op = ublksrv_get_op(iod);
struct io_uring_sqe *sqe[1];
- ublk_queue_alloc_sqes(q, sqe, 1);
+ ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1);
io_uring_prep_fsync(sqe[0], 1 /*fds[1]*/, IORING_FSYNC_DATASYNC);
io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE);
/* bit63 marks us as tgt io */
- sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1);
+ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
return 1;
}
@@ -36,7 +36,7 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de
void *addr = (zc | auto_zc) ? NULL : (void *)iod->addr;
if (!zc || auto_zc) {
- ublk_queue_alloc_sqes(q, sqe, 1);
+ ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1);
if (!sqe[0])
return -ENOMEM;
@@ -48,26 +48,26 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de
sqe[0]->buf_index = tag;
io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE);
/* bit63 marks us as tgt io */
- sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1);
+ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
return 1;
}
- ublk_queue_alloc_sqes(q, sqe, 3);
+ ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 3);
- io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
+ io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
sqe[0]->user_data = build_user_data(tag,
- ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1);
+ ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
io_uring_prep_rw(op, sqe[1], 1 /*fds[1]*/, 0,
iod->nr_sectors << 9,
iod->start_sector << 9);
sqe[1]->buf_index = tag;
sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK;
- sqe[1]->user_data = build_user_data(tag, ublk_op, 0, 1);
+ sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
- io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag);
- sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1);
+ io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
+ sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
return 2;
}
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index b5131a000795..e2d2042810d4 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -348,8 +348,8 @@ static void ublk_ctrl_dump(struct ublk_dev *dev)
for (i = 0; i < info->nr_hw_queues; i++) {
ublk_print_cpu_set(&affinity[i], buf, sizeof(buf));
- printf("\tqueue %u: tid %d affinity(%s)\n",
- i, dev->q[i].tid, buf);
+ printf("\tqueue %u: affinity(%s)\n",
+ i, buf);
}
free(affinity);
}
@@ -412,16 +412,6 @@ static void ublk_queue_deinit(struct ublk_queue *q)
int i;
int nr_ios = q->q_depth;
- io_uring_unregister_buffers(&q->ring);
-
- io_uring_unregister_ring_fd(&q->ring);
-
- if (q->ring.ring_fd > 0) {
- io_uring_unregister_files(&q->ring);
- close(q->ring.ring_fd);
- q->ring.ring_fd = -1;
- }
-
if (q->io_cmd_buf)
munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q));
@@ -429,20 +419,30 @@ static void ublk_queue_deinit(struct ublk_queue *q)
free(q->ios[i].buf_addr);
}
+static void ublk_thread_deinit(struct ublk_thread *t)
+{
+ io_uring_unregister_buffers(&t->ring);
+
+ io_uring_unregister_ring_fd(&t->ring);
+
+ if (t->ring.ring_fd > 0) {
+ io_uring_unregister_files(&t->ring);
+ close(t->ring.ring_fd);
+ t->ring.ring_fd = -1;
+ }
+}
+
static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
{
struct ublk_dev *dev = q->dev;
int depth = dev->dev_info.queue_depth;
- int i, ret = -1;
+ int i;
int cmd_buf_size, io_buf_size;
unsigned long off;
- int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
q->tgt_ops = dev->tgt.ops;
q->state = 0;
q->q_depth = depth;
- q->cmd_inflight = 0;
- q->tid = gettid();
if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
q->state |= UBLKSRV_NO_BUF;
@@ -467,6 +467,7 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
for (i = 0; i < q->q_depth; i++) {
q->ios[i].buf_addr = NULL;
q->ios[i].flags = UBLKSRV_NEED_FETCH_RQ | UBLKSRV_IO_FREE;
+ q->ios[i].tag = i;
if (q->state & UBLKSRV_NO_BUF)
continue;
@@ -479,39 +480,57 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
}
}
- ret = ublk_setup_ring(&q->ring, ring_depth, cq_depth,
+ return 0;
+ fail:
+ ublk_queue_deinit(q);
+ ublk_err("ublk dev %d queue %d failed\n",
+ dev->dev_info.dev_id, q->q_id);
+ return -ENOMEM;
+}
+
+static int ublk_thread_init(struct ublk_thread *t)
+{
+ struct ublk_dev *dev = t->dev;
+ int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
+ int ret;
+
+ ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth,
IORING_SETUP_COOP_TASKRUN |
IORING_SETUP_SINGLE_ISSUER |
IORING_SETUP_DEFER_TASKRUN);
if (ret < 0) {
- ublk_err("ublk dev %d queue %d setup io_uring failed %d\n",
- q->dev->dev_info.dev_id, q->q_id, ret);
+ ublk_err("ublk dev %d thread %d setup io_uring failed %d\n",
+ dev->dev_info.dev_id, t->idx, ret);
goto fail;
}
if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
- ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth);
+ unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues;
+ unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads;
+ max_nr_ios_per_thread += !!(nr_ios % dev->nthreads);
+ ret = io_uring_register_buffers_sparse(
+ &t->ring, max_nr_ios_per_thread);
if (ret) {
- ublk_err("ublk dev %d queue %d register spare buffers failed %d",
- dev->dev_info.dev_id, q->q_id, ret);
+ ublk_err("ublk dev %d thread %d register spare buffers failed %d",
+ dev->dev_info.dev_id, t->idx, ret);
goto fail;
}
}
- io_uring_register_ring_fd(&q->ring);
+ io_uring_register_ring_fd(&t->ring);
- ret = io_uring_register_files(&q->ring, dev->fds, dev->nr_fds);
+ ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds);
if (ret) {
- ublk_err("ublk dev %d queue %d register files failed %d\n",
- q->dev->dev_info.dev_id, q->q_id, ret);
+ ublk_err("ublk dev %d thread %d register files failed %d\n",
+ t->dev->dev_info.dev_id, t->idx, ret);
goto fail;
}
return 0;
- fail:
- ublk_queue_deinit(q);
- ublk_err("ublk dev %d queue %d failed\n",
- dev->dev_info.dev_id, q->q_id);
+fail:
+ ublk_thread_deinit(t);
+ ublk_err("ublk dev %d thread %d init failed\n",
+ dev->dev_info.dev_id, t->idx);
return -ENOMEM;
}
@@ -562,7 +581,7 @@ static void ublk_set_auto_buf_reg(const struct ublk_queue *q,
if (q->tgt_ops->buf_index)
buf.index = q->tgt_ops->buf_index(q, tag);
else
- buf.index = tag;
+ buf.index = q->ios[tag].buf_index;
if (q->state & UBLKSRV_AUTO_BUF_REG_FALLBACK)
buf.flags = UBLK_AUTO_BUF_REG_FALLBACK;
@@ -570,8 +589,10 @@ static void ublk_set_auto_buf_reg(const struct ublk_queue *q,
sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf);
}
-int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
+int ublk_queue_io_cmd(struct ublk_io *io)
{
+ struct ublk_thread *t = io->t;
+ struct ublk_queue *q = ublk_io_to_queue(io);
struct ublksrv_io_cmd *cmd;
struct io_uring_sqe *sqe[1];
unsigned int cmd_op = 0;
@@ -596,13 +617,13 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
else if (io->flags & UBLKSRV_NEED_FETCH_RQ)
cmd_op = UBLK_U_IO_FETCH_REQ;
- if (io_uring_sq_space_left(&q->ring) < 1)
- io_uring_submit(&q->ring);
+ if (io_uring_sq_space_left(&t->ring) < 1)
+ io_uring_submit(&t->ring);
- ublk_queue_alloc_sqes(q, sqe, 1);
+ ublk_io_alloc_sqes(io, sqe, 1);
if (!sqe[0]) {
- ublk_err("%s: run out of sqe %d, tag %d\n",
- __func__, q->q_id, tag);
+ ublk_err("%s: run out of sqe. thread %u, tag %d\n",
+ __func__, t->idx, io->tag);
return -1;
}
@@ -617,7 +638,7 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
sqe[0]->opcode = IORING_OP_URING_CMD;
sqe[0]->flags = IOSQE_FIXED_FILE;
sqe[0]->rw_flags = 0;
- cmd->tag = tag;
+ cmd->tag = io->tag;
cmd->q_id = q->q_id;
if (!(q->state & UBLKSRV_NO_BUF))
cmd->addr = (__u64) (uintptr_t) io->buf_addr;
@@ -625,37 +646,72 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
cmd->addr = 0;
if (q->state & UBLKSRV_AUTO_BUF_REG)
- ublk_set_auto_buf_reg(q, sqe[0], tag);
+ ublk_set_auto_buf_reg(q, sqe[0], io->tag);
- user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, 0);
+ user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0);
io_uring_sqe_set_data64(sqe[0], user_data);
io->flags = 0;
- q->cmd_inflight += 1;
+ t->cmd_inflight += 1;
- ublk_dbg(UBLK_DBG_IO_CMD, "%s: (qid %d tag %u cmd_op %u) iof %x stopping %d\n",
- __func__, q->q_id, tag, cmd_op,
- io->flags, !!(q->state & UBLKSRV_QUEUE_STOPPING));
+ ublk_dbg(UBLK_DBG_IO_CMD, "%s: (thread %u qid %d tag %u cmd_op %u) iof %x stopping %d\n",
+ __func__, t->idx, q->q_id, io->tag, cmd_op,
+ io->flags, !!(t->state & UBLKSRV_THREAD_STOPPING));
return 1;
}
-static void ublk_submit_fetch_commands(struct ublk_queue *q)
+static void ublk_submit_fetch_commands(struct ublk_thread *t)
{
- int i = 0;
+ struct ublk_queue *q;
+ struct ublk_io *io;
+ int i = 0, j = 0;
- for (i = 0; i < q->q_depth; i++)
- ublk_queue_io_cmd(q, &q->ios[i], i);
+ if (t->dev->per_io_tasks) {
+ /*
+ * Lexicographically order all the (qid,tag) pairs, with
+ * qid taking priority (so (1,0) > (0,1)). Then make
+ * this thread the daemon for every Nth entry in this
+ * list (N is the number of threads), starting at this
+ * thread's index. This ensures that each queue is
+ * handled by as many ublk server threads as possible,
+ * so that load that is concentrated on one or a few
+ * queues can make use of all ublk server threads.
+ */
+ const struct ublksrv_ctrl_dev_info *dinfo = &t->dev->dev_info;
+ int nr_ios = dinfo->nr_hw_queues * dinfo->queue_depth;
+ for (i = t->idx; i < nr_ios; i += t->dev->nthreads) {
+ int q_id = i / dinfo->queue_depth;
+ int tag = i % dinfo->queue_depth;
+ q = &t->dev->q[q_id];
+ io = &q->ios[tag];
+ io->t = t;
+ io->buf_index = j++;
+ ublk_queue_io_cmd(io);
+ }
+ } else {
+ /*
+ * Service exclusively the queue whose q_id matches our
+ * thread index.
+ */
+ struct ublk_queue *q = &t->dev->q[t->idx];
+ for (i = 0; i < q->q_depth; i++) {
+ io = &q->ios[i];
+ io->t = t;
+ io->buf_index = i;
+ ublk_queue_io_cmd(io);
+ }
+ }
}
-static int ublk_queue_is_idle(struct ublk_queue *q)
+static int ublk_thread_is_idle(struct ublk_thread *t)
{
- return !io_uring_sq_ready(&q->ring) && !q->io_inflight;
+ return !io_uring_sq_ready(&t->ring) && !t->io_inflight;
}
-static int ublk_queue_is_done(struct ublk_queue *q)
+static int ublk_thread_is_done(struct ublk_thread *t)
{
- return (q->state & UBLKSRV_QUEUE_STOPPING) && ublk_queue_is_idle(q);
+ return (t->state & UBLKSRV_THREAD_STOPPING) && ublk_thread_is_idle(t);
}
static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q,
@@ -673,14 +729,16 @@ static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q,
q->tgt_ops->tgt_io_done(q, tag, cqe);
}
-static void ublk_handle_cqe(struct io_uring *r,
+static void ublk_handle_cqe(struct ublk_thread *t,
struct io_uring_cqe *cqe, void *data)
{
- struct ublk_queue *q = container_of(r, struct ublk_queue, ring);
+ struct ublk_dev *dev = t->dev;
+ unsigned q_id = user_data_to_q_id(cqe->user_data);
+ struct ublk_queue *q = &dev->q[q_id];
unsigned tag = user_data_to_tag(cqe->user_data);
unsigned cmd_op = user_data_to_op(cqe->user_data);
int fetch = (cqe->res != UBLK_IO_RES_ABORT) &&
- !(q->state & UBLKSRV_QUEUE_STOPPING);
+ !(t->state & UBLKSRV_THREAD_STOPPING);
struct ublk_io *io;
if (cqe->res < 0 && cqe->res != -ENODEV)
@@ -691,7 +749,7 @@ static void ublk_handle_cqe(struct io_uring *r,
__func__, cqe->res, q->q_id, tag, cmd_op,
is_target_io(cqe->user_data),
user_data_to_tgt_data(cqe->user_data),
- (q->state & UBLKSRV_QUEUE_STOPPING));
+ (t->state & UBLKSRV_THREAD_STOPPING));
/* Don't retrieve io in case of target io */
if (is_target_io(cqe->user_data)) {
@@ -700,10 +758,10 @@ static void ublk_handle_cqe(struct io_uring *r,
}
io = &q->ios[tag];
- q->cmd_inflight--;
+ t->cmd_inflight--;
if (!fetch) {
- q->state |= UBLKSRV_QUEUE_STOPPING;
+ t->state |= UBLKSRV_THREAD_STOPPING;
io->flags &= ~UBLKSRV_NEED_FETCH_RQ;
}
@@ -713,7 +771,7 @@ static void ublk_handle_cqe(struct io_uring *r,
q->tgt_ops->queue_io(q, tag);
} else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) {
io->flags |= UBLKSRV_NEED_GET_DATA | UBLKSRV_IO_FREE;
- ublk_queue_io_cmd(q, io, tag);
+ ublk_queue_io_cmd(io);
} else {
/*
* COMMIT_REQ will be completed immediately since no fetching
@@ -727,92 +785,93 @@ static void ublk_handle_cqe(struct io_uring *r,
}
}
-static int ublk_reap_events_uring(struct io_uring *r)
+static int ublk_reap_events_uring(struct ublk_thread *t)
{
struct io_uring_cqe *cqe;
unsigned head;
int count = 0;
- io_uring_for_each_cqe(r, head, cqe) {
- ublk_handle_cqe(r, cqe, NULL);
+ io_uring_for_each_cqe(&t->ring, head, cqe) {
+ ublk_handle_cqe(t, cqe, NULL);
count += 1;
}
- io_uring_cq_advance(r, count);
+ io_uring_cq_advance(&t->ring, count);
return count;
}
-static int ublk_process_io(struct ublk_queue *q)
+static int ublk_process_io(struct ublk_thread *t)
{
int ret, reapped;
- ublk_dbg(UBLK_DBG_QUEUE, "dev%d-q%d: to_submit %d inflight cmd %u stopping %d\n",
- q->dev->dev_info.dev_id,
- q->q_id, io_uring_sq_ready(&q->ring),
- q->cmd_inflight,
- (q->state & UBLKSRV_QUEUE_STOPPING));
+ ublk_dbg(UBLK_DBG_THREAD, "dev%d-t%u: to_submit %d inflight cmd %u stopping %d\n",
+ t->dev->dev_info.dev_id,
+ t->idx, io_uring_sq_ready(&t->ring),
+ t->cmd_inflight,
+ (t->state & UBLKSRV_THREAD_STOPPING));
- if (ublk_queue_is_done(q))
+ if (ublk_thread_is_done(t))
return -ENODEV;
- ret = io_uring_submit_and_wait(&q->ring, 1);
- reapped = ublk_reap_events_uring(&q->ring);
+ ret = io_uring_submit_and_wait(&t->ring, 1);
+ reapped = ublk_reap_events_uring(t);
- ublk_dbg(UBLK_DBG_QUEUE, "submit result %d, reapped %d stop %d idle %d\n",
- ret, reapped, (q->state & UBLKSRV_QUEUE_STOPPING),
- (q->state & UBLKSRV_QUEUE_IDLE));
+ ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n",
+ ret, reapped, (t->state & UBLKSRV_THREAD_STOPPING),
+ (t->state & UBLKSRV_THREAD_IDLE));
return reapped;
}
-static void ublk_queue_set_sched_affinity(const struct ublk_queue *q,
+static void ublk_thread_set_sched_affinity(const struct ublk_thread *t,
cpu_set_t *cpuset)
{
if (sched_setaffinity(0, sizeof(*cpuset), cpuset) < 0)
- ublk_err("ublk dev %u queue %u set affinity failed",
- q->dev->dev_info.dev_id, q->q_id);
+ ublk_err("ublk dev %u thread %u set affinity failed",
+ t->dev->dev_info.dev_id, t->idx);
}
-struct ublk_queue_info {
- struct ublk_queue *q;
- sem_t *queue_sem;
+struct ublk_thread_info {
+ struct ublk_dev *dev;
+ unsigned idx;
+ sem_t *ready;
cpu_set_t *affinity;
- unsigned char auto_zc_fallback;
};
static void *ublk_io_handler_fn(void *data)
{
- struct ublk_queue_info *info = data;
- struct ublk_queue *q = info->q;
- int dev_id = q->dev->dev_info.dev_id;
- unsigned extra_flags = 0;
+ struct ublk_thread_info *info = data;
+ struct ublk_thread *t = &info->dev->threads[info->idx];
+ int dev_id = info->dev->dev_info.dev_id;
int ret;
- if (info->auto_zc_fallback)
- extra_flags = UBLKSRV_AUTO_BUF_REG_FALLBACK;
+ t->dev = info->dev;
+ t->idx = info->idx;
- ret = ublk_queue_init(q, extra_flags);
+ ret = ublk_thread_init(t);
if (ret) {
- ublk_err("ublk dev %d queue %d init queue failed\n",
- dev_id, q->q_id);
+ ublk_err("ublk dev %d thread %u init failed\n",
+ dev_id, t->idx);
return NULL;
}
/* IO perf is sensitive with queue pthread affinity on NUMA machine*/
- ublk_queue_set_sched_affinity(q, info->affinity);
- sem_post(info->queue_sem);
+ if (info->affinity)
+ ublk_thread_set_sched_affinity(t, info->affinity);
+ sem_post(info->ready);
- ublk_dbg(UBLK_DBG_QUEUE, "tid %d: ublk dev %d queue %d started\n",
- q->tid, dev_id, q->q_id);
+ ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n",
+ gettid(), dev_id, t->idx);
/* submit all io commands to ublk driver */
- ublk_submit_fetch_commands(q);
+ ublk_submit_fetch_commands(t);
do {
- if (ublk_process_io(q) < 0)
+ if (ublk_process_io(t) < 0)
break;
} while (1);
- ublk_dbg(UBLK_DBG_QUEUE, "ublk dev %d queue %d exited\n", dev_id, q->q_id);
- ublk_queue_deinit(q);
+ ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n",
+ gettid(), dev_id, t->idx);
+ ublk_thread_deinit(t);
return NULL;
}
@@ -855,20 +914,20 @@ static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev,
static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
{
const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
- struct ublk_queue_info *qinfo;
+ struct ublk_thread_info *tinfo;
+ unsigned extra_flags = 0;
cpu_set_t *affinity_buf;
void *thread_ret;
- sem_t queue_sem;
+ sem_t ready;
int ret, i;
ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__);
- qinfo = (struct ublk_queue_info *)calloc(sizeof(struct ublk_queue_info),
- dinfo->nr_hw_queues);
- if (!qinfo)
+ tinfo = calloc(sizeof(struct ublk_thread_info), dev->nthreads);
+ if (!tinfo)
return -ENOMEM;
- sem_init(&queue_sem, 0, 0);
+ sem_init(&ready, 0, 0);
ret = ublk_dev_prep(ctx, dev);
if (ret)
return ret;
@@ -877,22 +936,44 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
if (ret)
return ret;
+ if (ctx->auto_zc_fallback)
+ extra_flags = UBLKSRV_AUTO_BUF_REG_FALLBACK;
+
for (i = 0; i < dinfo->nr_hw_queues; i++) {
dev->q[i].dev = dev;
dev->q[i].q_id = i;
- qinfo[i].q = &dev->q[i];
- qinfo[i].queue_sem = &queue_sem;
- qinfo[i].affinity = &affinity_buf[i];
- qinfo[i].auto_zc_fallback = ctx->auto_zc_fallback;
- pthread_create(&dev->q[i].thread, NULL,
+ ret = ublk_queue_init(&dev->q[i], extra_flags);
+ if (ret) {
+ ublk_err("ublk dev %d queue %d init queue failed\n",
+ dinfo->dev_id, i);
+ goto fail;
+ }
+ }
+
+ for (i = 0; i < dev->nthreads; i++) {
+ tinfo[i].dev = dev;
+ tinfo[i].idx = i;
+ tinfo[i].ready = &ready;
+
+ /*
+ * If threads are not tied 1:1 to queues, setting thread
+ * affinity based on queue affinity makes little sense.
+ * However, thread CPU affinity has significant impact
+ * on performance, so to compare fairly, we'll still set
+ * thread CPU affinity based on queue affinity where
+ * possible.
+ */
+ if (dev->nthreads == dinfo->nr_hw_queues)
+ tinfo[i].affinity = &affinity_buf[i];
+ pthread_create(&dev->threads[i].thread, NULL,
ublk_io_handler_fn,
- &qinfo[i]);
+ &tinfo[i]);
}
- for (i = 0; i < dinfo->nr_hw_queues; i++)
- sem_wait(&queue_sem);
- free(qinfo);
+ for (i = 0; i < dev->nthreads; i++)
+ sem_wait(&ready);
+ free(tinfo);
free(affinity_buf);
/* everything is fine now, start us */
@@ -914,9 +995,11 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
/* wait until we are terminated */
- for (i = 0; i < dinfo->nr_hw_queues; i++)
- pthread_join(dev->q[i].thread, &thread_ret);
+ for (i = 0; i < dev->nthreads; i++)
+ pthread_join(dev->threads[i].thread, &thread_ret);
fail:
+ for (i = 0; i < dinfo->nr_hw_queues; i++)
+ ublk_queue_deinit(&dev->q[i]);
ublk_dev_unprep(dev);
ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__);
@@ -1022,13 +1105,14 @@ wait:
static int __cmd_dev_add(const struct dev_ctx *ctx)
{
+ unsigned nthreads = ctx->nthreads;
unsigned nr_queues = ctx->nr_hw_queues;
const char *tgt_type = ctx->tgt_type;
unsigned depth = ctx->queue_depth;
__u64 features;
const struct ublk_tgt_ops *ops;
struct ublksrv_ctrl_dev_info *info;
- struct ublk_dev *dev;
+ struct ublk_dev *dev = NULL;
int dev_id = ctx->dev_id;
int ret, i;
@@ -1036,29 +1120,55 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
if (!ops) {
ublk_err("%s: no such tgt type, type %s\n",
__func__, tgt_type);
- return -ENODEV;
+ ret = -ENODEV;
+ goto fail;
}
if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) {
ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n",
__func__, nr_queues, depth);
- return -EINVAL;
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ /* default to 1:1 threads:queues if nthreads is unspecified */
+ if (!nthreads)
+ nthreads = nr_queues;
+
+ if (nthreads > UBLK_MAX_THREADS) {
+ ublk_err("%s: %u is too many threads (max %u)\n",
+ __func__, nthreads, UBLK_MAX_THREADS);
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ if (nthreads != nr_queues && !ctx->per_io_tasks) {
+ ublk_err("%s: threads %u must be same as queues %u if "
+ "not using per_io_tasks\n",
+ __func__, nthreads, nr_queues);
+ ret = -EINVAL;
+ goto fail;
}
dev = ublk_ctrl_init();
if (!dev) {
ublk_err("%s: can't alloc dev id %d, type %s\n",
__func__, dev_id, tgt_type);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto fail;
}
/* kernel doesn't support get_features */
ret = ublk_ctrl_get_features(dev, &features);
- if (ret < 0)
- return -EINVAL;
+ if (ret < 0) {
+ ret = -EINVAL;
+ goto fail;
+ }
- if (!(features & UBLK_F_CMD_IOCTL_ENCODE))
- return -ENOTSUP;
+ if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) {
+ ret = -ENOTSUP;
+ goto fail;
+ }
info = &dev->dev_info;
info->dev_id = ctx->dev_id;
@@ -1068,6 +1178,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
if ((features & UBLK_F_QUIESCE) &&
(info->flags & UBLK_F_USER_RECOVERY))
info->flags |= UBLK_F_QUIESCE;
+ dev->nthreads = nthreads;
+ dev->per_io_tasks = ctx->per_io_tasks;
dev->tgt.ops = ops;
dev->tgt.sq_depth = depth;
dev->tgt.cq_depth = depth;
@@ -1097,7 +1209,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
fail:
if (ret < 0)
ublk_send_dev_event(ctx, dev, -1);
- ublk_ctrl_deinit(dev);
+ if (dev)
+ ublk_ctrl_deinit(dev);
return ret;
}
@@ -1159,6 +1272,8 @@ run:
shmctl(ctx->_shmid, IPC_RMID, NULL);
/* wait for child and detach from it */
wait(NULL);
+ if (exit_code == EXIT_FAILURE)
+ ublk_err("%s: command failed\n", __func__);
exit(exit_code);
} else {
exit(EXIT_FAILURE);
@@ -1266,6 +1381,7 @@ static int cmd_dev_get_features(void)
[const_ilog2(UBLK_F_UPDATE_SIZE)] = "UPDATE_SIZE",
[const_ilog2(UBLK_F_AUTO_BUF_REG)] = "AUTO_BUF_REG",
[const_ilog2(UBLK_F_QUIESCE)] = "QUIESCE",
+ [const_ilog2(UBLK_F_PER_IO_DAEMON)] = "PER_IO_DAEMON",
};
struct ublk_dev *dev;
__u64 features = 0;
@@ -1360,8 +1476,10 @@ static void __cmd_create_help(char *exe, bool recovery)
exe, recovery ? "recover" : "add");
printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1 ] [-g]\n");
printf("\t[-e 0|1 ] [-i 0|1]\n");
+ printf("\t[--nthreads threads] [--per_io_tasks]\n");
printf("\t[target options] [backfile1] [backfile2] ...\n");
printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
+ printf("\tdefault: nthreads=nr_queues");
for (i = 0; i < sizeof(tgt_ops_list) / sizeof(tgt_ops_list[0]); i++) {
const struct ublk_tgt_ops *ops = tgt_ops_list[i];
@@ -1418,6 +1536,8 @@ int main(int argc, char *argv[])
{ "auto_zc", 0, NULL, 0 },
{ "auto_zc_fallback", 0, NULL, 0 },
{ "size", 1, NULL, 's'},
+ { "nthreads", 1, NULL, 0 },
+ { "per_io_tasks", 0, NULL, 0 },
{ 0, 0, 0, 0 }
};
const struct ublk_tgt_ops *ops = NULL;
@@ -1493,6 +1613,10 @@ int main(int argc, char *argv[])
ctx.flags |= UBLK_F_AUTO_BUF_REG;
if (!strcmp(longopts[option_idx].name, "auto_zc_fallback"))
ctx.auto_zc_fallback = 1;
+ if (!strcmp(longopts[option_idx].name, "nthreads"))
+ ctx.nthreads = strtol(optarg, NULL, 10);
+ if (!strcmp(longopts[option_idx].name, "per_io_tasks"))
+ ctx.per_io_tasks = 1;
break;
case '?':
/*
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index e34508bf5798..6be601536b3d 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -49,11 +49,14 @@
#define UBLKSRV_IO_IDLE_SECS 20
#define UBLK_IO_MAX_BYTES (1 << 20)
-#define UBLK_MAX_QUEUES 32
+#define UBLK_MAX_QUEUES_SHIFT 5
+#define UBLK_MAX_QUEUES (1 << UBLK_MAX_QUEUES_SHIFT)
+#define UBLK_MAX_THREADS_SHIFT 5
+#define UBLK_MAX_THREADS (1 << UBLK_MAX_THREADS_SHIFT)
#define UBLK_QUEUE_DEPTH 1024
#define UBLK_DBG_DEV (1U << 0)
-#define UBLK_DBG_QUEUE (1U << 1)
+#define UBLK_DBG_THREAD (1U << 1)
#define UBLK_DBG_IO_CMD (1U << 2)
#define UBLK_DBG_IO (1U << 3)
#define UBLK_DBG_CTRL_CMD (1U << 4)
@@ -61,6 +64,7 @@
struct ublk_dev;
struct ublk_queue;
+struct ublk_thread;
struct stripe_ctx {
/* stripe */
@@ -76,6 +80,7 @@ struct dev_ctx {
char tgt_type[16];
unsigned long flags;
unsigned nr_hw_queues;
+ unsigned short nthreads;
unsigned queue_depth;
int dev_id;
int nr_files;
@@ -85,6 +90,7 @@ struct dev_ctx {
unsigned int fg:1;
unsigned int recovery:1;
unsigned int auto_zc_fallback:1;
+ unsigned int per_io_tasks:1;
int _evtfd;
int _shmid;
@@ -123,10 +129,14 @@ struct ublk_io {
unsigned short flags;
unsigned short refs; /* used by target code only */
+ int tag;
+
int result;
+ unsigned short buf_index;
unsigned short tgt_ios;
void *private_data;
+ struct ublk_thread *t;
};
struct ublk_tgt_ops {
@@ -165,28 +175,39 @@ struct ublk_tgt {
struct ublk_queue {
int q_id;
int q_depth;
- unsigned int cmd_inflight;
- unsigned int io_inflight;
struct ublk_dev *dev;
const struct ublk_tgt_ops *tgt_ops;
struct ublksrv_io_desc *io_cmd_buf;
- struct io_uring ring;
+
struct ublk_io ios[UBLK_QUEUE_DEPTH];
-#define UBLKSRV_QUEUE_STOPPING (1U << 0)
-#define UBLKSRV_QUEUE_IDLE (1U << 1)
#define UBLKSRV_NO_BUF (1U << 2)
#define UBLKSRV_ZC (1U << 3)
#define UBLKSRV_AUTO_BUF_REG (1U << 4)
#define UBLKSRV_AUTO_BUF_REG_FALLBACK (1U << 5)
unsigned state;
- pid_t tid;
+};
+
+struct ublk_thread {
+ struct ublk_dev *dev;
+ struct io_uring ring;
+ unsigned int cmd_inflight;
+ unsigned int io_inflight;
+
pthread_t thread;
+ unsigned idx;
+
+#define UBLKSRV_THREAD_STOPPING (1U << 0)
+#define UBLKSRV_THREAD_IDLE (1U << 1)
+ unsigned state;
};
struct ublk_dev {
struct ublk_tgt tgt;
struct ublksrv_ctrl_dev_info dev_info;
struct ublk_queue q[UBLK_MAX_QUEUES];
+ struct ublk_thread threads[UBLK_MAX_THREADS];
+ unsigned nthreads;
+ unsigned per_io_tasks;
int fds[MAX_BACK_FILES + 1]; /* fds[0] points to /dev/ublkcN */
int nr_fds;
@@ -211,7 +232,7 @@ struct ublk_dev {
extern unsigned int ublk_dbg_mask;
-extern int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag);
+extern int ublk_queue_io_cmd(struct ublk_io *io);
static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod)
@@ -225,11 +246,14 @@ static inline int is_target_io(__u64 user_data)
}
static inline __u64 build_user_data(unsigned tag, unsigned op,
- unsigned tgt_data, unsigned is_target_io)
+ unsigned tgt_data, unsigned q_id, unsigned is_target_io)
{
- assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16));
+ /* we only have 7 bits to encode q_id */
+ _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
+ assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7));
- return tag | (op << 16) | (tgt_data << 24) | (__u64)is_target_io << 63;
+ return tag | (op << 16) | (tgt_data << 24) |
+ (__u64)q_id << 56 | (__u64)is_target_io << 63;
}
static inline unsigned int user_data_to_tag(__u64 user_data)
@@ -247,6 +271,11 @@ static inline unsigned int user_data_to_tgt_data(__u64 user_data)
return (user_data >> 24) & 0xffff;
}
+static inline unsigned int user_data_to_q_id(__u64 user_data)
+{
+ return (user_data >> 56) & 0x7f;
+}
+
static inline unsigned short ublk_cmd_op_nr(unsigned int op)
{
return _IOC_NR(op);
@@ -280,17 +309,23 @@ static inline void ublk_dbg(int level, const char *fmt, ...)
}
}
-static inline int ublk_queue_alloc_sqes(struct ublk_queue *q,
+static inline struct ublk_queue *ublk_io_to_queue(const struct ublk_io *io)
+{
+ return container_of(io, struct ublk_queue, ios[io->tag]);
+}
+
+static inline int ublk_io_alloc_sqes(struct ublk_io *io,
struct io_uring_sqe *sqes[], int nr_sqes)
{
- unsigned left = io_uring_sq_space_left(&q->ring);
+ struct io_uring *ring = &io->t->ring;
+ unsigned left = io_uring_sq_space_left(ring);
int i;
if (left < nr_sqes)
- io_uring_submit(&q->ring);
+ io_uring_submit(ring);
for (i = 0; i < nr_sqes; i++) {
- sqes[i] = io_uring_get_sqe(&q->ring);
+ sqes[i] = io_uring_get_sqe(ring);
if (!sqes[i])
return i;
}
@@ -373,7 +408,7 @@ static inline int ublk_complete_io(struct ublk_queue *q, unsigned tag, int res)
ublk_mark_io_done(io, res);
- return ublk_queue_io_cmd(q, io, tag);
+ return ublk_queue_io_cmd(io);
}
static inline void ublk_queued_tgt_io(struct ublk_queue *q, unsigned tag, int queued)
@@ -383,7 +418,7 @@ static inline void ublk_queued_tgt_io(struct ublk_queue *q, unsigned tag, int qu
else {
struct ublk_io *io = ublk_get_io(q, tag);
- q->io_inflight += queued;
+ io->t->io_inflight += queued;
io->tgt_ios = queued;
io->result = 0;
}
@@ -393,7 +428,7 @@ static inline int ublk_completed_tgt_io(struct ublk_queue *q, unsigned tag)
{
struct ublk_io *io = ublk_get_io(q, tag);
- q->io_inflight--;
+ io->t->io_inflight--;
return --io->tgt_ios == 0;
}
diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c
index 44aca31cf2b0..afe0b99d77ee 100644
--- a/tools/testing/selftests/ublk/null.c
+++ b/tools/testing/selftests/ublk/null.c
@@ -43,7 +43,7 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
}
static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod,
- struct io_uring_sqe *sqe)
+ struct io_uring_sqe *sqe, int q_id)
{
unsigned ublk_op = ublksrv_get_op(iod);
@@ -52,7 +52,7 @@ static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod,
sqe->flags |= IOSQE_FIXED_FILE;
sqe->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT;
sqe->len = iod->nr_sectors << 9; /* injected result */
- sqe->user_data = build_user_data(tag, ublk_op, 0, 1);
+ sqe->user_data = build_user_data(tag, ublk_op, 0, q_id, 1);
}
static int null_queue_zc_io(struct ublk_queue *q, int tag)
@@ -60,18 +60,18 @@ static int null_queue_zc_io(struct ublk_queue *q, int tag)
const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
struct io_uring_sqe *sqe[3];
- ublk_queue_alloc_sqes(q, sqe, 3);
+ ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 3);
- io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
+ io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
sqe[0]->user_data = build_user_data(tag,
- ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1);
+ ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
- __setup_nop_io(tag, iod, sqe[1]);
+ __setup_nop_io(tag, iod, sqe[1], q->q_id);
sqe[1]->flags |= IOSQE_IO_HARDLINK;
- io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag);
- sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1);
+ io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
+ sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
// buf register is marked as IOSQE_CQE_SKIP_SUCCESS
return 2;
@@ -82,8 +82,8 @@ static int null_queue_auto_zc_io(struct ublk_queue *q, int tag)
const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
struct io_uring_sqe *sqe[1];
- ublk_queue_alloc_sqes(q, sqe, 1);
- __setup_nop_io(tag, iod, sqe[0]);
+ ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1);
+ __setup_nop_io(tag, iod, sqe[0], q->q_id);
return 1;
}
@@ -136,7 +136,7 @@ static unsigned short ublk_null_buf_index(const struct ublk_queue *q, int tag)
{
if (q->state & UBLKSRV_AUTO_BUF_REG_FALLBACK)
return (unsigned short)-1;
- return tag;
+ return q->ios[tag].buf_index;
}
const struct ublk_tgt_ops null_tgt_ops = {
diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c
index 404a143bf3d6..37d50bbf5f5e 100644
--- a/tools/testing/selftests/ublk/stripe.c
+++ b/tools/testing/selftests/ublk/stripe.c
@@ -138,13 +138,13 @@ static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_
io->private_data = s;
calculate_stripe_array(conf, iod, s, base);
- ublk_queue_alloc_sqes(q, sqe, s->nr + extra);
+ ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, s->nr + extra);
if (zc) {
- io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
+ io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, io->buf_index);
sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
sqe[0]->user_data = build_user_data(tag,
- ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1);
+ ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
}
for (i = zc; i < s->nr + extra - zc; i++) {
@@ -162,13 +162,14 @@ static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_
sqe[i]->flags |= IOSQE_IO_HARDLINK;
}
/* bit63 marks us as tgt io */
- sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i - zc, 1);
+ sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i - zc, q->q_id, 1);
}
if (zc) {
struct io_uring_sqe *unreg = sqe[s->nr + 1];
- io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, tag);
- unreg->user_data = build_user_data(tag, ublk_cmd_op_nr(unreg->cmd_op), 0, 1);
+ io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, io->buf_index);
+ unreg->user_data = build_user_data(
+ tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1);
}
/* register buffer is skip_success */
@@ -181,11 +182,11 @@ static int handle_flush(struct ublk_queue *q, const struct ublksrv_io_desc *iod,
struct io_uring_sqe *sqe[NR_STRIPE];
int i;
- ublk_queue_alloc_sqes(q, sqe, conf->nr_files);
+ ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, conf->nr_files);
for (i = 0; i < conf->nr_files; i++) {
io_uring_prep_fsync(sqe[i], i + 1, IORING_FSYNC_DATASYNC);
io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE);
- sqe[i]->user_data = build_user_data(tag, UBLK_IO_OP_FLUSH, 0, 1);
+ sqe[i]->user_data = build_user_data(tag, UBLK_IO_OP_FLUSH, 0, q->q_id, 1);
}
return conf->nr_files;
}
diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index 0145569ee7e9..8a4dbd09feb0 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -278,6 +278,11 @@ __run_io_and_remove()
fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio \
--rw=randrw --norandommap --iodepth=256 --size="${size}" --numjobs="$(nproc)" \
--runtime=20 --time_based > /dev/null 2>&1 &
+ fio --name=batchjob --filename=/dev/ublkb"${dev_id}" --ioengine=io_uring \
+ --rw=randrw --norandommap --iodepth=256 --size="${size}" \
+ --numjobs="$(nproc)" --runtime=20 --time_based \
+ --iodepth_batch_submit=32 --iodepth_batch_complete_min=32 \
+ --force_async=7 > /dev/null 2>&1 &
sleep 2
if [ "${kill_server}" = "yes" ]; then
local state
diff --git a/tools/testing/selftests/ublk/test_generic_12.sh b/tools/testing/selftests/ublk/test_generic_12.sh
new file mode 100755
index 000000000000..7abbb00d251d
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_generic_12.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+TID="generic_12"
+ERR_CODE=0
+
+if ! _have_program bpftrace; then
+ exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "null" "do imbalanced load, it should be balanced over I/O threads"
+
+NTHREADS=6
+dev_id=$(_add_ublk_dev -t null -q 4 -d 16 --nthreads $NTHREADS --per_io_tasks)
+_check_add_dev $TID $?
+
+dev_t=$(_get_disk_dev_t "$dev_id")
+bpftrace trace/count_ios_per_tid.bt "$dev_t" > "$UBLK_TMP" 2>&1 &
+btrace_pid=$!
+sleep 2
+
+if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then
+ _cleanup_test "null"
+ exit "$UBLK_SKIP_CODE"
+fi
+
+# do imbalanced I/O on the ublk device
+# pin to cpu 0 to prevent migration/only target one queue
+fio --name=write_seq \
+ --filename=/dev/ublkb"${dev_id}" \
+ --ioengine=libaio --iodepth=16 \
+ --rw=write \
+ --size=512M \
+ --direct=1 \
+ --bs=4k \
+ --cpus_allowed=0 > /dev/null 2>&1
+ERR_CODE=$?
+kill "$btrace_pid"
+wait
+
+# check that every task handles some I/O, even though all I/O was issued
+# from a single CPU. when ublk gets support for round-robin tag
+# allocation, this check can be strengthened to assert that every thread
+# handles the same number of I/Os
+NR_THREADS_THAT_HANDLED_IO=$(grep -c '@' ${UBLK_TMP})
+if [[ $NR_THREADS_THAT_HANDLED_IO -ne $NTHREADS ]]; then
+ echo "only $NR_THREADS_THAT_HANDLED_IO handled I/O! expected $NTHREADS"
+ cat "$UBLK_TMP"
+ ERR_CODE=255
+fi
+
+_cleanup_test "null"
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_stress_03.sh b/tools/testing/selftests/ublk/test_stress_03.sh
index 7d728ce50774..6eef282d569f 100755
--- a/tools/testing/selftests/ublk/test_stress_03.sh
+++ b/tools/testing/selftests/ublk/test_stress_03.sh
@@ -41,5 +41,13 @@ if _have_feature "AUTO_BUF_REG"; then
fi
wait
+if _have_feature "PER_IO_DAEMON"; then
+ ublk_io_and_remove 8G -t null -q 4 --auto_zc --nthreads 8 --per_io_tasks &
+ ublk_io_and_remove 256M -t loop -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[0]}" &
+ ublk_io_and_remove 256M -t stripe -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+ ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback --nthreads 8 --per_io_tasks &
+fi
+wait
+
_cleanup_test "stress"
_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_stress_04.sh b/tools/testing/selftests/ublk/test_stress_04.sh
index 9bcfa64ea1f0..40d1437ca298 100755
--- a/tools/testing/selftests/ublk/test_stress_04.sh
+++ b/tools/testing/selftests/ublk/test_stress_04.sh
@@ -38,6 +38,13 @@ if _have_feature "AUTO_BUF_REG"; then
ublk_io_and_kill_daemon 256M -t stripe -q 4 --auto_zc "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
ublk_io_and_kill_daemon 8G -t null -q 4 -z --auto_zc --auto_zc_fallback &
fi
+
+if _have_feature "PER_IO_DAEMON"; then
+ ublk_io_and_kill_daemon 8G -t null -q 4 --nthreads 8 --per_io_tasks &
+ ublk_io_and_kill_daemon 256M -t loop -q 4 --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[0]}" &
+ ublk_io_and_kill_daemon 256M -t stripe -q 4 --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+ ublk_io_and_kill_daemon 8G -t null -q 4 --nthreads 8 --per_io_tasks &
+fi
wait
_cleanup_test "stress"
diff --git a/tools/testing/selftests/ublk/test_stress_05.sh b/tools/testing/selftests/ublk/test_stress_05.sh
index bcfc904cefc6..566cfd90d192 100755
--- a/tools/testing/selftests/ublk/test_stress_05.sh
+++ b/tools/testing/selftests/ublk/test_stress_05.sh
@@ -69,5 +69,12 @@ if _have_feature "AUTO_BUF_REG"; then
done
fi
+if _have_feature "PER_IO_DAEMON"; then
+ ublk_io_and_remove 8G -t null -q 4 --nthreads 8 --per_io_tasks -r 1 -i "$reissue" &
+ ublk_io_and_remove 256M -t loop -q 4 --nthreads 8 --per_io_tasks -r 1 -i "$reissue" "${UBLK_BACKFILES[0]}" &
+ ublk_io_and_remove 8G -t null -q 4 --nthreads 8 --per_io_tasks -r 1 -i "$reissue" &
+fi
+wait
+
_cleanup_test "stress"
_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/trace/count_ios_per_tid.bt b/tools/testing/selftests/ublk/trace/count_ios_per_tid.bt
new file mode 100644
index 000000000000..f4aa63ff2938
--- /dev/null
+++ b/tools/testing/selftests/ublk/trace/count_ios_per_tid.bt
@@ -0,0 +1,11 @@
+/*
+ * Tabulates and prints I/O completions per thread for the given device
+ *
+ * $1: dev_t
+*/
+tracepoint:block:block_rq_complete
+{
+ if (args.dev == $1) {
+ @[tid] = count();
+ }
+}
diff --git a/tools/testing/selftests/vDSO/vgetrandom-chacha.S b/tools/testing/selftests/vDSO/vgetrandom-chacha.S
index d6e09af7c0a9..a4a82e1c28a9 100644
--- a/tools/testing/selftests/vDSO/vgetrandom-chacha.S
+++ b/tools/testing/selftests/vDSO/vgetrandom-chacha.S
@@ -11,6 +11,8 @@
#include "../../../../arch/loongarch/vdso/vgetrandom-chacha.S"
#elif defined(__powerpc__) || defined(__powerpc64__)
#include "../../../../arch/powerpc/kernel/vdso/vgetrandom-chacha.S"
+#elif defined(__riscv) && __riscv_xlen == 64
+#include "../../../../arch/riscv/kernel/vdso/vgetrandom-chacha.S"
#elif defined(__s390x__)
#include "../../../../arch/s390/kernel/vdso64/vgetrandom-chacha.S"
#elif defined(__x86_64__)
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 441feb21aa5a..4505b1c31be1 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -932,6 +932,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
(void)next;
}
+static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
+
static inline void vma_iter_free(struct vma_iterator *vmi)
{
mas_destroy(&vmi->mas);