27 files changed, 897 insertions, 339 deletions
diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile
index afbddea3a39c..ce1b556dfa90 100644
--- a/tools/bpf/resolve_btfids/Makefile
+++ b/tools/bpf/resolve_btfids/Makefile
@@ -17,7 +17,7 @@ endif
 
 # Overrides for the prepare step libraries.
 HOST_OVERRIDES := AR="$(HOSTAR)" CC="$(HOSTCC)" LD="$(HOSTLD)" ARCH="$(HOSTARCH)" \
-		  CROSS_COMPILE="" EXTRA_CFLAGS="$(HOSTCFLAGS)"
+		  CROSS_COMPILE="" CLANG_CROSS_FLAGS="" EXTRA_CFLAGS="$(HOSTCFLAGS)"
 
 RM      ?= rm
 HOSTCC  ?= gcc
diff --git a/tools/iio/iio_generic_buffer.c b/tools/iio/iio_generic_buffer.c
index 9ef5ee087eda..bc82bb6a7a2a 100644
--- a/tools/iio/iio_generic_buffer.c
+++ b/tools/iio/iio_generic_buffer.c
@@ -335,7 +335,7 @@ static const struct option longopts[] = {
 	{ "device-num",		1, 0, 'N' },
 	{ "trigger-name",	1, 0, 't' },
 	{ "trigger-num",	1, 0, 'T' },
-	{ },
+	{ }
 };
 
 int main(int argc, char **argv)
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index f1d495dc66bb..37682908cb0f 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1384,12 +1384,12 @@ static struct btf *btf_parse_raw_mmap(const char *path, struct btf *base_btf)
 
 	fd = open(path, O_RDONLY);
 	if (fd < 0)
-		return libbpf_err_ptr(-errno);
+		return ERR_PTR(-errno);
 
 	if (fstat(fd, &st) < 0) {
 		err = -errno;
 		close(fd);
-		return libbpf_err_ptr(err);
+		return ERR_PTR(err);
 	}
 
 	data = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
@@ -1397,7 +1397,7 @@ static struct btf *btf_parse_raw_mmap(const char *path, struct btf *base_btf)
 	close(fd);
 
 	if (data == MAP_FAILED)
-		return libbpf_err_ptr(err);
+		return ERR_PTR(err);
 
 	btf = btf_new(data, st.st_size, base_btf, true);
 	if (IS_ERR(btf))
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 01818abd02df..6d2c280a1730 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -1668,6 +1668,12 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss,
 				continue;
 		}
 
+		/* Reject RISCV ELF "mapping symbols" */
+		if (ehdr.e_machine == EM_RISCV) {
+			if (elf_name[0] == '$' && strchr("dx", elf_name[1]))
+				continue;
+		}
+
 		if (runtime_ss->opdsec && sym.st_shndx == runtime_ss->opdidx) {
 			u32 offset = sym.st_value - syms_ss->opdshdr.sh_addr;
 			u64 *opd = opddata->d_buf + offset;
diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index b74ed916057e..fb11108aaf42 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -100,7 +100,7 @@ The column name "all" can be used to enable all disabled-by-default built-in cou
 .PP
 \fB--show column\fP show only the specified built-in columns.  May be invoked multiple times, or with a comma-separated list of column names.
 .PP
-\fB--show CATEGORY --hide CATEGORY\fP  Show and hide also accept a single CATEGORY of columns: "all", "topology", "idle", "frequency", "power", "cpuidle", "hwidle", "swidle", "other".  "idle" (enabled by default), includes "hwidle" and "idle_pct".  "cpuidle" (default disabled) includes cpuidle software invocation counters.  "swidle" includes "cpuidle" plus "idle_pct".  "hwidle" includes only hardware based idle residency counters.  Older versions of turbostat used the term "sysfs" for what is now "swidle".
+\fB--show CATEGORY --hide CATEGORY\fP  Show and hide also accept a single CATEGORY of columns: "all", "topology", "idle", "frequency", "power", "cpuidle", "hwidle", "swidle", "other".  "idle" (enabled by default), includes "hwidle" and "pct_idle".  "cpuidle" (default disabled) includes cpuidle software invocation counters.  "swidle" includes "cpuidle" plus "pct_idle".  "hwidle" includes only hardware based idle residency counters.  Older versions of turbostat used the term "sysfs" for what is now "swidle".
 .PP
 \fB--Dump\fP displays the raw counter values.
 .PP
@@ -204,8 +204,8 @@ The system configuration dump (if --quiet is not used) is followed by statistics
 .PP
 \fBUncMHz\fP per-package uncore MHz, instantaneous sample.
 .PP
-\fBUMHz1.0\fP per-package uncore MHz for domain=1 and fabric_cluster=0, instantaneous sample.  System summary is the average of all packages.
-Intel Granite Rapids systems use domains 0-2 for CPUs, and 3-4 for IO, with cluster always 0.
+\fBUMHz1.0\fP per-package uncore MHz for pm_domain=1 and fabric_cluster=0, instantaneous sample.  System summary is the average of all packages.
+Intel Granite Rapids systems use pm_domains 0-2 for CPUs, and 3-4 for IO, with cluster always 0.
 For the "--show" and "--hide" options, use "UncMHz" to operate on all UMHz*.* as a group.
 .SH TOO MUCH INFORMATION EXAMPLE
 By default, turbostat dumps all possible information -- a system configuration header, followed by columns for all counters.
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 0170d3cc6819..5230e072e414 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -280,7 +280,7 @@ struct msr_counter bic[] = {
 #define BIC_GROUP_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ)
 #define BIC_GROUP_HW_IDLE (BIC_Busy | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6)
 #define BIC_GROUP_SW_IDLE (BIC_Busy | BIC_cpuidle | BIC_pct_idle )
-#define BIC_GROUP_IDLE (BIC_GROUP_HW_IDLE | BIC_pct_idle) 
+#define BIC_GROUP_IDLE (BIC_GROUP_HW_IDLE | BIC_pct_idle)
 #define BIC_OTHER (BIC_IRQ | BIC_NMI | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
 
 #define BIC_DISABLED_BY_DEFAULT	(BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC | BIC_cpuidle)
@@ -539,7 +539,7 @@ enum rapl_msrs {
 #define RAPL_PKG_ALL	(RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO)
 #define RAPL_DRAM_ALL	(RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_DRAM_POWER_INFO)
 #define RAPL_CORE_ALL	(RAPL_CORE | RAPL_CORE_POLICY)
-#define RAPL_GFX_ALL	(RAPL_GFX | RAPL_GFX_POLIGY)
+#define RAPL_GFX_ALL	(RAPL_GFX | RAPL_GFX_POLICY)
 
 #define RAPL_AMD_F17H	(RAPL_AMD_PWR_UNIT | RAPL_AMD_CORE_ENERGY_STAT | RAPL_AMD_PKG_ENERGY_STAT)
 
@@ -839,6 +839,23 @@ static const struct platform_features spr_features = {
 	.rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS,
 };
 
+static const struct platform_features dmr_features = {
+	.has_msr_misc_feature_control = spr_features.has_msr_misc_feature_control,
+	.has_msr_misc_pwr_mgmt = spr_features.has_msr_misc_pwr_mgmt,
+	.has_nhm_msrs = spr_features.has_nhm_msrs,
+	.has_config_tdp = spr_features.has_config_tdp,
+	.bclk_freq = spr_features.bclk_freq,
+	.supported_cstates = spr_features.supported_cstates,
+	.cst_limit = spr_features.cst_limit,
+	.has_msr_core_c1_res = spr_features.has_msr_core_c1_res,
+	.has_msr_module_c6_res_ms = 1,	/* DMR has Dual Core Module and MC6 MSR */
+	.has_irtl_msrs = spr_features.has_irtl_msrs,
+	.has_cst_prewake_bit = spr_features.has_cst_prewake_bit,
+	.has_fixed_rapl_psys_unit = spr_features.has_fixed_rapl_psys_unit,
+	.trl_msrs = spr_features.trl_msrs,
+	.rapl_msrs = 0,		/* DMR does not have RAPL MSRs */
+};
+
 static const struct platform_features srf_features = {
 	.has_msr_misc_feature_control = 1,
 	.has_msr_misc_pwr_mgmt = 1,
@@ -1028,12 +1045,14 @@ static const struct platform_data turbostat_pdata[] = {
 	{ INTEL_EMERALDRAPIDS_X, &spr_features },
 	{ INTEL_GRANITERAPIDS_X, &spr_features },
 	{ INTEL_GRANITERAPIDS_D, &spr_features },
+	{ INTEL_PANTHERCOVE_X, &dmr_features },
 	{ INTEL_LAKEFIELD, &cnl_features },
 	{ INTEL_ALDERLAKE, &adl_features },
 	{ INTEL_ALDERLAKE_L, &adl_features },
 	{ INTEL_RAPTORLAKE, &adl_features },
 	{ INTEL_RAPTORLAKE_P, &adl_features },
 	{ INTEL_RAPTORLAKE_S, &adl_features },
+	{ INTEL_BARTLETTLAKE, &adl_features },
 	{ INTEL_METEORLAKE, &adl_features },
 	{ INTEL_METEORLAKE_L, &adl_features },
 	{ INTEL_ARROWLAKE_H, &adl_features },
@@ -1072,7 +1091,6 @@ void probe_platform_features(unsigned int family, unsigned int model)
 {
 	int i;
 
-
 	if (authentic_amd || hygon_genuine) {
 		/* fallback to default features on unsupported models */
 		force_load++;
@@ -1106,8 +1124,7 @@ end:
 	if (platform)
 		return;
 
-	fprintf(stderr, "Unsupported platform detected.\n"
-		"\tSee RUN THE LATEST VERSION on turbostat(8)\n");
+	fprintf(stderr, "Unsupported platform detected.\n\tSee RUN THE LATEST VERSION on turbostat(8)\n");
 	exit(1);
 }
 
@@ -1127,7 +1144,8 @@ char *progname;
 
 #define CPU_SUBSET_MAXCPUS	8192	/* need to use before probe... */
 cpu_set_t *cpu_present_set, *cpu_possible_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset;
-size_t cpu_present_setsize, cpu_possible_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size;
+size_t cpu_present_setsize, cpu_possible_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize,
+    cpu_subset_size;
 #define MAX_ADDED_THREAD_COUNTERS 24
 #define MAX_ADDED_CORE_COUNTERS 8
 #define MAX_ADDED_PACKAGE_COUNTERS 16
@@ -2140,13 +2158,20 @@ int get_msr_fd(int cpu)
 
 	if (fd)
 		return fd;
-
+#if defined(ANDROID)
+	sprintf(pathname, "/dev/msr%d", cpu);
+#else
 	sprintf(pathname, "/dev/cpu/%d/msr", cpu);
+#endif
 	fd = open(pathname, O_RDONLY);
 	if (fd < 0)
+#if defined(ANDROID)
+		err(-1, "%s open failed, try chown or chmod +r /dev/msr*, "
+		    "or run with --no-msr, or run as root", pathname);
+#else
 		err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, "
 		    "or run with --no-msr, or run as root", pathname);
-
+#endif
 	fd_percpu[cpu] = fd;
 
 	return fd;
@@ -2215,32 +2240,52 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr)
 	return 0;
 }
 
-int probe_rapl_msr(int cpu, off_t offset, int index)
+int add_msr_counter(int cpu, off_t offset)
 {
 	ssize_t retval;
 	unsigned long long value;
 
-	assert(!no_msr);
+	if (no_msr)
+		return -1;
+
+	if (!offset)
+		return -1;
 
 	retval = pread(get_msr_fd(cpu), &value, sizeof(value), offset);
 
 	/* if the read failed, the probe fails */
 	if (retval != sizeof(value))
-		return 1;
+		return -1;
+
+	if (value == 0)
+		return 0;
+
+	return 1;
+}
+
+int add_rapl_msr_counter(int cpu, const struct rapl_counter_arch_info *cai)
+{
+	int ret;
+
+	if (!(platform->rapl_msrs & cai->feature_mask))
+		return -1;
+
+	ret = add_msr_counter(cpu, cai->msr);
+	if (ret < 0)
+		return -1;
 
-	/* If an Energy Status Counter MSR returns 0, the probe fails */
-	switch (index) {
+	switch (cai->rci_index) {
 	case RAPL_RCI_INDEX_ENERGY_PKG:
 	case RAPL_RCI_INDEX_ENERGY_CORES:
 	case RAPL_RCI_INDEX_DRAM:
 	case RAPL_RCI_INDEX_GFX:
 	case RAPL_RCI_INDEX_ENERGY_PLATFORM:
-		if (value == 0)
+		if (ret == 0)
 			return 1;
 	}
 
 	/* PKG,DRAM_PERF_STATUS MSRs, can return any value */
-	return 0;
+	return 1;
 }
 
 /* Convert CPU ID to domain ID for given added perf counter. */
@@ -2327,8 +2372,7 @@ void help(void)
 		"		  degrees Celsius\n"
 		"  -h, --help\n"
 		"		print this help message\n"
-		"  -v, --version\n"
-		"		print version information\n\nFor more help, run \"man turbostat\"\n");
+		"  -v, --version\n\t\tprint version information\n\nFor more help, run \"man turbostat\"\n");
 }
 
 /*
@@ -2644,7 +2688,7 @@ void print_header(char *delim)
 	if (DO_BIC(BIC_SYS_LPI))
 		outp += sprintf(outp, "%sSYS%%LPI", (printed++ ? delim : ""));
 
-	if (platform->rapl_msrs && !rapl_joules) {
+	if (!rapl_joules) {
 		if (DO_BIC(BIC_PkgWatt))
 			outp += sprintf(outp, "%sPkgWatt", (printed++ ? delim : ""));
 		if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl)
@@ -2657,7 +2701,7 @@ void print_header(char *delim)
 			outp += sprintf(outp, "%sPKG_%%", (printed++ ? delim : ""));
 		if (DO_BIC(BIC_RAM__))
 			outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : ""));
-	} else if (platform->rapl_msrs && rapl_joules) {
+	} else {
 		if (DO_BIC(BIC_Pkg_J))
 			outp += sprintf(outp, "%sPkg_J", (printed++ ? delim : ""));
 		if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl)
@@ -3943,7 +3987,6 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
 	if (average.threads.nmi_count > 9999999)
 		sums_need_wide_columns = 1;
 
-
 	average.cores.c3 /= topo.allowed_cores;
 	average.cores.c6 /= topo.allowed_cores;
 	average.cores.c7 /= topo.allowed_cores;
@@ -4766,6 +4809,37 @@ unsigned long pmt_read_counter(struct pmt_counter *ppmt, unsigned int domain_id)
 	return (value & value_mask) >> value_shift;
 }
 
+/* Rapl domain enumeration helpers */
+static inline int get_rapl_num_domains(void)
+{
+	int num_packages = topo.max_package_id + 1;
+	int num_cores_per_package;
+	int num_cores;
+
+	if (!platform->has_per_core_rapl)
+		return num_packages;
+
+	num_cores_per_package = topo.max_core_id + 1;
+	num_cores = num_cores_per_package * num_packages;
+
+	return num_cores;
+}
+
+static inline int get_rapl_domain_id(int cpu)
+{
+	int nr_cores_per_package = topo.max_core_id + 1;
+	int rapl_core_id;
+
+	if (!platform->has_per_core_rapl)
+		return cpus[cpu].physical_package_id;
+
+	/* Compute the system-wide unique core-id for @cpu */
+	rapl_core_id = cpus[cpu].physical_core_id;
+	rapl_core_id += cpus[cpu].physical_package_id * nr_cores_per_package;
+
+	return rapl_core_id;
+}
+
 /*
  * get_counters(...)
  * migrate to cpu
@@ -4821,7 +4895,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		goto done;
 
 	if (platform->has_per_core_rapl) {
-		status = get_rapl_counters(cpu, c->core_id, c, p);
+		status = get_rapl_counters(cpu, get_rapl_domain_id(cpu), c, p);
 		if (status != 0)
 			return status;
 	}
@@ -4887,7 +4961,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		p->sys_lpi = cpuidle_cur_sys_lpi_us;
 
 	if (!platform->has_per_core_rapl) {
-		status = get_rapl_counters(cpu, p->package_id, c, p);
+		status = get_rapl_counters(cpu, get_rapl_domain_id(cpu), c, p);
 		if (status != 0)
 			return status;
 	}
@@ -6476,8 +6550,11 @@ void check_dev_msr()
 
 	if (no_msr)
 		return;
-
+#if defined(ANDROID)
+	sprintf(pathname, "/dev/msr%d", base_cpu);
+#else
 	sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
+#endif
 	if (stat(pathname, &sb))
 		if (system("/sbin/modprobe msr > /dev/null 2>&1"))
 			no_msr = 1;
@@ -6527,7 +6604,11 @@ void check_msr_permission(void)
 	failed += check_for_cap_sys_rawio();
 
 	/* test file permissions */
+#if defined(ANDROID)
+	sprintf(pathname, "/dev/msr%d", base_cpu);
+#else
 	sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
+#endif
 	if (euidaccess(pathname, R_OK)) {
 		failed++;
 	}
@@ -6737,8 +6818,10 @@ static void probe_intel_uncore_frequency_cluster(void)
 		 * This allows "--show/--hide UncMHz" to be effective for
 		 * the clustered MHz counters, as a group.
 		 */
-		if BIC_IS_ENABLED(BIC_UNCORE_MHZ)
-			add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, package_id);
+		if BIC_IS_ENABLED
+			(BIC_UNCORE_MHZ)
+			    add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0,
+					package_id);
 
 		if (quiet)
 			continue;
@@ -6810,17 +6893,21 @@ static void probe_graphics(void)
 		else
 			goto next;
 
-		set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", gt0_is_gt ? GFX_rc6 : SAM_mc6);
+		set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms",
+				gt0_is_gt ? GFX_rc6 : SAM_mc6);
 
 		set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", gt0_is_gt ? GFX_MHz : SAM_MHz);
 
-		set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz);
+		set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq",
+				gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz);
 
-		set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", gt0_is_gt ? SAM_mc6 : GFX_rc6);
+		set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms",
+				gt0_is_gt ? SAM_mc6 : GFX_rc6);
 
 		set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", gt0_is_gt ? SAM_MHz : GFX_MHz);
 
-		set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz);
+		set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq",
+				gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz);
 
 		goto end;
 	}
@@ -7256,6 +7343,9 @@ void rapl_probe_intel(void)
 	else
 		bic_enabled &= ~bic_joules_bits;
 
+	if (!platform->rapl_msrs || no_msr)
+		return;
+
 	if (!(platform->rapl_msrs & RAPL_PKG_PERF_STATUS))
 		bic_enabled &= ~BIC_PKG__;
 	if (!(platform->rapl_msrs & RAPL_DRAM_PERF_STATUS))
@@ -7306,6 +7396,9 @@ void rapl_probe_amd(void)
 	else
 		bic_enabled &= ~bic_joules_bits;
 
+	if (!platform->rapl_msrs || no_msr)
+		return;
+
 	if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr))
 		return;
 
@@ -7332,6 +7425,158 @@ void print_power_limit_msr(int cpu, unsigned long long msr, char *label)
 	return;
 }
 
+static int fread_int(char *path, int *val)
+{
+	FILE *filep;
+	int ret;
+
+	filep = fopen(path, "r");
+	if (!filep)
+		return -1;
+
+	ret = fscanf(filep, "%d", val);
+	fclose(filep);
+	return ret;
+}
+
+static int fread_ull(char *path, unsigned long long *val)
+{
+	FILE *filep;
+	int ret;
+
+	filep = fopen(path, "r");
+	if (!filep)
+		return -1;
+
+	ret = fscanf(filep, "%llu", val);
+	fclose(filep);
+	return ret;
+}
+
+static int fread_str(char *path, char *buf, int size)
+{
+	FILE *filep;
+	int ret;
+	char *cp;
+
+	filep = fopen(path, "r");
+	if (!filep)
+		return -1;
+
+	ret = fread(buf, 1, size, filep);
+	fclose(filep);
+
+	/* replace '\n' with '\0' */
+	cp = strchr(buf, '\n');
+	if (cp != NULL)
+		*cp = '\0';
+
+	return ret;
+}
+
+#define PATH_RAPL_SYSFS	"/sys/class/powercap"
+
+static int dump_one_domain(char *domain_path)
+{
+	char path[PATH_MAX];
+	char str[PATH_MAX];
+	unsigned long long val;
+	int constraint;
+	int enable;
+	int ret;
+
+	snprintf(path, PATH_MAX, "%s/name", domain_path);
+	ret = fread_str(path, str, PATH_MAX);
+	if (ret <= 0)
+		return -1;
+
+	fprintf(outf, "%s: %s", domain_path + strlen(PATH_RAPL_SYSFS) + 1, str);
+
+	snprintf(path, PATH_MAX, "%s/enabled", domain_path);
+	ret = fread_int(path, &enable);
+	if (ret <= 0)
+		return -1;
+
+	if (!enable) {
+		fputs(" disabled\n", outf);
+		return 0;
+	}
+
+	for (constraint = 0;; constraint++) {
+		snprintf(path, PATH_MAX, "%s/constraint_%d_time_window_us", domain_path, constraint);
+		ret = fread_ull(path, &val);
+		if (ret <= 0)
+			break;
+
+		if (val > 1000000)
+			fprintf(outf, " %0.1fs", (double)val / 1000000);
+		else if (val > 1000)
+			fprintf(outf, " %0.1fms", (double)val / 1000);
+		else
+			fprintf(outf, " %0.1fus", (double)val);
+
+		snprintf(path, PATH_MAX, "%s/constraint_%d_power_limit_uw", domain_path, constraint);
+		ret = fread_ull(path, &val);
+		if (ret > 0 && val)
+			fprintf(outf, ":%lluW", val / 1000000);
+
+		snprintf(path, PATH_MAX, "%s/constraint_%d_max_power_uw", domain_path, constraint);
+		ret = fread_ull(path, &val);
+		if (ret > 0 && val)
+			fprintf(outf, ",max:%lluW", val / 1000000);
+	}
+	fputc('\n', outf);
+
+	return 0;
+}
+
+static int print_rapl_sysfs(void)
+{
+	DIR *dir, *cdir;
+	struct dirent *entry, *centry;
+	char path[PATH_MAX];
+	char str[PATH_MAX];
+
+	if ((dir = opendir(PATH_RAPL_SYSFS)) == NULL) {
+		warn("open %s failed", PATH_RAPL_SYSFS);
+		return 1;
+	}
+
+	while ((entry = readdir(dir)) != NULL) {
+		if (strlen(entry->d_name) > 100)
+			continue;
+
+		if (strncmp(entry->d_name, "intel-rapl", strlen("intel-rapl")))
+			continue;
+
+		snprintf(path, PATH_MAX, "%s/%s/name", PATH_RAPL_SYSFS, entry->d_name);
+
+		/* Parse top level domains first, including package and psys */
+		fread_str(path, str, PATH_MAX);
+		if (strncmp(str, "package", strlen("package")) && strncmp(str, "psys", strlen("psys")))
+			continue;
+
+		snprintf(path, PATH_MAX, "%s/%s", PATH_RAPL_SYSFS, entry->d_name);
+		if ((cdir = opendir(path)) == NULL) {
+			perror("opendir() error");
+			return 1;
+		}
+
+		dump_one_domain(path);
+
+		while ((centry = readdir(cdir)) != NULL) {
+			if (strncmp(centry->d_name, "intel-rapl", strlen("intel-rapl")))
+				continue;
+			snprintf(path, PATH_MAX, "%s/%s/%s", PATH_RAPL_SYSFS, entry->d_name, centry->d_name);
+			dump_one_domain(path);
+		}
+		closedir(cdir);
+	}
+
+	closedir(dir);
+	return 0;
+}
+
 int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
 	unsigned long long msr;
@@ -7458,9 +7703,6 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
  */
 void probe_rapl(void)
 {
-	if (!platform->rapl_msrs || no_msr)
-		return;
-
 	if (genuine_intel)
 		rapl_probe_intel();
 	if (authentic_amd || hygon_genuine)
@@ -7469,6 +7711,11 @@ void probe_rapl(void)
 	if (quiet)
 		return;
 
+	print_rapl_sysfs();
+
+	if (!platform->rapl_msrs || no_msr)
+		return;
+
 	for_all_cpus(print_rapl, ODD_COUNTERS);
 }
 
@@ -7801,44 +8048,42 @@ static int has_instr_count_access(void)
 	return has_access;
 }
 
-int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
-			   double *scale_, enum rapl_unit *unit_)
+int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
+			  double *scale_, enum rapl_unit *unit_)
 {
+	int ret = -1;
+
 	if (no_perf)
 		return -1;
 
+	if (!cai->perf_name)
+		return -1;
+
 	const double scale = read_perf_scale(cai->perf_subsys, cai->perf_name);
 
 	if (scale == 0.0)
-		return -1;
+		goto end;
 
 	const enum rapl_unit unit = read_perf_rapl_unit(cai->perf_subsys, cai->perf_name);
 
 	if (unit == RAPL_UNIT_INVALID)
-		return -1;
+		goto end;
 
 	const unsigned int rapl_type = read_perf_type(cai->perf_subsys);
 	const unsigned int rapl_energy_pkg_config = read_perf_config(cai->perf_subsys, cai->perf_name);
 
-	const int fd_counter =
-	    open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP);
-	if (fd_counter == -1)
-		return -1;
+	ret = open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP);
+	if (ret == -1)
+		goto end;
 
 	/* If it's the first counter opened, make it a group descriptor */
 	if (rci->fd_perf == -1)
-		rci->fd_perf = fd_counter;
+		rci->fd_perf = ret;
 
 	*scale_ = scale;
 	*unit_ = unit;
-	return fd_counter;
-}
-
-int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
-			  double *scale, enum rapl_unit *unit)
-{
-	int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit);
 
+end:
 	if (debug >= 2)
 		fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu);
 
@@ -7863,7 +8108,7 @@ void linux_perf_init(void)
 
 void rapl_perf_init(void)
 {
-	const unsigned int num_domains = (platform->has_per_core_rapl ? topo.max_core_id : topo.max_package_id) + 1;
+	const unsigned int num_domains = get_rapl_num_domains();
 	bool *domain_visited = calloc(num_domains, sizeof(bool));
 
 	rapl_counter_info_perdomain = calloc(num_domains, sizeof(*rapl_counter_info_perdomain));
@@ -7896,6 +8141,9 @@ void rapl_perf_init(void)
 		enum rapl_unit unit;
 		unsigned int next_domain;
 
+		if (!BIC_IS_ENABLED(cai->bic))
+			continue;
+
 		memset(domain_visited, 0, num_domains * sizeof(*domain_visited));
 
 		for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) {
@@ -7904,8 +8152,7 @@ void rapl_perf_init(void)
 				continue;
 
 			/* Skip already seen and handled RAPL domains */
-			next_domain =
-			    platform->has_per_core_rapl ? cpus[cpu].physical_core_id : cpus[cpu].physical_package_id;
+			next_domain = get_rapl_domain_id(cpu);
 
 			assert(next_domain < num_domains);
 
@@ -7919,27 +8166,37 @@ void rapl_perf_init(void)
 
 			struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[next_domain];
 
-			/* Check if the counter is enabled and accessible */
-			if (BIC_IS_ENABLED(cai->bic) && (platform->rapl_msrs & cai->feature_mask)) {
+			/*
+			 * rapl_counter_arch_infos[] can have multiple entries describing the same
+			 * counter, due to the difference from different platforms/Vendors.
+			 * E.g. rapl_counter_arch_infos[0] and rapl_counter_arch_infos[1] share the
+			 * same perf_subsys and perf_name, but with different MSR address.
+			 * rapl_counter_arch_infos[0] is for Intel and rapl_counter_arch_infos[1]
+			 * is for AMD.
+			 * In this case, it is possible that multiple rapl_counter_arch_infos[]
+			 * entries are probed just because their perf/msr is duplicate and valid.
+			 *
+			 * Thus need a check to avoid re-probe the same counters.
+			 */
+			if (rci->source[cai->rci_index] != COUNTER_SOURCE_NONE)
+				break;
 
-				/* Use perf API for this counter */
-				if (!no_perf && cai->perf_name
-				    && add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) {
-					rci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
-					rci->scale[cai->rci_index] = scale * cai->compat_scale;
-					rci->unit[cai->rci_index] = unit;
-					rci->flags[cai->rci_index] = cai->flags;
-
-					/* Use MSR for this counter */
-				} else if (!no_msr && cai->msr && probe_rapl_msr(cpu, cai->msr, cai->rci_index) == 0) {
-					rci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
-					rci->msr[cai->rci_index] = cai->msr;
-					rci->msr_mask[cai->rci_index] = cai->msr_mask;
-					rci->msr_shift[cai->rci_index] = cai->msr_shift;
-					rci->unit[cai->rci_index] = RAPL_UNIT_JOULES;
-					rci->scale[cai->rci_index] = *cai->platform_rapl_msr_scale * cai->compat_scale;
-					rci->flags[cai->rci_index] = cai->flags;
-				}
+			/* Use perf API for this counter */
+			if (add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) {
+				rci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
+				rci->scale[cai->rci_index] = scale * cai->compat_scale;
+				rci->unit[cai->rci_index] = unit;
+				rci->flags[cai->rci_index] = cai->flags;
+
+				/* Use MSR for this counter */
+			} else if (add_rapl_msr_counter(cpu, cai) >= 0) {
+				rci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
+				rci->msr[cai->rci_index] = cai->msr;
+				rci->msr_mask[cai->rci_index] = cai->msr_mask;
+				rci->msr_shift[cai->rci_index] = cai->msr_shift;
+				rci->unit[cai->rci_index] = RAPL_UNIT_JOULES;
+				rci->scale[cai->rci_index] = *cai->platform_rapl_msr_scale * cai->compat_scale;
+				rci->flags[cai->rci_index] = cai->flags;
 			}
 
 			if (rci->source[cai->rci_index] != COUNTER_SOURCE_NONE)
@@ -7972,65 +8229,63 @@ int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *gro
 	return NULL;
 }
 
-int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai)
+int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai)
 {
+	int ret = -1;
+
 	if (no_perf)
 		return -1;
 
+	if (!cai->perf_name)
+		return -1;
+
 	int *pfd_group = get_cstate_perf_group_fd(cci, cai->perf_subsys);
 
 	if (pfd_group == NULL)
-		return -1;
+		goto end;
 
 	const unsigned int type = read_perf_type(cai->perf_subsys);
 	const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name);
 
-	const int fd_counter = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP);
+	ret = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP);
 
-	if (fd_counter == -1)
-		return -1;
+	if (ret == -1)
+		goto end;
 
 	/* If it's the first counter opened, make it a group descriptor */
 	if (*pfd_group == -1)
-		*pfd_group = fd_counter;
-
-	return fd_counter;
-}
-
-int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai)
-{
-	int ret = add_cstate_perf_counter_(cpu, cci, cai);
+		*pfd_group = ret;
 
+end:
 	if (debug >= 2)
 		fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu);
 
 	return ret;
 }
 
-int add_msr_perf_counter_(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai)
+int add_msr_perf_counter(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai)
 {
+	int ret = -1;
+
 	if (no_perf)
 		return -1;
 
+	if (!cai->perf_name)
+		return -1;
+
 	const unsigned int type = read_perf_type(cai->perf_subsys);
 	const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name);
 
-	const int fd_counter = open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP);
+	ret = open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP);
 
-	if (fd_counter == -1)
-		return -1;
+	if (ret == -1)
+		goto end;
 
 	/* If it's the first counter opened, make it a group descriptor */
 	if (cci->fd_perf == -1)
-		cci->fd_perf = fd_counter;
-
-	return fd_counter;
-}
-
-int add_msr_perf_counter(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai)
-{
-	int ret = add_msr_perf_counter_(cpu, cci, cai);
+		cci->fd_perf = ret;
 
+end:
 	if (debug)
 		fprintf(stderr, "%s: %s/%s: %d (cpu: %d)\n", __func__, cai->perf_subsys, cai->perf_name, ret, cpu);
 
@@ -8064,12 +8319,12 @@ void msr_perf_init_(void)
 
 			if (cai->needed) {
 				/* Use perf API for this counter */
-				if (!no_perf && cai->perf_name && add_msr_perf_counter(cpu, cci, cai) != -1) {
+				if (add_msr_perf_counter(cpu, cci, cai) != -1) {
 					cci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
 					cai->present = true;
 
 					/* User MSR for this counter */
-				} else if (!no_msr && cai->msr && probe_rapl_msr(cpu, cai->msr, cai->rci_index) == 0) {
+				} else if (add_msr_counter(cpu, cai->msr) >= 0) {
 					cci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
 					cci->msr[cai->rci_index] = cai->msr;
 					cci->msr_mask[cai->rci_index] = cai->msr_mask;
@@ -8177,13 +8432,13 @@ void cstate_perf_init_(bool soft_c1)
 
 			if (counter_needed && counter_supported) {
 				/* Use perf API for this counter */
-				if (!no_perf && cai->perf_name && add_cstate_perf_counter(cpu, cci, cai) != -1) {
+				if (add_cstate_perf_counter(cpu, cci, cai) != -1) {
 
 					cci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
 
 					/* User MSR for this counter */
-				} else if (!no_msr && cai->msr && pkg_cstate_limit >= cai->pkg_cstate_limit
-					   && probe_rapl_msr(cpu, cai->msr, cai->rci_index) == 0) {
+				} else if (pkg_cstate_limit >= cai->pkg_cstate_limit
+					   && add_msr_counter(cpu, cai->msr) >= 0) {
 					cci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
 					cci->msr[cai->rci_index] = cai->msr;
 				}
@@ -9044,15 +9299,14 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo)
 					perf_device = "cpu_atom";
 					break;
 
-				default: /* Don't change, we will probably fail and report a problem soon. */
+				default:	/* Don't change, we will probably fail and report a problem soon. */
 					break;
 				}
 			}
 
 			perf_type = read_perf_type(perf_device);
 			if (perf_type == (unsigned int)-1) {
-				warnx("%s: perf/%s/%s: failed to read %s",
-				      __func__, perf_device, pinfo->event, "type");
+				warnx("%s: perf/%s/%s: failed to read %s", __func__, perf_device, pinfo->event, "type");
 				continue;
 			}
 
@@ -9154,7 +9408,7 @@ struct pmt_mmio *pmt_mmio_open(unsigned int target_guid)
 		return NULL;
 	}
 
-	for ( ; entry != NULL; entry = pmt_diriter_next(&pmt_iter)) {
+	for (; entry != NULL; entry = pmt_diriter_next(&pmt_iter)) {
 		if (fstatat(dirfd(pmt_iter.dir), entry->d_name, &st, 0) == -1)
 			break;
 
@@ -9594,7 +9848,7 @@ int get_and_dump_counters(void)
 
 void print_version()
 {
-	fprintf(outf, "turbostat version 2025.04.06 - Len Brown <lenb@kernel.org>\n");
+	fprintf(outf, "turbostat version 2025.06.08 - Len Brown <lenb@kernel.org>\n");
 }
 
 #define COMMAND_LINE_SIZE 2048
@@ -10050,7 +10304,7 @@ void parse_add_command_pmt(char *add_command)
 	unsigned int lsb;
 	unsigned int msb;
 	unsigned int guid;
-	unsigned int seq = 0; /* By default, pick first file in a sequence with a given GUID. */
+	unsigned int seq = 0;	/* By default, pick first file in a sequence with a given GUID. */
 	unsigned int domain_id;
 	enum counter_scope scope = 0;
 	enum pmt_datatype type = PMT_TYPE_RAW;
diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c
index dcdd5bb20f3d..e80deac1436b 100644
--- a/tools/testing/selftests/mm/ksm_tests.c
+++ b/tools/testing/selftests/mm/ksm_tests.c
@@ -58,40 +58,12 @@ int debug;
 
 static int ksm_write_sysfs(const char *file_path, unsigned long val)
 {
-	FILE *f = fopen(file_path, "w");
-
-	if (!f) {
-		fprintf(stderr, "f %s\n", file_path);
-		perror("fopen");
-		return 1;
-	}
-	if (fprintf(f, "%lu", val) < 0) {
-		perror("fprintf");
-		fclose(f);
-		return 1;
-	}
-	fclose(f);
-
-	return 0;
+	return write_sysfs(file_path, val);
 }
 
 static int ksm_read_sysfs(const char *file_path, unsigned long *val)
 {
-	FILE *f = fopen(file_path, "r");
-
-	if (!f) {
-		fprintf(stderr, "f %s\n", file_path);
-		perror("fopen");
-		return 1;
-	}
-	if (fscanf(f, "%lu", val) != 1) {
-		perror("fscanf");
-		fclose(f);
-		return 1;
-	}
-	fclose(f);
-
-	return 0;
+	return read_sysfs(file_path, val);
 }
 
 static void ksm_print_sysfs(void)
diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c
index c76646cdf6e6..bbae66fc5038 100644
--- a/tools/testing/selftests/mm/merge.c
+++ b/tools/testing/selftests/mm/merge.c
@@ -2,11 +2,14 @@
 
 #define _GNU_SOURCE
 #include "../kselftest_harness.h"
+#include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <sys/mman.h>
+#include <sys/syscall.h>
 #include <sys/wait.h>
+#include <linux/perf_event.h>
 #include "vm_util.h"
 
 FIXTURE(merge)
@@ -452,4 +455,44 @@ TEST_F(merge, forked_source_vma)
 	ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr2 + 5 * page_size);
 }
 
+TEST_F(merge, handle_uprobe_upon_merged_vma)
+{
+	const size_t attr_sz = sizeof(struct perf_event_attr);
+	unsigned int page_size = self->page_size;
+	const char *probe_file = "./foo";
+	char *carveout = self->carveout;
+	struct perf_event_attr attr;
+	unsigned long type;
+	void *ptr1, *ptr2;
+	int fd;
+
+	fd = open(probe_file, O_RDWR|O_CREAT, 0600);
+	ASSERT_GE(fd, 0);
+
+	ASSERT_EQ(ftruncate(fd, page_size), 0);
+	ASSERT_EQ(read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type), 0);
+
+	memset(&attr, 0, attr_sz);
+	attr.size = attr_sz;
+	attr.type = type;
+	attr.config1 = (__u64)(long)probe_file;
+	attr.config2 = 0x0;
+
+	ASSERT_GE(syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0), 0);
+
+	ptr1 = mmap(&carveout[page_size], 10 * page_size, PROT_EXEC,
+		    MAP_PRIVATE | MAP_FIXED, fd, 0);
+	ASSERT_NE(ptr1, MAP_FAILED);
+
+	ptr2 = mremap(ptr1, page_size, 2 * page_size,
+		      MREMAP_MAYMOVE | MREMAP_FIXED, ptr1 + 5 * page_size);
+	ASSERT_NE(ptr2, MAP_FAILED);
+
+	ASSERT_NE(mremap(ptr2, page_size, page_size,
+			 MREMAP_MAYMOVE | MREMAP_FIXED, ptr1), MAP_FAILED);
+
+	close(fd);
+	remove(probe_file);
+}
+
 TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
index a41bc1234b37..95b6f043a3cb 100644
--- a/tools/testing/selftests/mm/thuge-gen.c
+++ b/tools/testing/selftests/mm/thuge-gen.c
@@ -77,7 +77,7 @@ void show(unsigned long ps)
 	system(buf);
 }
 
-unsigned long read_sysfs(int warn, char *fmt, ...)
+unsigned long thuge_read_sysfs(int warn, char *fmt, ...)
 {
 	char *line = NULL;
 	size_t linelen = 0;
@@ -106,7 +106,7 @@ unsigned long read_sysfs(int warn, char *fmt, ...)
 
 unsigned long read_free(unsigned long ps)
 {
-	return read_sysfs(ps != getpagesize(),
+	return thuge_read_sysfs(ps != getpagesize(),
 			  "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
 			  ps >> 10);
 }
@@ -195,7 +195,7 @@ void find_pagesizes(void)
 	}
 	globfree(&g);
 
-	if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest)
+	if (thuge_read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest)
 		ksft_exit_fail_msg("Please do echo %lu > /proc/sys/kernel/shmmax",
 				   largest * NUM_PAGES);
 
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index 61d7bf1f8c62..5492e3f784df 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -486,3 +486,41 @@ int close_procmap(struct procmap_fd *procmap)
 {
 	return close(procmap->fd);
 }
+
+int write_sysfs(const char *file_path, unsigned long val)
+{
+	FILE *f = fopen(file_path, "w");
+
+	if (!f) {
+		fprintf(stderr, "f %s\n", file_path);
+		perror("fopen");
+		return 1;
+	}
+	if (fprintf(f, "%lu", val) < 0) {
+		perror("fprintf");
+		fclose(f);
+		return 1;
+	}
+	fclose(f);
+
+	return 0;
+}
+
+int read_sysfs(const char *file_path, unsigned long *val)
+{
+	FILE *f = fopen(file_path, "r");
+
+	if (!f) {
+		fprintf(stderr, "f %s\n", file_path);
+		perror("fopen");
+		return 1;
+	}
+	if (fscanf(f, "%lu", val) != 1) {
+		perror("fscanf");
+		fclose(f);
+		return 1;
+	}
+	fclose(f);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index adb5d294a220..b8136d12a0f8 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -88,6 +88,8 @@ int open_procmap(pid_t pid, struct procmap_fd *procmap_out);
 int query_procmap(struct procmap_fd *procmap);
 bool find_vma_procmap(struct procmap_fd *procmap, void *address);
 int close_procmap(struct procmap_fd *procmap);
+int write_sysfs(const char *file_path, unsigned long val);
+int read_sysfs(const char *file_path, unsigned long *val);
 
 static inline int open_self_procmap(struct procmap_fd *procmap_out)
 {
diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
index 8b378c91debf..b1e4618399be 100644
--- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c
+++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
@@ -2079,24 +2079,9 @@ TEST_F(mount_setattr, detached_tree_propagation)
 	 * means that the device information will be different for any
 	 * statx() that was taken from /mnt/A before the mount compared
 	 * to one after the mount.
-	 *
-	 * Since we already now that the device information between the
-	 * stx1 and stx2 samples are identical we also now that stx2 and
-	 * stx3 device information will necessarily differ.
 	 */
 	ASSERT_NE(stx1.stx_dev_minor, stx3.stx_dev_minor);
-
-	/*
-	 * If mount propagation worked correctly then the tmpfs mount
-	 * that was created after the mount namespace was unshared will
-	 * have propagated onto /mnt/A in the detached mount tree.
-	 *
-	 * Verify that the device information for stx3 and stx4 are
-	 * identical. It is already established that stx3 is different
-	 * from both stx1 and stx2 sampled before the tmpfs mount was
-	 * done so if stx3 and stx4 are identical the proof is done.
-	 */
-	ASSERT_EQ(stx3.stx_dev_minor, stx4.stx_dev_minor);
+	ASSERT_EQ(stx1.stx_dev_minor, stx4.stx_dev_minor);
 
 	EXPECT_EQ(close(fd_tree), 0);
 }
diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 4dde8838261d..5d7f4ecfb816 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -19,6 +19,7 @@ TEST_PROGS += test_generic_08.sh
 TEST_PROGS += test_generic_09.sh
 TEST_PROGS += test_generic_10.sh
 TEST_PROGS += test_generic_11.sh
+TEST_PROGS += test_generic_12.sh
 
 TEST_PROGS += test_null_01.sh
 TEST_PROGS += test_null_02.sh
diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c
index 5421774d7867..6e60f7d97125 100644
--- a/tools/testing/selftests/ublk/fault_inject.c
+++ b/tools/testing/selftests/ublk/fault_inject.c
@@ -46,9 +46,9 @@ static int ublk_fault_inject_queue_io(struct ublk_queue *q, int tag)
 		.tv_nsec = (long long)q->dev->private_data,
 	};
 
-	ublk_queue_alloc_sqes(q, &sqe, 1);
+	ublk_io_alloc_sqes(ublk_get_io(q, tag), &sqe, 1);
 	io_uring_prep_timeout(sqe, &ts, 1, 0);
-	sqe->user_data = build_user_data(tag, ublksrv_get_op(iod), 0, 1);
+	sqe->user_data = build_user_data(tag, ublksrv_get_op(iod), 0, q->q_id, 1);
 
 	ublk_queued_tgt_io(q, tag, 1);
 
diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c
index 509842df9bee..cfa59b631693 100644
--- a/tools/testing/selftests/ublk/file_backed.c
+++ b/tools/testing/selftests/ublk/file_backed.c
@@ -18,11 +18,11 @@ static int loop_queue_flush_io(struct ublk_queue *q, const struct ublksrv_io_des
 	unsigned ublk_op = ublksrv_get_op(iod);
 	struct io_uring_sqe *sqe[1];
 
-	ublk_queue_alloc_sqes(q, sqe, 1);
+	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1);
 	io_uring_prep_fsync(sqe[0], 1 /*fds[1]*/, IORING_FSYNC_DATASYNC);
 	io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE);
 	/* bit63 marks us as tgt io */
-	sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1);
+	sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
 	return 1;
 }
 
@@ -36,7 +36,7 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de
 	void *addr = (zc | auto_zc) ? NULL : (void *)iod->addr;
 
 	if (!zc || auto_zc) {
-		ublk_queue_alloc_sqes(q, sqe, 1);
+		ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1);
 		if (!sqe[0])
 			return -ENOMEM;
 
@@ -48,26 +48,26 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de
 			sqe[0]->buf_index = tag;
 		io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE);
 		/* bit63 marks us as tgt io */
-		sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1);
+		sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
 		return 1;
 	}
 
-	ublk_queue_alloc_sqes(q, sqe, 3);
+	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 3);
 
-	io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
+	io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
 	sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
 	sqe[0]->user_data = build_user_data(tag,
-			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1);
+			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
 
 	io_uring_prep_rw(op, sqe[1], 1 /*fds[1]*/, 0,
 		iod->nr_sectors << 9,
 		iod->start_sector << 9);
 	sqe[1]->buf_index = tag;
 	sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK;
-	sqe[1]->user_data = build_user_data(tag, ublk_op, 0, 1);
+	sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
 
-	io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag);
-	sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1);
+	io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
+	sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
 
 	return 2;
 }
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index b5131a000795..e2d2042810d4 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -348,8 +348,8 @@ static void ublk_ctrl_dump(struct ublk_dev *dev)
 
 		for (i = 0; i < info->nr_hw_queues; i++) {
 			ublk_print_cpu_set(&affinity[i], buf, sizeof(buf));
-			printf("\tqueue %u: tid %d affinity(%s)\n",
-					i, dev->q[i].tid, buf);
+			printf("\tqueue %u: affinity(%s)\n",
+					i, buf);
 		}
 		free(affinity);
 	}
@@ -412,16 +412,6 @@ static void ublk_queue_deinit(struct ublk_queue *q)
 	int i;
 	int nr_ios = q->q_depth;
 
-	io_uring_unregister_buffers(&q->ring);
-
-	io_uring_unregister_ring_fd(&q->ring);
-
-	if (q->ring.ring_fd > 0) {
-		io_uring_unregister_files(&q->ring);
-		close(q->ring.ring_fd);
-		q->ring.ring_fd = -1;
-	}
-
 	if (q->io_cmd_buf)
 		munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q));
 
@@ -429,20 +419,30 @@ static void ublk_queue_deinit(struct ublk_queue *q)
 		free(q->ios[i].buf_addr);
 }
 
+static void ublk_thread_deinit(struct ublk_thread *t)
+{
+	io_uring_unregister_buffers(&t->ring);
+
+	io_uring_unregister_ring_fd(&t->ring);
+
+	if (t->ring.ring_fd > 0) {
+		io_uring_unregister_files(&t->ring);
+		close(t->ring.ring_fd);
+		t->ring.ring_fd = -1;
+	}
+}
+
 static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
 {
 	struct ublk_dev *dev = q->dev;
 	int depth = dev->dev_info.queue_depth;
-	int i, ret = -1;
+	int i;
 	int cmd_buf_size, io_buf_size;
 	unsigned long off;
-	int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
 
 	q->tgt_ops = dev->tgt.ops;
 	q->state = 0;
 	q->q_depth = depth;
-	q->cmd_inflight = 0;
-	q->tid = gettid();
 
 	if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
 		q->state |= UBLKSRV_NO_BUF;
@@ -467,6 +467,7 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
 	for (i = 0; i < q->q_depth; i++) {
 		q->ios[i].buf_addr = NULL;
 		q->ios[i].flags = UBLKSRV_NEED_FETCH_RQ | UBLKSRV_IO_FREE;
+		q->ios[i].tag = i;
 
 		if (q->state & UBLKSRV_NO_BUF)
 			continue;
@@ -479,39 +480,57 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
 		}
 	}
 
-	ret = ublk_setup_ring(&q->ring, ring_depth, cq_depth,
+	return 0;
+ fail:
+	ublk_queue_deinit(q);
+	ublk_err("ublk dev %d queue %d failed\n",
+			dev->dev_info.dev_id, q->q_id);
+	return -ENOMEM;
+}
+
+static int ublk_thread_init(struct ublk_thread *t)
+{
+	struct ublk_dev *dev = t->dev;
+	int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
+	int ret;
+
+	ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth,
 			IORING_SETUP_COOP_TASKRUN |
 			IORING_SETUP_SINGLE_ISSUER |
 			IORING_SETUP_DEFER_TASKRUN);
 	if (ret < 0) {
-		ublk_err("ublk dev %d queue %d setup io_uring failed %d\n",
-				q->dev->dev_info.dev_id, q->q_id, ret);
+		ublk_err("ublk dev %d thread %d setup io_uring failed %d\n",
+				dev->dev_info.dev_id, t->idx, ret);
 		goto fail;
 	}
 
 	if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
-		ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth);
+		unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues;
+		unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads;
+		max_nr_ios_per_thread += !!(nr_ios % dev->nthreads);
+		ret = io_uring_register_buffers_sparse(
+			&t->ring, max_nr_ios_per_thread);
 		if (ret) {
-			ublk_err("ublk dev %d queue %d register spare buffers failed %d",
-					dev->dev_info.dev_id, q->q_id, ret);
+			ublk_err("ublk dev %d thread %d register spare buffers failed %d",
+					dev->dev_info.dev_id, t->idx, ret);
 			goto fail;
 		}
 	}
 
-	io_uring_register_ring_fd(&q->ring);
+	io_uring_register_ring_fd(&t->ring);
 
-	ret = io_uring_register_files(&q->ring, dev->fds, dev->nr_fds);
+	ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds);
 	if (ret) {
-		ublk_err("ublk dev %d queue %d register files failed %d\n",
-				q->dev->dev_info.dev_id, q->q_id, ret);
+		ublk_err("ublk dev %d thread %d register files failed %d\n",
+				t->dev->dev_info.dev_id, t->idx, ret);
 		goto fail;
 	}
 
 	return 0;
- fail:
-	ublk_queue_deinit(q);
-	ublk_err("ublk dev %d queue %d failed\n",
-			dev->dev_info.dev_id, q->q_id);
+fail:
+	ublk_thread_deinit(t);
+	ublk_err("ublk dev %d thread %d init failed\n",
+			dev->dev_info.dev_id, t->idx);
 	return -ENOMEM;
 }
 
@@ -562,7 +581,7 @@ static void ublk_set_auto_buf_reg(const struct ublk_queue *q,
 	if (q->tgt_ops->buf_index)
 		buf.index = q->tgt_ops->buf_index(q, tag);
 	else
-		buf.index = tag;
+		buf.index = q->ios[tag].buf_index;
 
 	if (q->state & UBLKSRV_AUTO_BUF_REG_FALLBACK)
 		buf.flags = UBLK_AUTO_BUF_REG_FALLBACK;
@@ -570,8 +589,10 @@ static void ublk_set_auto_buf_reg(const struct ublk_queue *q,
 	sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf);
 }
 
-int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
+int ublk_queue_io_cmd(struct ublk_io *io)
 {
+	struct ublk_thread *t = io->t;
+	struct ublk_queue *q = ublk_io_to_queue(io);
 	struct ublksrv_io_cmd *cmd;
 	struct io_uring_sqe *sqe[1];
 	unsigned int cmd_op = 0;
@@ -596,13 +617,13 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
 	else if (io->flags & UBLKSRV_NEED_FETCH_RQ)
 		cmd_op = UBLK_U_IO_FETCH_REQ;
 
-	if (io_uring_sq_space_left(&q->ring) < 1)
-		io_uring_submit(&q->ring);
+	if (io_uring_sq_space_left(&t->ring) < 1)
+		io_uring_submit(&t->ring);
 
-	ublk_queue_alloc_sqes(q, sqe, 1);
+	ublk_io_alloc_sqes(io, sqe, 1);
 	if (!sqe[0]) {
-		ublk_err("%s: run out of sqe %d, tag %d\n",
-				__func__, q->q_id, tag);
+		ublk_err("%s: run out of sqe. thread %u, tag %d\n",
+				__func__, t->idx, io->tag);
 		return -1;
 	}
 
@@ -617,7 +638,7 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
 	sqe[0]->opcode	= IORING_OP_URING_CMD;
 	sqe[0]->flags	= IOSQE_FIXED_FILE;
 	sqe[0]->rw_flags	= 0;
-	cmd->tag	= tag;
+	cmd->tag	= io->tag;
 	cmd->q_id	= q->q_id;
 	if (!(q->state & UBLKSRV_NO_BUF))
 		cmd->addr	= (__u64) (uintptr_t) io->buf_addr;
@@ -625,37 +646,72 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
 		cmd->addr	= 0;
 
 	if (q->state & UBLKSRV_AUTO_BUF_REG)
-		ublk_set_auto_buf_reg(q, sqe[0], tag);
+		ublk_set_auto_buf_reg(q, sqe[0], io->tag);
 
-	user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, 0);
+	user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0);
 	io_uring_sqe_set_data64(sqe[0], user_data);
 
 	io->flags = 0;
 
-	q->cmd_inflight += 1;
+	t->cmd_inflight += 1;
 
-	ublk_dbg(UBLK_DBG_IO_CMD, "%s: (qid %d tag %u cmd_op %u) iof %x stopping %d\n",
-			__func__, q->q_id, tag, cmd_op,
-			io->flags, !!(q->state & UBLKSRV_QUEUE_STOPPING));
+	ublk_dbg(UBLK_DBG_IO_CMD, "%s: (thread %u qid %d tag %u cmd_op %u) iof %x stopping %d\n",
+			__func__, t->idx, q->q_id, io->tag, cmd_op,
+			io->flags, !!(t->state & UBLKSRV_THREAD_STOPPING));
 	return 1;
 }
 
-static void ublk_submit_fetch_commands(struct ublk_queue *q)
+static void ublk_submit_fetch_commands(struct ublk_thread *t)
 {
-	int i = 0;
+	struct ublk_queue *q;
+	struct ublk_io *io;
+	int i = 0, j = 0;
 
-	for (i = 0; i < q->q_depth; i++)
-		ublk_queue_io_cmd(q, &q->ios[i], i);
+	if (t->dev->per_io_tasks) {
+		/*
+		 * Lexicographically order all the (qid,tag) pairs, with
+		 * qid taking priority (so (1,0) > (0,1)). Then make
+		 * this thread the daemon for every Nth entry in this
+		 * list (N is the number of threads), starting at this
+		 * thread's index. This ensures that each queue is
+		 * handled by as many ublk server threads as possible,
+		 * so that load that is concentrated on one or a few
+		 * queues can make use of all ublk server threads.
+		 */
+		const struct ublksrv_ctrl_dev_info *dinfo = &t->dev->dev_info;
+		int nr_ios = dinfo->nr_hw_queues * dinfo->queue_depth;
+		for (i = t->idx; i < nr_ios; i += t->dev->nthreads) {
+			int q_id = i / dinfo->queue_depth;
+			int tag = i % dinfo->queue_depth;
+			q = &t->dev->q[q_id];
+			io = &q->ios[tag];
+			io->t = t;
+			io->buf_index = j++;
+			ublk_queue_io_cmd(io);
+		}
+	} else {
+		/*
+		 * Service exclusively the queue whose q_id matches our
+		 * thread index.
+		 */
+		struct ublk_queue *q = &t->dev->q[t->idx];
+		for (i = 0; i < q->q_depth; i++) {
+			io = &q->ios[i];
+			io->t = t;
+			io->buf_index = i;
+			ublk_queue_io_cmd(io);
+		}
+	}
 }
 
-static int ublk_queue_is_idle(struct ublk_queue *q)
+static int ublk_thread_is_idle(struct ublk_thread *t)
 {
-	return !io_uring_sq_ready(&q->ring) && !q->io_inflight;
+	return !io_uring_sq_ready(&t->ring) && !t->io_inflight;
 }
 
-static int ublk_queue_is_done(struct ublk_queue *q)
+static int ublk_thread_is_done(struct ublk_thread *t)
 {
-	return (q->state & UBLKSRV_QUEUE_STOPPING) && ublk_queue_is_idle(q);
+	return (t->state & UBLKSRV_THREAD_STOPPING) && ublk_thread_is_idle(t);
 }
 
 static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q,
@@ -673,14 +729,16 @@ static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q,
 		q->tgt_ops->tgt_io_done(q, tag, cqe);
 }
 
-static void ublk_handle_cqe(struct io_uring *r,
+static void ublk_handle_cqe(struct ublk_thread *t,
 		struct io_uring_cqe *cqe, void *data)
 {
-	struct ublk_queue *q = container_of(r, struct ublk_queue, ring);
+	struct ublk_dev *dev = t->dev;
+	unsigned q_id = user_data_to_q_id(cqe->user_data);
+	struct ublk_queue *q = &dev->q[q_id];
 	unsigned tag = user_data_to_tag(cqe->user_data);
 	unsigned cmd_op = user_data_to_op(cqe->user_data);
 	int fetch = (cqe->res != UBLK_IO_RES_ABORT) &&
-		!(q->state & UBLKSRV_QUEUE_STOPPING);
+		!(t->state & UBLKSRV_THREAD_STOPPING);
 	struct ublk_io *io;
 
 	if (cqe->res < 0 && cqe->res != -ENODEV)
@@ -691,7 +749,7 @@ static void ublk_handle_cqe(struct io_uring *r,
 			__func__, cqe->res, q->q_id, tag, cmd_op,
 			is_target_io(cqe->user_data),
 			user_data_to_tgt_data(cqe->user_data),
-			(q->state & UBLKSRV_QUEUE_STOPPING));
+			(t->state & UBLKSRV_THREAD_STOPPING));
 
 	/* Don't retrieve io in case of target io */
 	if (is_target_io(cqe->user_data)) {
@@ -700,10 +758,10 @@ static void ublk_handle_cqe(struct io_uring *r,
 	}
 
 	io = &q->ios[tag];
-	q->cmd_inflight--;
+	t->cmd_inflight--;
 
 	if (!fetch) {
-		q->state |= UBLKSRV_QUEUE_STOPPING;
+		t->state |= UBLKSRV_THREAD_STOPPING;
 		io->flags &= ~UBLKSRV_NEED_FETCH_RQ;
 	}
 
@@ -713,7 +771,7 @@ static void ublk_handle_cqe(struct io_uring *r,
 			q->tgt_ops->queue_io(q, tag);
 	} else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) {
 		io->flags |= UBLKSRV_NEED_GET_DATA | UBLKSRV_IO_FREE;
-		ublk_queue_io_cmd(q, io, tag);
+		ublk_queue_io_cmd(io);
 	} else {
 		/*
 		 * COMMIT_REQ will be completed immediately since no fetching
@@ -727,92 +785,93 @@ static void ublk_handle_cqe(struct io_uring *r,
 	}
 }
 
-static int ublk_reap_events_uring(struct io_uring *r)
+static int ublk_reap_events_uring(struct ublk_thread *t)
 {
 	struct io_uring_cqe *cqe;
 	unsigned head;
 	int count = 0;
 
-	io_uring_for_each_cqe(r, head, cqe) {
-		ublk_handle_cqe(r, cqe, NULL);
+	io_uring_for_each_cqe(&t->ring, head, cqe) {
+		ublk_handle_cqe(t, cqe, NULL);
 		count += 1;
 	}
-	io_uring_cq_advance(r, count);
+	io_uring_cq_advance(&t->ring, count);
 
 	return count;
 }
 
-static int ublk_process_io(struct ublk_queue *q)
+static int ublk_process_io(struct ublk_thread *t)
 {
 	int ret, reapped;
 
-	ublk_dbg(UBLK_DBG_QUEUE, "dev%d-q%d: to_submit %d inflight cmd %u stopping %d\n",
-				q->dev->dev_info.dev_id,
-				q->q_id, io_uring_sq_ready(&q->ring),
-				q->cmd_inflight,
-				(q->state & UBLKSRV_QUEUE_STOPPING));
+	ublk_dbg(UBLK_DBG_THREAD, "dev%d-t%u: to_submit %d inflight cmd %u stopping %d\n",
+				t->dev->dev_info.dev_id,
+				t->idx, io_uring_sq_ready(&t->ring),
+				t->cmd_inflight,
+				(t->state & UBLKSRV_THREAD_STOPPING));
 
-	if (ublk_queue_is_done(q))
+	if (ublk_thread_is_done(t))
 		return -ENODEV;
 
-	ret = io_uring_submit_and_wait(&q->ring, 1);
-	reapped = ublk_reap_events_uring(&q->ring);
+	ret = io_uring_submit_and_wait(&t->ring, 1);
+	reapped = ublk_reap_events_uring(t);
 
-	ublk_dbg(UBLK_DBG_QUEUE, "submit result %d, reapped %d stop %d idle %d\n",
-			ret, reapped, (q->state & UBLKSRV_QUEUE_STOPPING),
-			(q->state & UBLKSRV_QUEUE_IDLE));
+	ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n",
+			ret, reapped, (t->state & UBLKSRV_THREAD_STOPPING),
+			(t->state & UBLKSRV_THREAD_IDLE));
 
 	return reapped;
 }
 
-static void ublk_queue_set_sched_affinity(const struct ublk_queue *q,
+static void ublk_thread_set_sched_affinity(const struct ublk_thread *t,
 		cpu_set_t *cpuset)
 {
         if (sched_setaffinity(0, sizeof(*cpuset), cpuset) < 0)
-                ublk_err("ublk dev %u queue %u set affinity failed",
-                                q->dev->dev_info.dev_id, q->q_id);
+		ublk_err("ublk dev %u thread %u set affinity failed",
+				t->dev->dev_info.dev_id, t->idx);
 }
 
-struct ublk_queue_info {
-	struct ublk_queue 	*q;
-	sem_t 			*queue_sem;
+struct ublk_thread_info {
+	struct ublk_dev 	*dev;
+	unsigned		idx;
+	sem_t 			*ready;
 	cpu_set_t 		*affinity;
-	unsigned char 		auto_zc_fallback;
 };
 
 static void *ublk_io_handler_fn(void *data)
 {
-	struct ublk_queue_info *info = data;
-	struct ublk_queue *q = info->q;
-	int dev_id = q->dev->dev_info.dev_id;
-	unsigned extra_flags = 0;
+	struct ublk_thread_info *info = data;
+	struct ublk_thread *t = &info->dev->threads[info->idx];
+	int dev_id = info->dev->dev_info.dev_id;
 	int ret;
 
-	if (info->auto_zc_fallback)
-		extra_flags = UBLKSRV_AUTO_BUF_REG_FALLBACK;
+	t->dev = info->dev;
+	t->idx = info->idx;
 
-	ret = ublk_queue_init(q, extra_flags);
+	ret = ublk_thread_init(t);
 	if (ret) {
-		ublk_err("ublk dev %d queue %d init queue failed\n",
-				dev_id, q->q_id);
+		ublk_err("ublk dev %d thread %u init failed\n",
+				dev_id, t->idx);
 		return NULL;
 	}
 	/* IO perf is sensitive with queue pthread affinity on NUMA machine*/
-	ublk_queue_set_sched_affinity(q, info->affinity);
-	sem_post(info->queue_sem);
+	if (info->affinity)
+		ublk_thread_set_sched_affinity(t, info->affinity);
+	sem_post(info->ready);
 
-	ublk_dbg(UBLK_DBG_QUEUE, "tid %d: ublk dev %d queue %d started\n",
-			q->tid, dev_id, q->q_id);
+	ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n",
+			gettid(), dev_id, t->idx);
 
 	/* submit all io commands to ublk driver */
-	ublk_submit_fetch_commands(q);
+	ublk_submit_fetch_commands(t);
 	do {
-		if (ublk_process_io(q) < 0)
+		if (ublk_process_io(t) < 0)
 			break;
 	} while (1);
 
-	ublk_dbg(UBLK_DBG_QUEUE, "ublk dev %d queue %d exited\n", dev_id, q->q_id);
-	ublk_queue_deinit(q);
+	ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n",
+		 gettid(), dev_id, t->idx);
+	ublk_thread_deinit(t);
 	return NULL;
 }
 
@@ -855,20 +914,20 @@ static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev,
 static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 {
 	const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
-	struct ublk_queue_info *qinfo;
+	struct ublk_thread_info *tinfo;
+	unsigned extra_flags = 0;
 	cpu_set_t *affinity_buf;
 	void *thread_ret;
-	sem_t queue_sem;
+	sem_t ready;
 	int ret, i;
 
 	ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__);
 
-	qinfo = (struct ublk_queue_info *)calloc(sizeof(struct ublk_queue_info),
-			dinfo->nr_hw_queues);
-	if (!qinfo)
+	tinfo = calloc(sizeof(struct ublk_thread_info), dev->nthreads);
+	if (!tinfo)
 		return -ENOMEM;
 
-	sem_init(&queue_sem, 0, 0);
+	sem_init(&ready, 0, 0);
 	ret = ublk_dev_prep(ctx, dev);
 	if (ret)
 		return ret;
@@ -877,22 +936,44 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 	if (ret)
 		return ret;
 
+	if (ctx->auto_zc_fallback)
+		extra_flags = UBLKSRV_AUTO_BUF_REG_FALLBACK;
+
 	for (i = 0; i < dinfo->nr_hw_queues; i++) {
 		dev->q[i].dev = dev;
 		dev->q[i].q_id = i;
 
-		qinfo[i].q = &dev->q[i];
-		qinfo[i].queue_sem = &queue_sem;
-		qinfo[i].affinity = &affinity_buf[i];
-		qinfo[i].auto_zc_fallback = ctx->auto_zc_fallback;
-		pthread_create(&dev->q[i].thread, NULL,
+		ret = ublk_queue_init(&dev->q[i], extra_flags);
+		if (ret) {
+			ublk_err("ublk dev %d queue %d init queue failed\n",
+				 dinfo->dev_id, i);
+			goto fail;
+		}
+	}
+
+	for (i = 0; i < dev->nthreads; i++) {
+		tinfo[i].dev = dev;
+		tinfo[i].idx = i;
+		tinfo[i].ready = &ready;
+
+		/*
+		 * If threads are not tied 1:1 to queues, setting thread
+		 * affinity based on queue affinity makes little sense.
+		 * However, thread CPU affinity has significant impact
+		 * on performance, so to compare fairly, we'll still set
+		 * thread CPU affinity based on queue affinity where
+		 * possible.
+		 */
+		if (dev->nthreads == dinfo->nr_hw_queues)
+			tinfo[i].affinity = &affinity_buf[i];
+		pthread_create(&dev->threads[i].thread, NULL,
 				ublk_io_handler_fn,
-				&qinfo[i]);
+				&tinfo[i]);
 	}
 
-	for (i = 0; i < dinfo->nr_hw_queues; i++)
-		sem_wait(&queue_sem);
-	free(qinfo);
+	for (i = 0; i < dev->nthreads; i++)
+		sem_wait(&ready);
+	free(tinfo);
 	free(affinity_buf);
 
 	/* everything is fine now, start us */
@@ -914,9 +995,11 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 		ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
 
 	/* wait until we are terminated */
-	for (i = 0; i < dinfo->nr_hw_queues; i++)
-		pthread_join(dev->q[i].thread, &thread_ret);
+	for (i = 0; i < dev->nthreads; i++)
+		pthread_join(dev->threads[i].thread, &thread_ret);
  fail:
+	for (i = 0; i < dinfo->nr_hw_queues; i++)
+		ublk_queue_deinit(&dev->q[i]);
 	ublk_dev_unprep(dev);
 	ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__);
 
@@ -1022,13 +1105,14 @@ wait:
 
 static int __cmd_dev_add(const struct dev_ctx *ctx)
 {
+	unsigned nthreads = ctx->nthreads;
 	unsigned nr_queues = ctx->nr_hw_queues;
 	const char *tgt_type = ctx->tgt_type;
 	unsigned depth = ctx->queue_depth;
 	__u64 features;
 	const struct ublk_tgt_ops *ops;
 	struct ublksrv_ctrl_dev_info *info;
-	struct ublk_dev *dev;
+	struct ublk_dev *dev = NULL;
 	int dev_id = ctx->dev_id;
 	int ret, i;
 
@@ -1036,29 +1120,55 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
 	if (!ops) {
 		ublk_err("%s: no such tgt type, type %s\n",
 				__func__, tgt_type);
-		return -ENODEV;
+		ret = -ENODEV;
+		goto fail;
 	}
 
 	if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) {
 		ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n",
 				__func__, nr_queues, depth);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	/* default to 1:1 threads:queues if nthreads is unspecified */
+	if (!nthreads)
+		nthreads = nr_queues;
+
+	if (nthreads > UBLK_MAX_THREADS) {
+		ublk_err("%s: %u is too many threads (max %u)\n",
+				__func__, nthreads, UBLK_MAX_THREADS);
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	if (nthreads != nr_queues && !ctx->per_io_tasks) {
+		ublk_err("%s: threads %u must be same as queues %u if "
+			"not using per_io_tasks\n",
+			__func__, nthreads, nr_queues);
+		ret = -EINVAL;
+		goto fail;
 	}
 
 	dev = ublk_ctrl_init();
 	if (!dev) {
 		ublk_err("%s: can't alloc dev id %d, type %s\n",
 				__func__, dev_id, tgt_type);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto fail;
 	}
 
 	/* kernel doesn't support get_features */
 	ret = ublk_ctrl_get_features(dev, &features);
-	if (ret < 0)
-		return -EINVAL;
+	if (ret < 0) {
+		ret = -EINVAL;
+		goto fail;
+	}
 
-	if (!(features & UBLK_F_CMD_IOCTL_ENCODE))
-		return -ENOTSUP;
+	if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) {
+		ret = -ENOTSUP;
+		goto fail;
+	}
 
 	info = &dev->dev_info;
 	info->dev_id = ctx->dev_id;
@@ -1068,6 +1178,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
 	if ((features & UBLK_F_QUIESCE) &&
 			(info->flags & UBLK_F_USER_RECOVERY))
 		info->flags |= UBLK_F_QUIESCE;
+	dev->nthreads = nthreads;
+	dev->per_io_tasks = ctx->per_io_tasks;
 	dev->tgt.ops = ops;
 	dev->tgt.sq_depth = depth;
 	dev->tgt.cq_depth = depth;
@@ -1097,7 +1209,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
 fail:
 	if (ret < 0)
 		ublk_send_dev_event(ctx, dev, -1);
-	ublk_ctrl_deinit(dev);
+	if (dev)
+		ublk_ctrl_deinit(dev);
 	return ret;
 }
 
@@ -1159,6 +1272,8 @@ run:
 		shmctl(ctx->_shmid, IPC_RMID, NULL);
 		/* wait for child and detach from it */
 		wait(NULL);
+		if (exit_code == EXIT_FAILURE)
+			ublk_err("%s: command failed\n", __func__);
 		exit(exit_code);
 	} else {
 		exit(EXIT_FAILURE);
@@ -1266,6 +1381,7 @@ static int cmd_dev_get_features(void)
 		[const_ilog2(UBLK_F_UPDATE_SIZE)] = "UPDATE_SIZE",
 		[const_ilog2(UBLK_F_AUTO_BUF_REG)] = "AUTO_BUF_REG",
 		[const_ilog2(UBLK_F_QUIESCE)] = "QUIESCE",
+		[const_ilog2(UBLK_F_PER_IO_DAEMON)] = "PER_IO_DAEMON",
 	};
 	struct ublk_dev *dev;
 	__u64 features = 0;
@@ -1360,8 +1476,10 @@ static void __cmd_create_help(char *exe, bool recovery)
 			exe, recovery ? "recover" : "add");
 	printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1 ] [-g]\n");
 	printf("\t[-e 0|1 ] [-i 0|1]\n");
+	printf("\t[--nthreads threads] [--per_io_tasks]\n");
 	printf("\t[target options] [backfile1] [backfile2] ...\n");
 	printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
+	printf("\tdefault: nthreads=nr_queues");
 
 	for (i = 0; i < sizeof(tgt_ops_list) / sizeof(tgt_ops_list[0]); i++) {
 		const struct ublk_tgt_ops *ops = tgt_ops_list[i];
@@ -1418,6 +1536,8 @@ int main(int argc, char *argv[])
 		{ "auto_zc",		0,	NULL,  0 },
 		{ "auto_zc_fallback", 	0,	NULL,  0 },
 		{ "size",		1,	NULL, 's'},
+		{ "nthreads",		1,	NULL,  0 },
+		{ "per_io_tasks",	0,	NULL,  0 },
 		{ 0, 0, 0, 0 }
 	};
 	const struct ublk_tgt_ops *ops = NULL;
@@ -1493,6 +1613,10 @@ int main(int argc, char *argv[])
 				ctx.flags |= UBLK_F_AUTO_BUF_REG;
 			if (!strcmp(longopts[option_idx].name, "auto_zc_fallback"))
 				ctx.auto_zc_fallback = 1;
+			if (!strcmp(longopts[option_idx].name, "nthreads"))
+				ctx.nthreads = strtol(optarg, NULL, 10);
+			if (!strcmp(longopts[option_idx].name, "per_io_tasks"))
+				ctx.per_io_tasks = 1;
 			break;
 		case '?':
 			/*
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index e34508bf5798..6be601536b3d 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -49,11 +49,14 @@
 #define UBLKSRV_IO_IDLE_SECS		20
 
 #define UBLK_IO_MAX_BYTES               (1 << 20)
-#define UBLK_MAX_QUEUES                 32
+#define UBLK_MAX_QUEUES_SHIFT		5
+#define UBLK_MAX_QUEUES                 (1 << UBLK_MAX_QUEUES_SHIFT)
+#define UBLK_MAX_THREADS_SHIFT		5
+#define UBLK_MAX_THREADS		(1 << UBLK_MAX_THREADS_SHIFT)
 #define UBLK_QUEUE_DEPTH                1024
 
 #define UBLK_DBG_DEV            (1U << 0)
-#define UBLK_DBG_QUEUE          (1U << 1)
+#define UBLK_DBG_THREAD         (1U << 1)
 #define UBLK_DBG_IO_CMD         (1U << 2)
 #define UBLK_DBG_IO             (1U << 3)
 #define UBLK_DBG_CTRL_CMD       (1U << 4)
@@ -61,6 +64,7 @@
 
 struct ublk_dev;
 struct ublk_queue;
+struct ublk_thread;
 
 struct stripe_ctx {
 	/* stripe */
@@ -76,6 +80,7 @@ struct dev_ctx {
 	char tgt_type[16];
 	unsigned long flags;
 	unsigned nr_hw_queues;
+	unsigned short nthreads;
 	unsigned queue_depth;
 	int dev_id;
 	int nr_files;
@@ -85,6 +90,7 @@ struct dev_ctx {
 	unsigned int	fg:1;
 	unsigned int	recovery:1;
 	unsigned int	auto_zc_fallback:1;
+	unsigned int	per_io_tasks:1;
 
 	int _evtfd;
 	int _shmid;
@@ -123,10 +129,14 @@ struct ublk_io {
 	unsigned short flags;
 	unsigned short refs;		/* used by target code only */
 
+	int tag;
+
 	int result;
 
+	unsigned short buf_index;
 	unsigned short tgt_ios;
 	void *private_data;
+	struct ublk_thread *t;
 };
 
 struct ublk_tgt_ops {
@@ -165,28 +175,39 @@ struct ublk_tgt {
 struct ublk_queue {
 	int q_id;
 	int q_depth;
-	unsigned int cmd_inflight;
-	unsigned int io_inflight;
 	struct ublk_dev *dev;
 	const struct ublk_tgt_ops *tgt_ops;
 	struct ublksrv_io_desc *io_cmd_buf;
-	struct io_uring ring;
+
 	struct ublk_io ios[UBLK_QUEUE_DEPTH];
-#define UBLKSRV_QUEUE_STOPPING	(1U << 0)
-#define UBLKSRV_QUEUE_IDLE	(1U << 1)
 #define UBLKSRV_NO_BUF		(1U << 2)
 #define UBLKSRV_ZC		(1U << 3)
 #define UBLKSRV_AUTO_BUF_REG		(1U << 4)
 #define UBLKSRV_AUTO_BUF_REG_FALLBACK	(1U << 5)
 	unsigned state;
-	pid_t tid;
+};
+
+struct ublk_thread {
+	struct ublk_dev *dev;
+	struct io_uring ring;
+	unsigned int cmd_inflight;
+	unsigned int io_inflight;
+
 	pthread_t thread;
+	unsigned idx;
+
+#define UBLKSRV_THREAD_STOPPING	(1U << 0)
+#define UBLKSRV_THREAD_IDLE	(1U << 1)
+	unsigned state;
 };
 
 struct ublk_dev {
 	struct ublk_tgt tgt;
 	struct ublksrv_ctrl_dev_info  dev_info;
 	struct ublk_queue q[UBLK_MAX_QUEUES];
+	struct ublk_thread threads[UBLK_MAX_THREADS];
+	unsigned nthreads;
+	unsigned per_io_tasks;
 
 	int fds[MAX_BACK_FILES + 1];	/* fds[0] points to /dev/ublkcN */
 	int nr_fds;
@@ -211,7 +232,7 @@ struct ublk_dev {
 
 
 extern unsigned int ublk_dbg_mask;
-extern int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag);
+extern int ublk_queue_io_cmd(struct ublk_io *io);
 
 
 static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod)
@@ -225,11 +246,14 @@ static inline int is_target_io(__u64 user_data)
 }
 
 static inline __u64 build_user_data(unsigned tag, unsigned op,
-		unsigned tgt_data, unsigned is_target_io)
+		unsigned tgt_data, unsigned q_id, unsigned is_target_io)
 {
-	assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16));
+	/* we only have 7 bits to encode q_id */
+	_Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
+	assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7));
 
-	return tag | (op << 16) | (tgt_data << 24) | (__u64)is_target_io << 63;
+	return tag | (op << 16) | (tgt_data << 24) |
+		(__u64)q_id << 56 | (__u64)is_target_io << 63;
 }
 
 static inline unsigned int user_data_to_tag(__u64 user_data)
@@ -247,6 +271,11 @@ static inline unsigned int user_data_to_tgt_data(__u64 user_data)
 	return (user_data >> 24) & 0xffff;
 }
 
+static inline unsigned int user_data_to_q_id(__u64 user_data)
+{
+	return (user_data >> 56) & 0x7f;
+}
+
 static inline unsigned short ublk_cmd_op_nr(unsigned int op)
 {
 	return _IOC_NR(op);
@@ -280,17 +309,23 @@ static inline void ublk_dbg(int level, const char *fmt, ...)
 	}
 }
 
-static inline int ublk_queue_alloc_sqes(struct ublk_queue *q,
+static inline struct ublk_queue *ublk_io_to_queue(const struct ublk_io *io)
+{
+	return container_of(io, struct ublk_queue, ios[io->tag]);
+}
+
+static inline int ublk_io_alloc_sqes(struct ublk_io *io,
 		struct io_uring_sqe *sqes[], int nr_sqes)
 {
-	unsigned left = io_uring_sq_space_left(&q->ring);
+	struct io_uring *ring = &io->t->ring;
+	unsigned left = io_uring_sq_space_left(ring);
 	int i;
 
 	if (left < nr_sqes)
-		io_uring_submit(&q->ring);
+		io_uring_submit(ring);
 
 	for (i = 0; i < nr_sqes; i++) {
-		sqes[i] = io_uring_get_sqe(&q->ring);
+		sqes[i] = io_uring_get_sqe(ring);
 		if (!sqes[i])
 			return i;
 	}
@@ -373,7 +408,7 @@ static inline int ublk_complete_io(struct ublk_queue *q, unsigned tag, int res)
 
 	ublk_mark_io_done(io, res);
 
-	return ublk_queue_io_cmd(q, io, tag);
+	return ublk_queue_io_cmd(io);
 }
 
 static inline void ublk_queued_tgt_io(struct ublk_queue *q, unsigned tag, int queued)
@@ -383,7 +418,7 @@ static inline void ublk_queued_tgt_io(struct ublk_queue *q, unsigned tag, int qu
 	else {
 		struct ublk_io *io = ublk_get_io(q, tag);
 
-		q->io_inflight += queued;
+		io->t->io_inflight += queued;
 		io->tgt_ios = queued;
 		io->result = 0;
 	}
@@ -393,7 +428,7 @@ static inline int ublk_completed_tgt_io(struct ublk_queue *q, unsigned tag)
 {
 	struct ublk_io *io = ublk_get_io(q, tag);
 
-	q->io_inflight--;
+	io->t->io_inflight--;
 
 	return --io->tgt_ios == 0;
 }
diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c
index 44aca31cf2b0..afe0b99d77ee 100644
--- a/tools/testing/selftests/ublk/null.c
+++ b/tools/testing/selftests/ublk/null.c
@@ -43,7 +43,7 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
 }
 
 static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod,
-		struct io_uring_sqe *sqe)
+		struct io_uring_sqe *sqe, int q_id)
 {
 	unsigned ublk_op = ublksrv_get_op(iod);
 
@@ -52,7 +52,7 @@ static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod,
 	sqe->flags |= IOSQE_FIXED_FILE;
 	sqe->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT;
 	sqe->len = iod->nr_sectors << 9; 	/* injected result */
-	sqe->user_data = build_user_data(tag, ublk_op, 0, 1);
+	sqe->user_data = build_user_data(tag, ublk_op, 0, q_id, 1);
 }
 
 static int null_queue_zc_io(struct ublk_queue *q, int tag)
@@ -60,18 +60,18 @@ static int null_queue_zc_io(struct ublk_queue *q, int tag)
 	const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
 	struct io_uring_sqe *sqe[3];
 
-	ublk_queue_alloc_sqes(q, sqe, 3);
+	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 3);
 
-	io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
+	io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
 	sqe[0]->user_data = build_user_data(tag,
-			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1);
+			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
 	sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
 
-	__setup_nop_io(tag, iod, sqe[1]);
+	__setup_nop_io(tag, iod, sqe[1], q->q_id);
 	sqe[1]->flags |= IOSQE_IO_HARDLINK;
 
-	io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag);
-	sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1);
+	io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
+	sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
 
 	// buf register is marked as IOSQE_CQE_SKIP_SUCCESS
 	return 2;
@@ -82,8 +82,8 @@ static int null_queue_auto_zc_io(struct ublk_queue *q, int tag)
 	const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
 	struct io_uring_sqe *sqe[1];
 
-	ublk_queue_alloc_sqes(q, sqe, 1);
-	__setup_nop_io(tag, iod, sqe[0]);
+	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1);
+	__setup_nop_io(tag, iod, sqe[0], q->q_id);
 	return 1;
 }
 
@@ -136,7 +136,7 @@ static unsigned short ublk_null_buf_index(const struct ublk_queue *q, int tag)
 {
 	if (q->state & UBLKSRV_AUTO_BUF_REG_FALLBACK)
 		return (unsigned short)-1;
-	return tag;
+	return q->ios[tag].buf_index;
 }
 
 const struct ublk_tgt_ops null_tgt_ops = {
diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c
index 404a143bf3d6..37d50bbf5f5e 100644
--- a/tools/testing/selftests/ublk/stripe.c
+++ b/tools/testing/selftests/ublk/stripe.c
@@ -138,13 +138,13 @@ static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_
 	io->private_data = s;
 	calculate_stripe_array(conf, iod, s, base);
 
-	ublk_queue_alloc_sqes(q, sqe, s->nr + extra);
+	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, s->nr + extra);
 
 	if (zc) {
-		io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
+		io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, io->buf_index);
 		sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
 		sqe[0]->user_data = build_user_data(tag,
-			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1);
+			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
 	}
 
 	for (i = zc; i < s->nr + extra - zc; i++) {
@@ -162,13 +162,14 @@ static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_
 				sqe[i]->flags |= IOSQE_IO_HARDLINK;
 		}
 		/* bit63 marks us as tgt io */
-		sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i - zc, 1);
+		sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i - zc, q->q_id, 1);
 	}
 	if (zc) {
 		struct io_uring_sqe *unreg = sqe[s->nr + 1];
 
-		io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, tag);
-		unreg->user_data = build_user_data(tag, ublk_cmd_op_nr(unreg->cmd_op), 0, 1);
+		io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, io->buf_index);
+		unreg->user_data = build_user_data(
+			tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1);
 	}
 
 	/* register buffer is skip_success */
@@ -181,11 +182,11 @@ static int handle_flush(struct ublk_queue *q, const struct ublksrv_io_desc *iod,
 	struct io_uring_sqe *sqe[NR_STRIPE];
 	int i;
 
-	ublk_queue_alloc_sqes(q, sqe, conf->nr_files);
+	ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, conf->nr_files);
 	for (i = 0; i < conf->nr_files; i++) {
 		io_uring_prep_fsync(sqe[i], i + 1, IORING_FSYNC_DATASYNC);
 		io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE);
-		sqe[i]->user_data = build_user_data(tag, UBLK_IO_OP_FLUSH, 0, 1);
+		sqe[i]->user_data = build_user_data(tag, UBLK_IO_OP_FLUSH, 0, q->q_id, 1);
 	}
 	return conf->nr_files;
 }
diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index 0145569ee7e9..8a4dbd09feb0 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -278,6 +278,11 @@ __run_io_and_remove()
 	fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio \
 		--rw=randrw --norandommap --iodepth=256 --size="${size}" --numjobs="$(nproc)" \
 		--runtime=20 --time_based > /dev/null 2>&1 &
+	fio --name=batchjob --filename=/dev/ublkb"${dev_id}" --ioengine=io_uring \
+		--rw=randrw --norandommap --iodepth=256 --size="${size}" \
+		--numjobs="$(nproc)" --runtime=20 --time_based \
+		--iodepth_batch_submit=32 --iodepth_batch_complete_min=32 \
+		--force_async=7 > /dev/null 2>&1 &
 	sleep 2
 	if [ "${kill_server}" = "yes" ]; then
 		local state
diff --git a/tools/testing/selftests/ublk/test_generic_12.sh b/tools/testing/selftests/ublk/test_generic_12.sh
new file mode 100755
index 000000000000..7abbb00d251d
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_generic_12.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+TID="generic_12"
+ERR_CODE=0
+
+if ! _have_program bpftrace; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "null" "do imbalanced load, it should be balanced over I/O threads"
+
+NTHREADS=6
+dev_id=$(_add_ublk_dev -t null -q 4 -d 16 --nthreads $NTHREADS --per_io_tasks)
+_check_add_dev $TID $?
+
+dev_t=$(_get_disk_dev_t "$dev_id")
+bpftrace trace/count_ios_per_tid.bt "$dev_t" > "$UBLK_TMP" 2>&1 &
+btrace_pid=$!
+sleep 2
+
+if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then
+	_cleanup_test "null"
+	exit "$UBLK_SKIP_CODE"
+fi
+
+# do imbalanced I/O on the ublk device
+# pin to cpu 0 to prevent migration/only target one queue
+fio --name=write_seq \
+    --filename=/dev/ublkb"${dev_id}" \
+    --ioengine=libaio --iodepth=16 \
+    --rw=write \
+    --size=512M \
+    --direct=1 \
+    --bs=4k \
+    --cpus_allowed=0 > /dev/null 2>&1
+ERR_CODE=$?
+kill "$btrace_pid"
+wait
+
+# check that every task handles some I/O, even though all I/O was issued
+# from a single CPU. when ublk gets support for round-robin tag
+# allocation, this check can be strengthened to assert that every thread
+# handles the same number of I/Os
+NR_THREADS_THAT_HANDLED_IO=$(grep -c '@' ${UBLK_TMP})
+if [[ $NR_THREADS_THAT_HANDLED_IO -ne $NTHREADS ]]; then
+        echo "only $NR_THREADS_THAT_HANDLED_IO handled I/O! expected $NTHREADS"
+        cat "$UBLK_TMP"
+        ERR_CODE=255
+fi
+
+_cleanup_test "null"
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_stress_03.sh b/tools/testing/selftests/ublk/test_stress_03.sh
index 7d728ce50774..6eef282d569f 100755
--- a/tools/testing/selftests/ublk/test_stress_03.sh
+++ b/tools/testing/selftests/ublk/test_stress_03.sh
@@ -41,5 +41,13 @@ if _have_feature "AUTO_BUF_REG"; then
 fi
 wait
 
+if _have_feature "PER_IO_DAEMON"; then
+	ublk_io_and_remove 8G -t null -q 4 --auto_zc --nthreads 8 --per_io_tasks &
+	ublk_io_and_remove 256M -t loop -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[0]}" &
+	ublk_io_and_remove 256M -t stripe -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+	ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback --nthreads 8 --per_io_tasks &
+fi
+wait
+
 _cleanup_test "stress"
 _show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_stress_04.sh b/tools/testing/selftests/ublk/test_stress_04.sh
index 9bcfa64ea1f0..40d1437ca298 100755
--- a/tools/testing/selftests/ublk/test_stress_04.sh
+++ b/tools/testing/selftests/ublk/test_stress_04.sh
@@ -38,6 +38,13 @@ if _have_feature "AUTO_BUF_REG"; then
 	ublk_io_and_kill_daemon 256M -t stripe -q 4 --auto_zc "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
 	ublk_io_and_kill_daemon 8G -t null -q 4 -z --auto_zc --auto_zc_fallback &
 fi
+
+if _have_feature "PER_IO_DAEMON"; then
+	ublk_io_and_kill_daemon 8G -t null -q 4 --nthreads 8 --per_io_tasks &
+	ublk_io_and_kill_daemon 256M -t loop -q 4 --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[0]}" &
+	ublk_io_and_kill_daemon 256M -t stripe -q 4 --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+	ublk_io_and_kill_daemon 8G -t null -q 4 --nthreads 8 --per_io_tasks &
+fi
 wait
 
 _cleanup_test "stress"
diff --git a/tools/testing/selftests/ublk/test_stress_05.sh b/tools/testing/selftests/ublk/test_stress_05.sh
index bcfc904cefc6..566cfd90d192 100755
--- a/tools/testing/selftests/ublk/test_stress_05.sh
+++ b/tools/testing/selftests/ublk/test_stress_05.sh
@@ -69,5 +69,12 @@ if _have_feature "AUTO_BUF_REG"; then
 	done
 fi
 
+if _have_feature "PER_IO_DAEMON"; then
+	ublk_io_and_remove 8G -t null -q 4 --nthreads 8 --per_io_tasks -r 1 -i "$reissue" &
+	ublk_io_and_remove 256M -t loop -q 4 --nthreads 8 --per_io_tasks -r 1 -i "$reissue" "${UBLK_BACKFILES[0]}" &
+	ublk_io_and_remove 8G -t null -q 4 --nthreads 8 --per_io_tasks -r 1 -i "$reissue"  &
+fi
+wait
+
 _cleanup_test "stress"
 _show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/trace/count_ios_per_tid.bt b/tools/testing/selftests/ublk/trace/count_ios_per_tid.bt
new file mode 100644
index 000000000000..f4aa63ff2938
--- /dev/null
+++ b/tools/testing/selftests/ublk/trace/count_ios_per_tid.bt
@@ -0,0 +1,11 @@
+/*
+ * Tabulates and prints I/O completions per thread for the given device
+ *
+ * $1: dev_t
+*/
+tracepoint:block:block_rq_complete
+{
+	if (args.dev == $1) {
+		@[tid] = count();
+	}
+}
diff --git a/tools/testing/selftests/vDSO/vgetrandom-chacha.S b/tools/testing/selftests/vDSO/vgetrandom-chacha.S
index d6e09af7c0a9..a4a82e1c28a9 100644
--- a/tools/testing/selftests/vDSO/vgetrandom-chacha.S
+++ b/tools/testing/selftests/vDSO/vgetrandom-chacha.S
@@ -11,6 +11,8 @@
 #include "../../../../arch/loongarch/vdso/vgetrandom-chacha.S"
 #elif defined(__powerpc__) || defined(__powerpc64__)
 #include "../../../../arch/powerpc/kernel/vdso/vgetrandom-chacha.S"
+#elif defined(__riscv) && __riscv_xlen == 64
+#include "../../../../arch/riscv/kernel/vdso/vgetrandom-chacha.S"
 #elif defined(__s390x__)
 #include "../../../../arch/s390/kernel/vdso64/vgetrandom-chacha.S"
 #elif defined(__x86_64__)
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 441feb21aa5a..4505b1c31be1 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -932,6 +932,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
 	(void)next;
 }
 
+static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
+
 static inline void vma_iter_free(struct vma_iterator *vmi)
 {
 	mas_destroy(&vmi->mas);