drm/i915: Squash repeated awaits on the same fence

Track the latest fence waited upon on each context, and only add a new asynchronous wait if the new fence is more recent than the recorded fence for that context. This requires us to filter out unordered timelines, which are noted by DMA_FENCE_NO_CONTEXT. However, in the absence of a universal identifier, we have to use our own i915->mm.unordered_timeline token. v2: Throw around the debug crutches v3: Inline the likely case of the pre-allocation cache being full. v4: Drop the pre-allocation support, we can lose the most recent fence in case of allocation failure -- it just means we may emit more awaits than strictly necessary but will not break. v5: Trim allocation size for leaf nodes, they only need an array of u32 not pointers. v6: Create mock_timeline to tidy selftest writing v7: s/intel_timeline_sync_get/intel_timeline_sync_is_later/ (Tvrtko) v8: Prune the stale sync points when we idle. v9: Include a small benchmark in the kselftests v10: Separate the idr implementation into its own compartment. (Tvrkto) v11: Refactor igt_sync kselftests to avoid deep nesting (Tvrkto) v12: __sync_leaf_idx() to assert that p->height is 0 when checking leaves v13: kselftests to investigate struct i915_syncmap itself (Tvrtko) v14: Foray into ascii art graphs v15: Take into account that the random lookup/insert does 2 prng calls, not 1, when benchmarking, and use for_each_set_bit() (Tvrtko) v16: Improved ascii art Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170503093924.5320-4-chris@chris-wilson.co.uk
author: Chris Wilson <chris@chris-wilson.co.uk> 2017-05-03 12:39:21 +0300
committer: Chris Wilson <chris@chris-wilson.co.uk> 2017-05-03 13:08:48 +0300
commit: 4797948071f607c5b43753cb8f1b7ddcf22e146d (patch)
tree: 65ee87bef56977d4f056947aaa664fe7b47c11bd /drivers/gpu/drm/i915/selftests/i915_gem_timeline.c
parent: ceae14bd4cc4333b9a3b0b6b9457bb16e7ca410a (diff)
download: linux-4797948071f607c5b43753cb8f1b7ddcf22e146d.tar.xz
1 files changed, 301 insertions, 0 deletions
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_timeline.c b/drivers/gpu/drm/i915/selftests/i915_gem_timeline.c
new file mode 100644
index 000000000000..6df00cc02c12
--- /dev/null
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_timeline.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include "../i915_selftest.h"
+#include "i915_random.h"
+
+#include "mock_gem_device.h"
+#include "mock_timeline.h"
+
+struct __igt_sync {
+	const char *name;
+	u32 seqno;
+	bool expected;
+	bool set;
+};
+
+static int __igt_sync(struct intel_timeline *tl,
+		      u64 ctx,
+		      const struct __igt_sync *p,
+		      const char *name)
+{
+	int ret;
+
+	if (__intel_timeline_sync_is_later(tl, ctx, p->seqno) != p->expected) {
+		pr_err("%s: %s(ctx=%llu, seqno=%u) expected passed %s but failed\n",
+		       name, p->name, ctx, p->seqno, yesno(p->expected));
+		return -EINVAL;
+	}
+
+	if (p->set) {
+		ret = __intel_timeline_sync_set(tl, ctx, p->seqno);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int igt_sync(void *arg)
+{
+	const struct __igt_sync pass[] = {
+		{ "unset", 0, false, false },
+		{ "new", 0, false, true },
+		{ "0a", 0, true, true },
+		{ "1a", 1, false, true },
+		{ "1b", 1, true, true },
+		{ "0b", 0, true, false },
+		{ "2a", 2, false, true },
+		{ "4", 4, false, true },
+		{ "INT_MAX", INT_MAX, false, true },
+		{ "INT_MAX-1", INT_MAX-1, true, false },
+		{ "INT_MAX+1", (u32)INT_MAX+1, false, true },
+		{ "INT_MAX", INT_MAX, true, false },
+		{ "UINT_MAX", UINT_MAX, false, true },
+		{ "wrap", 0, false, true },
+		{ "unwrap", UINT_MAX, true, false },
+		{},
+	}, *p;
+	struct intel_timeline *tl;
+	int order, offset;
+	int ret;
+
+	tl = mock_timeline(0);
+	if (!tl)
+		return -ENOMEM;
+
+	for (p = pass; p->name; p++) {
+		for (order = 1; order < 64; order++) {
+			for (offset = -1; offset <= (order > 1); offset++) {
+				u64 ctx = BIT_ULL(order) + offset;
+
+				ret = __igt_sync(tl, ctx, p, "1");
+				if (ret)
+					goto out;
+			}
+		}
+	}
+	mock_timeline_destroy(tl);
+
+	tl = mock_timeline(0);
+	if (!tl)
+		return -ENOMEM;
+
+	for (order = 1; order < 64; order++) {
+		for (offset = -1; offset <= (order > 1); offset++) {
+			u64 ctx = BIT_ULL(order) + offset;
+
+			for (p = pass; p->name; p++) {
+				ret = __igt_sync(tl, ctx, p, "2");
+				if (ret)
+					goto out;
+			}
+		}
+	}
+
+out:
+	mock_timeline_destroy(tl);
+	return ret;
+}
+
+static unsigned int random_engine(struct rnd_state *rnd)
+{
+	return ((u64)prandom_u32_state(rnd) * I915_NUM_ENGINES) >> 32;
+}
+
+static int bench_sync(void *arg)
+{
+#define M (1 << 20)
+	struct rnd_state prng;
+	struct intel_timeline *tl;
+	unsigned long end_time, count;
+	u64 prng32_1M;
+	ktime_t kt;
+	int order, last_order;
+
+	tl = mock_timeline(0);
+	if (!tl)
+		return -ENOMEM;
+
+	/* Lookups from cache are very fast and so the random number generation
+	 * and the loop itself becomes a significant factor in the per-iteration
+	 * timings. We try to compensate the results by measuring the overhead
+	 * of the prng and subtract it from the reported results.
+	 */
+	prandom_seed_state(&prng, i915_selftest.random_seed);
+	count = 0;
+	kt = ktime_get();
+	end_time = jiffies + HZ/10;
+	do {
+		u32 x;
+
+		/* Make sure the compiler doesn't optimise away the prng call */
+		WRITE_ONCE(x, prandom_u32_state(&prng));
+
+		count++;
+	} while (!time_after(jiffies, end_time));
+	kt = ktime_sub(ktime_get(), kt);
+	pr_debug("%s: %lu random evaluations, %lluns/prng\n",
+		 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
+	prng32_1M = ktime_to_ns(kt) * M / count;
+
+	/* Benchmark (only) setting random context ids */
+	prandom_seed_state(&prng, i915_selftest.random_seed);
+	count = 0;
+	kt = ktime_get();
+	end_time = jiffies + HZ/10;
+	do {
+		u64 id = i915_prandom_u64_state(&prng);
+
+		__intel_timeline_sync_set(tl, id, 0);
+		count++;
+	} while (!time_after(jiffies, end_time));
+	kt = ktime_sub(ktime_get(), kt);
+	kt = ktime_sub_ns(kt, count * prng32_1M * 2 / M);
+	pr_info("%s: %lu random insertions, %lluns/insert\n",
+		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
+
+	/* Benchmark looking up the exact same context ids as we just set */
+	prandom_seed_state(&prng, i915_selftest.random_seed);
+	end_time = count;
+	kt = ktime_get();
+	while (end_time--) {
+		u64 id = i915_prandom_u64_state(&prng);
+
+		if (!__intel_timeline_sync_is_later(tl, id, 0)) {
+			mock_timeline_destroy(tl);
+			pr_err("Lookup of %llu failed\n", id);
+			return -EINVAL;
+		}
+	}
+	kt = ktime_sub(ktime_get(), kt);
+	kt = ktime_sub_ns(kt, count * prng32_1M * 2 / M);
+	pr_info("%s: %lu random lookups, %lluns/lookup\n",
+		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
+
+	mock_timeline_destroy(tl);
+	cond_resched();
+
+	tl = mock_timeline(0);
+	if (!tl)
+		return -ENOMEM;
+
+	/* Benchmark setting the first N (in order) contexts */
+	count = 0;
+	kt = ktime_get();
+	end_time = jiffies + HZ/10;
+	do {
+		__intel_timeline_sync_set(tl, count++, 0);
+	} while (!time_after(jiffies, end_time));
+	kt = ktime_sub(ktime_get(), kt);
+	pr_info("%s: %lu in-order insertions, %lluns/insert\n",
+		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
+
+	/* Benchmark looking up the exact same context ids as we just set */
+	end_time = count;
+	kt = ktime_get();
+	while (end_time--) {
+		if (!__intel_timeline_sync_is_later(tl, end_time, 0)) {
+			pr_err("Lookup of %lu failed\n", end_time);
+			mock_timeline_destroy(tl);
+			return -EINVAL;
+		}
+	}
+	kt = ktime_sub(ktime_get(), kt);
+	pr_info("%s: %lu in-order lookups, %lluns/lookup\n",
+		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
+
+	mock_timeline_destroy(tl);
+	cond_resched();
+
+	tl = mock_timeline(0);
+	if (!tl)
+		return -ENOMEM;
+
+	/* Benchmark searching for a random context id and maybe changing it */
+	prandom_seed_state(&prng, i915_selftest.random_seed);
+	count = 0;
+	kt = ktime_get();
+	end_time = jiffies + HZ/10;
+	do {
+		u32 id = random_engine(&prng);
+		u32 seqno = prandom_u32_state(&prng);
+
+		if (!__intel_timeline_sync_is_later(tl, id, seqno))
+			__intel_timeline_sync_set(tl, id, seqno);
+
+		count++;
+	} while (!time_after(jiffies, end_time));
+	kt = ktime_sub(ktime_get(), kt);
+	kt = ktime_sub_ns(kt, count * prng32_1M * 2 / M);
+	pr_info("%s: %lu repeated insert/lookups, %lluns/op\n",
+		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
+	mock_timeline_destroy(tl);
+	cond_resched();
+
+	/* Benchmark searching for a known context id and changing the seqno */
+	for (last_order = 1, order = 1; order < 32;
+	     ({ int tmp = last_order; last_order = order; order += tmp; })) {
+		unsigned int mask = BIT(order) - 1;
+
+		tl = mock_timeline(0);
+		if (!tl)
+			return -ENOMEM;
+
+		count = 0;
+		kt = ktime_get();
+		end_time = jiffies + HZ/10;
+		do {
+			/* Without assuming too many details of the underlying
+			 * implementation, try to identify its phase-changes
+			 * (if any)!
+			 */
+			u64 id = (u64)(count & mask) << order;
+
+			__intel_timeline_sync_is_later(tl, id, 0);
+			__intel_timeline_sync_set(tl, id, 0);
+
+			count++;
+		} while (!time_after(jiffies, end_time));
+		kt = ktime_sub(ktime_get(), kt);
+		pr_info("%s: %lu cyclic/%d insert/lookups, %lluns/op\n",
+			__func__, count, order,
+			(long long)div64_ul(ktime_to_ns(kt), count));
+		mock_timeline_destroy(tl);
+		cond_resched();
+	}
+
+	return 0;
+#undef M
+}
+
+int i915_gem_timeline_mock_selftests(void)
+{
+	static const struct i915_subtest tests[] = {
+		SUBTEST(igt_sync),
+		SUBTEST(bench_sync),
+	};
+
+	return i915_subtests(tests, NULL);
+}
author	Chris Wilson <chris@chris-wilson.co.uk>	2017-05-03 12:39:21 +0300
committer	Chris Wilson <chris@chris-wilson.co.uk>	2017-05-03 13:08:48 +0300
commit	4797948071f607c5b43753cb8f1b7ddcf22e146d (patch)
tree	65ee87bef56977d4f056947aaa664fe7b47c11bd /drivers/gpu/drm/i915/selftests/i915_gem_timeline.c
parent	ceae14bd4cc4333b9a3b0b6b9457bb16e7ca410a (diff)
download	linux-4797948071f607c5b43753cb8f1b7ddcf22e146d.tar.xz