From c39a0e2c8850f08249383f2425dbd8dbe4baad69 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:20 -0700
Subject: x86/perf/cqm: Wipe out perf based cqm

'perf cqm' never worked due to the incompatibility between perf
infrastructure and cqm hardware support.  The hardware uses RMIDs to
track the llc occupancy of tasks and these RMIDs are per package. This
makes monitoring a hierarchy like cgroup along with monitoring of tasks
separately difficult and several patches sent to lkml to fix them were
NACKed. Further more, the following issues in the current perf cqm make
it almost unusable:

    1. No support to monitor the same group of tasks for which we do
    allocation using resctrl.

    2. It gives random and inaccurate data (mostly 0s) once we run out
    of RMIDs due to issues in Recycling.

    3. Recycling results in inaccuracy of data because we cannot
    guarantee that the RMID was stolen from a task when it was not
    pulling data into cache or even when it pulled the least data. Also
    for monitoring llc_occupancy, if we stop using an RMID_x and then
    start using an RMID_y after we reclaim an RMID from an other event,
    we miss accounting all the occupancy that was tagged to RMID_x at a
    later perf_count.

    2. Recycling code makes the monitoring code complex including
    scheduling because the event can lose RMID any time. Since MBM
    counters count bandwidth for a period of time by taking snap shot of
    total bytes at two different times, recycling complicates the way we
    count MBM in a hierarchy. Also we need a spin lock while we do the
    processing to account for MBM counter overflow. We also currently
    use a spin lock in scheduling to prevent the RMID from being taken
    away.

    4. Lack of support when we run different kind of event like task,
    system-wide and cgroup events together. Data mostly prints 0s. This
    is also because we can have only one RMID tied to a cpu as defined
    by the cqm hardware but a perf can at the same time tie multiple
    events during one sched_in.

    5. No support of monitoring a group of tasks. There is partial support
    for cgroup but it does not work once there is a hierarchy of cgroups
    or if we want to monitor a task in a cgroup and the cgroup itself.

    6. No support for monitoring tasks for the lifetime without perf
    overhead.

    7. It reported the aggregate cache occupancy or memory bandwidth over
    all sockets. But most cloud and VMM based use cases want to know the
    individual per-socket usage.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-2-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/events/intel/Makefile          |    2 +-
 arch/x86/events/intel/cqm.c             | 1766 -------------------------------
 arch/x86/include/asm/intel_rdt_common.h |    2 -
 arch/x86/kernel/cpu/intel_rdt.c         |    8 +
 4 files changed, 9 insertions(+), 1769 deletions(-)
 delete mode 100644 arch/x86/events/intel/cqm.c

(limited to 'arch')

diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
index 06c2baa51814..e9d8520a801a 100644
--- a/arch/x86/events/intel/Makefile
+++ b/arch/x86/events/intel/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_CPU_SUP_INTEL)		+= core.o bts.o cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= core.o bts.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= ds.o knc.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= lbr.o p4.o p6.o pt.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL)	+= intel-rapl-perf.o
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
deleted file mode 100644
index 2521f771f2f5..000000000000
--- a/arch/x86/events/intel/cqm.c
+++ /dev/null
@@ -1,1766 +0,0 @@
-/*
- * Intel Cache Quality-of-Service Monitoring (CQM) support.
- *
- * Based very, very heavily on work by Peter Zijlstra.
- */
-
-#include <linux/perf_event.h>
-#include <linux/slab.h>
-#include <asm/cpu_device_id.h>
-#include <asm/intel_rdt_common.h>
-#include "../perf_event.h"
-
-#define MSR_IA32_QM_CTR		0x0c8e
-#define MSR_IA32_QM_EVTSEL	0x0c8d
-
-#define MBM_CNTR_WIDTH		24
-/*
- * Guaranteed time in ms as per SDM where MBM counters will not overflow.
- */
-#define MBM_CTR_OVERFLOW_TIME	1000
-
-static u32 cqm_max_rmid = -1;
-static unsigned int cqm_l3_scale; /* supposedly cacheline size */
-static bool cqm_enabled, mbm_enabled;
-unsigned int mbm_socket_max;
-
-/*
- * The cached intel_pqr_state is strictly per CPU and can never be
- * updated from a remote CPU. Both functions which modify the state
- * (intel_cqm_event_start and intel_cqm_event_stop) are called with
- * interrupts disabled, which is sufficient for the protection.
- */
-DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
-static struct hrtimer *mbm_timers;
-/**
- * struct sample - mbm event's (local or total) data
- * @total_bytes    #bytes since we began monitoring
- * @prev_msr       previous value of MSR
- */
-struct sample {
-	u64	total_bytes;
-	u64	prev_msr;
-};
-
-/*
- * samples profiled for total memory bandwidth type events
- */
-static struct sample *mbm_total;
-/*
- * samples profiled for local memory bandwidth type events
- */
-static struct sample *mbm_local;
-
-#define pkg_id	topology_physical_package_id(smp_processor_id())
-/*
- * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array.
- * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of
- * rmids per socket, an example is given below
- * RMID1 of Socket0:  vrmid =  1
- * RMID1 of Socket1:  vrmid =  1 * (cqm_max_rmid + 1) + 1
- * RMID1 of Socket2:  vrmid =  2 * (cqm_max_rmid + 1) + 1
- */
-#define rmid_2_index(rmid)  ((pkg_id * (cqm_max_rmid + 1)) + rmid)
-/*
- * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
- * Also protects event->hw.cqm_rmid
- *
- * Hold either for stability, both for modification of ->hw.cqm_rmid.
- */
-static DEFINE_MUTEX(cache_mutex);
-static DEFINE_RAW_SPINLOCK(cache_lock);
-
-/*
- * Groups of events that have the same target(s), one RMID per group.
- */
-static LIST_HEAD(cache_groups);
-
-/*
- * Mask of CPUs for reading CQM values. We only need one per-socket.
- */
-static cpumask_t cqm_cpumask;
-
-#define RMID_VAL_ERROR		(1ULL << 63)
-#define RMID_VAL_UNAVAIL	(1ULL << 62)
-
-/*
- * Event IDs are used to program IA32_QM_EVTSEL before reading event
- * counter from IA32_QM_CTR
- */
-#define QOS_L3_OCCUP_EVENT_ID	0x01
-#define QOS_MBM_TOTAL_EVENT_ID	0x02
-#define QOS_MBM_LOCAL_EVENT_ID	0x03
-
-/*
- * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
- *
- * This rmid is always free and is guaranteed to have an associated
- * near-zero occupancy value, i.e. no cachelines are tagged with this
- * RMID, once __intel_cqm_rmid_rotate() returns.
- */
-static u32 intel_cqm_rotation_rmid;
-
-#define INVALID_RMID		(-1)
-
-/*
- * Is @rmid valid for programming the hardware?
- *
- * rmid 0 is reserved by the hardware for all non-monitored tasks, which
- * means that we should never come across an rmid with that value.
- * Likewise, an rmid value of -1 is used to indicate "no rmid currently
- * assigned" and is used as part of the rotation code.
- */
-static inline bool __rmid_valid(u32 rmid)
-{
-	if (!rmid || rmid == INVALID_RMID)
-		return false;
-
-	return true;
-}
-
-static u64 __rmid_read(u32 rmid)
-{
-	u64 val;
-
-	/*
-	 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
-	 * it just says that to increase confusion.
-	 */
-	wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
-	rdmsrl(MSR_IA32_QM_CTR, val);
-
-	/*
-	 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
-	 * the number of cachelines tagged with @rmid.
-	 */
-	return val;
-}
-
-enum rmid_recycle_state {
-	RMID_YOUNG = 0,
-	RMID_AVAILABLE,
-	RMID_DIRTY,
-};
-
-struct cqm_rmid_entry {
-	u32 rmid;
-	enum rmid_recycle_state state;
-	struct list_head list;
-	unsigned long queue_time;
-};
-
-/*
- * cqm_rmid_free_lru - A least recently used list of RMIDs.
- *
- * Oldest entry at the head, newest (most recently used) entry at the
- * tail. This list is never traversed, it's only used to keep track of
- * the lru order. That is, we only pick entries of the head or insert
- * them on the tail.
- *
- * All entries on the list are 'free', and their RMIDs are not currently
- * in use. To mark an RMID as in use, remove its entry from the lru
- * list.
- *
- *
- * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
- *
- * This list is contains RMIDs that no one is currently using but that
- * may have a non-zero occupancy value associated with them. The
- * rotation worker moves RMIDs from the limbo list to the free list once
- * the occupancy value drops below __intel_cqm_threshold.
- *
- * Both lists are protected by cache_mutex.
- */
-static LIST_HEAD(cqm_rmid_free_lru);
-static LIST_HEAD(cqm_rmid_limbo_lru);
-
-/*
- * We use a simple array of pointers so that we can lookup a struct
- * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
- * and __put_rmid() from having to worry about dealing with struct
- * cqm_rmid_entry - they just deal with rmids, i.e. integers.
- *
- * Once this array is initialized it is read-only. No locks are required
- * to access it.
- *
- * All entries for all RMIDs can be looked up in the this array at all
- * times.
- */
-static struct cqm_rmid_entry **cqm_rmid_ptrs;
-
-static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
-{
-	struct cqm_rmid_entry *entry;
-
-	entry = cqm_rmid_ptrs[rmid];
-	WARN_ON(entry->rmid != rmid);
-
-	return entry;
-}
-
-/*
- * Returns < 0 on fail.
- *
- * We expect to be called with cache_mutex held.
- */
-static u32 __get_rmid(void)
-{
-	struct cqm_rmid_entry *entry;
-
-	lockdep_assert_held(&cache_mutex);
-
-	if (list_empty(&cqm_rmid_free_lru))
-		return INVALID_RMID;
-
-	entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
-	list_del(&entry->list);
-
-	return entry->rmid;
-}
-
-static void __put_rmid(u32 rmid)
-{
-	struct cqm_rmid_entry *entry;
-
-	lockdep_assert_held(&cache_mutex);
-
-	WARN_ON(!__rmid_valid(rmid));
-	entry = __rmid_entry(rmid);
-
-	entry->queue_time = jiffies;
-	entry->state = RMID_YOUNG;
-
-	list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
-}
-
-static void cqm_cleanup(void)
-{
-	int i;
-
-	if (!cqm_rmid_ptrs)
-		return;
-
-	for (i = 0; i < cqm_max_rmid; i++)
-		kfree(cqm_rmid_ptrs[i]);
-
-	kfree(cqm_rmid_ptrs);
-	cqm_rmid_ptrs = NULL;
-	cqm_enabled = false;
-}
-
-static int intel_cqm_setup_rmid_cache(void)
-{
-	struct cqm_rmid_entry *entry;
-	unsigned int nr_rmids;
-	int r = 0;
-
-	nr_rmids = cqm_max_rmid + 1;
-	cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) *
-				nr_rmids, GFP_KERNEL);
-	if (!cqm_rmid_ptrs)
-		return -ENOMEM;
-
-	for (; r <= cqm_max_rmid; r++) {
-		struct cqm_rmid_entry *entry;
-
-		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-		if (!entry)
-			goto fail;
-
-		INIT_LIST_HEAD(&entry->list);
-		entry->rmid = r;
-		cqm_rmid_ptrs[r] = entry;
-
-		list_add_tail(&entry->list, &cqm_rmid_free_lru);
-	}
-
-	/*
-	 * RMID 0 is special and is always allocated. It's used for all
-	 * tasks that are not monitored.
-	 */
-	entry = __rmid_entry(0);
-	list_del(&entry->list);
-
-	mutex_lock(&cache_mutex);
-	intel_cqm_rotation_rmid = __get_rmid();
-	mutex_unlock(&cache_mutex);
-
-	return 0;
-
-fail:
-	cqm_cleanup();
-	return -ENOMEM;
-}
-
-/*
- * Determine if @a and @b measure the same set of tasks.
- *
- * If @a and @b measure the same set of tasks then we want to share a
- * single RMID.
- */
-static bool __match_event(struct perf_event *a, struct perf_event *b)
-{
-	/* Per-cpu and task events don't mix */
-	if ((a->attach_state & PERF_ATTACH_TASK) !=
-	    (b->attach_state & PERF_ATTACH_TASK))
-		return false;
-
-#ifdef CONFIG_CGROUP_PERF
-	if (a->cgrp != b->cgrp)
-		return false;
-#endif
-
-	/* If not task event, we're machine wide */
-	if (!(b->attach_state & PERF_ATTACH_TASK))
-		return true;
-
-	/*
-	 * Events that target same task are placed into the same cache group.
-	 * Mark it as a multi event group, so that we update ->count
-	 * for every event rather than just the group leader later.
-	 */
-	if (a->hw.target == b->hw.target) {
-		b->hw.is_group_event = true;
-		return true;
-	}
-
-	/*
-	 * Are we an inherited event?
-	 */
-	if (b->parent == a)
-		return true;
-
-	return false;
-}
-
-#ifdef CONFIG_CGROUP_PERF
-static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
-{
-	if (event->attach_state & PERF_ATTACH_TASK)
-		return perf_cgroup_from_task(event->hw.target, event->ctx);
-
-	return event->cgrp;
-}
-#endif
-
-/*
- * Determine if @a's tasks intersect with @b's tasks
- *
- * There are combinations of events that we explicitly prohibit,
- *
- *		   PROHIBITS
- *     system-wide    -> 	cgroup and task
- *     cgroup 	      ->	system-wide
- *     		      ->	task in cgroup
- *     task 	      -> 	system-wide
- *     		      ->	task in cgroup
- *
- * Call this function before allocating an RMID.
- */
-static bool __conflict_event(struct perf_event *a, struct perf_event *b)
-{
-#ifdef CONFIG_CGROUP_PERF
-	/*
-	 * We can have any number of cgroups but only one system-wide
-	 * event at a time.
-	 */
-	if (a->cgrp && b->cgrp) {
-		struct perf_cgroup *ac = a->cgrp;
-		struct perf_cgroup *bc = b->cgrp;
-
-		/*
-		 * This condition should have been caught in
-		 * __match_event() and we should be sharing an RMID.
-		 */
-		WARN_ON_ONCE(ac == bc);
-
-		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-			return true;
-
-		return false;
-	}
-
-	if (a->cgrp || b->cgrp) {
-		struct perf_cgroup *ac, *bc;
-
-		/*
-		 * cgroup and system-wide events are mutually exclusive
-		 */
-		if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
-		    (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
-			return true;
-
-		/*
-		 * Ensure neither event is part of the other's cgroup
-		 */
-		ac = event_to_cgroup(a);
-		bc = event_to_cgroup(b);
-		if (ac == bc)
-			return true;
-
-		/*
-		 * Must have cgroup and non-intersecting task events.
-		 */
-		if (!ac || !bc)
-			return false;
-
-		/*
-		 * We have cgroup and task events, and the task belongs
-		 * to a cgroup. Check for for overlap.
-		 */
-		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-			return true;
-
-		return false;
-	}
-#endif
-	/*
-	 * If one of them is not a task, same story as above with cgroups.
-	 */
-	if (!(a->attach_state & PERF_ATTACH_TASK) ||
-	    !(b->attach_state & PERF_ATTACH_TASK))
-		return true;
-
-	/*
-	 * Must be non-overlapping.
-	 */
-	return false;
-}
-
-struct rmid_read {
-	u32 rmid;
-	u32 evt_type;
-	atomic64_t value;
-};
-
-static void __intel_cqm_event_count(void *info);
-static void init_mbm_sample(u32 rmid, u32 evt_type);
-static void __intel_mbm_event_count(void *info);
-
-static bool is_cqm_event(int e)
-{
-	return (e == QOS_L3_OCCUP_EVENT_ID);
-}
-
-static bool is_mbm_event(int e)
-{
-	return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID);
-}
-
-static void cqm_mask_call(struct rmid_read *rr)
-{
-	if (is_mbm_event(rr->evt_type))
-		on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1);
-	else
-		on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1);
-}
-
-/*
- * Exchange the RMID of a group of events.
- */
-static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
-{
-	struct perf_event *event;
-	struct list_head *head = &group->hw.cqm_group_entry;
-	u32 old_rmid = group->hw.cqm_rmid;
-
-	lockdep_assert_held(&cache_mutex);
-
-	/*
-	 * If our RMID is being deallocated, perform a read now.
-	 */
-	if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
-		struct rmid_read rr = {
-			.rmid = old_rmid,
-			.evt_type = group->attr.config,
-			.value = ATOMIC64_INIT(0),
-		};
-
-		cqm_mask_call(&rr);
-		local64_set(&group->count, atomic64_read(&rr.value));
-	}
-
-	raw_spin_lock_irq(&cache_lock);
-
-	group->hw.cqm_rmid = rmid;
-	list_for_each_entry(event, head, hw.cqm_group_entry)
-		event->hw.cqm_rmid = rmid;
-
-	raw_spin_unlock_irq(&cache_lock);
-
-	/*
-	 * If the allocation is for mbm, init the mbm stats.
-	 * Need to check if each event in the group is mbm event
-	 * because there could be multiple type of events in the same group.
-	 */
-	if (__rmid_valid(rmid)) {
-		event = group;
-		if (is_mbm_event(event->attr.config))
-			init_mbm_sample(rmid, event->attr.config);
-
-		list_for_each_entry(event, head, hw.cqm_group_entry) {
-			if (is_mbm_event(event->attr.config))
-				init_mbm_sample(rmid, event->attr.config);
-		}
-	}
-
-	return old_rmid;
-}
-
-/*
- * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
- * cachelines are still tagged with RMIDs in limbo, we progressively
- * increment the threshold until we find an RMID in limbo with <=
- * __intel_cqm_threshold lines tagged. This is designed to mitigate the
- * problem where cachelines tagged with an RMID are not steadily being
- * evicted.
- *
- * On successful rotations we decrease the threshold back towards zero.
- *
- * __intel_cqm_max_threshold provides an upper bound on the threshold,
- * and is measured in bytes because it's exposed to userland.
- */
-static unsigned int __intel_cqm_threshold;
-static unsigned int __intel_cqm_max_threshold;
-
-/*
- * Test whether an RMID has a zero occupancy value on this cpu.
- */
-static void intel_cqm_stable(void *arg)
-{
-	struct cqm_rmid_entry *entry;
-
-	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-		if (entry->state != RMID_AVAILABLE)
-			break;
-
-		if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
-			entry->state = RMID_DIRTY;
-	}
-}
-
-/*
- * If we have group events waiting for an RMID that don't conflict with
- * events already running, assign @rmid.
- */
-static bool intel_cqm_sched_in_event(u32 rmid)
-{
-	struct perf_event *leader, *event;
-
-	lockdep_assert_held(&cache_mutex);
-
-	leader = list_first_entry(&cache_groups, struct perf_event,
-				  hw.cqm_groups_entry);
-	event = leader;
-
-	list_for_each_entry_continue(event, &cache_groups,
-				     hw.cqm_groups_entry) {
-		if (__rmid_valid(event->hw.cqm_rmid))
-			continue;
-
-		if (__conflict_event(event, leader))
-			continue;
-
-		intel_cqm_xchg_rmid(event, rmid);
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * Initially use this constant for both the limbo queue time and the
- * rotation timer interval, pmu::hrtimer_interval_ms.
- *
- * They don't need to be the same, but the two are related since if you
- * rotate faster than you recycle RMIDs, you may run out of available
- * RMIDs.
- */
-#define RMID_DEFAULT_QUEUE_TIME 250	/* ms */
-
-static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
-
-/*
- * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
- * @nr_available: number of freeable RMIDs on the limbo list
- *
- * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
- * cachelines are tagged with those RMIDs. After this we can reuse them
- * and know that the current set of active RMIDs is stable.
- *
- * Return %true or %false depending on whether stabilization needs to be
- * reattempted.
- *
- * If we return %true then @nr_available is updated to indicate the
- * number of RMIDs on the limbo list that have been queued for the
- * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
- * are above __intel_cqm_threshold.
- */
-static bool intel_cqm_rmid_stabilize(unsigned int *available)
-{
-	struct cqm_rmid_entry *entry, *tmp;
-
-	lockdep_assert_held(&cache_mutex);
-
-	*available = 0;
-	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-		unsigned long min_queue_time;
-		unsigned long now = jiffies;
-
-		/*
-		 * We hold RMIDs placed into limbo for a minimum queue
-		 * time. Before the minimum queue time has elapsed we do
-		 * not recycle RMIDs.
-		 *
-		 * The reasoning is that until a sufficient time has
-		 * passed since we stopped using an RMID, any RMID
-		 * placed onto the limbo list will likely still have
-		 * data tagged in the cache, which means we'll probably
-		 * fail to recycle it anyway.
-		 *
-		 * We can save ourselves an expensive IPI by skipping
-		 * any RMIDs that have not been queued for the minimum
-		 * time.
-		 */
-		min_queue_time = entry->queue_time +
-			msecs_to_jiffies(__rmid_queue_time_ms);
-
-		if (time_after(min_queue_time, now))
-			break;
-
-		entry->state = RMID_AVAILABLE;
-		(*available)++;
-	}
-
-	/*
-	 * Fast return if none of the RMIDs on the limbo list have been
-	 * sitting on the queue for the minimum queue time.
-	 */
-	if (!*available)
-		return false;
-
-	/*
-	 * Test whether an RMID is free for each package.
-	 */
-	on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
-
-	list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
-		/*
-		 * Exhausted all RMIDs that have waited min queue time.
-		 */
-		if (entry->state == RMID_YOUNG)
-			break;
-
-		if (entry->state == RMID_DIRTY)
-			continue;
-
-		list_del(&entry->list);	/* remove from limbo */
-
-		/*
-		 * The rotation RMID gets priority if it's
-		 * currently invalid. In which case, skip adding
-		 * the RMID to the the free lru.
-		 */
-		if (!__rmid_valid(intel_cqm_rotation_rmid)) {
-			intel_cqm_rotation_rmid = entry->rmid;
-			continue;
-		}
-
-		/*
-		 * If we have groups waiting for RMIDs, hand
-		 * them one now provided they don't conflict.
-		 */
-		if (intel_cqm_sched_in_event(entry->rmid))
-			continue;
-
-		/*
-		 * Otherwise place it onto the free list.
-		 */
-		list_add_tail(&entry->list, &cqm_rmid_free_lru);
-	}
-
-
-	return __rmid_valid(intel_cqm_rotation_rmid);
-}
-
-/*
- * Pick a victim group and move it to the tail of the group list.
- * @next: The first group without an RMID
- */
-static void __intel_cqm_pick_and_rotate(struct perf_event *next)
-{
-	struct perf_event *rotor;
-	u32 rmid;
-
-	lockdep_assert_held(&cache_mutex);
-
-	rotor = list_first_entry(&cache_groups, struct perf_event,
-				 hw.cqm_groups_entry);
-
-	/*
-	 * The group at the front of the list should always have a valid
-	 * RMID. If it doesn't then no groups have RMIDs assigned and we
-	 * don't need to rotate the list.
-	 */
-	if (next == rotor)
-		return;
-
-	rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
-	__put_rmid(rmid);
-
-	list_rotate_left(&cache_groups);
-}
-
-/*
- * Deallocate the RMIDs from any events that conflict with @event, and
- * place them on the back of the group list.
- */
-static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
-{
-	struct perf_event *group, *g;
-	u32 rmid;
-
-	lockdep_assert_held(&cache_mutex);
-
-	list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
-		if (group == event)
-			continue;
-
-		rmid = group->hw.cqm_rmid;
-
-		/*
-		 * Skip events that don't have a valid RMID.
-		 */
-		if (!__rmid_valid(rmid))
-			continue;
-
-		/*
-		 * No conflict? No problem! Leave the event alone.
-		 */
-		if (!__conflict_event(group, event))
-			continue;
-
-		intel_cqm_xchg_rmid(group, INVALID_RMID);
-		__put_rmid(rmid);
-	}
-}
-
-/*
- * Attempt to rotate the groups and assign new RMIDs.
- *
- * We rotate for two reasons,
- *   1. To handle the scheduling of conflicting events
- *   2. To recycle RMIDs
- *
- * Rotating RMIDs is complicated because the hardware doesn't give us
- * any clues.
- *
- * There's problems with the hardware interface; when you change the
- * task:RMID map cachelines retain their 'old' tags, giving a skewed
- * picture. In order to work around this, we must always keep one free
- * RMID - intel_cqm_rotation_rmid.
- *
- * Rotation works by taking away an RMID from a group (the old RMID),
- * and assigning the free RMID to another group (the new RMID). We must
- * then wait for the old RMID to not be used (no cachelines tagged).
- * This ensure that all cachelines are tagged with 'active' RMIDs. At
- * this point we can start reading values for the new RMID and treat the
- * old RMID as the free RMID for the next rotation.
- *
- * Return %true or %false depending on whether we did any rotating.
- */
-static bool __intel_cqm_rmid_rotate(void)
-{
-	struct perf_event *group, *start = NULL;
-	unsigned int threshold_limit;
-	unsigned int nr_needed = 0;
-	unsigned int nr_available;
-	bool rotated = false;
-
-	mutex_lock(&cache_mutex);
-
-again:
-	/*
-	 * Fast path through this function if there are no groups and no
-	 * RMIDs that need cleaning.
-	 */
-	if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
-		goto out;
-
-	list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
-		if (!__rmid_valid(group->hw.cqm_rmid)) {
-			if (!start)
-				start = group;
-			nr_needed++;
-		}
-	}
-
-	/*
-	 * We have some event groups, but they all have RMIDs assigned
-	 * and no RMIDs need cleaning.
-	 */
-	if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
-		goto out;
-
-	if (!nr_needed)
-		goto stabilize;
-
-	/*
-	 * We have more event groups without RMIDs than available RMIDs,
-	 * or we have event groups that conflict with the ones currently
-	 * scheduled.
-	 *
-	 * We force deallocate the rmid of the group at the head of
-	 * cache_groups. The first event group without an RMID then gets
-	 * assigned intel_cqm_rotation_rmid. This ensures we always make
-	 * forward progress.
-	 *
-	 * Rotate the cache_groups list so the previous head is now the
-	 * tail.
-	 */
-	__intel_cqm_pick_and_rotate(start);
-
-	/*
-	 * If the rotation is going to succeed, reduce the threshold so
-	 * that we don't needlessly reuse dirty RMIDs.
-	 */
-	if (__rmid_valid(intel_cqm_rotation_rmid)) {
-		intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
-		intel_cqm_rotation_rmid = __get_rmid();
-
-		intel_cqm_sched_out_conflicting_events(start);
-
-		if (__intel_cqm_threshold)
-			__intel_cqm_threshold--;
-	}
-
-	rotated = true;
-
-stabilize:
-	/*
-	 * We now need to stablize the RMID we freed above (if any) to
-	 * ensure that the next time we rotate we have an RMID with zero
-	 * occupancy value.
-	 *
-	 * Alternatively, if we didn't need to perform any rotation,
-	 * we'll have a bunch of RMIDs in limbo that need stabilizing.
-	 */
-	threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
-
-	while (intel_cqm_rmid_stabilize(&nr_available) &&
-	       __intel_cqm_threshold < threshold_limit) {
-		unsigned int steal_limit;
-
-		/*
-		 * Don't spin if nobody is actively waiting for an RMID,
-		 * the rotation worker will be kicked as soon as an
-		 * event needs an RMID anyway.
-		 */
-		if (!nr_needed)
-			break;
-
-		/* Allow max 25% of RMIDs to be in limbo. */
-		steal_limit = (cqm_max_rmid + 1) / 4;
-
-		/*
-		 * We failed to stabilize any RMIDs so our rotation
-		 * logic is now stuck. In order to make forward progress
-		 * we have a few options:
-		 *
-		 *   1. rotate ("steal") another RMID
-		 *   2. increase the threshold
-		 *   3. do nothing
-		 *
-		 * We do both of 1. and 2. until we hit the steal limit.
-		 *
-		 * The steal limit prevents all RMIDs ending up on the
-		 * limbo list. This can happen if every RMID has a
-		 * non-zero occupancy above threshold_limit, and the
-		 * occupancy values aren't dropping fast enough.
-		 *
-		 * Note that there is prioritisation at work here - we'd
-		 * rather increase the number of RMIDs on the limbo list
-		 * than increase the threshold, because increasing the
-		 * threshold skews the event data (because we reuse
-		 * dirty RMIDs) - threshold bumps are a last resort.
-		 */
-		if (nr_available < steal_limit)
-			goto again;
-
-		__intel_cqm_threshold++;
-	}
-
-out:
-	mutex_unlock(&cache_mutex);
-	return rotated;
-}
-
-static void intel_cqm_rmid_rotate(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
-
-static struct pmu intel_cqm_pmu;
-
-static void intel_cqm_rmid_rotate(struct work_struct *work)
-{
-	unsigned long delay;
-
-	__intel_cqm_rmid_rotate();
-
-	delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
-	schedule_delayed_work(&intel_cqm_rmid_work, delay);
-}
-
-static u64 update_sample(unsigned int rmid, u32 evt_type, int first)
-{
-	struct sample *mbm_current;
-	u32 vrmid = rmid_2_index(rmid);
-	u64 val, bytes, shift;
-	u32 eventid;
-
-	if (evt_type == QOS_MBM_LOCAL_EVENT_ID) {
-		mbm_current = &mbm_local[vrmid];
-		eventid     = QOS_MBM_LOCAL_EVENT_ID;
-	} else {
-		mbm_current = &mbm_total[vrmid];
-		eventid     = QOS_MBM_TOTAL_EVENT_ID;
-	}
-
-	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
-	rdmsrl(MSR_IA32_QM_CTR, val);
-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		return mbm_current->total_bytes;
-
-	if (first) {
-		mbm_current->prev_msr = val;
-		mbm_current->total_bytes = 0;
-		return mbm_current->total_bytes;
-	}
-
-	/*
-	 * The h/w guarantees that counters will not overflow
-	 * so long as we poll them at least once per second.
-	 */
-	shift = 64 - MBM_CNTR_WIDTH;
-	bytes = (val << shift) - (mbm_current->prev_msr << shift);
-	bytes >>= shift;
-
-	bytes *= cqm_l3_scale;
-
-	mbm_current->total_bytes += bytes;
-	mbm_current->prev_msr = val;
-
-	return mbm_current->total_bytes;
-}
-
-static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type)
-{
-	return update_sample(rmid, evt_type, 0);
-}
-
-static void __intel_mbm_event_init(void *info)
-{
-	struct rmid_read *rr = info;
-
-	update_sample(rr->rmid, rr->evt_type, 1);
-}
-
-static void init_mbm_sample(u32 rmid, u32 evt_type)
-{
-	struct rmid_read rr = {
-		.rmid = rmid,
-		.evt_type = evt_type,
-		.value = ATOMIC64_INIT(0),
-	};
-
-	/* on each socket, init sample */
-	on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
-}
-
-/*
- * Find a group and setup RMID.
- *
- * If we're part of a group, we use the group's RMID.
- */
-static void intel_cqm_setup_event(struct perf_event *event,
-				  struct perf_event **group)
-{
-	struct perf_event *iter;
-	bool conflict = false;
-	u32 rmid;
-
-	event->hw.is_group_event = false;
-	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
-		rmid = iter->hw.cqm_rmid;
-
-		if (__match_event(iter, event)) {
-			/* All tasks in a group share an RMID */
-			event->hw.cqm_rmid = rmid;
-			*group = iter;
-			if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
-				init_mbm_sample(rmid, event->attr.config);
-			return;
-		}
-
-		/*
-		 * We only care about conflicts for events that are
-		 * actually scheduled in (and hence have a valid RMID).
-		 */
-		if (__conflict_event(iter, event) && __rmid_valid(rmid))
-			conflict = true;
-	}
-
-	if (conflict)
-		rmid = INVALID_RMID;
-	else
-		rmid = __get_rmid();
-
-	if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
-		init_mbm_sample(rmid, event->attr.config);
-
-	event->hw.cqm_rmid = rmid;
-}
-
-static void intel_cqm_event_read(struct perf_event *event)
-{
-	unsigned long flags;
-	u32 rmid;
-	u64 val;
-
-	/*
-	 * Task events are handled by intel_cqm_event_count().
-	 */
-	if (event->cpu == -1)
-		return;
-
-	raw_spin_lock_irqsave(&cache_lock, flags);
-	rmid = event->hw.cqm_rmid;
-
-	if (!__rmid_valid(rmid))
-		goto out;
-
-	if (is_mbm_event(event->attr.config))
-		val = rmid_read_mbm(rmid, event->attr.config);
-	else
-		val = __rmid_read(rmid);
-
-	/*
-	 * Ignore this reading on error states and do not update the value.
-	 */
-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		goto out;
-
-	local64_set(&event->count, val);
-out:
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-}
-
-static void __intel_cqm_event_count(void *info)
-{
-	struct rmid_read *rr = info;
-	u64 val;
-
-	val = __rmid_read(rr->rmid);
-
-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		return;
-
-	atomic64_add(val, &rr->value);
-}
-
-static inline bool cqm_group_leader(struct perf_event *event)
-{
-	return !list_empty(&event->hw.cqm_groups_entry);
-}
-
-static void __intel_mbm_event_count(void *info)
-{
-	struct rmid_read *rr = info;
-	u64 val;
-
-	val = rmid_read_mbm(rr->rmid, rr->evt_type);
-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		return;
-	atomic64_add(val, &rr->value);
-}
-
-static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
-{
-	struct perf_event *iter, *iter1;
-	int ret = HRTIMER_RESTART;
-	struct list_head *head;
-	unsigned long flags;
-	u32 grp_rmid;
-
-	/*
-	 * Need to cache_lock as the timer Event Select MSR reads
-	 * can race with the mbm/cqm count() and mbm_init() reads.
-	 */
-	raw_spin_lock_irqsave(&cache_lock, flags);
-
-	if (list_empty(&cache_groups)) {
-		ret = HRTIMER_NORESTART;
-		goto out;
-	}
-
-	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
-		grp_rmid = iter->hw.cqm_rmid;
-		if (!__rmid_valid(grp_rmid))
-			continue;
-		if (is_mbm_event(iter->attr.config))
-			update_sample(grp_rmid, iter->attr.config, 0);
-
-		head = &iter->hw.cqm_group_entry;
-		if (list_empty(head))
-			continue;
-		list_for_each_entry(iter1, head, hw.cqm_group_entry) {
-			if (!iter1->hw.is_group_event)
-				break;
-			if (is_mbm_event(iter1->attr.config))
-				update_sample(iter1->hw.cqm_rmid,
-					      iter1->attr.config, 0);
-		}
-	}
-
-	hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME));
-out:
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-
-	return ret;
-}
-
-static void __mbm_start_timer(void *info)
-{
-	hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME),
-			     HRTIMER_MODE_REL_PINNED);
-}
-
-static void __mbm_stop_timer(void *info)
-{
-	hrtimer_cancel(&mbm_timers[pkg_id]);
-}
-
-static void mbm_start_timers(void)
-{
-	on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1);
-}
-
-static void mbm_stop_timers(void)
-{
-	on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1);
-}
-
-static void mbm_hrtimer_init(void)
-{
-	struct hrtimer *hr;
-	int i;
-
-	for (i = 0; i < mbm_socket_max; i++) {
-		hr = &mbm_timers[i];
-		hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-		hr->function = mbm_hrtimer_handle;
-	}
-}
-
-static u64 intel_cqm_event_count(struct perf_event *event)
-{
-	unsigned long flags;
-	struct rmid_read rr = {
-		.evt_type = event->attr.config,
-		.value = ATOMIC64_INIT(0),
-	};
-
-	/*
-	 * We only need to worry about task events. System-wide events
-	 * are handled like usual, i.e. entirely with
-	 * intel_cqm_event_read().
-	 */
-	if (event->cpu != -1)
-		return __perf_event_count(event);
-
-	/*
-	 * Only the group leader gets to report values except in case of
-	 * multiple events in the same group, we still need to read the
-	 * other events.This stops us
-	 * reporting duplicate values to userspace, and gives us a clear
-	 * rule for which task gets to report the values.
-	 *
-	 * Note that it is impossible to attribute these values to
-	 * specific packages - we forfeit that ability when we create
-	 * task events.
-	 */
-	if (!cqm_group_leader(event) && !event->hw.is_group_event)
-		return 0;
-
-	/*
-	 * Getting up-to-date values requires an SMP IPI which is not
-	 * possible if we're being called in interrupt context. Return
-	 * the cached values instead.
-	 */
-	if (unlikely(in_interrupt()))
-		goto out;
-
-	/*
-	 * Notice that we don't perform the reading of an RMID
-	 * atomically, because we can't hold a spin lock across the
-	 * IPIs.
-	 *
-	 * Speculatively perform the read, since @event might be
-	 * assigned a different (possibly invalid) RMID while we're
-	 * busying performing the IPI calls. It's therefore necessary to
-	 * check @event's RMID afterwards, and if it has changed,
-	 * discard the result of the read.
-	 */
-	rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
-
-	if (!__rmid_valid(rr.rmid))
-		goto out;
-
-	cqm_mask_call(&rr);
-
-	raw_spin_lock_irqsave(&cache_lock, flags);
-	if (event->hw.cqm_rmid == rr.rmid)
-		local64_set(&event->count, atomic64_read(&rr.value));
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-out:
-	return __perf_event_count(event);
-}
-
-static void intel_cqm_event_start(struct perf_event *event, int mode)
-{
-	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-	u32 rmid = event->hw.cqm_rmid;
-
-	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
-		return;
-
-	event->hw.cqm_state &= ~PERF_HES_STOPPED;
-
-	if (state->rmid_usecnt++) {
-		if (!WARN_ON_ONCE(state->rmid != rmid))
-			return;
-	} else {
-		WARN_ON_ONCE(state->rmid);
-	}
-
-	state->rmid = rmid;
-	wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
-}
-
-static void intel_cqm_event_stop(struct perf_event *event, int mode)
-{
-	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-
-	if (event->hw.cqm_state & PERF_HES_STOPPED)
-		return;
-
-	event->hw.cqm_state |= PERF_HES_STOPPED;
-
-	intel_cqm_event_read(event);
-
-	if (!--state->rmid_usecnt) {
-		state->rmid = 0;
-		wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
-	} else {
-		WARN_ON_ONCE(!state->rmid);
-	}
-}
-
-static int intel_cqm_event_add(struct perf_event *event, int mode)
-{
-	unsigned long flags;
-	u32 rmid;
-
-	raw_spin_lock_irqsave(&cache_lock, flags);
-
-	event->hw.cqm_state = PERF_HES_STOPPED;
-	rmid = event->hw.cqm_rmid;
-
-	if (__rmid_valid(rmid) && (mode & PERF_EF_START))
-		intel_cqm_event_start(event, mode);
-
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-
-	return 0;
-}
-
-static void intel_cqm_event_destroy(struct perf_event *event)
-{
-	struct perf_event *group_other = NULL;
-	unsigned long flags;
-
-	mutex_lock(&cache_mutex);
-	/*
-	* Hold the cache_lock as mbm timer handlers could be
-	* scanning the list of events.
-	*/
-	raw_spin_lock_irqsave(&cache_lock, flags);
-
-	/*
-	 * If there's another event in this group...
-	 */
-	if (!list_empty(&event->hw.cqm_group_entry)) {
-		group_other = list_first_entry(&event->hw.cqm_group_entry,
-					       struct perf_event,
-					       hw.cqm_group_entry);
-		list_del(&event->hw.cqm_group_entry);
-	}
-
-	/*
-	 * And we're the group leader..
-	 */
-	if (cqm_group_leader(event)) {
-		/*
-		 * If there was a group_other, make that leader, otherwise
-		 * destroy the group and return the RMID.
-		 */
-		if (group_other) {
-			list_replace(&event->hw.cqm_groups_entry,
-				     &group_other->hw.cqm_groups_entry);
-		} else {
-			u32 rmid = event->hw.cqm_rmid;
-
-			if (__rmid_valid(rmid))
-				__put_rmid(rmid);
-			list_del(&event->hw.cqm_groups_entry);
-		}
-	}
-
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-
-	/*
-	 * Stop the mbm overflow timers when the last event is destroyed.
-	*/
-	if (mbm_enabled && list_empty(&cache_groups))
-		mbm_stop_timers();
-
-	mutex_unlock(&cache_mutex);
-}
-
-static int intel_cqm_event_init(struct perf_event *event)
-{
-	struct perf_event *group = NULL;
-	bool rotate = false;
-	unsigned long flags;
-
-	if (event->attr.type != intel_cqm_pmu.type)
-		return -ENOENT;
-
-	if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) ||
-	     (event->attr.config > QOS_MBM_LOCAL_EVENT_ID))
-		return -EINVAL;
-
-	if ((is_cqm_event(event->attr.config) && !cqm_enabled) ||
-	    (is_mbm_event(event->attr.config) && !mbm_enabled))
-		return -EINVAL;
-
-	/* unsupported modes and filters */
-	if (event->attr.exclude_user   ||
-	    event->attr.exclude_kernel ||
-	    event->attr.exclude_hv     ||
-	    event->attr.exclude_idle   ||
-	    event->attr.exclude_host   ||
-	    event->attr.exclude_guest  ||
-	    event->attr.sample_period) /* no sampling */
-		return -EINVAL;
-
-	INIT_LIST_HEAD(&event->hw.cqm_group_entry);
-	INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
-
-	event->destroy = intel_cqm_event_destroy;
-
-	mutex_lock(&cache_mutex);
-
-	/*
-	 * Start the mbm overflow timers when the first event is created.
-	*/
-	if (mbm_enabled && list_empty(&cache_groups))
-		mbm_start_timers();
-
-	/* Will also set rmid */
-	intel_cqm_setup_event(event, &group);
-
-	/*
-	* Hold the cache_lock as mbm timer handlers be
-	* scanning the list of events.
-	*/
-	raw_spin_lock_irqsave(&cache_lock, flags);
-
-	if (group) {
-		list_add_tail(&event->hw.cqm_group_entry,
-			      &group->hw.cqm_group_entry);
-	} else {
-		list_add_tail(&event->hw.cqm_groups_entry,
-			      &cache_groups);
-
-		/*
-		 * All RMIDs are either in use or have recently been
-		 * used. Kick the rotation worker to clean/free some.
-		 *
-		 * We only do this for the group leader, rather than for
-		 * every event in a group to save on needless work.
-		 */
-		if (!__rmid_valid(event->hw.cqm_rmid))
-			rotate = true;
-	}
-
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-	mutex_unlock(&cache_mutex);
-
-	if (rotate)
-		schedule_delayed_work(&intel_cqm_rmid_work, 0);
-
-	return 0;
-}
-
-EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
-EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
-EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
-EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
-EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
-
-EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02");
-EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1");
-EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB");
-EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6");
-
-EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03");
-EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1");
-EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB");
-EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6");
-
-static struct attribute *intel_cqm_events_attr[] = {
-	EVENT_PTR(intel_cqm_llc),
-	EVENT_PTR(intel_cqm_llc_pkg),
-	EVENT_PTR(intel_cqm_llc_unit),
-	EVENT_PTR(intel_cqm_llc_scale),
-	EVENT_PTR(intel_cqm_llc_snapshot),
-	NULL,
-};
-
-static struct attribute *intel_mbm_events_attr[] = {
-	EVENT_PTR(intel_cqm_total_bytes),
-	EVENT_PTR(intel_cqm_local_bytes),
-	EVENT_PTR(intel_cqm_total_bytes_pkg),
-	EVENT_PTR(intel_cqm_local_bytes_pkg),
-	EVENT_PTR(intel_cqm_total_bytes_unit),
-	EVENT_PTR(intel_cqm_local_bytes_unit),
-	EVENT_PTR(intel_cqm_total_bytes_scale),
-	EVENT_PTR(intel_cqm_local_bytes_scale),
-	NULL,
-};
-
-static struct attribute *intel_cmt_mbm_events_attr[] = {
-	EVENT_PTR(intel_cqm_llc),
-	EVENT_PTR(intel_cqm_total_bytes),
-	EVENT_PTR(intel_cqm_local_bytes),
-	EVENT_PTR(intel_cqm_llc_pkg),
-	EVENT_PTR(intel_cqm_total_bytes_pkg),
-	EVENT_PTR(intel_cqm_local_bytes_pkg),
-	EVENT_PTR(intel_cqm_llc_unit),
-	EVENT_PTR(intel_cqm_total_bytes_unit),
-	EVENT_PTR(intel_cqm_local_bytes_unit),
-	EVENT_PTR(intel_cqm_llc_scale),
-	EVENT_PTR(intel_cqm_total_bytes_scale),
-	EVENT_PTR(intel_cqm_local_bytes_scale),
-	EVENT_PTR(intel_cqm_llc_snapshot),
-	NULL,
-};
-
-static struct attribute_group intel_cqm_events_group = {
-	.name = "events",
-	.attrs = NULL,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-7");
-static struct attribute *intel_cqm_formats_attr[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-
-static struct attribute_group intel_cqm_format_group = {
-	.name = "format",
-	.attrs = intel_cqm_formats_attr,
-};
-
-static ssize_t
-max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
-			   char *page)
-{
-	ssize_t rv;
-
-	mutex_lock(&cache_mutex);
-	rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
-	mutex_unlock(&cache_mutex);
-
-	return rv;
-}
-
-static ssize_t
-max_recycle_threshold_store(struct device *dev,
-			    struct device_attribute *attr,
-			    const char *buf, size_t count)
-{
-	unsigned int bytes, cachelines;
-	int ret;
-
-	ret = kstrtouint(buf, 0, &bytes);
-	if (ret)
-		return ret;
-
-	mutex_lock(&cache_mutex);
-
-	__intel_cqm_max_threshold = bytes;
-	cachelines = bytes / cqm_l3_scale;
-
-	/*
-	 * The new maximum takes effect immediately.
-	 */
-	if (__intel_cqm_threshold > cachelines)
-		__intel_cqm_threshold = cachelines;
-
-	mutex_unlock(&cache_mutex);
-
-	return count;
-}
-
-static DEVICE_ATTR_RW(max_recycle_threshold);
-
-static struct attribute *intel_cqm_attrs[] = {
-	&dev_attr_max_recycle_threshold.attr,
-	NULL,
-};
-
-static const struct attribute_group intel_cqm_group = {
-	.attrs = intel_cqm_attrs,
-};
-
-static const struct attribute_group *intel_cqm_attr_groups[] = {
-	&intel_cqm_events_group,
-	&intel_cqm_format_group,
-	&intel_cqm_group,
-	NULL,
-};
-
-static struct pmu intel_cqm_pmu = {
-	.hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
-	.attr_groups	     = intel_cqm_attr_groups,
-	.task_ctx_nr	     = perf_sw_context,
-	.event_init	     = intel_cqm_event_init,
-	.add		     = intel_cqm_event_add,
-	.del		     = intel_cqm_event_stop,
-	.start		     = intel_cqm_event_start,
-	.stop		     = intel_cqm_event_stop,
-	.read		     = intel_cqm_event_read,
-	.count		     = intel_cqm_event_count,
-};
-
-static inline void cqm_pick_event_reader(int cpu)
-{
-	int reader;
-
-	/* First online cpu in package becomes the reader */
-	reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu));
-	if (reader >= nr_cpu_ids)
-		cpumask_set_cpu(cpu, &cqm_cpumask);
-}
-
-static int intel_cqm_cpu_starting(unsigned int cpu)
-{
-	struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-	state->rmid = 0;
-	state->closid = 0;
-	state->rmid_usecnt = 0;
-
-	WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
-	WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
-
-	cqm_pick_event_reader(cpu);
-	return 0;
-}
-
-static int intel_cqm_cpu_exit(unsigned int cpu)
-{
-	int target;
-
-	/* Is @cpu the current cqm reader for this package ? */
-	if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
-		return 0;
-
-	/* Find another online reader in this package */
-	target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
-
-	if (target < nr_cpu_ids)
-		cpumask_set_cpu(target, &cqm_cpumask);
-
-	return 0;
-}
-
-static const struct x86_cpu_id intel_cqm_match[] = {
-	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
-	{}
-};
-
-static void mbm_cleanup(void)
-{
-	if (!mbm_enabled)
-		return;
-
-	kfree(mbm_local);
-	kfree(mbm_total);
-	mbm_enabled = false;
-}
-
-static const struct x86_cpu_id intel_mbm_local_match[] = {
-	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
-	{}
-};
-
-static const struct x86_cpu_id intel_mbm_total_match[] = {
-	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL },
-	{}
-};
-
-static int intel_mbm_init(void)
-{
-	int ret = 0, array_size, maxid = cqm_max_rmid + 1;
-
-	mbm_socket_max = topology_max_packages();
-	array_size = sizeof(struct sample) * maxid * mbm_socket_max;
-	mbm_local = kmalloc(array_size, GFP_KERNEL);
-	if (!mbm_local)
-		return -ENOMEM;
-
-	mbm_total = kmalloc(array_size, GFP_KERNEL);
-	if (!mbm_total) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	array_size = sizeof(struct hrtimer) * mbm_socket_max;
-	mbm_timers = kmalloc(array_size, GFP_KERNEL);
-	if (!mbm_timers) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	mbm_hrtimer_init();
-
-out:
-	if (ret)
-		mbm_cleanup();
-
-	return ret;
-}
-
-static int __init intel_cqm_init(void)
-{
-	char *str = NULL, scale[20];
-	int cpu, ret;
-
-	if (x86_match_cpu(intel_cqm_match))
-		cqm_enabled = true;
-
-	if (x86_match_cpu(intel_mbm_local_match) &&
-	     x86_match_cpu(intel_mbm_total_match))
-		mbm_enabled = true;
-
-	if (!cqm_enabled && !mbm_enabled)
-		return -ENODEV;
-
-	cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
-
-	/*
-	 * It's possible that not all resources support the same number
-	 * of RMIDs. Instead of making scheduling much more complicated
-	 * (where we have to match a task's RMID to a cpu that supports
-	 * that many RMIDs) just find the minimum RMIDs supported across
-	 * all cpus.
-	 *
-	 * Also, check that the scales match on all cpus.
-	 */
-	cpus_read_lock();
-	for_each_online_cpu(cpu) {
-		struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-		if (c->x86_cache_max_rmid < cqm_max_rmid)
-			cqm_max_rmid = c->x86_cache_max_rmid;
-
-		if (c->x86_cache_occ_scale != cqm_l3_scale) {
-			pr_err("Multiple LLC scale values, disabling\n");
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-
-	/*
-	 * A reasonable upper limit on the max threshold is the number
-	 * of lines tagged per RMID if all RMIDs have the same number of
-	 * lines tagged in the LLC.
-	 *
-	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
-	 */
-	__intel_cqm_max_threshold =
-		boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
-
-	snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
-	str = kstrdup(scale, GFP_KERNEL);
-	if (!str) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	event_attr_intel_cqm_llc_scale.event_str = str;
-
-	ret = intel_cqm_setup_rmid_cache();
-	if (ret)
-		goto out;
-
-	if (mbm_enabled)
-		ret = intel_mbm_init();
-	if (ret && !cqm_enabled)
-		goto out;
-
-	if (cqm_enabled && mbm_enabled)
-		intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr;
-	else if (!cqm_enabled && mbm_enabled)
-		intel_cqm_events_group.attrs = intel_mbm_events_attr;
-	else if (cqm_enabled && !mbm_enabled)
-		intel_cqm_events_group.attrs = intel_cqm_events_attr;
-
-	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
-	if (ret) {
-		pr_err("Intel CQM perf registration failed: %d\n", ret);
-		goto out;
-	}
-
-	if (cqm_enabled)
-		pr_info("Intel CQM monitoring enabled\n");
-	if (mbm_enabled)
-		pr_info("Intel MBM enabled\n");
-
-	/*
-	 * Setup the hot cpu notifier once we are sure cqm
-	 * is enabled to avoid notifier leak.
-	 */
-	cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING,
-				     "perf/x86/cqm:starting",
-				     intel_cqm_cpu_starting, NULL);
-	cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE,
-				     "perf/x86/cqm:online",
-				     NULL, intel_cqm_cpu_exit);
-out:
-	cpus_read_unlock();
-
-	if (ret) {
-		kfree(str);
-		cqm_cleanup();
-		mbm_cleanup();
-	}
-
-	return ret;
-}
-device_initcall(intel_cqm_init);
diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h
index b31081b89407..c95321870842 100644
--- a/arch/x86/include/asm/intel_rdt_common.h
+++ b/arch/x86/include/asm/intel_rdt_common.h
@@ -7,7 +7,6 @@
  * struct intel_pqr_state - State cache for the PQR MSR
  * @rmid:		The cached Resource Monitoring ID
  * @closid:		The cached Class Of Service ID
- * @rmid_usecnt:	The usage counter for rmid
  *
  * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
  * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
@@ -19,7 +18,6 @@
 struct intel_pqr_state {
 	u32			rmid;
 	u32			closid;
-	int			rmid_usecnt;
 };
 
 DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 5b366462f579..989a997fce47 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -40,6 +40,14 @@ DEFINE_MUTEX(rdtgroup_mutex);
 
 DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
 
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Functions which modify the state
+ * are called with interrupts disabled and no preemption, which
+ * is sufficient for the protection.
+ */
+DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+
 /*
  * Used to store the max resource name width and max resource data width
  * to display the schemata in a tabular format
-- 
cgit v1.2.3


From f01d7d51f577b5dc0fa5919ab8a9228e2bf49f3e Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:22 -0700
Subject: x86/intel_rdt: Introduce a common compile option for RDT

We currently have a CONFIG_RDT_A which is for RDT(Resource directory
technology) allocation based resctrl filesystem interface. As a
preparation to add support for RDT monitoring as well into the same
resctrl filesystem, change the config option to be CONFIG_RDT which
would include both RDT allocation and monitoring code.

No functional change.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-4-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/Kconfig                 | 12 ++++++------
 arch/x86/include/asm/intel_rdt.h |  4 ++--
 arch/x86/kernel/cpu/Makefile     |  2 +-
 include/linux/sched.h            |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..b1abda70c2d1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -424,16 +424,16 @@ config GOLDFISH
        def_bool y
        depends on X86_GOLDFISH
 
-config INTEL_RDT_A
-	bool "Intel Resource Director Technology Allocation support"
+config INTEL_RDT
+	bool "Intel Resource Director Technology support"
 	default n
 	depends on X86 && CPU_SUP_INTEL
 	select KERNFS
 	help
-	  Select to enable resource allocation which is a sub-feature of
-	  Intel Resource Director Technology(RDT). More information about
-	  RDT can be found in the Intel x86 Architecture Software
-	  Developer Manual.
+	  Select to enable resource allocation and monitoring which are
+	  sub-features of Intel Resource Director Technology(RDT). More
+	  information about RDT can be found in the Intel x86
+	  Architecture Software Developer Manual.
 
 	  Say N if unsure.
 
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index 597dc4995678..ae1efc3609d3 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_X86_INTEL_RDT_H
 #define _ASM_X86_INTEL_RDT_H
 
-#ifdef CONFIG_INTEL_RDT_A
+#ifdef CONFIG_INTEL_RDT
 
 #include <linux/sched.h>
 #include <linux/kernfs.h>
@@ -282,5 +282,5 @@ static inline void intel_rdt_sched_in(void)
 
 static inline void intel_rdt_sched_in(void) {}
 
-#endif /* CONFIG_INTEL_RDT_A */
+#endif /* CONFIG_INTEL_RDT */
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index cdf82492b770..3622ca2b1555 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
-obj-$(CONFIG_INTEL_RDT_A)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
+obj-$(CONFIG_INTEL_RDT)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
 
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8337e2db0bb2..20b2ff2f4fde 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -898,7 +898,7 @@ struct task_struct {
 	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
 	struct list_head		cg_list;
 #endif
-#ifdef CONFIG_INTEL_RDT_A
+#ifdef CONFIG_INTEL_RDT
 	int				closid;
 #endif
 #ifdef CONFIG_FUTEX
-- 
cgit v1.2.3


From 0583020456cea9fcf43b84bb13a41eab059ae0a8 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:23 -0700
Subject: x86/intel_rdt: Change file names to accommodate RDT monitor code

Because the "perf cqm" and resctrl code were separately added and
indivdually configurable, there seem to be separate context switch code
and also things on global .h which are not really needed.

Move only the scheduling specific code and definitions to
<asm/intel_rdt_sched.h> and the put all the other declarations to a
local intel_rdt.h.

h/t to Reinette Chatre for pointing out that we should separate the
public interfaces used by other parts of the kernel from private
objects shared between the various files comprising RDT.

No functional change.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-5-git-send-email-vikas.shivappa@linux.intel.com
---
 MAINTAINERS                              |   2 +-
 arch/x86/include/asm/intel_rdt.h         | 286 -------------------------------
 arch/x86/include/asm/intel_rdt_common.h  |  25 ---
 arch/x86/include/asm/intel_rdt_sched.h   |  72 ++++++++
 arch/x86/kernel/cpu/intel_rdt.c          |   5 +-
 arch/x86/kernel/cpu/intel_rdt.h          | 243 ++++++++++++++++++++++++++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c |   4 +-
 arch/x86/kernel/cpu/intel_rdt_schemata.c |   2 +-
 arch/x86/kernel/process_32.c             |   2 +-
 arch/x86/kernel/process_64.c             |   2 +-
 10 files changed, 324 insertions(+), 319 deletions(-)
 delete mode 100644 arch/x86/include/asm/intel_rdt.h
 delete mode 100644 arch/x86/include/asm/intel_rdt_common.h
 create mode 100644 arch/x86/include/asm/intel_rdt_sched.h
 create mode 100644 arch/x86/kernel/cpu/intel_rdt.h

(limited to 'arch')

diff --git a/MAINTAINERS b/MAINTAINERS
index f66488dfdbc9..cbbc79a81dea 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11103,7 +11103,7 @@ M:	Fenghua Yu <fenghua.yu@intel.com>
 L:	linux-kernel@vger.kernel.org
 S:	Supported
 F:	arch/x86/kernel/cpu/intel_rdt*
-F:	arch/x86/include/asm/intel_rdt*
+F:	arch/x86/include/asm/intel_rdt_sched.h
 F:	Documentation/x86/intel_rdt*
 
 READ-COPY UPDATE (RCU)
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
deleted file mode 100644
index ae1efc3609d3..000000000000
--- a/arch/x86/include/asm/intel_rdt.h
+++ /dev/null
@@ -1,286 +0,0 @@
-#ifndef _ASM_X86_INTEL_RDT_H
-#define _ASM_X86_INTEL_RDT_H
-
-#ifdef CONFIG_INTEL_RDT
-
-#include <linux/sched.h>
-#include <linux/kernfs.h>
-#include <linux/jump_label.h>
-
-#include <asm/intel_rdt_common.h>
-
-#define IA32_L3_QOS_CFG		0xc81
-#define IA32_L3_CBM_BASE	0xc90
-#define IA32_L2_CBM_BASE	0xd10
-#define IA32_MBA_THRTL_BASE	0xd50
-
-#define L3_QOS_CDP_ENABLE	0x01ULL
-
-/**
- * struct rdtgroup - store rdtgroup's data in resctrl file system.
- * @kn:				kernfs node
- * @rdtgroup_list:		linked list for all rdtgroups
- * @closid:			closid for this rdtgroup
- * @cpu_mask:			CPUs assigned to this rdtgroup
- * @flags:			status bits
- * @waitcount:			how many cpus expect to find this
- *				group when they acquire rdtgroup_mutex
- */
-struct rdtgroup {
-	struct kernfs_node	*kn;
-	struct list_head	rdtgroup_list;
-	int			closid;
-	struct cpumask		cpu_mask;
-	int			flags;
-	atomic_t		waitcount;
-};
-
-/* rdtgroup.flags */
-#define	RDT_DELETED		1
-
-/* rftype.flags */
-#define RFTYPE_FLAGS_CPUS_LIST	1
-
-/* List of all resource groups */
-extern struct list_head rdt_all_groups;
-
-extern int max_name_width, max_data_width;
-
-int __init rdtgroup_init(void);
-
-/**
- * struct rftype - describe each file in the resctrl file system
- * @name:	File name
- * @mode:	Access mode
- * @kf_ops:	File operations
- * @flags:	File specific RFTYPE_FLAGS_* flags
- * @seq_show:	Show content of the file
- * @write:	Write to the file
- */
-struct rftype {
-	char			*name;
-	umode_t			mode;
-	struct kernfs_ops	*kf_ops;
-	unsigned long		flags;
-
-	int (*seq_show)(struct kernfs_open_file *of,
-			struct seq_file *sf, void *v);
-	/*
-	 * write() is the generic write callback which maps directly to
-	 * kernfs write operation and overrides all other operations.
-	 * Maximum write size is determined by ->max_write_len.
-	 */
-	ssize_t (*write)(struct kernfs_open_file *of,
-			 char *buf, size_t nbytes, loff_t off);
-};
-
-/**
- * struct rdt_domain - group of cpus sharing an RDT resource
- * @list:	all instances of this resource
- * @id:		unique id for this instance
- * @cpu_mask:	which cpus share this resource
- * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
- * @new_ctrl:	new ctrl value to be loaded
- * @have_new_ctrl: did user provide new_ctrl for this domain
- */
-struct rdt_domain {
-	struct list_head	list;
-	int			id;
-	struct cpumask		cpu_mask;
-	u32			*ctrl_val;
-	u32			new_ctrl;
-	bool			have_new_ctrl;
-};
-
-/**
- * struct msr_param - set a range of MSRs from a domain
- * @res:       The resource to use
- * @low:       Beginning index from base MSR
- * @high:      End index
- */
-struct msr_param {
-	struct rdt_resource	*res;
-	int			low;
-	int			high;
-};
-
-/**
- * struct rdt_cache - Cache allocation related data
- * @cbm_len:		Length of the cache bit mask
- * @min_cbm_bits:	Minimum number of consecutive bits to be set
- * @cbm_idx_mult:	Multiplier of CBM index
- * @cbm_idx_offset:	Offset of CBM index. CBM index is computed by:
- *			closid * cbm_idx_multi + cbm_idx_offset
- *			in a cache bit mask
- */
-struct rdt_cache {
-	unsigned int	cbm_len;
-	unsigned int	min_cbm_bits;
-	unsigned int	cbm_idx_mult;
-	unsigned int	cbm_idx_offset;
-};
-
-/**
- * struct rdt_membw - Memory bandwidth allocation related data
- * @max_delay:		Max throttle delay. Delay is the hardware
- *			representation for memory bandwidth.
- * @min_bw:		Minimum memory bandwidth percentage user can request
- * @bw_gran:		Granularity at which the memory bandwidth is allocated
- * @delay_linear:	True if memory B/W delay is in linear scale
- * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
- */
-struct rdt_membw {
-	u32		max_delay;
-	u32		min_bw;
-	u32		bw_gran;
-	u32		delay_linear;
-	u32		*mb_map;
-};
-
-/**
- * struct rdt_resource - attributes of an RDT resource
- * @enabled:		Is this feature enabled on this machine
- * @capable:		Is this feature available on this machine
- * @name:		Name to use in "schemata" file
- * @num_closid:		Number of CLOSIDs available
- * @cache_level:	Which cache level defines scope of this resource
- * @default_ctrl:	Specifies default cache cbm or memory B/W percent.
- * @msr_base:		Base MSR address for CBMs
- * @msr_update:		Function pointer to update QOS MSRs
- * @data_width:		Character width of data when displaying
- * @domains:		All domains for this resource
- * @cache:		Cache allocation related data
- * @info_files:		resctrl info files for the resource
- * @nr_info_files:	Number of info files
- * @format_str:		Per resource format string to show domain value
- * @parse_ctrlval:	Per resource function pointer to parse control values
- */
-struct rdt_resource {
-	bool			enabled;
-	bool			capable;
-	char			*name;
-	int			num_closid;
-	int			cache_level;
-	u32			default_ctrl;
-	unsigned int		msr_base;
-	void (*msr_update)	(struct rdt_domain *d, struct msr_param *m,
-				 struct rdt_resource *r);
-	int			data_width;
-	struct list_head	domains;
-	struct rdt_cache	cache;
-	struct rdt_membw	membw;
-	struct rftype		*info_files;
-	int			nr_info_files;
-	const char		*format_str;
-	int (*parse_ctrlval)	(char *buf, struct rdt_resource *r,
-				 struct rdt_domain *d);
-};
-
-void rdt_get_cache_infofile(struct rdt_resource *r);
-void rdt_get_mba_infofile(struct rdt_resource *r);
-int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
-int parse_bw(char *buf, struct rdt_resource *r,  struct rdt_domain *d);
-
-extern struct mutex rdtgroup_mutex;
-
-extern struct rdt_resource rdt_resources_all[];
-extern struct rdtgroup rdtgroup_default;
-DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
-
-int __init rdtgroup_init(void);
-
-enum {
-	RDT_RESOURCE_L3,
-	RDT_RESOURCE_L3DATA,
-	RDT_RESOURCE_L3CODE,
-	RDT_RESOURCE_L2,
-	RDT_RESOURCE_MBA,
-
-	/* Must be the last */
-	RDT_NUM_RESOURCES,
-};
-
-#define for_each_capable_rdt_resource(r)				      \
-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++)							      \
-		if (r->capable)
-
-#define for_each_enabled_rdt_resource(r)				      \
-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
-	     r++)							      \
-		if (r->enabled)
-
-/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
-union cpuid_0x10_1_eax {
-	struct {
-		unsigned int cbm_len:5;
-	} split;
-	unsigned int full;
-};
-
-/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
-union cpuid_0x10_3_eax {
-	struct {
-		unsigned int max_delay:12;
-	} split;
-	unsigned int full;
-};
-
-/* CPUID.(EAX=10H, ECX=ResID).EDX */
-union cpuid_0x10_x_edx {
-	struct {
-		unsigned int cos_max:16;
-	} split;
-	unsigned int full;
-};
-
-DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
-
-void rdt_ctrl_update(void *arg);
-struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
-void rdtgroup_kn_unlock(struct kernfs_node *kn);
-ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
-				char *buf, size_t nbytes, loff_t off);
-int rdtgroup_schemata_show(struct kernfs_open_file *of,
-			   struct seq_file *s, void *v);
-
-/*
- * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
- *
- * Following considerations are made so that this has minimal impact
- * on scheduler hot path:
- * - This will stay as no-op unless we are running on an Intel SKU
- *   which supports resource control and we enable by mounting the
- *   resctrl file system.
- * - Caches the per cpu CLOSid values and does the MSR write only
- *   when a task with a different CLOSid is scheduled in.
- *
- * Must be called with preemption disabled.
- */
-static inline void intel_rdt_sched_in(void)
-{
-	if (static_branch_likely(&rdt_enable_key)) {
-		struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-		int closid;
-
-		/*
-		 * If this task has a closid assigned, use it.
-		 * Else use the closid assigned to this cpu.
-		 */
-		closid = current->closid;
-		if (closid == 0)
-			closid = this_cpu_read(cpu_closid);
-
-		if (closid != state->closid) {
-			state->closid = closid;
-			wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid);
-		}
-	}
-}
-
-#else
-
-static inline void intel_rdt_sched_in(void) {}
-
-#endif /* CONFIG_INTEL_RDT */
-#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h
deleted file mode 100644
index c95321870842..000000000000
--- a/arch/x86/include/asm/intel_rdt_common.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _ASM_X86_INTEL_RDT_COMMON_H
-#define _ASM_X86_INTEL_RDT_COMMON_H
-
-#define MSR_IA32_PQR_ASSOC	0x0c8f
-
-/**
- * struct intel_pqr_state - State cache for the PQR MSR
- * @rmid:		The cached Resource Monitoring ID
- * @closid:		The cached Class Of Service ID
- *
- * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
- * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them.
- *
- * The cache also helps to avoid pointless updates if the value does
- * not change.
- */
-struct intel_pqr_state {
-	u32			rmid;
-	u32			closid;
-};
-
-DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
-
-#endif /* _ASM_X86_INTEL_RDT_COMMON_H */
diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h
new file mode 100644
index 000000000000..62a70bc85bce
--- /dev/null
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -0,0 +1,72 @@
+#ifndef _ASM_X86_INTEL_RDT_SCHED_H
+#define _ASM_X86_INTEL_RDT_SCHED_H
+
+#ifdef CONFIG_INTEL_RDT
+
+#include <linux/sched.h>
+#include <linux/jump_label.h>
+
+#define IA32_PQR_ASSOC	0x0c8f
+
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @rmid:		The cached Resource Monitoring ID
+ * @closid:		The cached Class Of Service ID
+ *
+ * The upper 32 bits of IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+	u32			rmid;
+	u32			closid;
+};
+
+DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
+DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+
+/*
+ * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
+ *
+ * Following considerations are made so that this has minimal impact
+ * on scheduler hot path:
+ * - This will stay as no-op unless we are running on an Intel SKU
+ *   which supports resource control and we enable by mounting the
+ *   resctrl file system.
+ * - Caches the per cpu CLOSid values and does the MSR write only
+ *   when a task with a different CLOSid is scheduled in.
+ *
+ * Must be called with preemption disabled.
+ */
+static inline void intel_rdt_sched_in(void)
+{
+	if (static_branch_likely(&rdt_enable_key)) {
+		struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+		int closid;
+
+		/*
+		 * If this task has a closid assigned, use it.
+		 * Else use the closid assigned to this cpu.
+		 */
+		closid = current->closid;
+		if (closid == 0)
+			closid = this_cpu_read(cpu_closid);
+
+		if (closid != state->closid) {
+			state->closid = closid;
+			wrmsr(IA32_PQR_ASSOC, state->rmid, closid);
+		}
+	}
+}
+
+#else
+
+static inline void intel_rdt_sched_in(void) {}
+
+#endif /* CONFIG_INTEL_RDT */
+
+#endif /* _ASM_X86_INTEL_RDT_SCHED_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 989a997fce47..08872e9e09c3 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -30,7 +30,8 @@
 #include <linux/cpuhotplug.h>
 
 #include <asm/intel-family.h>
-#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_sched.h>
+#include "intel_rdt.h"
 
 #define MAX_MBA_BW	100u
 #define MBA_IS_LINEAR	0x4
@@ -455,7 +456,7 @@ static void clear_closid(int cpu)
 
 	per_cpu(cpu_closid, cpu) = 0;
 	state->closid = 0;
-	wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0);
+	wrmsr(IA32_PQR_ASSOC, state->rmid, 0);
 }
 
 static int intel_rdt_online_cpu(unsigned int cpu)
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
new file mode 100644
index 000000000000..0e4852d68faf
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -0,0 +1,243 @@
+#ifndef _ASM_X86_INTEL_RDT_H
+#define _ASM_X86_INTEL_RDT_H
+
+#include <linux/sched.h>
+#include <linux/kernfs.h>
+#include <linux/jump_label.h>
+
+#define IA32_L3_QOS_CFG		0xc81
+#define IA32_L3_CBM_BASE	0xc90
+#define IA32_L2_CBM_BASE	0xd10
+#define IA32_MBA_THRTL_BASE	0xd50
+
+#define L3_QOS_CDP_ENABLE	0x01ULL
+
+/**
+ * struct rdtgroup - store rdtgroup's data in resctrl file system.
+ * @kn:				kernfs node
+ * @rdtgroup_list:		linked list for all rdtgroups
+ * @closid:			closid for this rdtgroup
+ * @cpu_mask:			CPUs assigned to this rdtgroup
+ * @flags:			status bits
+ * @waitcount:			how many cpus expect to find this
+ *				group when they acquire rdtgroup_mutex
+ */
+struct rdtgroup {
+	struct kernfs_node	*kn;
+	struct list_head	rdtgroup_list;
+	int			closid;
+	struct cpumask		cpu_mask;
+	int			flags;
+	atomic_t		waitcount;
+};
+
+/* rdtgroup.flags */
+#define	RDT_DELETED		1
+
+/* rftype.flags */
+#define RFTYPE_FLAGS_CPUS_LIST	1
+
+/* List of all resource groups */
+extern struct list_head rdt_all_groups;
+
+extern int max_name_width, max_data_width;
+
+int __init rdtgroup_init(void);
+
+/**
+ * struct rftype - describe each file in the resctrl file system
+ * @name:	File name
+ * @mode:	Access mode
+ * @kf_ops:	File operations
+ * @flags:	File specific RFTYPE_FLAGS_* flags
+ * @seq_show:	Show content of the file
+ * @write:	Write to the file
+ */
+struct rftype {
+	char			*name;
+	umode_t			mode;
+	struct kernfs_ops	*kf_ops;
+	unsigned long		flags;
+
+	int (*seq_show)(struct kernfs_open_file *of,
+			struct seq_file *sf, void *v);
+	/*
+	 * write() is the generic write callback which maps directly to
+	 * kernfs write operation and overrides all other operations.
+	 * Maximum write size is determined by ->max_write_len.
+	 */
+	ssize_t (*write)(struct kernfs_open_file *of,
+			 char *buf, size_t nbytes, loff_t off);
+};
+
+/**
+ * struct rdt_domain - group of cpus sharing an RDT resource
+ * @list:	all instances of this resource
+ * @id:		unique id for this instance
+ * @cpu_mask:	which cpus share this resource
+ * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
+ * @new_ctrl:	new ctrl value to be loaded
+ * @have_new_ctrl: did user provide new_ctrl for this domain
+ */
+struct rdt_domain {
+	struct list_head	list;
+	int			id;
+	struct cpumask		cpu_mask;
+	u32			*ctrl_val;
+	u32			new_ctrl;
+	bool			have_new_ctrl;
+};
+
+/**
+ * struct msr_param - set a range of MSRs from a domain
+ * @res:       The resource to use
+ * @low:       Beginning index from base MSR
+ * @high:      End index
+ */
+struct msr_param {
+	struct rdt_resource	*res;
+	int			low;
+	int			high;
+};
+
+/**
+ * struct rdt_cache - Cache allocation related data
+ * @cbm_len:		Length of the cache bit mask
+ * @min_cbm_bits:	Minimum number of consecutive bits to be set
+ * @cbm_idx_mult:	Multiplier of CBM index
+ * @cbm_idx_offset:	Offset of CBM index. CBM index is computed by:
+ *			closid * cbm_idx_multi + cbm_idx_offset
+ *			in a cache bit mask
+ */
+struct rdt_cache {
+	unsigned int	cbm_len;
+	unsigned int	min_cbm_bits;
+	unsigned int	cbm_idx_mult;
+	unsigned int	cbm_idx_offset;
+};
+
+/**
+ * struct rdt_membw - Memory bandwidth allocation related data
+ * @max_delay:		Max throttle delay. Delay is the hardware
+ *			representation for memory bandwidth.
+ * @min_bw:		Minimum memory bandwidth percentage user can request
+ * @bw_gran:		Granularity at which the memory bandwidth is allocated
+ * @delay_linear:	True if memory B/W delay is in linear scale
+ * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
+ */
+struct rdt_membw {
+	u32		max_delay;
+	u32		min_bw;
+	u32		bw_gran;
+	u32		delay_linear;
+	u32		*mb_map;
+};
+
+/**
+ * struct rdt_resource - attributes of an RDT resource
+ * @enabled:		Is this feature enabled on this machine
+ * @capable:		Is this feature available on this machine
+ * @name:		Name to use in "schemata" file
+ * @num_closid:		Number of CLOSIDs available
+ * @cache_level:	Which cache level defines scope of this resource
+ * @default_ctrl:	Specifies default cache cbm or memory B/W percent.
+ * @msr_base:		Base MSR address for CBMs
+ * @msr_update:		Function pointer to update QOS MSRs
+ * @data_width:		Character width of data when displaying
+ * @domains:		All domains for this resource
+ * @cache:		Cache allocation related data
+ * @info_files:		resctrl info files for the resource
+ * @nr_info_files:	Number of info files
+ * @format_str:		Per resource format string to show domain value
+ * @parse_ctrlval:	Per resource function pointer to parse control values
+ */
+struct rdt_resource {
+	bool			enabled;
+	bool			capable;
+	char			*name;
+	int			num_closid;
+	int			cache_level;
+	u32			default_ctrl;
+	unsigned int		msr_base;
+	void (*msr_update)	(struct rdt_domain *d, struct msr_param *m,
+				 struct rdt_resource *r);
+	int			data_width;
+	struct list_head	domains;
+	struct rdt_cache	cache;
+	struct rdt_membw	membw;
+	struct rftype		*info_files;
+	int			nr_info_files;
+	const char		*format_str;
+	int (*parse_ctrlval)	(char *buf, struct rdt_resource *r,
+				 struct rdt_domain *d);
+};
+
+void rdt_get_cache_infofile(struct rdt_resource *r);
+void rdt_get_mba_infofile(struct rdt_resource *r);
+int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
+int parse_bw(char *buf, struct rdt_resource *r,  struct rdt_domain *d);
+
+extern struct mutex rdtgroup_mutex;
+
+extern struct rdt_resource rdt_resources_all[];
+extern struct rdtgroup rdtgroup_default;
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+
+int __init rdtgroup_init(void);
+
+enum {
+	RDT_RESOURCE_L3,
+	RDT_RESOURCE_L3DATA,
+	RDT_RESOURCE_L3CODE,
+	RDT_RESOURCE_L2,
+	RDT_RESOURCE_MBA,
+
+	/* Must be the last */
+	RDT_NUM_RESOURCES,
+};
+
+#define for_each_capable_rdt_resource(r)				      \
+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+	     r++)							      \
+		if (r->capable)
+
+#define for_each_enabled_rdt_resource(r)				      \
+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+	     r++)							      \
+		if (r->enabled)
+
+/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
+union cpuid_0x10_1_eax {
+	struct {
+		unsigned int cbm_len:5;
+	} split;
+	unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
+union cpuid_0x10_3_eax {
+	struct {
+		unsigned int max_delay:12;
+	} split;
+	unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID).EDX */
+union cpuid_0x10_x_edx {
+	struct {
+		unsigned int cos_max:16;
+	} split;
+	unsigned int full;
+};
+
+DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+
+void rdt_ctrl_update(void *arg);
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
+void rdtgroup_kn_unlock(struct kernfs_node *kn);
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off);
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+			   struct seq_file *s, void *v);
+
+#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 9257bd9dc664..fab88117ae5f 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -32,8 +32,8 @@
 
 #include <uapi/linux/magic.h>
 
-#include <asm/intel_rdt.h>
-#include <asm/intel_rdt_common.h>
+#include <asm/intel_rdt_sched.h>
+#include "intel_rdt.h"
 
 DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
 struct kernfs_root *rdt_root;
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c
index 406d7a6532f9..8cef1c8e223a 100644
--- a/arch/x86/kernel/cpu/intel_rdt_schemata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c
@@ -26,7 +26,7 @@
 #include <linux/kernfs.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <asm/intel_rdt.h>
+#include "intel_rdt.h"
 
 /*
  * Check whether MBA bandwidth percentage value is correct. The value is
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index c6d6dc5f8bb2..22802162eeb9 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -56,7 +56,7 @@
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
 #include <asm/vm86.h>
-#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_sched.h>
 #include <asm/proto.h>
 
 void __show_regs(struct pt_regs *regs, int all)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c3169be4c596..77a35c817b2b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
 #include <asm/switch_to.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/vdso.h>
-#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_sched.h>
 #include <asm/unistd.h>
 #ifdef CONFIG_IA32_EMULATION
 /* Not included via unistd.h */
-- 
cgit v1.2.3


From cb2200e967c65519ca6c5426644a49dca65f6294 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Tue, 25 Jul 2017 14:14:24 -0700
Subject: x86/intel_rdt: Mark rdt_root and closid_alloc as static

Sparse reports that both of these can be static.

Make it so.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Link: http://lkml.kernel.org/r/1501017287-28083-6-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index fab88117ae5f..3273e88ab29d 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -36,7 +36,7 @@
 #include "intel_rdt.h"
 
 DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
-struct kernfs_root *rdt_root;
+static struct kernfs_root *rdt_root;
 struct rdtgroup rdtgroup_default;
 LIST_HEAD(rdt_all_groups);
 
@@ -75,7 +75,7 @@ static void closid_init(void)
 	closid_free_map &= ~1;
 }
 
-int closid_alloc(void)
+static int closid_alloc(void)
 {
 	int closid = ffs(closid_free_map);
 
-- 
cgit v1.2.3


From 1b5c0b7583173b787b5c93ff89838a950d0e23ff Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:25 -0700
Subject: x86/intel_rdt: Cleanup namespace to support RDT monitoring

Few of the data-structures have generic names although they are RDT
allocation specific. Rename them to be allocation specific to
accommodate RDT monitoring. E.g. s/enabled/alloc_enabled/

No functional change.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-7-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/include/asm/intel_rdt_sched.h   |  4 ++--
 arch/x86/kernel/cpu/intel_rdt.c          | 26 ++++++++++++-------------
 arch/x86/kernel/cpu/intel_rdt.h          | 18 ++++++++---------
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 33 ++++++++++++++++----------------
 arch/x86/kernel/cpu/intel_rdt_schemata.c |  8 ++++----
 5 files changed, 45 insertions(+), 44 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h
index 62a70bc85bce..4dee77b6de07 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -27,7 +27,7 @@ struct intel_pqr_state {
 
 DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
 DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
-DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 
 /*
  * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
@@ -44,7 +44,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
  */
 static inline void intel_rdt_sched_in(void)
 {
-	if (static_branch_likely(&rdt_enable_key)) {
+	if (static_branch_likely(&rdt_alloc_enable_key)) {
 		struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
 		int closid;
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 08872e9e09c3..835e1ff8449f 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -173,8 +173,8 @@ static inline bool cache_alloc_hsw_probe(void)
 		r->default_ctrl = max_cbm;
 		r->cache.cbm_len = 20;
 		r->cache.min_cbm_bits = 2;
-		r->capable = true;
-		r->enabled = true;
+		r->alloc_capable = true;
+		r->alloc_enabled = true;
 
 		return true;
 	}
@@ -224,8 +224,8 @@ static bool rdt_get_mem_config(struct rdt_resource *r)
 	r->data_width = 3;
 	rdt_get_mba_infofile(r);
 
-	r->capable = true;
-	r->enabled = true;
+	r->alloc_capable = true;
+	r->alloc_enabled = true;
 
 	return true;
 }
@@ -242,8 +242,8 @@ static void rdt_get_cache_config(int idx, struct rdt_resource *r)
 	r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
 	r->data_width = (r->cache.cbm_len + 3) / 4;
 	rdt_get_cache_infofile(r);
-	r->capable = true;
-	r->enabled = true;
+	r->alloc_capable = true;
+	r->alloc_enabled = true;
 }
 
 static void rdt_get_cdp_l3_config(int type)
@@ -255,12 +255,12 @@ static void rdt_get_cdp_l3_config(int type)
 	r->cache.cbm_len = r_l3->cache.cbm_len;
 	r->default_ctrl = r_l3->default_ctrl;
 	r->data_width = (r->cache.cbm_len + 3) / 4;
-	r->capable = true;
+	r->alloc_capable = true;
 	/*
 	 * By default, CDP is disabled. CDP can be enabled by mount parameter
 	 * "cdp" during resctrl file system mount time.
 	 */
-	r->enabled = false;
+	r->alloc_enabled = false;
 }
 
 static int get_cache_id(int cpu, int level)
@@ -422,7 +422,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 
 	d->id = id;
 
-	if (domain_setup_ctrlval(r, d)) {
+	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
 		kfree(d);
 		return;
 	}
@@ -464,7 +464,7 @@ static int intel_rdt_online_cpu(unsigned int cpu)
 	struct rdt_resource *r;
 
 	mutex_lock(&rdtgroup_mutex);
-	for_each_capable_rdt_resource(r)
+	for_each_alloc_capable_rdt_resource(r)
 		domain_add_cpu(cpu, r);
 	/* The cpu is set in default rdtgroup after online. */
 	cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
@@ -480,7 +480,7 @@ static int intel_rdt_offline_cpu(unsigned int cpu)
 	struct rdt_resource *r;
 
 	mutex_lock(&rdtgroup_mutex);
-	for_each_capable_rdt_resource(r)
+	for_each_alloc_capable_rdt_resource(r)
 		domain_remove_cpu(cpu, r);
 	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
 		if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask))
@@ -501,7 +501,7 @@ static __init void rdt_init_padding(void)
 	struct rdt_resource *r;
 	int cl;
 
-	for_each_capable_rdt_resource(r) {
+	for_each_alloc_capable_rdt_resource(r) {
 		cl = strlen(r->name);
 		if (cl > max_name_width)
 			max_name_width = cl;
@@ -565,7 +565,7 @@ static int __init intel_rdt_late_init(void)
 		return ret;
 	}
 
-	for_each_capable_rdt_resource(r)
+	for_each_alloc_capable_rdt_resource(r)
 		pr_info("Intel RDT %s allocation detected\n", r->name);
 
 	return 0;
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 0e4852d68faf..29630af30c38 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -135,8 +135,8 @@ struct rdt_membw {
 
 /**
  * struct rdt_resource - attributes of an RDT resource
- * @enabled:		Is this feature enabled on this machine
- * @capable:		Is this feature available on this machine
+ * @alloc_enabled:	Is allocation enabled on this machine
+ * @alloc_capable:	Is allocation available on this machine
  * @name:		Name to use in "schemata" file
  * @num_closid:		Number of CLOSIDs available
  * @cache_level:	Which cache level defines scope of this resource
@@ -152,8 +152,8 @@ struct rdt_membw {
  * @parse_ctrlval:	Per resource function pointer to parse control values
  */
 struct rdt_resource {
-	bool			enabled;
-	bool			capable;
+	bool			alloc_enabled;
+	bool			alloc_capable;
 	char			*name;
 	int			num_closid;
 	int			cache_level;
@@ -181,7 +181,7 @@ extern struct mutex rdtgroup_mutex;
 
 extern struct rdt_resource rdt_resources_all[];
 extern struct rdtgroup rdtgroup_default;
-DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 
 int __init rdtgroup_init(void);
 
@@ -196,15 +196,15 @@ enum {
 	RDT_NUM_RESOURCES,
 };
 
-#define for_each_capable_rdt_resource(r)				      \
+#define for_each_alloc_capable_rdt_resource(r)				      \
 	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
 	     r++)							      \
-		if (r->capable)
+		if (r->alloc_capable)
 
-#define for_each_enabled_rdt_resource(r)				      \
+#define for_each_alloc_enabled_rdt_resource(r)				      \
 	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
 	     r++)							      \
-		if (r->enabled)
+		if (r->alloc_enabled)
 
 /* CPUID.(EAX=10H, ECX=ResID=1).EAX */
 union cpuid_0x10_1_eax {
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 3273e88ab29d..ce63d1011e03 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -35,7 +35,7 @@
 #include <asm/intel_rdt_sched.h>
 #include "intel_rdt.h"
 
-DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
+DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 static struct kernfs_root *rdt_root;
 struct rdtgroup rdtgroup_default;
 LIST_HEAD(rdt_all_groups);
@@ -66,7 +66,7 @@ static void closid_init(void)
 	int rdt_min_closid = 32;
 
 	/* Compute rdt_min_closid across all resources */
-	for_each_enabled_rdt_resource(r)
+	for_each_alloc_enabled_rdt_resource(r)
 		rdt_min_closid = min(rdt_min_closid, r->num_closid);
 
 	closid_free_map = BIT_MASK(rdt_min_closid) - 1;
@@ -638,7 +638,7 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 		return PTR_ERR(kn_info);
 	kernfs_get(kn_info);
 
-	for_each_enabled_rdt_resource(r) {
+	for_each_alloc_enabled_rdt_resource(r) {
 		kn_subdir = kernfs_create_dir(kn_info, r->name,
 					      kn_info->mode, r);
 		if (IS_ERR(kn_subdir)) {
@@ -718,14 +718,15 @@ static int cdp_enable(void)
 	struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
 	int ret;
 
-	if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable)
+	if (!r_l3->alloc_capable || !r_l3data->alloc_capable ||
+	    !r_l3code->alloc_capable)
 		return -EINVAL;
 
 	ret = set_l3_qos_cfg(r_l3, true);
 	if (!ret) {
-		r_l3->enabled = false;
-		r_l3data->enabled = true;
-		r_l3code->enabled = true;
+		r_l3->alloc_enabled = false;
+		r_l3data->alloc_enabled = true;
+		r_l3code->alloc_enabled = true;
 	}
 	return ret;
 }
@@ -734,11 +735,11 @@ static void cdp_disable(void)
 {
 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
 
-	r->enabled = r->capable;
+	r->alloc_enabled = r->alloc_capable;
 
-	if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) {
-		rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false;
-		rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false;
+	if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) {
+		rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false;
+		rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false;
 		set_l3_qos_cfg(r, false);
 	}
 }
@@ -834,7 +835,7 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
 	/*
 	 * resctrl file system can only be mounted once.
 	 */
-	if (static_branch_unlikely(&rdt_enable_key)) {
+	if (static_branch_unlikely(&rdt_alloc_enable_key)) {
 		dentry = ERR_PTR(-EBUSY);
 		goto out;
 	}
@@ -858,7 +859,7 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
 	if (IS_ERR(dentry))
 		goto out_destroy;
 
-	static_branch_enable(&rdt_enable_key);
+	static_branch_enable(&rdt_alloc_enable_key);
 	goto out;
 
 out_destroy:
@@ -986,11 +987,11 @@ static void rdt_kill_sb(struct super_block *sb)
 	mutex_lock(&rdtgroup_mutex);
 
 	/*Put everything back to default values. */
-	for_each_enabled_rdt_resource(r)
+	for_each_alloc_enabled_rdt_resource(r)
 		reset_all_ctrls(r);
 	cdp_disable();
 	rmdir_all_sub();
-	static_branch_disable(&rdt_enable_key);
+	static_branch_disable(&rdt_alloc_enable_key);
 	kernfs_kill_sb(sb);
 	mutex_unlock(&rdtgroup_mutex);
 }
@@ -1129,7 +1130,7 @@ out:
 
 static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
 {
-	if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled)
+	if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
 		seq_puts(seq, ",cdp");
 	return 0;
 }
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c
index 8cef1c8e223a..952156c2688d 100644
--- a/arch/x86/kernel/cpu/intel_rdt_schemata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c
@@ -192,7 +192,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
 {
 	struct rdt_resource *r;
 
-	for_each_enabled_rdt_resource(r) {
+	for_each_alloc_enabled_rdt_resource(r) {
 		if (!strcmp(resname, r->name) && closid < r->num_closid)
 			return parse_line(tok, r);
 	}
@@ -221,7 +221,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 
 	closid = rdtgrp->closid;
 
-	for_each_enabled_rdt_resource(r) {
+	for_each_alloc_enabled_rdt_resource(r) {
 		list_for_each_entry(dom, &r->domains, list)
 			dom->have_new_ctrl = false;
 	}
@@ -237,7 +237,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 			goto out;
 	}
 
-	for_each_enabled_rdt_resource(r) {
+	for_each_alloc_enabled_rdt_resource(r) {
 		ret = update_domains(r, closid);
 		if (ret)
 			goto out;
@@ -274,7 +274,7 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
 	if (rdtgrp) {
 		closid = rdtgrp->closid;
-		for_each_enabled_rdt_resource(r) {
+		for_each_alloc_enabled_rdt_resource(r) {
 			if (closid < r->num_closid)
 				show_doms(s, r, closid);
 		}
-- 
cgit v1.2.3


From dd131853f3fbc1c3aa051c34a2967c2f76309024 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:26 -0700
Subject: x86/intel_rdt: Make rdt_resources_all more readable

Change the format of the global rdt_resources_all. This holds all the
RDT resource structure initialization values. Make this more readable by
using the format:

rdt_resources_all[] = {
	[<resource_index>] =
            {...
	    }
	...
}

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-8-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 835e1ff8449f..98715c50ee4c 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -63,6 +63,7 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
 #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
 
 struct rdt_resource rdt_resources_all[] = {
+	[RDT_RESOURCE_L3] =
 	{
 		.name			= "L3",
 		.domains		= domain_init(RDT_RESOURCE_L3),
@@ -77,6 +78,7 @@ struct rdt_resource rdt_resources_all[] = {
 		.parse_ctrlval		= parse_cbm,
 		.format_str		= "%d=%0*x",
 	},
+	[RDT_RESOURCE_L3DATA] =
 	{
 		.name			= "L3DATA",
 		.domains		= domain_init(RDT_RESOURCE_L3DATA),
@@ -91,6 +93,7 @@ struct rdt_resource rdt_resources_all[] = {
 		.parse_ctrlval		= parse_cbm,
 		.format_str		= "%d=%0*x",
 	},
+	[RDT_RESOURCE_L3CODE] =
 	{
 		.name			= "L3CODE",
 		.domains		= domain_init(RDT_RESOURCE_L3CODE),
@@ -105,6 +108,7 @@ struct rdt_resource rdt_resources_all[] = {
 		.parse_ctrlval		= parse_cbm,
 		.format_str		= "%d=%0*x",
 	},
+	[RDT_RESOURCE_L2] =
 	{
 		.name			= "L2",
 		.domains		= domain_init(RDT_RESOURCE_L2),
@@ -119,6 +123,7 @@ struct rdt_resource rdt_resources_all[] = {
 		.parse_ctrlval		= parse_cbm,
 		.format_str		= "%d=%0*x",
 	},
+	[RDT_RESOURCE_MBA] =
 	{
 		.name			= "MB",
 		.domains		= domain_init(RDT_RESOURCE_MBA),
-- 
cgit v1.2.3


From 6a445edce657810992594c7b9e679219aaf78ad9 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:27 -0700
Subject: x86/intel_rdt/cqm: Add RDT monitoring initialization

Add common data structures for RDT resource monitoring and perform RDT
monitoring related data structure initializations which include setting
up the RMID(Resource monitoring ID) lists and event list which the
resource supports.

[ tony: some cleanup to make adding MBM easier later, remove "cqm" from
  	some names, make some data structure local to intel_rdt_monitor.c
  	static. Add copyright header]

[ tglx: Made it readable ]

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-9-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/Makefile            |   2 +-
 arch/x86/kernel/cpu/intel_rdt.c         |  41 +++++++-
 arch/x86/kernel/cpu/intel_rdt.h         |  44 +++++++++
 arch/x86/kernel/cpu/intel_rdt_monitor.c | 161 ++++++++++++++++++++++++++++++++
 4 files changed, 242 insertions(+), 6 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/intel_rdt_monitor.c

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3622ca2b1555..7584be0750c4 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
-obj-$(CONFIG_INTEL_RDT)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
+obj-$(CONFIG_INTEL_RDT)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o intel_rdt_monitor.o
 
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 98715c50ee4c..cb520dd73bc8 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -55,6 +55,12 @@ DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
  */
 int max_name_width, max_data_width;
 
+/*
+ * Global boolean for rdt_alloc which is true if any
+ * resource allocation is enabled.
+ */
+bool rdt_alloc_capable;
+
 static void
 mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
 static void
@@ -235,7 +241,7 @@ static bool rdt_get_mem_config(struct rdt_resource *r)
 	return true;
 }
 
-static void rdt_get_cache_config(int idx, struct rdt_resource *r)
+static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
 {
 	union cpuid_0x10_1_eax eax;
 	union cpuid_0x10_x_edx edx;
@@ -516,7 +522,7 @@ static __init void rdt_init_padding(void)
 	}
 }
 
-static __init bool get_rdt_resources(void)
+static __init bool get_rdt_alloc_resources(void)
 {
 	bool ret = false;
 
@@ -527,7 +533,7 @@ static __init bool get_rdt_resources(void)
 		return false;
 
 	if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
-		rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+		rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
 		if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
 			rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
 			rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
@@ -536,7 +542,7 @@ static __init bool get_rdt_resources(void)
 	}
 	if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
 		/* CPUID 0x10.2 fields are same format at 0x10.1 */
-		rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+		rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
 		ret = true;
 	}
 
@@ -544,10 +550,32 @@ static __init bool get_rdt_resources(void)
 		if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA]))
 			ret = true;
 	}
-
 	return ret;
 }
 
+static __init bool get_rdt_mon_resources(void)
+{
+	if (boot_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
+		rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
+	if (boot_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
+		rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID);
+	if (boot_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
+		rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID);
+
+	if (!rdt_mon_features)
+		return false;
+
+	return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]);
+}
+
+static __init bool get_rdt_resources(void)
+{
+	rdt_alloc_capable = get_rdt_alloc_resources();
+	rdt_mon_capable = get_rdt_mon_resources();
+
+	return (rdt_mon_capable || rdt_alloc_capable);
+}
+
 static int __init intel_rdt_late_init(void)
 {
 	struct rdt_resource *r;
@@ -573,6 +601,9 @@ static int __init intel_rdt_late_init(void)
 	for_each_alloc_capable_rdt_resource(r)
 		pr_info("Intel RDT %s allocation detected\n", r->name);
 
+	for_each_mon_capable_rdt_resource(r)
+		pr_info("Intel RDT %s monitoring detected\n", r->name);
+
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 29630af30c38..993ab9d678bc 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -12,6 +12,29 @@
 
 #define L3_QOS_CDP_ENABLE	0x01ULL
 
+/*
+ * Event IDs are used to program IA32_QM_EVTSEL before reading event
+ * counter from IA32_QM_CTR
+ */
+#define QOS_L3_OCCUP_EVENT_ID		0x01
+#define QOS_L3_MBM_TOTAL_EVENT_ID	0x02
+#define QOS_L3_MBM_LOCAL_EVENT_ID	0x03
+
+/**
+ * struct mon_evt - Entry in the event list of a resource
+ * @evtid:		event id
+ * @name:		name of the event
+ */
+struct mon_evt {
+	u32			evtid;
+	char			*name;
+	struct list_head	list;
+};
+
+extern unsigned int intel_cqm_threshold;
+extern bool rdt_alloc_capable;
+extern bool rdt_mon_capable;
+extern unsigned int rdt_mon_features;
 /**
  * struct rdtgroup - store rdtgroup's data in resctrl file system.
  * @kn:				kernfs node
@@ -133,10 +156,17 @@ struct rdt_membw {
 	u32		*mb_map;
 };
 
+static inline bool is_llc_occupancy_enabled(void)
+{
+	return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
+}
+
 /**
  * struct rdt_resource - attributes of an RDT resource
  * @alloc_enabled:	Is allocation enabled on this machine
+ * @mon_enabled:		Is monitoring enabled for this feature
  * @alloc_capable:	Is allocation available on this machine
+ * @mon_capable:		Is monitor feature available on this machine
  * @name:		Name to use in "schemata" file
  * @num_closid:		Number of CLOSIDs available
  * @cache_level:	Which cache level defines scope of this resource
@@ -150,10 +180,15 @@ struct rdt_membw {
  * @nr_info_files:	Number of info files
  * @format_str:		Per resource format string to show domain value
  * @parse_ctrlval:	Per resource function pointer to parse control values
+ * @evt_list:			List of monitoring events
+ * @num_rmid:			Number of RMIDs available
+ * @mon_scale:			cqm counter * mon_scale = occupancy in bytes
  */
 struct rdt_resource {
 	bool			alloc_enabled;
+	bool			mon_enabled;
 	bool			alloc_capable;
+	bool			mon_capable;
 	char			*name;
 	int			num_closid;
 	int			cache_level;
@@ -170,6 +205,9 @@ struct rdt_resource {
 	const char		*format_str;
 	int (*parse_ctrlval)	(char *buf, struct rdt_resource *r,
 				 struct rdt_domain *d);
+	struct list_head	evt_list;
+	int			num_rmid;
+	unsigned int		mon_scale;
 };
 
 void rdt_get_cache_infofile(struct rdt_resource *r);
@@ -201,6 +239,11 @@ enum {
 	     r++)							      \
 		if (r->alloc_capable)
 
+#define for_each_mon_capable_rdt_resource(r)				      \
+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+	     r++)							      \
+		if (r->mon_capable)
+
 #define for_each_alloc_enabled_rdt_resource(r)				      \
 	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
 	     r++)							      \
@@ -239,5 +282,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 				char *buf, size_t nbytes, loff_t off);
 int rdtgroup_schemata_show(struct kernfs_open_file *of,
 			   struct seq_file *s, void *v);
+int rdt_get_mon_l3_config(struct rdt_resource *r);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
new file mode 100644
index 000000000000..f6d43c3eba4e
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -0,0 +1,161 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Monitoring code
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author:
+ *    Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This replaces the cqm.c based on perf but we reuse a lot of
+ * code and datastructures originally from Peter Zijlstra and Matt Fleming.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "intel_rdt.h"
+
+struct rmid_entry {
+	u32				rmid;
+	struct list_head		list;
+};
+
+/**
+ * @rmid_free_lru    A least recently used list of free RMIDs
+ *     These RMIDs are guaranteed to have an occupancy less than the
+ *     threshold occupancy
+ */
+static LIST_HEAD(rmid_free_lru);
+
+/**
+ * @rmid_limbo_lru       list of currently unused but (potentially)
+ *     dirty RMIDs.
+ *     This list contains RMIDs that no one is currently using but that
+ *     may have a occupancy value > intel_cqm_threshold. User can change
+ *     the threshold occupancy value.
+ */
+static LIST_HEAD(rmid_limbo_lru);
+
+/**
+ * @rmid_entry - The entry in the limbo and free lists.
+ */
+static struct rmid_entry	*rmid_ptrs;
+
+/*
+ * Global boolean for rdt_monitor which is true if any
+ * resource monitoring is enabled.
+ */
+bool rdt_mon_capable;
+
+/*
+ * Global to indicate which monitoring events are enabled.
+ */
+unsigned int rdt_mon_features;
+
+/*
+ * This is the threshold cache occupancy at which we will consider an
+ * RMID available for re-allocation.
+ */
+unsigned int intel_cqm_threshold;
+
+static inline struct rmid_entry *__rmid_entry(u32 rmid)
+{
+	struct rmid_entry *entry;
+
+	entry = &rmid_ptrs[rmid];
+	WARN_ON(entry->rmid != rmid);
+
+	return entry;
+}
+
+static int dom_data_init(struct rdt_resource *r)
+{
+	struct rmid_entry *entry = NULL;
+	int i, nr_rmids;
+
+	nr_rmids = r->num_rmid;
+	rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL);
+	if (!rmid_ptrs)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_rmids; i++) {
+		entry = &rmid_ptrs[i];
+		INIT_LIST_HEAD(&entry->list);
+
+		entry->rmid = i;
+		list_add_tail(&entry->list, &rmid_free_lru);
+	}
+
+	/*
+	 * RMID 0 is special and is always allocated. It's used for all
+	 * tasks that are not monitored.
+	 */
+	entry = __rmid_entry(0);
+	list_del(&entry->list);
+
+	return 0;
+}
+
+static struct mon_evt llc_occupancy_event = {
+	.name		= "llc_occupancy",
+	.evtid		= QOS_L3_OCCUP_EVENT_ID,
+};
+
+/*
+ * Initialize the event list for the resource.
+ *
+ * Note that MBM events are also part of RDT_RESOURCE_L3 resource
+ * because as per the SDM the total and local memory bandwidth
+ * are enumerated as part of L3 monitoring.
+ */
+static void l3_mon_evt_init(struct rdt_resource *r)
+{
+	INIT_LIST_HEAD(&r->evt_list);
+
+	if (is_llc_occupancy_enabled())
+		list_add_tail(&llc_occupancy_event.list, &r->evt_list);
+}
+
+int rdt_get_mon_l3_config(struct rdt_resource *r)
+{
+	int ret;
+
+	r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
+	r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+
+	/*
+	 * A reasonable upper limit on the max threshold is the number
+	 * of lines tagged per RMID if all RMIDs have the same number of
+	 * lines tagged in the LLC.
+	 *
+	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+	 */
+	intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid;
+
+	/* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
+	intel_cqm_threshold /= r->mon_scale;
+
+	ret = dom_data_init(r);
+	if (ret)
+		return ret;
+
+	l3_mon_evt_init(r);
+
+	r->mon_capable = true;
+	r->mon_enabled = true;
+
+	return 0;
+}
-- 
cgit v1.2.3


From edf6fa1c4a951b3a03e94b63e6483c5d9da3ab11 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:28 -0700
Subject: x86/intel_rdt/cqm: Add RMID (Resource monitoring ID) management

Hardware uses RMID(Resource monitoring ID) to keep track of each of the
RDT events associated with tasks. The number of RMIDs is dependent on
the SKU and is enumerated via CPUID. We add support to manage the RMIDs
which include managing the RMID allocation and reading LLC occupancy
for an RMID.

RMID allocation is managed by keeping a free list which is initialized
to all available RMIDs except for RMID 0 which is always reserved for
root group. RMIDs goto a limbo list once they are
freed since the RMIDs are still tagged to cache lines of the tasks which
were using them - thereby still having some occupancy. They continue to
be in limbo list until the occupancy < threshold_occupancy. The
threshold_occupancy is a user configurable value.
OS uses IA32_QM_CTR MSR to read the occupancy associated with an RMID
after programming the IA32_EVENTSEL MSR with the RMID.

[Tony: Improved limbo search]

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-10-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt.c         |  32 +++++
 arch/x86/kernel/cpu/intel_rdt.h         |   6 +
 arch/x86/kernel/cpu/intel_rdt_monitor.c | 213 ++++++++++++++++++++++++++++++++
 3 files changed, 251 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index cb520dd73bc8..d30830a8eafd 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -320,6 +320,19 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
 		wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
 }
 
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
+{
+	struct rdt_domain *d;
+
+	list_for_each_entry(d, &r->domains, list) {
+		/* Find the domain that contains this CPU */
+		if (cpumask_test_cpu(cpu, &d->cpu_mask))
+			return d;
+	}
+
+	return NULL;
+}
+
 void rdt_ctrl_update(void *arg)
 {
 	struct msr_param *m = arg;
@@ -397,6 +410,19 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 	return 0;
 }
 
+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
+{
+	if (is_llc_occupancy_enabled()) {
+		d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid),
+					   sizeof(unsigned long),
+					   GFP_KERNEL);
+		if (!d->rmid_busy_llc)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
 /*
  * domain_add_cpu - Add a cpu to a resource's domain list.
  *
@@ -438,6 +464,11 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 		return;
 	}
 
+	if (r->mon_capable && domain_setup_mon_state(r, d)) {
+		kfree(d);
+		return;
+	}
+
 	cpumask_set_cpu(cpu, &d->cpu_mask);
 	list_add_tail(&d->list, add_pos);
 }
@@ -456,6 +487,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 	cpumask_clear_cpu(cpu, &d->cpu_mask);
 	if (cpumask_empty(&d->cpu_mask)) {
 		kfree(d->ctrl_val);
+		kfree(d->rmid_busy_llc);
 		list_del(&d->list);
 		kfree(d);
 	}
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 993ab9d678bc..35bf8eb8437c 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -19,6 +19,8 @@
 #define QOS_L3_OCCUP_EVENT_ID		0x01
 #define QOS_L3_MBM_TOTAL_EVENT_ID	0x02
 #define QOS_L3_MBM_LOCAL_EVENT_ID	0x03
+#define RMID_VAL_ERROR			BIT_ULL(63)
+#define RMID_VAL_UNAVAIL		BIT_ULL(62)
 
 /**
  * struct mon_evt - Entry in the event list of a resource
@@ -98,6 +100,8 @@ struct rftype {
  * @list:	all instances of this resource
  * @id:		unique id for this instance
  * @cpu_mask:	which cpus share this resource
+ * @rmid_busy_llc:
+ *		bitmap of which limbo RMIDs are above threshold
  * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
  * @new_ctrl:	new ctrl value to be loaded
  * @have_new_ctrl: did user provide new_ctrl for this domain
@@ -106,6 +110,7 @@ struct rdt_domain {
 	struct list_head	list;
 	int			id;
 	struct cpumask		cpu_mask;
+	unsigned long		*rmid_busy_llc;
 	u32			*ctrl_val;
 	u32			new_ctrl;
 	bool			have_new_ctrl;
@@ -282,6 +287,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 				char *buf, size_t nbytes, loff_t off);
 int rdtgroup_schemata_show(struct kernfs_open_file *of,
 			   struct seq_file *s, void *v);
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
 int rdt_get_mon_l3_config(struct rdt_resource *r);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index f6d43c3eba4e..b6732c51829b 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -28,8 +28,12 @@
 #include <asm/cpu_device_id.h>
 #include "intel_rdt.h"
 
+#define MSR_IA32_QM_CTR		0x0c8e
+#define MSR_IA32_QM_EVTSEL		0x0c8d
+
 struct rmid_entry {
 	u32				rmid;
+	atomic_t			busy;
 	struct list_head		list;
 };
 
@@ -81,6 +85,215 @@ static inline struct rmid_entry *__rmid_entry(u32 rmid)
 	return entry;
 }
 
+static u64 __rmid_read(u32 rmid, u32 eventid)
+{
+	u64 val;
+
+	/*
+	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
+	 * with a valid event code for supported resource type and the bits
+	 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
+	 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
+	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
+	 * are error bits.
+	 */
+	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+	rdmsrl(MSR_IA32_QM_CTR, val);
+
+	return val;
+}
+
+/*
+ * Walk the limbo list looking at any RMIDs that are flagged in the
+ * domain rmid_busy_llc bitmap as busy. If the reported LLC occupancy
+ * is below the threshold clear the busy bit and decrement the count.
+ * If the busy count gets to zero on an RMID we stop looking.
+ * This can be called from an IPI.
+ * We need an atomic for the busy count because multiple CPUs may check
+ * the same RMID at the same time.
+ */
+static bool __check_limbo(struct rdt_domain *d)
+{
+	struct rmid_entry *entry;
+	u64 val;
+
+	list_for_each_entry(entry, &rmid_limbo_lru, list) {
+		if (!test_bit(entry->rmid, d->rmid_busy_llc))
+			continue;
+		val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+		if (val <= intel_cqm_threshold) {
+			clear_bit(entry->rmid, d->rmid_busy_llc);
+			if (atomic_dec_and_test(&entry->busy))
+				return true;
+		}
+	}
+	return false;
+}
+
+static void check_limbo(void *arg)
+{
+	struct rdt_domain *d;
+
+	d = get_domain_from_cpu(smp_processor_id(),
+				&rdt_resources_all[RDT_RESOURCE_L3]);
+
+	if (d)
+		__check_limbo(d);
+}
+
+static bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
+{
+	return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
+}
+
+/*
+ * Scan the limbo list and move all entries that are below the
+ * intel_cqm_threshold to the free list.
+ * Return "true" if the limbo list is empty, "false" if there are
+ * still some RMIDs there.
+ */
+static bool try_freeing_limbo_rmid(void)
+{
+	struct rmid_entry *entry, *tmp;
+	struct rdt_resource *r;
+	cpumask_var_t cpu_mask;
+	struct rdt_domain *d;
+	bool ret = true;
+	int cpu;
+
+	if (list_empty(&rmid_limbo_lru))
+		return ret;
+
+	r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+	cpu = get_cpu();
+
+	/*
+	 * First see if we can free up an RMID by checking busy values
+	 * on the local package.
+	 */
+	d = get_domain_from_cpu(cpu, r);
+	if (d && has_busy_rmid(r, d) && __check_limbo(d)) {
+		list_for_each_entry_safe(entry, tmp, &rmid_limbo_lru, list) {
+			if (atomic_read(&entry->busy) == 0) {
+				list_del(&entry->list);
+				list_add_tail(&entry->list, &rmid_free_lru);
+				goto done;
+			}
+		}
+	}
+
+	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) {
+		ret = false;
+		goto done;
+	}
+
+	/*
+	 * Build a mask of other domains that have busy RMIDs
+	 */
+	list_for_each_entry(d, &r->domains, list) {
+		if (!cpumask_test_cpu(cpu, &d->cpu_mask) &&
+		    has_busy_rmid(r, d))
+			cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+	}
+	if (cpumask_empty(cpu_mask)) {
+		ret = false;
+		goto free_mask;
+	}
+
+	/*
+	 * Scan domains with busy RMIDs to check if they still are busy
+	 */
+	on_each_cpu_mask(cpu_mask, check_limbo, NULL, true);
+
+	/* Walk limbo list moving all free RMIDs to the &rmid_free_lru list */
+	list_for_each_entry_safe(entry, tmp, &rmid_limbo_lru, list) {
+		if (atomic_read(&entry->busy) != 0) {
+			ret = false;
+			continue;
+		}
+		list_del(&entry->list);
+		list_add_tail(&entry->list, &rmid_free_lru);
+	}
+
+free_mask:
+	free_cpumask_var(cpu_mask);
+done:
+	put_cpu();
+	return ret;
+}
+
+/*
+ * As of now the RMIDs allocation is global.
+ * However we keep track of which packages the RMIDs
+ * are used to optimize the limbo list management.
+ */
+int alloc_rmid(void)
+{
+	struct rmid_entry *entry;
+	bool ret;
+
+	lockdep_assert_held(&rdtgroup_mutex);
+
+	if (list_empty(&rmid_free_lru)) {
+		ret = try_freeing_limbo_rmid();
+		if (list_empty(&rmid_free_lru))
+			return ret ? -ENOSPC : -EBUSY;
+	}
+
+	entry = list_first_entry(&rmid_free_lru,
+				 struct rmid_entry, list);
+	list_del(&entry->list);
+
+	return entry->rmid;
+}
+
+static void add_rmid_to_limbo(struct rmid_entry *entry)
+{
+	struct rdt_resource *r;
+	struct rdt_domain *d;
+	int cpu, nbusy = 0;
+	u64 val;
+
+	r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+	cpu = get_cpu();
+	list_for_each_entry(d, &r->domains, list) {
+		if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
+			val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+			if (val <= intel_cqm_threshold)
+				continue;
+		}
+		set_bit(entry->rmid, d->rmid_busy_llc);
+		nbusy++;
+	}
+	put_cpu();
+
+	if (nbusy) {
+		atomic_set(&entry->busy, nbusy);
+		list_add_tail(&entry->list, &rmid_limbo_lru);
+	} else {
+		list_add_tail(&entry->list, &rmid_free_lru);
+	}
+}
+
+void free_rmid(u32 rmid)
+{
+	struct rmid_entry *entry;
+
+	if (!rmid)
+		return;
+
+	lockdep_assert_held(&rdtgroup_mutex);
+
+	entry = __rmid_entry(rmid);
+
+	if (is_llc_occupancy_enabled())
+		add_rmid_to_limbo(entry);
+	else
+		list_add_tail(&entry->list, &rmid_free_lru);
+}
+
 static int dom_data_init(struct rdt_resource *r)
 {
 	struct rmid_entry *entry = NULL;
-- 
cgit v1.2.3


From 5dc1d5c6bac2cfe3420cf353dfb0ef2e543f7c10 Mon Sep 17 00:00:00 2001
From: Tony luck <tony.luck@intel.com>
Date: Tue, 25 Jul 2017 14:14:29 -0700
Subject: x86/intel_rdt: Simplify info and base file lists

The info directory files and base files need to be different for each
resource like cache and Memory bandwidth. With in each resource, the
files would be further different for monitoring and ctrl. This leads to
a lot of different static array declarations given that we are adding
resctrl monitoring.

Simplify this to one common list of files and then declare a set of
flags to choose the files based on the resource, whether it is info or
base and if it is control type file. This is as a preparation to include
monitoring based info and base files.

No functional change.

[Vikas: Extended the flags to have few bits per category like resource,
	info/base etc]

Signed-off-by: Tony luck <tony.luck@intel.com>
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-11-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt.c          |   7 +-
 arch/x86/kernel/cpu/intel_rdt.h          |  22 +++-
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 187 +++++++++++++++----------------
 3 files changed, 112 insertions(+), 104 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index d30830a8eafd..7cae4ec75cfe 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -83,6 +83,7 @@ struct rdt_resource rdt_resources_all[] = {
 		},
 		.parse_ctrlval		= parse_cbm,
 		.format_str		= "%d=%0*x",
+		.fflags			= RFTYPE_RES_CACHE,
 	},
 	[RDT_RESOURCE_L3DATA] =
 	{
@@ -98,6 +99,7 @@ struct rdt_resource rdt_resources_all[] = {
 		},
 		.parse_ctrlval		= parse_cbm,
 		.format_str		= "%d=%0*x",
+		.fflags			= RFTYPE_RES_CACHE,
 	},
 	[RDT_RESOURCE_L3CODE] =
 	{
@@ -113,6 +115,7 @@ struct rdt_resource rdt_resources_all[] = {
 		},
 		.parse_ctrlval		= parse_cbm,
 		.format_str		= "%d=%0*x",
+		.fflags			= RFTYPE_RES_CACHE,
 	},
 	[RDT_RESOURCE_L2] =
 	{
@@ -128,6 +131,7 @@ struct rdt_resource rdt_resources_all[] = {
 		},
 		.parse_ctrlval		= parse_cbm,
 		.format_str		= "%d=%0*x",
+		.fflags			= RFTYPE_RES_CACHE,
 	},
 	[RDT_RESOURCE_MBA] =
 	{
@@ -138,6 +142,7 @@ struct rdt_resource rdt_resources_all[] = {
 		.cache_level		= 3,
 		.parse_ctrlval		= parse_bw,
 		.format_str		= "%d=%*d",
+		.fflags			= RFTYPE_RES_MB,
 	},
 };
 
@@ -233,7 +238,6 @@ static bool rdt_get_mem_config(struct rdt_resource *r)
 			return false;
 	}
 	r->data_width = 3;
-	rdt_get_mba_infofile(r);
 
 	r->alloc_capable = true;
 	r->alloc_enabled = true;
@@ -252,7 +256,6 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
 	r->cache.cbm_len = eax.split.cbm_len + 1;
 	r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
 	r->data_width = (r->cache.cbm_len + 3) / 4;
-	rdt_get_cache_infofile(r);
 	r->alloc_capable = true;
 	r->alloc_enabled = true;
 }
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 35bf8eb8437c..aecbe77c8354 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -62,6 +62,18 @@ struct rdtgroup {
 /* rftype.flags */
 #define RFTYPE_FLAGS_CPUS_LIST	1
 
+/*
+ * Define the file type flags for base and info directories.
+ */
+#define RFTYPE_INFO			BIT(0)
+#define RFTYPE_BASE			BIT(1)
+#define RF_CTRLSHIFT			4
+#define RFTYPE_CTRL			BIT(RF_CTRLSHIFT)
+#define RFTYPE_RES_CACHE		BIT(8)
+#define RFTYPE_RES_MB			BIT(9)
+#define RF_CTRL_INFO			(RFTYPE_INFO | RFTYPE_CTRL)
+#define RF_CTRL_BASE			(RFTYPE_BASE | RFTYPE_CTRL)
+
 /* List of all resource groups */
 extern struct list_head rdt_all_groups;
 
@@ -75,6 +87,7 @@ int __init rdtgroup_init(void);
  * @mode:	Access mode
  * @kf_ops:	File operations
  * @flags:	File specific RFTYPE_FLAGS_* flags
+ * @fflags:	File specific RF_* or RFTYPE_* flags
  * @seq_show:	Show content of the file
  * @write:	Write to the file
  */
@@ -83,6 +96,7 @@ struct rftype {
 	umode_t			mode;
 	struct kernfs_ops	*kf_ops;
 	unsigned long		flags;
+	unsigned long		fflags;
 
 	int (*seq_show)(struct kernfs_open_file *of,
 			struct seq_file *sf, void *v);
@@ -181,13 +195,12 @@ static inline bool is_llc_occupancy_enabled(void)
  * @data_width:		Character width of data when displaying
  * @domains:		All domains for this resource
  * @cache:		Cache allocation related data
- * @info_files:		resctrl info files for the resource
- * @nr_info_files:	Number of info files
  * @format_str:		Per resource format string to show domain value
  * @parse_ctrlval:	Per resource function pointer to parse control values
  * @evt_list:			List of monitoring events
  * @num_rmid:			Number of RMIDs available
  * @mon_scale:			cqm counter * mon_scale = occupancy in bytes
+ * @fflags:			flags to choose base and info files
  */
 struct rdt_resource {
 	bool			alloc_enabled;
@@ -205,18 +218,15 @@ struct rdt_resource {
 	struct list_head	domains;
 	struct rdt_cache	cache;
 	struct rdt_membw	membw;
-	struct rftype		*info_files;
-	int			nr_info_files;
 	const char		*format_str;
 	int (*parse_ctrlval)	(char *buf, struct rdt_resource *r,
 				 struct rdt_domain *d);
 	struct list_head	evt_list;
 	int			num_rmid;
 	unsigned int		mon_scale;
+	unsigned long		fflags;
 };
 
-void rdt_get_cache_infofile(struct rdt_resource *r);
-void rdt_get_mba_infofile(struct rdt_resource *r);
 int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
 int parse_bw(char *buf, struct rdt_resource *r,  struct rdt_domain *d);
 
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index ce63d1011e03..7627b31937e9 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -125,28 +125,6 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
 	return 0;
 }
 
-static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts,
-			      int len)
-{
-	struct rftype *rft;
-	int ret;
-
-	lockdep_assert_held(&rdtgroup_mutex);
-
-	for (rft = rfts; rft < rfts + len; rft++) {
-		ret = rdtgroup_add_file(kn, rft);
-		if (ret)
-			goto error;
-	}
-
-	return 0;
-error:
-	pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
-	while (--rft >= rfts)
-		kernfs_remove_by_name(kn, rft->name);
-	return ret;
-}
-
 static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
 {
 	struct kernfs_open_file *of = m->private;
@@ -476,39 +454,6 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of,
 	return ret;
 }
 
-/* Files in each rdtgroup */
-static struct rftype rdtgroup_base_files[] = {
-	{
-		.name		= "cpus",
-		.mode		= 0644,
-		.kf_ops		= &rdtgroup_kf_single_ops,
-		.write		= rdtgroup_cpus_write,
-		.seq_show	= rdtgroup_cpus_show,
-	},
-	{
-		.name		= "cpus_list",
-		.mode		= 0644,
-		.kf_ops		= &rdtgroup_kf_single_ops,
-		.write		= rdtgroup_cpus_write,
-		.seq_show	= rdtgroup_cpus_show,
-		.flags		= RFTYPE_FLAGS_CPUS_LIST,
-	},
-	{
-		.name		= "tasks",
-		.mode		= 0644,
-		.kf_ops		= &rdtgroup_kf_single_ops,
-		.write		= rdtgroup_tasks_write,
-		.seq_show	= rdtgroup_tasks_show,
-	},
-	{
-		.name		= "schemata",
-		.mode		= 0644,
-		.kf_ops		= &rdtgroup_kf_single_ops,
-		.write		= rdtgroup_schemata_write,
-		.seq_show	= rdtgroup_schemata_show,
-	},
-};
-
 static int rdt_num_closids_show(struct kernfs_open_file *of,
 				struct seq_file *seq, void *v)
 {
@@ -564,73 +509,140 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of,
 }
 
 /* rdtgroup information files for one cache resource. */
-static struct rftype res_cache_info_files[] = {
+static struct rftype res_common_files[] = {
 	{
 		.name		= "num_closids",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_num_closids_show,
+		.fflags		= RF_CTRL_INFO,
 	},
 	{
 		.name		= "cbm_mask",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_default_ctrl_show,
+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_CACHE,
 	},
 	{
 		.name		= "min_cbm_bits",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_min_cbm_bits_show,
-	},
-};
-
-/* rdtgroup information files for memory bandwidth. */
-static struct rftype res_mba_info_files[] = {
-	{
-		.name		= "num_closids",
-		.mode		= 0444,
-		.kf_ops		= &rdtgroup_kf_single_ops,
-		.seq_show	= rdt_num_closids_show,
+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_CACHE,
 	},
 	{
 		.name		= "min_bandwidth",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_min_bw_show,
+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_MB,
 	},
 	{
 		.name		= "bandwidth_gran",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_bw_gran_show,
+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_MB,
 	},
 	{
 		.name		= "delay_linear",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_delay_linear_show,
+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_MB,
+	},
+	{
+		.name		= "cpus",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= rdtgroup_cpus_write,
+		.seq_show	= rdtgroup_cpus_show,
+		.fflags		= RFTYPE_BASE,
+	},
+	{
+		.name		= "cpus_list",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= rdtgroup_cpus_write,
+		.seq_show	= rdtgroup_cpus_show,
+		.flags		= RFTYPE_FLAGS_CPUS_LIST,
+		.fflags		= RFTYPE_BASE,
+	},
+	{
+		.name		= "tasks",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= rdtgroup_tasks_write,
+		.seq_show	= rdtgroup_tasks_show,
+		.fflags		= RFTYPE_BASE,
+	},
+	{
+		.name		= "schemata",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= rdtgroup_schemata_write,
+		.seq_show	= rdtgroup_schemata_show,
+		.fflags		= RF_CTRL_BASE,
 	},
 };
 
-void rdt_get_mba_infofile(struct rdt_resource *r)
+static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
 {
-	r->info_files = res_mba_info_files;
-	r->nr_info_files = ARRAY_SIZE(res_mba_info_files);
+	struct rftype *rfts, *rft;
+	int ret, len;
+
+	rfts = res_common_files;
+	len = ARRAY_SIZE(res_common_files);
+
+	lockdep_assert_held(&rdtgroup_mutex);
+
+	for (rft = rfts; rft < rfts + len; rft++) {
+		if ((fflags & rft->fflags) == rft->fflags) {
+			ret = rdtgroup_add_file(kn, rft);
+			if (ret)
+				goto error;
+		}
+	}
+
+	return 0;
+error:
+	pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
+	while (--rft >= rfts) {
+		if ((fflags & rft->fflags) == rft->fflags)
+			kernfs_remove_by_name(kn, rft->name);
+	}
+	return ret;
 }
 
-void rdt_get_cache_infofile(struct rdt_resource *r)
+static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
+				      unsigned long fflags)
 {
-	r->info_files = res_cache_info_files;
-	r->nr_info_files = ARRAY_SIZE(res_cache_info_files);
+	struct kernfs_node *kn_subdir;
+	int ret;
+
+	kn_subdir = kernfs_create_dir(kn_info, name,
+				      kn_info->mode, r);
+	if (IS_ERR(kn_subdir))
+		return PTR_ERR(kn_subdir);
+
+	kernfs_get(kn_subdir);
+	ret = rdtgroup_kn_set_ugid(kn_subdir);
+	if (ret)
+		return ret;
+
+	ret = rdtgroup_add_files(kn_subdir, fflags);
+	if (!ret)
+		kernfs_activate(kn_subdir);
+
+	return ret;
 }
 
 static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 {
-	struct kernfs_node *kn_subdir;
-	struct rftype *res_info_files;
 	struct rdt_resource *r;
-	int ret, len;
+	unsigned long fflags;
+	int ret;
 
 	/* create the directory */
 	kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
@@ -639,26 +651,11 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 	kernfs_get(kn_info);
 
 	for_each_alloc_enabled_rdt_resource(r) {
-		kn_subdir = kernfs_create_dir(kn_info, r->name,
-					      kn_info->mode, r);
-		if (IS_ERR(kn_subdir)) {
-			ret = PTR_ERR(kn_subdir);
-			goto out_destroy;
-		}
-		kernfs_get(kn_subdir);
-		ret = rdtgroup_kn_set_ugid(kn_subdir);
+		fflags =  r->fflags | RF_CTRL_INFO;
+		ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
 		if (ret)
 			goto out_destroy;
-
-		res_info_files = r->info_files;
-		len = r->nr_info_files;
-
-		ret = rdtgroup_add_files(kn_subdir, res_info_files, len);
-		if (ret)
-			goto out_destroy;
-		kernfs_activate(kn_subdir);
 	}
-
 	/*
 	 * This extra ref will be put in kernfs_remove() and guarantees
 	 * that @rdtgrp->kn is always accessible.
@@ -1057,8 +1054,7 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	if (ret)
 		goto out_destroy;
 
-	ret = rdtgroup_add_files(kn, rdtgroup_base_files,
-				 ARRAY_SIZE(rdtgroup_base_files));
+	ret = rdtgroup_add_files(kn, RF_CTRL_BASE);
 	if (ret)
 		goto out_destroy;
 
@@ -1156,8 +1152,7 @@ static int __init rdtgroup_setup_root(void)
 	rdtgroup_default.closid = 0;
 	list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
 
-	ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files,
-				 ARRAY_SIZE(rdtgroup_base_files));
+	ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE);
 	if (ret) {
 		kernfs_destroy_root(rdt_root);
 		goto out;
-- 
cgit v1.2.3


From d4ab33201029913b594ae785a9665f45040396ab Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:30 -0700
Subject: x86/intel_rdt/cqm: Add info files for RDT monitoring

Add info directory files specific to RDT monitoring.

 num_rmids:
    The number of RMIDs which are valid for the resource.

 mon_features:
    Lists the monitoring events if monitoring is enabled for the
    resource.

 max_threshold_occupancy:
    This is specific to llc_occupancy monitoring and is used to
    determine if an RMID can be reused. Provides an upper bound on the
    threshold and is shown to the user in bytes though the internal
    value will be rounded to the scaling factor supported by the h/w.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-12-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt.h          |  8 +++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 83 ++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index aecbe77c8354..4051f5e2d5fc 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -68,10 +68,13 @@ struct rdtgroup {
 #define RFTYPE_INFO			BIT(0)
 #define RFTYPE_BASE			BIT(1)
 #define RF_CTRLSHIFT			4
+#define RF_MONSHIFT			5
 #define RFTYPE_CTRL			BIT(RF_CTRLSHIFT)
+#define RFTYPE_MON			BIT(RF_MONSHIFT)
 #define RFTYPE_RES_CACHE		BIT(8)
 #define RFTYPE_RES_MB			BIT(9)
 #define RF_CTRL_INFO			(RFTYPE_INFO | RFTYPE_CTRL)
+#define RF_MON_INFO			(RFTYPE_INFO | RFTYPE_MON)
 #define RF_CTRL_BASE			(RFTYPE_BASE | RFTYPE_CTRL)
 
 /* List of all resource groups */
@@ -264,6 +267,11 @@ enum {
 	     r++)							      \
 		if (r->alloc_enabled)
 
+#define for_each_mon_enabled_rdt_resource(r)				      \
+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+	     r++)							      \
+		if (r->mon_enabled)
+
 /* CPUID.(EAX=10H, ECX=ResID=1).EAX */
 union cpuid_0x10_1_eax {
 	struct {
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 7627b31937e9..14489a5ecf12 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -490,6 +490,28 @@ static int rdt_min_bw_show(struct kernfs_open_file *of,
 	return 0;
 }
 
+static int rdt_num_rmids_show(struct kernfs_open_file *of,
+			      struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+
+	seq_printf(seq, "%d\n", r->num_rmid);
+
+	return 0;
+}
+
+static int rdt_mon_features_show(struct kernfs_open_file *of,
+				 struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+	struct mon_evt *mevt;
+
+	list_for_each_entry(mevt, &r->evt_list, list)
+		seq_printf(seq, "%s\n", mevt->name);
+
+	return 0;
+}
+
 static int rdt_bw_gran_show(struct kernfs_open_file *of,
 			     struct seq_file *seq, void *v)
 {
@@ -508,6 +530,35 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of,
 	return 0;
 }
 
+static int max_threshold_occ_show(struct kernfs_open_file *of,
+				  struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+
+	seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale);
+
+	return 0;
+}
+
+static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
+				       char *buf, size_t nbytes, loff_t off)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+	unsigned int bytes;
+	int ret;
+
+	ret = kstrtouint(buf, 0, &bytes);
+	if (ret)
+		return ret;
+
+	if (bytes > (boot_cpu_data.x86_cache_size * 1024))
+		return -EINVAL;
+
+	intel_cqm_threshold = bytes / r->mon_scale;
+
+	return ret ?: nbytes;
+}
+
 /* rdtgroup information files for one cache resource. */
 static struct rftype res_common_files[] = {
 	{
@@ -517,6 +568,20 @@ static struct rftype res_common_files[] = {
 		.seq_show	= rdt_num_closids_show,
 		.fflags		= RF_CTRL_INFO,
 	},
+	{
+		.name		= "mon_features",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_mon_features_show,
+		.fflags		= RF_MON_INFO,
+	},
+	{
+		.name		= "num_rmids",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_num_rmids_show,
+		.fflags		= RF_MON_INFO,
+	},
 	{
 		.name		= "cbm_mask",
 		.mode		= 0444,
@@ -552,6 +617,14 @@ static struct rftype res_common_files[] = {
 		.seq_show	= rdt_delay_linear_show,
 		.fflags		= RF_CTRL_INFO | RFTYPE_RES_MB,
 	},
+	{
+		.name		= "max_threshold_occupancy",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= max_threshold_occ_write,
+		.seq_show	= max_threshold_occ_show,
+		.fflags		= RF_MON_INFO | RFTYPE_RES_CACHE,
+	},
 	{
 		.name		= "cpus",
 		.mode		= 0644,
@@ -642,6 +715,7 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 {
 	struct rdt_resource *r;
 	unsigned long fflags;
+	char name[32];
 	int ret;
 
 	/* create the directory */
@@ -656,6 +730,15 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 		if (ret)
 			goto out_destroy;
 	}
+
+	for_each_mon_enabled_rdt_resource(r) {
+		fflags =  r->fflags | RF_MON_INFO;
+		sprintf(name, "%s_MON", r->name);
+		ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
+		if (ret)
+			goto out_destroy;
+	}
+
 	/*
 	 * This extra ref will be put in kernfs_remove() and guarantees
 	 * that @rdtgrp->kn is always accessible.
-- 
cgit v1.2.3


From 65b4f403057e32da73c36e33d403890173c4c324 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:31 -0700
Subject: x86/intel_rdt: Prepare for RDT monitoring mkdir support

Separate the ctrl mkdir code from the rest in order to prepare for
adding support for RDT monitoring mkdir support as well.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-13-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 111 ++++++++++++++++++++++---------
 1 file changed, 79 insertions(+), 32 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 14489a5ecf12..b3c901a8295e 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1082,46 +1082,35 @@ static struct file_system_type rdt_fs_type = {
 	.kill_sb = rdt_kill_sb,
 };
 
-static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
-			  umode_t mode)
+static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
+			     struct kernfs_node *prgrp_kn,
+			     const char *name, umode_t mode,
+			     struct rdtgroup **r)
 {
-	struct rdtgroup *parent, *rdtgrp;
+	struct rdtgroup *prdtgrp, *rdtgrp;
 	struct kernfs_node *kn;
-	int ret, closid;
-
-	/* Only allow mkdir in the root directory */
-	if (parent_kn != rdtgroup_default.kn)
-		return -EPERM;
-
-	/* Do not accept '\n' to avoid unparsable situation. */
-	if (strchr(name, '\n'))
-		return -EINVAL;
+	uint files = 0;
+	int ret;
 
-	parent = rdtgroup_kn_lock_live(parent_kn);
-	if (!parent) {
+	prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
+	if (!prdtgrp) {
 		ret = -ENODEV;
 		goto out_unlock;
 	}
 
-	ret = closid_alloc();
-	if (ret < 0)
-		goto out_unlock;
-	closid = ret;
-
 	/* allocate the rdtgroup. */
 	rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
 	if (!rdtgrp) {
 		ret = -ENOSPC;
-		goto out_closid_free;
+		goto out_unlock;
 	}
-	rdtgrp->closid = closid;
-	list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+	*r = rdtgrp;
 
 	/* kernfs creates the directory for rdtgrp */
-	kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp);
+	kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
 	if (IS_ERR(kn)) {
 		ret = PTR_ERR(kn);
-		goto out_cancel_ref;
+		goto out_free_rgrp;
 	}
 	rdtgrp->kn = kn;
 
@@ -1137,27 +1126,85 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	if (ret)
 		goto out_destroy;
 
-	ret = rdtgroup_add_files(kn, RF_CTRL_BASE);
+	files = RFTYPE_BASE | RFTYPE_CTRL;
+	ret = rdtgroup_add_files(kn, files);
 	if (ret)
 		goto out_destroy;
 
 	kernfs_activate(kn);
 
-	ret = 0;
-	goto out_unlock;
+	/*
+	 * The caller unlocks the prgrp_kn upon success.
+	 */
+	return 0;
 
 out_destroy:
 	kernfs_remove(rdtgrp->kn);
-out_cancel_ref:
-	list_del(&rdtgrp->rdtgroup_list);
+out_free_rgrp:
 	kfree(rdtgrp);
-out_closid_free:
-	closid_free(closid);
 out_unlock:
-	rdtgroup_kn_unlock(parent_kn);
+	rdtgroup_kn_unlock(prgrp_kn);
+	return ret;
+}
+
+static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
+{
+	kernfs_remove(rgrp->kn);
+	kfree(rgrp);
+}
+
+/*
+ * These are rdtgroups created under the root directory. Can be used
+ * to allocate resources.
+ */
+static int rdtgroup_mkdir_ctrl(struct kernfs_node *parent_kn,
+			       struct kernfs_node *prgrp_kn,
+			       const char *name, umode_t mode)
+{
+	struct rdtgroup *rdtgrp;
+	struct kernfs_node *kn;
+	u32 closid;
+	int ret;
+
+	ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, &rdtgrp);
+	if (ret)
+		return ret;
+
+	kn = rdtgrp->kn;
+	ret = closid_alloc();
+	if (ret < 0)
+		goto out_common_fail;
+	closid = ret;
+
+	rdtgrp->closid = closid;
+	list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+
+	goto out_unlock;
+
+out_common_fail:
+	mkdir_rdt_prepare_clean(rdtgrp);
+out_unlock:
+	rdtgroup_kn_unlock(prgrp_kn);
 	return ret;
 }
 
+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+			  umode_t mode)
+{
+	/* Do not accept '\n' to avoid unparsable situation. */
+	if (strchr(name, '\n'))
+		return -EINVAL;
+
+	/*
+	 * If the parent directory is the root directory and RDT
+	 * allocation is supported, add a control rdtgroup.
+	 */
+	if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
+		return rdtgroup_mkdir_ctrl(parent_kn, parent_kn, name, mode);
+
+	return -EPERM;
+}
+
 static int rdtgroup_rmdir(struct kernfs_node *kn)
 {
 	int ret, cpu, closid = rdtgroup_default.closid;
-- 
cgit v1.2.3


From c7d9aac6131148abe29ed1dc6bd73ad1213d1f56 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:32 -0700
Subject: x86/intel_rdt/cqm: Add mkdir support for RDT monitoring

Resource control groups can be created using mkdir in resctrl
fs(rdtgroup). In order to extend the resctrl interface to support
monitoring the control groups, extend the current mkdir to support
resource monitoring also.

This allows the rdtgroup created under the root directory to be able to
both control and monitor resources (ctrl_mon group). The ctrl_mon groups
are associated with one CLOSID like the legacy rdtgroups and one
RMID(Resource monitoring ID) as well. Hardware uses RMID to track the
resource usage. Once either of the CLOSID or RMID are exhausted, the
mkdir fails with -ENOSPC. If there are RMIDs in limbo list but not free
an -EBUSY is returned. User can also monitor a subset of the ctrl_mon
rdtgroup's tasks/cpus using the monitor groups. The monitor groups are
created using mkdir under the "mon_groups" directory in every ctrl_mon
group.

[Merged Tony's code: Removed a lot of common mkdir code, a fix to handling
of the list of the child rdtgroups and some cleanups in list
traversal. Also the changes to have similar alloc and free for CLOS/RMID
and return -EBUSY when RMIDs are in limbo and not free]

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-14-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt.h          |  26 ++++++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 133 +++++++++++++++++++++++++++++--
 2 files changed, 151 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 4051f5e2d5fc..e8fb08f9ea89 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -37,6 +37,25 @@ extern unsigned int intel_cqm_threshold;
 extern bool rdt_alloc_capable;
 extern bool rdt_mon_capable;
 extern unsigned int rdt_mon_features;
+
+enum rdt_group_type {
+	RDTCTRL_GROUP = 0,
+	RDTMON_GROUP,
+	RDT_NUM_GROUP,
+};
+
+/**
+ * struct mongroup - store mon group's data in resctrl fs.
+ * @parent:			parent rdtgrp
+ * @crdtgrp_list:		child rdtgroup node list
+ * @rmid:			rmid for this rdtgroup
+ */
+struct mongroup {
+	struct rdtgroup		*parent;
+	struct list_head	crdtgrp_list;
+	u32			rmid;
+};
+
 /**
  * struct rdtgroup - store rdtgroup's data in resctrl file system.
  * @kn:				kernfs node
@@ -46,6 +65,9 @@ extern unsigned int rdt_mon_features;
  * @flags:			status bits
  * @waitcount:			how many cpus expect to find this
  *				group when they acquire rdtgroup_mutex
+ * @type:			indicates type of this rdtgroup - either
+ *				monitor only or ctrl_mon group
+ * @mon:			mongroup related data
  */
 struct rdtgroup {
 	struct kernfs_node	*kn;
@@ -54,6 +76,8 @@ struct rdtgroup {
 	struct cpumask		cpu_mask;
 	int			flags;
 	atomic_t		waitcount;
+	enum rdt_group_type	type;
+	struct mongroup		mon;
 };
 
 /* rdtgroup.flags */
@@ -306,6 +330,8 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 int rdtgroup_schemata_show(struct kernfs_open_file *of,
 			   struct seq_file *s, void *v);
 struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
+int alloc_rmid(void);
+void free_rmid(u32 rmid);
 int rdt_get_mon_l3_config(struct rdt_resource *r);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index b3c901a8295e..bee9f885dd20 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -758,6 +758,39 @@ out_destroy:
 	return ret;
 }
 
+static int
+mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
+		    char *name, struct kernfs_node **dest_kn)
+{
+	struct kernfs_node *kn;
+	int ret;
+
+	/* create the directory */
+	kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+	if (IS_ERR(kn))
+		return PTR_ERR(kn);
+
+	if (dest_kn)
+		*dest_kn = kn;
+
+	/*
+	 * This extra ref will be put in kernfs_remove() and guarantees
+	 * that @rdtgrp->kn is always accessible.
+	 */
+	kernfs_get(kn);
+
+	ret = rdtgroup_kn_set_ugid(kn);
+	if (ret)
+		goto out_destroy;
+
+	kernfs_activate(kn);
+
+	return 0;
+
+out_destroy:
+	kernfs_remove(kn);
+	return ret;
+}
 static void l3_qos_cfg_update(void *arg)
 {
 	bool *enable = arg;
@@ -1085,7 +1118,7 @@ static struct file_system_type rdt_fs_type = {
 static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
 			     struct kernfs_node *prgrp_kn,
 			     const char *name, umode_t mode,
-			     struct rdtgroup **r)
+			     enum rdt_group_type rtype, struct rdtgroup **r)
 {
 	struct rdtgroup *prdtgrp, *rdtgrp;
 	struct kernfs_node *kn;
@@ -1105,6 +1138,9 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
 		goto out_unlock;
 	}
 	*r = rdtgrp;
+	rdtgrp->mon.parent = prdtgrp;
+	rdtgrp->type = rtype;
+	INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
 
 	/* kernfs creates the directory for rdtgrp */
 	kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
@@ -1127,10 +1163,17 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
 		goto out_destroy;
 
 	files = RFTYPE_BASE | RFTYPE_CTRL;
+	files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
 	ret = rdtgroup_add_files(kn, files);
 	if (ret)
 		goto out_destroy;
 
+	if (rdt_mon_capable) {
+		ret = alloc_rmid();
+		if (ret < 0)
+			goto out_destroy;
+		rdtgrp->mon.rmid = ret;
+	}
 	kernfs_activate(kn);
 
 	/*
@@ -1150,23 +1193,56 @@ out_unlock:
 static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
 {
 	kernfs_remove(rgrp->kn);
+	free_rmid(rgrp->mon.rmid);
 	kfree(rgrp);
 }
 
+/*
+ * Create a monitor group under "mon_groups" directory of a control
+ * and monitor group(ctrl_mon). This is a resource group
+ * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
+ */
+static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
+			      struct kernfs_node *prgrp_kn,
+			      const char *name,
+			      umode_t mode)
+{
+	struct rdtgroup *rdtgrp, *prgrp;
+	int ret;
+
+	ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
+				&rdtgrp);
+	if (ret)
+		return ret;
+
+	prgrp = rdtgrp->mon.parent;
+	rdtgrp->closid = prgrp->closid;
+
+	/*
+	 * Add the rdtgrp to the list of rdtgrps the parent
+	 * ctrl_mon group has to track.
+	 */
+	list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
+
+	rdtgroup_kn_unlock(prgrp_kn);
+	return ret;
+}
+
 /*
  * These are rdtgroups created under the root directory. Can be used
- * to allocate resources.
+ * to allocate and monitor resources.
  */
-static int rdtgroup_mkdir_ctrl(struct kernfs_node *parent_kn,
-			       struct kernfs_node *prgrp_kn,
-			       const char *name, umode_t mode)
+static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
+				   struct kernfs_node *prgrp_kn,
+				   const char *name, umode_t mode)
 {
 	struct rdtgroup *rdtgrp;
 	struct kernfs_node *kn;
 	u32 closid;
 	int ret;
 
-	ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, &rdtgrp);
+	ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
+				&rdtgrp);
 	if (ret)
 		return ret;
 
@@ -1179,8 +1255,21 @@ static int rdtgroup_mkdir_ctrl(struct kernfs_node *parent_kn,
 	rdtgrp->closid = closid;
 	list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
 
+	if (rdt_mon_capable) {
+		/*
+		 * Create an empty mon_groups directory to hold the subset
+		 * of tasks and cpus to monitor.
+		 */
+		ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
+		if (ret)
+			goto out_id_free;
+	}
+
 	goto out_unlock;
 
+out_id_free:
+	closid_free(closid);
+	list_del(&rdtgrp->rdtgroup_list);
 out_common_fail:
 	mkdir_rdt_prepare_clean(rdtgrp);
 out_unlock:
@@ -1188,6 +1277,22 @@ out_unlock:
 	return ret;
 }
 
+/*
+ * We allow creating mon groups only with in a directory called "mon_groups"
+ * which is present in every ctrl_mon group. Check if this is a valid
+ * "mon_groups" directory.
+ *
+ * 1. The directory should be named "mon_groups".
+ * 2. The mon group itself should "not" be named "mon_groups".
+ *   This makes sure "mon_groups" directory always has a ctrl_mon group
+ *   as parent.
+ */
+static bool is_mon_groups(struct kernfs_node *kn, const char *name)
+{
+	return (!strcmp(kn->name, "mon_groups") &&
+		strcmp(name, "mon_groups"));
+}
+
 static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 			  umode_t mode)
 {
@@ -1197,10 +1302,18 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 
 	/*
 	 * If the parent directory is the root directory and RDT
-	 * allocation is supported, add a control rdtgroup.
+	 * allocation is supported, add a control and monitoring
+	 * subdirectory
 	 */
 	if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
-		return rdtgroup_mkdir_ctrl(parent_kn, parent_kn, name, mode);
+		return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
+
+	/*
+	 * If RDT monitoring is supported and the parent directory is a valid
+	 * "mon_groups" directory, add a monitoring subdirectory.
+	 */
+	if (rdt_mon_capable && is_mon_groups(parent_kn, name))
+		return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
 
 	return -EPERM;
 }
@@ -1280,6 +1393,10 @@ static int __init rdtgroup_setup_root(void)
 	mutex_lock(&rdtgroup_mutex);
 
 	rdtgroup_default.closid = 0;
+	rdtgroup_default.mon.rmid = 0;
+	rdtgroup_default.type = RDTCTRL_GROUP;
+	INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
+
 	list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
 
 	ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE);
-- 
cgit v1.2.3


From 0734ded1abee9439b0c5d7b62af1ead78aab895b Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:33 -0700
Subject: x86/intel_rdt: Change closid type from int to u32

OS associates a CLOSid(Class of service id) to a task by writing the
high 32 bits of per CPU IA32_PQR_ASSOC MSR when a task is scheduled in.
CPUID.(EAX=10H, ECX=1):EDX[15:0] enumerates the max CLOSID supported and
it is zero indexed. Hence change the type to u32 from int.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-15-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/include/asm/intel_rdt_sched.h   | 2 +-
 arch/x86/kernel/cpu/intel_rdt.h          | 2 +-
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 +-
 include/linux/sched.h                    | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h
index 4dee77b6de07..1d8d45a773c2 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -46,7 +46,7 @@ static inline void intel_rdt_sched_in(void)
 {
 	if (static_branch_likely(&rdt_alloc_enable_key)) {
 		struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-		int closid;
+		u32 closid;
 
 		/*
 		 * If this task has a closid assigned, use it.
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index e8fb08f9ea89..b2a2de360961 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -72,7 +72,7 @@ struct mongroup {
 struct rdtgroup {
 	struct kernfs_node	*kn;
 	struct list_head	rdtgroup_list;
-	int			closid;
+	u32			closid;
 	struct cpumask		cpu_mask;
 	int			flags;
 	atomic_t		waitcount;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index bee9f885dd20..840405421841 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -77,7 +77,7 @@ static void closid_init(void)
 
 static int closid_alloc(void)
 {
-	int closid = ffs(closid_free_map);
+	u32 closid = ffs(closid_free_map);
 
 	if (closid == 0)
 		return -ENOSPC;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 20b2ff2f4fde..8839fd09540c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -899,7 +899,7 @@ struct task_struct {
 	struct list_head		cg_list;
 #endif
 #ifdef CONFIG_INTEL_RDT
-	int				closid;
+	u32				closid;
 #endif
 #ifdef CONFIG_FUTEX
 	struct robust_list_head __user	*robust_list;
-- 
cgit v1.2.3


From d6aaba615a482ce7d3ec218cf7b8d02d0d5753b8 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:34 -0700
Subject: x86/intel_rdt/cqm: Add tasks file support

The root directory, ctrl_mon and monitor groups are populated
with a read/write file named "tasks". When read, it shows all the task
IDs assigned to the resource group.

Tasks can be added to groups by writing the PID to the file. A task can
be present in one "ctrl_mon" group "and" one "monitor" group. IOW a
PID_x can be seen in a ctrl_mon group and a monitor group at the same
time. When a task is added to a ctrl_mon group, it is automatically
removed from the previous ctrl_mon group where it belonged. Similarly if
a task is moved to a monitor group it is removed from the previous
monitor group . Also since the monitor groups can only have subset of
tasks of parent ctrl_mon group, a task can be moved to a monitor group
only if its already present in the parent ctrl_mon group.

Task membership is indicated by a new field in the task_struct "u32
rmid" which holds the RMID for the task. RMID=0 is reserved for the
default root group where the tasks belong to at mount.

[tony: zero the rmid if rdtgroup was deleted when task was being moved]

Signed-off-by: Tony Luck <tony.luck@linux.intel.com>
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-16-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 19 +++++++++++++++++--
 include/linux/sched.h                    |  1 +
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 840405421841..aedad1d17880 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -314,6 +314,7 @@ static void move_myself(struct callback_head *head)
 	if (atomic_dec_and_test(&rdtgrp->waitcount) &&
 	    (rdtgrp->flags & RDT_DELETED)) {
 		current->closid = 0;
+		current->rmid = 0;
 		kfree(rdtgrp);
 	}
 
@@ -352,7 +353,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
 		atomic_dec(&rdtgrp->waitcount);
 		kfree(callback);
 	} else {
-		tsk->closid = rdtgrp->closid;
+		/*
+		 * For ctrl_mon groups move both closid and rmid.
+		 * For monitor groups, can move the tasks only from
+		 * their parent CTRL group.
+		 */
+		if (rdtgrp->type == RDTCTRL_GROUP) {
+			tsk->closid = rdtgrp->closid;
+			tsk->rmid = rdtgrp->mon.rmid;
+		} else if (rdtgrp->type == RDTMON_GROUP) {
+			if (rdtgrp->mon.parent->closid == tsk->closid)
+				tsk->rmid = rdtgrp->mon.rmid;
+			else
+				ret = -EINVAL;
+		}
 	}
 	return ret;
 }
@@ -432,7 +446,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
 
 	rcu_read_lock();
 	for_each_process_thread(p, t) {
-		if (t->closid == r->closid)
+		if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
+		    (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
 			seq_printf(s, "%d\n", t->pid);
 	}
 	rcu_read_unlock();
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8839fd09540c..067a41ac5fd4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -900,6 +900,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_INTEL_RDT
 	u32				closid;
+	u32				rmid;
 #endif
 #ifdef CONFIG_FUTEX
 	struct robust_list_head __user	*robust_list;
-- 
cgit v1.2.3


From b09d981b3f346690dafa3e4ebedfcf3e44b68e83 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:35 -0700
Subject: x86/intel_rdt: Prepare to add RDT monitor cpus file support

Separate the ctrl cpus file handling from the generic cpus file handling
and convert the per cpu closid from u32 to a struct which will be used
later to add rmid to the same struct. Also cleanup some name space.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-17-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/include/asm/intel_rdt_sched.h   |   4 +-
 arch/x86/kernel/cpu/intel_rdt.c          |   6 +-
 arch/x86/kernel/cpu/intel_rdt.h          |   2 -
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 103 +++++++++++++++++--------------
 4 files changed, 62 insertions(+), 53 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h
index 1d8d45a773c2..2c704d205e2f 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -26,7 +26,7 @@ struct intel_pqr_state {
 };
 
 DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
-DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+DECLARE_PER_CPU_READ_MOSTLY(struct intel_pqr_state, rdt_cpu_default);
 DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 
 /*
@@ -54,7 +54,7 @@ static inline void intel_rdt_sched_in(void)
 		 */
 		closid = current->closid;
 		if (closid == 0)
-			closid = this_cpu_read(cpu_closid);
+			closid = this_cpu_read(rdt_cpu_default.closid);
 
 		if (closid != state->closid) {
 			state->closid = closid;
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 7cae4ec75cfe..303b085b969f 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -39,8 +39,6 @@
 /* Mutex to protect rdtgroup access. */
 DEFINE_MUTEX(rdtgroup_mutex);
 
-DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
-
 /*
  * The cached intel_pqr_state is strictly per CPU and can never be
  * updated from a remote CPU. Functions which modify the state
@@ -49,6 +47,8 @@ DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
  */
 DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
 
+DEFINE_PER_CPU_READ_MOSTLY(struct intel_pqr_state, rdt_cpu_default);
+
 /*
  * Used to store the max resource name width and max resource data width
  * to display the schemata in a tabular format
@@ -500,7 +500,7 @@ static void clear_closid(int cpu)
 {
 	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
 
-	per_cpu(cpu_closid, cpu) = 0;
+	per_cpu(rdt_cpu_default.closid, cpu) = 0;
 	state->closid = 0;
 	wrmsr(IA32_PQR_ASSOC, state->rmid, 0);
 }
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index b2a2de360961..6f070476416f 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -320,8 +320,6 @@ union cpuid_0x10_x_edx {
 	unsigned int full;
 };
 
-DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
-
 void rdt_ctrl_update(void *arg);
 struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
 void rdtgroup_kn_unlock(struct kernfs_node *kn);
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index aedad1d17880..d11f4629a702 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -181,13 +181,16 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 /*
  * This is safe against intel_rdt_sched_in() called from __switch_to()
  * because __switch_to() is executed with interrupts disabled. A local call
- * from rdt_update_closid() is proteced against __switch_to() because
+ * from update_closid() is proteced against __switch_to() because
  * preemption is disabled.
  */
-static void rdt_update_cpu_closid(void *closid)
+static void update_cpu_closid(void *info)
 {
-	if (closid)
-		this_cpu_write(cpu_closid, *(int *)closid);
+	struct rdtgroup *r = info;
+
+	if (r)
+		this_cpu_write(rdt_cpu_default.closid, r->closid);
+
 	/*
 	 * We cannot unconditionally write the MSR because the current
 	 * executing task might have its own closid selected. Just reuse
@@ -199,28 +202,62 @@ static void rdt_update_cpu_closid(void *closid)
 /*
  * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
  *
- * Per task closids must have been set up before calling this function.
- *
- * The per cpu closids are updated with the smp function call, when @closid
- * is not NULL. If @closid is NULL then all affected percpu closids must
- * have been set up before calling this function.
+ * Per task closids/rmids must have been set up before calling this function.
  */
 static void
-rdt_update_closid(const struct cpumask *cpu_mask, int *closid)
+update_closid(const struct cpumask *cpu_mask, struct rdtgroup *r)
 {
 	int cpu = get_cpu();
 
 	if (cpumask_test_cpu(cpu, cpu_mask))
-		rdt_update_cpu_closid(closid);
-	smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1);
+		update_cpu_closid(r);
+	smp_call_function_many(cpu_mask, update_cpu_closid, r, 1);
 	put_cpu();
 }
 
+static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+			   cpumask_var_t tmpmask)
+{
+	struct rdtgroup *r;
+
+	/* Check whether cpus are dropped from this group */
+	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+	if (cpumask_weight(tmpmask)) {
+		/* Can't drop from default group */
+		if (rdtgrp == &rdtgroup_default)
+			return -EINVAL;
+
+		/* Give any dropped cpus to rdtgroup_default */
+		cpumask_or(&rdtgroup_default.cpu_mask,
+			   &rdtgroup_default.cpu_mask, tmpmask);
+		update_closid(tmpmask, &rdtgroup_default);
+	}
+
+	/*
+	 * If we added cpus, remove them from previous group that owned them
+	 * and update per-cpu closid
+	 */
+	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+	if (cpumask_weight(tmpmask)) {
+		list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
+			if (r == rdtgrp)
+				continue;
+			cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask);
+		}
+		update_closid(tmpmask, rdtgrp);
+	}
+
+	/* Done pushing/pulling - update this group with new mask */
+	cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+	return 0;
+}
+
 static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 				   char *buf, size_t nbytes, loff_t off)
 {
 	cpumask_var_t tmpmask, newmask;
-	struct rdtgroup *rdtgrp, *r;
+	struct rdtgroup *rdtgrp;
 	int ret;
 
 	if (!buf)
@@ -254,36 +291,10 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 		goto unlock;
 	}
 
-	/* Check whether cpus are dropped from this group */
-	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
-	if (cpumask_weight(tmpmask)) {
-		/* Can't drop from default group */
-		if (rdtgrp == &rdtgroup_default) {
-			ret = -EINVAL;
-			goto unlock;
-		}
-		/* Give any dropped cpus to rdtgroup_default */
-		cpumask_or(&rdtgroup_default.cpu_mask,
-			   &rdtgroup_default.cpu_mask, tmpmask);
-		rdt_update_closid(tmpmask, &rdtgroup_default.closid);
-	}
-
-	/*
-	 * If we added cpus, remove them from previous group that owned them
-	 * and update per-cpu closid
-	 */
-	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
-	if (cpumask_weight(tmpmask)) {
-		list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
-			if (r == rdtgrp)
-				continue;
-			cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask);
-		}
-		rdt_update_closid(tmpmask, &rdtgrp->closid);
-	}
-
-	/* Done pushing/pulling - update this group with new mask */
-	cpumask_copy(&rdtgrp->cpu_mask, newmask);
+	if (rdtgrp->type == RDTCTRL_GROUP)
+		ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask);
+	else
+		ret = -EINVAL;
 
 unlock:
 	rdtgroup_kn_unlock(of->kn);
@@ -1102,7 +1113,7 @@ static void rmdir_all_sub(void)
 	}
 	/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
 	get_online_cpus();
-	rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid);
+	update_closid(cpu_online_mask, &rdtgroup_default);
 	put_online_cpus();
 
 	kernfs_remove(kn_info);
@@ -1357,13 +1368,13 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 
 	/* Update per cpu closid of the moved CPUs first */
 	for_each_cpu(cpu, &rdtgrp->cpu_mask)
-		per_cpu(cpu_closid, cpu) = closid;
+		per_cpu(rdt_cpu_default.closid, cpu) = closid;
 	/*
 	 * Update the MSR on moved CPUs and CPUs which have moved
 	 * task running on them.
 	 */
 	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
-	rdt_update_closid(tmpmask, NULL);
+	update_closid(tmpmask, NULL);
 
 	rdtgrp->flags = RDT_DELETED;
 	closid_free(rdtgrp->closid);
-- 
cgit v1.2.3


From a9fcf8627dc01049c390023bbb0323db3c785b91 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:36 -0700
Subject: x86/intel_rdt/cqm: Add cpus file support

The cpus file is extended to support resource monitoring. This is used
to over-ride the RMID of the default group when running on specific
CPUs. It works similar to the resource control. The "cpus" and
"cpus_list" file is present in default group, ctrl_mon groups and
monitor groups.

Each "cpus" file or cpu_list file reads a cpumask or list showing which
CPUs belong to the resource group. By default all online cpus belong to
the default root group. A CPU can be present in one "ctrl_mon" and one
"monitor" group simultaneously. They can be added to a resource group by
writing the CPU to the file. When a CPU is added to a ctrl_mon group it
is automatically removed from the previous ctrl_mon group. A CPU can be
added to a monitor group only if it is present in the parent ctrl_mon
group and when a CPU is added to a monitor group, it is automatically
removed from the previous monitor group. When CPUs go offline, they are
automatically removed from the ctrl_mon and monitor groups.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-18-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 110 ++++++++++++++++++++++++++-----
 1 file changed, 93 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index d11f4629a702..89457bba457a 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -181,15 +181,17 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 /*
  * This is safe against intel_rdt_sched_in() called from __switch_to()
  * because __switch_to() is executed with interrupts disabled. A local call
- * from update_closid() is proteced against __switch_to() because
+ * from update_closid_rmid() is proteced against __switch_to() because
  * preemption is disabled.
  */
-static void update_cpu_closid(void *info)
+static void update_cpu_closid_rmid(void *info)
 {
 	struct rdtgroup *r = info;
 
-	if (r)
+	if (r) {
 		this_cpu_write(rdt_cpu_default.closid, r->closid);
+		this_cpu_write(rdt_cpu_default.rmid, r->mon.rmid);
+	}
 
 	/*
 	 * We cannot unconditionally write the MSR because the current
@@ -205,20 +207,72 @@ static void update_cpu_closid(void *info)
  * Per task closids/rmids must have been set up before calling this function.
  */
 static void
-update_closid(const struct cpumask *cpu_mask, struct rdtgroup *r)
+update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
 {
 	int cpu = get_cpu();
 
 	if (cpumask_test_cpu(cpu, cpu_mask))
-		update_cpu_closid(r);
-	smp_call_function_many(cpu_mask, update_cpu_closid, r, 1);
+		update_cpu_closid_rmid(r);
+	smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
 	put_cpu();
 }
 
+static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+			  cpumask_var_t tmpmask)
+{
+	struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
+	struct list_head *head;
+
+	/* Check whether cpus belong to parent ctrl group */
+	cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
+	if (cpumask_weight(tmpmask))
+		return -EINVAL;
+
+	/* Check whether cpus are dropped from this group */
+	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+	if (cpumask_weight(tmpmask)) {
+		/* Give any dropped cpus to parent rdtgroup */
+		cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
+		update_closid_rmid(tmpmask, prgrp);
+	}
+
+	/*
+	 * If we added cpus, remove them from previous group that owned them
+	 * and update per-cpu rmid
+	 */
+	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+	if (cpumask_weight(tmpmask)) {
+		head = &prgrp->mon.crdtgrp_list;
+		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+			if (crgrp == rdtgrp)
+				continue;
+			cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
+				       tmpmask);
+		}
+		update_closid_rmid(tmpmask, rdtgrp);
+	}
+
+	/* Done pushing/pulling - update this group with new mask */
+	cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+	return 0;
+}
+
+static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
+{
+	struct rdtgroup *crgrp;
+
+	cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
+	/* update the child mon group masks as well*/
+	list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
+		cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
+}
+
 static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
-			   cpumask_var_t tmpmask)
+			   cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
 {
-	struct rdtgroup *r;
+	struct rdtgroup *r, *crgrp;
+	struct list_head *head;
 
 	/* Check whether cpus are dropped from this group */
 	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
@@ -230,33 +284,47 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
 		/* Give any dropped cpus to rdtgroup_default */
 		cpumask_or(&rdtgroup_default.cpu_mask,
 			   &rdtgroup_default.cpu_mask, tmpmask);
-		update_closid(tmpmask, &rdtgroup_default);
+		update_closid_rmid(tmpmask, &rdtgroup_default);
 	}
 
 	/*
-	 * If we added cpus, remove them from previous group that owned them
-	 * and update per-cpu closid
+	 * If we added cpus, remove them from previous group and
+	 * the prev group's child groups that owned them
+	 * and update per-cpu closid/rmid.
 	 */
 	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
 	if (cpumask_weight(tmpmask)) {
 		list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
 			if (r == rdtgrp)
 				continue;
-			cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask);
+			cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
+			if (cpumask_weight(tmpmask1))
+				cpumask_rdtgrp_clear(r, tmpmask1);
 		}
-		update_closid(tmpmask, rdtgrp);
+		update_closid_rmid(tmpmask, rdtgrp);
 	}
 
 	/* Done pushing/pulling - update this group with new mask */
 	cpumask_copy(&rdtgrp->cpu_mask, newmask);
 
+	/*
+	 * Clear child mon group masks since there is a new parent mask
+	 * now and update the rmid for the cpus the child lost.
+	 */
+	head = &rdtgrp->mon.crdtgrp_list;
+	list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+		cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
+		update_closid_rmid(tmpmask, rdtgrp);
+		cpumask_clear(&crgrp->cpu_mask);
+	}
+
 	return 0;
 }
 
 static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 				   char *buf, size_t nbytes, loff_t off)
 {
-	cpumask_var_t tmpmask, newmask;
+	cpumask_var_t tmpmask, newmask, tmpmask1;
 	struct rdtgroup *rdtgrp;
 	int ret;
 
@@ -269,6 +337,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 		free_cpumask_var(tmpmask);
 		return -ENOMEM;
 	}
+	if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
+		free_cpumask_var(tmpmask);
+		free_cpumask_var(newmask);
+		return -ENOMEM;
+	}
 
 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
 	if (!rdtgrp) {
@@ -292,7 +365,9 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 	}
 
 	if (rdtgrp->type == RDTCTRL_GROUP)
-		ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask);
+		ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
+	else if (rdtgrp->type == RDTMON_GROUP)
+		ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
 	else
 		ret = -EINVAL;
 
@@ -300,6 +375,7 @@ unlock:
 	rdtgroup_kn_unlock(of->kn);
 	free_cpumask_var(tmpmask);
 	free_cpumask_var(newmask);
+	free_cpumask_var(tmpmask1);
 
 	return ret ?: nbytes;
 }
@@ -1113,7 +1189,7 @@ static void rmdir_all_sub(void)
 	}
 	/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
 	get_online_cpus();
-	update_closid(cpu_online_mask, &rdtgroup_default);
+	update_closid_rmid(cpu_online_mask, &rdtgroup_default);
 	put_online_cpus();
 
 	kernfs_remove(kn_info);
@@ -1374,7 +1450,7 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 	 * task running on them.
 	 */
 	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
-	update_closid(tmpmask, NULL);
+	update_closid_rmid(tmpmask, NULL);
 
 	rdtgrp->flags = RDT_DELETED;
 	closid_free(rdtgrp->closid);
-- 
cgit v1.2.3


From 90c403e83101c87ee9e6df8c8d30ea8628ff8bfc Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:37 -0700
Subject: x86/intel_rdt: Prepare for RDT monitor data support

Rename the intel_rdt_schemata file to intel_rdt_ctrlmondata as we now
want to add support for RDT monitoring data for the events that are
supported in later patches.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-19-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/Makefile                |   2 +-
 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 286 ++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/intel_rdt_schemata.c    | 286 ----------------------------
 3 files changed, 287 insertions(+), 287 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
 delete mode 100644 arch/x86/kernel/cpu/intel_rdt_schemata.c

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 7584be0750c4..e17942c131c8 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
-obj-$(CONFIG_INTEL_RDT)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o intel_rdt_monitor.o
+obj-$(CONFIG_INTEL_RDT)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o
 
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
new file mode 100644
index 000000000000..952156c2688d
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -0,0 +1,286 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ *    Tony Luck <tony.luck@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include "intel_rdt.h"
+
+/*
+ * Check whether MBA bandwidth percentage value is correct. The value is
+ * checked against the minimum and max bandwidth values specified by the
+ * hardware. The allocated bandwidth percentage is rounded to the next
+ * control step available on the hardware.
+ */
+static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
+{
+	unsigned long bw;
+	int ret;
+
+	/*
+	 * Only linear delay values is supported for current Intel SKUs.
+	 */
+	if (!r->membw.delay_linear)
+		return false;
+
+	ret = kstrtoul(buf, 10, &bw);
+	if (ret)
+		return false;
+
+	if (bw < r->membw.min_bw || bw > r->default_ctrl)
+		return false;
+
+	*data = roundup(bw, (unsigned long)r->membw.bw_gran);
+	return true;
+}
+
+int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d)
+{
+	unsigned long data;
+
+	if (d->have_new_ctrl)
+		return -EINVAL;
+
+	if (!bw_validate(buf, &data, r))
+		return -EINVAL;
+	d->new_ctrl = data;
+	d->have_new_ctrl = true;
+
+	return 0;
+}
+
+/*
+ * Check whether a cache bit mask is valid. The SDM says:
+ *	Please note that all (and only) contiguous '1' combinations
+ *	are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
+ * Additionally Haswell requires at least two bits set.
+ */
+static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r)
+{
+	unsigned long first_bit, zero_bit, val;
+	unsigned int cbm_len = r->cache.cbm_len;
+	int ret;
+
+	ret = kstrtoul(buf, 16, &val);
+	if (ret)
+		return false;
+
+	if (val == 0 || val > r->default_ctrl)
+		return false;
+
+	first_bit = find_first_bit(&val, cbm_len);
+	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
+
+	if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)
+		return false;
+
+	if ((zero_bit - first_bit) < r->cache.min_cbm_bits)
+		return false;
+
+	*data = val;
+	return true;
+}
+
+/*
+ * Read one cache bit mask (hex). Check that it is valid for the current
+ * resource type.
+ */
+int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d)
+{
+	unsigned long data;
+
+	if (d->have_new_ctrl)
+		return -EINVAL;
+
+	if(!cbm_validate(buf, &data, r))
+		return -EINVAL;
+	d->new_ctrl = data;
+	d->have_new_ctrl = true;
+
+	return 0;
+}
+
+/*
+ * For each domain in this resource we expect to find a series of:
+ *	id=mask
+ * separated by ";". The "id" is in decimal, and must match one of
+ * the "id"s for this resource.
+ */
+static int parse_line(char *line, struct rdt_resource *r)
+{
+	char *dom = NULL, *id;
+	struct rdt_domain *d;
+	unsigned long dom_id;
+
+next:
+	if (!line || line[0] == '\0')
+		return 0;
+	dom = strsep(&line, ";");
+	id = strsep(&dom, "=");
+	if (!dom || kstrtoul(id, 10, &dom_id))
+		return -EINVAL;
+	dom = strim(dom);
+	list_for_each_entry(d, &r->domains, list) {
+		if (d->id == dom_id) {
+			if (r->parse_ctrlval(dom, r, d))
+				return -EINVAL;
+			goto next;
+		}
+	}
+	return -EINVAL;
+}
+
+static int update_domains(struct rdt_resource *r, int closid)
+{
+	struct msr_param msr_param;
+	cpumask_var_t cpu_mask;
+	struct rdt_domain *d;
+	int cpu;
+
+	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	msr_param.low = closid;
+	msr_param.high = msr_param.low + 1;
+	msr_param.res = r;
+
+	list_for_each_entry(d, &r->domains, list) {
+		if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) {
+			cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+			d->ctrl_val[closid] = d->new_ctrl;
+		}
+	}
+	if (cpumask_empty(cpu_mask))
+		goto done;
+	cpu = get_cpu();
+	/* Update CBM on this cpu if it's in cpu_mask. */
+	if (cpumask_test_cpu(cpu, cpu_mask))
+		rdt_ctrl_update(&msr_param);
+	/* Update CBM on other cpus. */
+	smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
+	put_cpu();
+
+done:
+	free_cpumask_var(cpu_mask);
+
+	return 0;
+}
+
+static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
+{
+	struct rdt_resource *r;
+
+	for_each_alloc_enabled_rdt_resource(r) {
+		if (!strcmp(resname, r->name) && closid < r->num_closid)
+			return parse_line(tok, r);
+	}
+	return -EINVAL;
+}
+
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct rdtgroup *rdtgrp;
+	struct rdt_domain *dom;
+	struct rdt_resource *r;
+	char *tok, *resname;
+	int closid, ret = 0;
+
+	/* Valid input requires a trailing newline */
+	if (nbytes == 0 || buf[nbytes - 1] != '\n')
+		return -EINVAL;
+	buf[nbytes - 1] = '\0';
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (!rdtgrp) {
+		rdtgroup_kn_unlock(of->kn);
+		return -ENOENT;
+	}
+
+	closid = rdtgrp->closid;
+
+	for_each_alloc_enabled_rdt_resource(r) {
+		list_for_each_entry(dom, &r->domains, list)
+			dom->have_new_ctrl = false;
+	}
+
+	while ((tok = strsep(&buf, "\n")) != NULL) {
+		resname = strim(strsep(&tok, ":"));
+		if (!tok) {
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = rdtgroup_parse_resource(resname, tok, closid);
+		if (ret)
+			goto out;
+	}
+
+	for_each_alloc_enabled_rdt_resource(r) {
+		ret = update_domains(r, closid);
+		if (ret)
+			goto out;
+	}
+
+out:
+	rdtgroup_kn_unlock(of->kn);
+	return ret ?: nbytes;
+}
+
+static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
+{
+	struct rdt_domain *dom;
+	bool sep = false;
+
+	seq_printf(s, "%*s:", max_name_width, r->name);
+	list_for_each_entry(dom, &r->domains, list) {
+		if (sep)
+			seq_puts(s, ";");
+		seq_printf(s, r->format_str, dom->id, max_data_width,
+			   dom->ctrl_val[closid]);
+		sep = true;
+	}
+	seq_puts(s, "\n");
+}
+
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+			   struct seq_file *s, void *v)
+{
+	struct rdtgroup *rdtgrp;
+	struct rdt_resource *r;
+	int closid, ret = 0;
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (rdtgrp) {
+		closid = rdtgrp->closid;
+		for_each_alloc_enabled_rdt_resource(r) {
+			if (closid < r->num_closid)
+				show_doms(s, r, closid);
+		}
+	} else {
+		ret = -ENOENT;
+	}
+	rdtgroup_kn_unlock(of->kn);
+	return ret;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c
deleted file mode 100644
index 952156c2688d..000000000000
--- a/arch/x86/kernel/cpu/intel_rdt_schemata.c
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Resource Director Technology(RDT)
- * - Cache Allocation code.
- *
- * Copyright (C) 2016 Intel Corporation
- *
- * Authors:
- *    Fenghua Yu <fenghua.yu@intel.com>
- *    Tony Luck <tony.luck@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * More information about RDT be found in the Intel (R) x86 Architecture
- * Software Developer Manual June 2016, volume 3, section 17.17.
- */
-
-#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
-
-#include <linux/kernfs.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include "intel_rdt.h"
-
-/*
- * Check whether MBA bandwidth percentage value is correct. The value is
- * checked against the minimum and max bandwidth values specified by the
- * hardware. The allocated bandwidth percentage is rounded to the next
- * control step available on the hardware.
- */
-static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
-{
-	unsigned long bw;
-	int ret;
-
-	/*
-	 * Only linear delay values is supported for current Intel SKUs.
-	 */
-	if (!r->membw.delay_linear)
-		return false;
-
-	ret = kstrtoul(buf, 10, &bw);
-	if (ret)
-		return false;
-
-	if (bw < r->membw.min_bw || bw > r->default_ctrl)
-		return false;
-
-	*data = roundup(bw, (unsigned long)r->membw.bw_gran);
-	return true;
-}
-
-int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d)
-{
-	unsigned long data;
-
-	if (d->have_new_ctrl)
-		return -EINVAL;
-
-	if (!bw_validate(buf, &data, r))
-		return -EINVAL;
-	d->new_ctrl = data;
-	d->have_new_ctrl = true;
-
-	return 0;
-}
-
-/*
- * Check whether a cache bit mask is valid. The SDM says:
- *	Please note that all (and only) contiguous '1' combinations
- *	are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
- * Additionally Haswell requires at least two bits set.
- */
-static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r)
-{
-	unsigned long first_bit, zero_bit, val;
-	unsigned int cbm_len = r->cache.cbm_len;
-	int ret;
-
-	ret = kstrtoul(buf, 16, &val);
-	if (ret)
-		return false;
-
-	if (val == 0 || val > r->default_ctrl)
-		return false;
-
-	first_bit = find_first_bit(&val, cbm_len);
-	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
-
-	if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)
-		return false;
-
-	if ((zero_bit - first_bit) < r->cache.min_cbm_bits)
-		return false;
-
-	*data = val;
-	return true;
-}
-
-/*
- * Read one cache bit mask (hex). Check that it is valid for the current
- * resource type.
- */
-int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d)
-{
-	unsigned long data;
-
-	if (d->have_new_ctrl)
-		return -EINVAL;
-
-	if(!cbm_validate(buf, &data, r))
-		return -EINVAL;
-	d->new_ctrl = data;
-	d->have_new_ctrl = true;
-
-	return 0;
-}
-
-/*
- * For each domain in this resource we expect to find a series of:
- *	id=mask
- * separated by ";". The "id" is in decimal, and must match one of
- * the "id"s for this resource.
- */
-static int parse_line(char *line, struct rdt_resource *r)
-{
-	char *dom = NULL, *id;
-	struct rdt_domain *d;
-	unsigned long dom_id;
-
-next:
-	if (!line || line[0] == '\0')
-		return 0;
-	dom = strsep(&line, ";");
-	id = strsep(&dom, "=");
-	if (!dom || kstrtoul(id, 10, &dom_id))
-		return -EINVAL;
-	dom = strim(dom);
-	list_for_each_entry(d, &r->domains, list) {
-		if (d->id == dom_id) {
-			if (r->parse_ctrlval(dom, r, d))
-				return -EINVAL;
-			goto next;
-		}
-	}
-	return -EINVAL;
-}
-
-static int update_domains(struct rdt_resource *r, int closid)
-{
-	struct msr_param msr_param;
-	cpumask_var_t cpu_mask;
-	struct rdt_domain *d;
-	int cpu;
-
-	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
-		return -ENOMEM;
-
-	msr_param.low = closid;
-	msr_param.high = msr_param.low + 1;
-	msr_param.res = r;
-
-	list_for_each_entry(d, &r->domains, list) {
-		if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) {
-			cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
-			d->ctrl_val[closid] = d->new_ctrl;
-		}
-	}
-	if (cpumask_empty(cpu_mask))
-		goto done;
-	cpu = get_cpu();
-	/* Update CBM on this cpu if it's in cpu_mask. */
-	if (cpumask_test_cpu(cpu, cpu_mask))
-		rdt_ctrl_update(&msr_param);
-	/* Update CBM on other cpus. */
-	smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
-	put_cpu();
-
-done:
-	free_cpumask_var(cpu_mask);
-
-	return 0;
-}
-
-static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
-{
-	struct rdt_resource *r;
-
-	for_each_alloc_enabled_rdt_resource(r) {
-		if (!strcmp(resname, r->name) && closid < r->num_closid)
-			return parse_line(tok, r);
-	}
-	return -EINVAL;
-}
-
-ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
-				char *buf, size_t nbytes, loff_t off)
-{
-	struct rdtgroup *rdtgrp;
-	struct rdt_domain *dom;
-	struct rdt_resource *r;
-	char *tok, *resname;
-	int closid, ret = 0;
-
-	/* Valid input requires a trailing newline */
-	if (nbytes == 0 || buf[nbytes - 1] != '\n')
-		return -EINVAL;
-	buf[nbytes - 1] = '\0';
-
-	rdtgrp = rdtgroup_kn_lock_live(of->kn);
-	if (!rdtgrp) {
-		rdtgroup_kn_unlock(of->kn);
-		return -ENOENT;
-	}
-
-	closid = rdtgrp->closid;
-
-	for_each_alloc_enabled_rdt_resource(r) {
-		list_for_each_entry(dom, &r->domains, list)
-			dom->have_new_ctrl = false;
-	}
-
-	while ((tok = strsep(&buf, "\n")) != NULL) {
-		resname = strim(strsep(&tok, ":"));
-		if (!tok) {
-			ret = -EINVAL;
-			goto out;
-		}
-		ret = rdtgroup_parse_resource(resname, tok, closid);
-		if (ret)
-			goto out;
-	}
-
-	for_each_alloc_enabled_rdt_resource(r) {
-		ret = update_domains(r, closid);
-		if (ret)
-			goto out;
-	}
-
-out:
-	rdtgroup_kn_unlock(of->kn);
-	return ret ?: nbytes;
-}
-
-static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
-{
-	struct rdt_domain *dom;
-	bool sep = false;
-
-	seq_printf(s, "%*s:", max_name_width, r->name);
-	list_for_each_entry(dom, &r->domains, list) {
-		if (sep)
-			seq_puts(s, ";");
-		seq_printf(s, r->format_str, dom->id, max_data_width,
-			   dom->ctrl_val[closid]);
-		sep = true;
-	}
-	seq_puts(s, "\n");
-}
-
-int rdtgroup_schemata_show(struct kernfs_open_file *of,
-			   struct seq_file *s, void *v)
-{
-	struct rdtgroup *rdtgrp;
-	struct rdt_resource *r;
-	int closid, ret = 0;
-
-	rdtgrp = rdtgroup_kn_lock_live(of->kn);
-	if (rdtgrp) {
-		closid = rdtgrp->closid;
-		for_each_alloc_enabled_rdt_resource(r) {
-			if (closid < r->num_closid)
-				show_doms(s, r, closid);
-		}
-	} else {
-		ret = -ENOENT;
-	}
-	rdtgroup_kn_unlock(of->kn);
-	return ret;
-}
-- 
cgit v1.2.3


From d89b7379015fc561060a4094676d143e6ed264e7 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:38 -0700
Subject: x86/intel_rdt/cqm: Add mon_data

Add a mon_data directory for the root rdtgroup and all other rdtgroups.
The directory holds all of the monitored data for all domains and events
of all resources being monitored.

The mon_data itself has a list of directories in the format
mon_<domain_name>_<domain_id>. Each of these subdirectories contain one
file per event in the mode "0444". Reading the file displays a snapshot
of the monitored data for the event the file represents.

For ex, on a 2 socket Broadwell with llc_occupancy being
monitored the mon_data contents look as below:

$ ls /sys/fs/resctrl/p1/mon_data/
mon_L3_00
mon_L3_01

Each domain directory has one file per event:
$ ls /sys/fs/resctrl/p1/mon_data/mon_L3_00/
llc_occupancy

To read current llc_occupancy of ctrl_mon group p1
$ cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy
33789096

[This patch idea is based on Tony's sample patches to organise data in a
per domain directory and have one file per event (and use the fp->priv to
store mon data bits)]

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-20-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt.c             |   9 +-
 arch/x86/kernel/cpu/intel_rdt.h             |  29 ++++++
 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c |  55 ++++++++++-
 arch/x86/kernel/cpu/intel_rdt_monitor.c     |  49 ++++++++++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c    | 145 ++++++++++++++++++++++++++++
 5 files changed, 284 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 303b085b969f..050ea15259ea 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -71,6 +71,7 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
 struct rdt_resource rdt_resources_all[] = {
 	[RDT_RESOURCE_L3] =
 	{
+		.rid			= RDT_RESOURCE_L3,
 		.name			= "L3",
 		.domains		= domain_init(RDT_RESOURCE_L3),
 		.msr_base		= IA32_L3_CBM_BASE,
@@ -87,6 +88,7 @@ struct rdt_resource rdt_resources_all[] = {
 	},
 	[RDT_RESOURCE_L3DATA] =
 	{
+		.rid			= RDT_RESOURCE_L3DATA,
 		.name			= "L3DATA",
 		.domains		= domain_init(RDT_RESOURCE_L3DATA),
 		.msr_base		= IA32_L3_CBM_BASE,
@@ -103,6 +105,7 @@ struct rdt_resource rdt_resources_all[] = {
 	},
 	[RDT_RESOURCE_L3CODE] =
 	{
+		.rid			= RDT_RESOURCE_L3CODE,
 		.name			= "L3CODE",
 		.domains		= domain_init(RDT_RESOURCE_L3CODE),
 		.msr_base		= IA32_L3_CBM_BASE,
@@ -119,6 +122,7 @@ struct rdt_resource rdt_resources_all[] = {
 	},
 	[RDT_RESOURCE_L2] =
 	{
+		.rid			= RDT_RESOURCE_L2,
 		.name			= "L2",
 		.domains		= domain_init(RDT_RESOURCE_L2),
 		.msr_base		= IA32_L2_CBM_BASE,
@@ -135,6 +139,7 @@ struct rdt_resource rdt_resources_all[] = {
 	},
 	[RDT_RESOURCE_MBA] =
 	{
+		.rid			= RDT_RESOURCE_MBA,
 		.name			= "MB",
 		.domains		= domain_init(RDT_RESOURCE_MBA),
 		.msr_base		= IA32_MBA_THRTL_BASE,
@@ -362,8 +367,8 @@ void rdt_ctrl_update(void *arg)
  * caller, return the first domain whose id is bigger than the input id.
  * The domain list is sorted by id in ascending order.
  */
-static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
-					  struct list_head **pos)
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+				   struct list_head **pos)
 {
 	struct rdt_domain *d;
 	struct list_head *l;
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 6f070476416f..7fcaa5f82bd0 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -33,6 +33,27 @@ struct mon_evt {
 	struct list_head	list;
 };
 
+/**
+ * struct mon_data_bits - Monitoring details for each event file
+ * @rid:               Resource id associated with the event file.
+ * @evtid:             Event id associated with the event file
+ * @domid:             The domain to which the event file belongs
+ */
+union mon_data_bits {
+	void *priv;
+	struct {
+		unsigned int rid	: 10;
+		unsigned int evtid	: 8;
+		unsigned int domid	: 14;
+	} u;
+};
+
+struct rmid_read {
+	struct rdtgroup		*rgrp;
+	int			evtid;
+	u64			val;
+};
+
 extern unsigned int intel_cqm_threshold;
 extern bool rdt_alloc_capable;
 extern bool rdt_mon_capable;
@@ -46,11 +67,13 @@ enum rdt_group_type {
 
 /**
  * struct mongroup - store mon group's data in resctrl fs.
+ * @mon_data_kn		kernlfs node for the mon_data directory
  * @parent:			parent rdtgrp
  * @crdtgrp_list:		child rdtgroup node list
  * @rmid:			rmid for this rdtgroup
  */
 struct mongroup {
+	struct kernfs_node	*mon_data_kn;
 	struct rdtgroup		*parent;
 	struct list_head	crdtgrp_list;
 	u32			rmid;
@@ -209,6 +232,7 @@ static inline bool is_llc_occupancy_enabled(void)
 
 /**
  * struct rdt_resource - attributes of an RDT resource
+ * @rid:		The index of the resource
  * @alloc_enabled:	Is allocation enabled on this machine
  * @mon_enabled:		Is monitoring enabled for this feature
  * @alloc_capable:	Is allocation available on this machine
@@ -230,6 +254,7 @@ static inline bool is_llc_occupancy_enabled(void)
  * @fflags:			flags to choose base and info files
  */
 struct rdt_resource {
+	int			rid;
 	bool			alloc_enabled;
 	bool			mon_enabled;
 	bool			alloc_capable;
@@ -323,6 +348,8 @@ union cpuid_0x10_x_edx {
 void rdt_ctrl_update(void *arg);
 struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
 void rdtgroup_kn_unlock(struct kernfs_node *kn);
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+				   struct list_head **pos);
 ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 				char *buf, size_t nbytes, loff_t off);
 int rdtgroup_schemata_show(struct kernfs_open_file *of,
@@ -331,5 +358,7 @@ struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
 int alloc_rmid(void);
 void free_rmid(u32 rmid);
 int rdt_get_mon_l3_config(struct rdt_resource *r);
+void mon_event_count(void *info);
+int rdtgroup_mondata_show(struct seq_file *m, void *arg);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index 952156c2688d..f44e77dc568b 100644
--- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -269,7 +269,8 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 {
 	struct rdtgroup *rdtgrp;
 	struct rdt_resource *r;
-	int closid, ret = 0;
+	int ret = 0;
+	u32 closid;
 
 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
 	if (rdtgrp) {
@@ -284,3 +285,55 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 	rdtgroup_kn_unlock(of->kn);
 	return ret;
 }
+
+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
+		    struct rdtgroup *rdtgrp, int evtid)
+{
+	/*
+	 * setup the parameters to send to the IPI to read the data.
+	 */
+	rr->rgrp = rdtgrp;
+	rr->evtid = evtid;
+	rr->val = 0;
+
+	smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
+}
+
+int rdtgroup_mondata_show(struct seq_file *m, void *arg)
+{
+	struct kernfs_open_file *of = m->private;
+	u32 resid, evtid, domid;
+	struct rdtgroup *rdtgrp;
+	struct rdt_resource *r;
+	union mon_data_bits md;
+	struct rdt_domain *d;
+	struct rmid_read rr;
+	int ret = 0;
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+	md.priv = of->kn->priv;
+	resid = md.u.rid;
+	domid = md.u.domid;
+	evtid = md.u.evtid;
+
+	r = &rdt_resources_all[resid];
+	d = rdt_find_domain(r, domid, NULL);
+	if (!d) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	mon_event_read(&rr, d, rdtgrp, evtid);
+
+	if (rr.val & RMID_VAL_ERROR)
+		seq_puts(m, "Error\n");
+	else if (rr.val & RMID_VAL_UNAVAIL)
+		seq_puts(m, "Unavailable\n");
+	else
+		seq_printf(m, "%llu\n", rr.val * r->mon_scale);
+
+out:
+	rdtgroup_kn_unlock(of->kn);
+	return ret;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index b6732c51829b..f8f06f5b7fc3 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -294,6 +294,55 @@ void free_rmid(u32 rmid)
 		list_add_tail(&entry->list, &rmid_free_lru);
 }
 
+static int __mon_event_count(u32 rmid, struct rmid_read *rr)
+{
+	u64 tval;
+
+	tval = __rmid_read(rmid, rr->evtid);
+	if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
+		rr->val = tval;
+		return -EINVAL;
+	}
+	switch (rr->evtid) {
+	case QOS_L3_OCCUP_EVENT_ID:
+		rr->val += tval;
+		return 0;
+	default:
+		/*
+		 * Code would never reach here because
+		 * an invalid event id would fail the __rmid_read.
+		 */
+		return -EINVAL;
+	}
+}
+
+/*
+ * This is called via IPI to read the CQM/MBM counters
+ * on a domain.
+ */
+void mon_event_count(void *info)
+{
+	struct rdtgroup *rdtgrp, *entry;
+	struct rmid_read *rr = info;
+	struct list_head *head;
+
+	rdtgrp = rr->rgrp;
+
+	if (__mon_event_count(rdtgrp->mon.rmid, rr))
+		return;
+
+	/*
+	 * For Ctrl groups read data from child monitor groups.
+	 */
+	head = &rdtgrp->mon.crdtgrp_list;
+
+	if (rdtgrp->type == RDTCTRL_GROUP) {
+		list_for_each_entry(entry, head, mon.crdtgrp_list) {
+			if (__mon_event_count(entry->mon.rmid, rr))
+				return;
+		}
+	}
+}
 static int dom_data_init(struct rdt_resource *r)
 {
 	struct rmid_entry *entry = NULL;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 89457bba457a..59de36dd534f 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -152,6 +152,11 @@ static struct kernfs_ops rdtgroup_kf_single_ops = {
 	.seq_show		= rdtgroup_seqfile_show,
 };
 
+static struct kernfs_ops kf_mondata_ops = {
+	.atomic_write_len	= PAGE_SIZE,
+	.seq_show		= rdtgroup_mondata_show,
+};
+
 static bool is_cpu_list(struct kernfs_open_file *of)
 {
 	struct rftype *rft = of->kn->priv;
@@ -1217,6 +1222,140 @@ static struct file_system_type rdt_fs_type = {
 	.kill_sb = rdt_kill_sb,
 };
 
+static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
+		       void *priv)
+{
+	struct kernfs_node *kn;
+	int ret = 0;
+
+	kn = __kernfs_create_file(parent_kn, name, 0444, 0,
+				  &kf_mondata_ops, priv, NULL, NULL);
+	if (IS_ERR(kn))
+		return PTR_ERR(kn);
+
+	ret = rdtgroup_kn_set_ugid(kn);
+	if (ret) {
+		kernfs_remove(kn);
+		return ret;
+	}
+
+	return ret;
+}
+
+static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
+				struct rdt_domain *d,
+				struct rdt_resource *r, struct rdtgroup *prgrp)
+{
+	union mon_data_bits priv;
+	struct kernfs_node *kn;
+	struct mon_evt *mevt;
+	char name[32];
+	int ret;
+
+	sprintf(name, "mon_%s_%02d", r->name, d->id);
+	/* create the directory */
+	kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+	if (IS_ERR(kn))
+		return PTR_ERR(kn);
+
+	/*
+	 * This extra ref will be put in kernfs_remove() and guarantees
+	 * that kn is always accessible.
+	 */
+	kernfs_get(kn);
+	ret = rdtgroup_kn_set_ugid(kn);
+	if (ret)
+		goto out_destroy;
+
+	if (WARN_ON(list_empty(&r->evt_list))) {
+		ret = -EPERM;
+		goto out_destroy;
+	}
+
+	priv.u.rid = r->rid;
+	priv.u.domid = d->id;
+	list_for_each_entry(mevt, &r->evt_list, list) {
+		priv.u.evtid = mevt->evtid;
+		ret = mon_addfile(kn, mevt->name, priv.priv);
+		if (ret)
+			goto out_destroy;
+	}
+	kernfs_activate(kn);
+	return 0;
+
+out_destroy:
+	kernfs_remove(kn);
+	return ret;
+}
+
+static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
+				       struct rdt_resource *r,
+				       struct rdtgroup *prgrp)
+{
+	struct rdt_domain *dom;
+	int ret;
+
+	list_for_each_entry(dom, &r->domains, list) {
+		ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * This creates a directory mon_data which contains the monitored data.
+ *
+ * mon_data has one directory for each domain whic are named
+ * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
+ * with L3 domain looks as below:
+ * ./mon_data:
+ * mon_L3_00
+ * mon_L3_01
+ * mon_L3_02
+ * ...
+ *
+ * Each domain directory has one file per event:
+ * ./mon_L3_00/:
+ * llc_occupancy
+ *
+ */
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+			     struct rdtgroup *prgrp,
+			     struct kernfs_node **dest_kn)
+{
+	struct rdt_resource *r;
+	struct kernfs_node *kn;
+	int ret;
+
+	/*
+	 * Create the mon_data directory first.
+	 */
+	ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn);
+	if (ret)
+		return ret;
+
+	if (dest_kn)
+		*dest_kn = kn;
+
+	/*
+	 * Create the subdirectories for each domain. Note that all events
+	 * in a domain like L3 are grouped into a resource whose domain is L3
+	 */
+	for_each_mon_enabled_rdt_resource(r) {
+		ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
+		if (ret)
+			goto out_destroy;
+	}
+
+	return 0;
+
+out_destroy:
+	kernfs_remove(kn);
+	return ret;
+}
+
 static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
 			     struct kernfs_node *prgrp_kn,
 			     const char *name, umode_t mode,
@@ -1275,6 +1414,10 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
 		if (ret < 0)
 			goto out_destroy;
 		rdtgrp->mon.rmid = ret;
+
+		ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
+		if (ret)
+			goto out_idfree;
 	}
 	kernfs_activate(kn);
 
@@ -1283,6 +1426,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
 	 */
 	return 0;
 
+out_idfree:
+	free_rmid(rdtgrp->mon.rmid);
 out_destroy:
 	kernfs_remove(rdtgrp->kn);
 out_free_rgrp:
-- 
cgit v1.2.3


From f9049547f7e72377049d717354b2f56f36a5854a Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:39 -0700
Subject: x86/intel_rdt: Separate the ctrl bits from rmdir

Re-factor the code to separate the ctrl group removal from the rmdir to
prepare to add RDT monitoring group removal.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-21-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 48 ++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 15 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 59de36dd534f..3b62287a2f01 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1565,20 +1565,10 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	return -EPERM;
 }
 
-static int rdtgroup_rmdir(struct kernfs_node *kn)
+static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+			       cpumask_var_t tmpmask)
 {
-	int ret, cpu, closid = rdtgroup_default.closid;
-	struct rdtgroup *rdtgrp;
-	cpumask_var_t tmpmask;
-
-	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
-		return -ENOMEM;
-
-	rdtgrp = rdtgroup_kn_lock_live(kn);
-	if (!rdtgrp) {
-		ret = -EPERM;
-		goto out;
-	}
+	int cpu;
 
 	/* Give any tasks back to the default group */
 	rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
@@ -1589,7 +1579,7 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 
 	/* Update per cpu closid of the moved CPUs first */
 	for_each_cpu(cpu, &rdtgrp->cpu_mask)
-		per_cpu(rdt_cpu_default.closid, cpu) = closid;
+		per_cpu(rdt_cpu_default.closid, cpu) = rdtgroup_default.closid;
 	/*
 	 * Update the MSR on moved CPUs and CPUs which have moved
 	 * task running on them.
@@ -1607,7 +1597,35 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 	 */
 	kernfs_get(kn);
 	kernfs_remove(rdtgrp->kn);
-	ret = 0;
+
+	return 0;
+}
+
+static int rdtgroup_rmdir(struct kernfs_node *kn)
+{
+	struct kernfs_node *parent_kn = kn->parent;
+	struct rdtgroup *rdtgrp;
+	cpumask_var_t tmpmask;
+	int ret = 0;
+
+	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+		return -ENOMEM;
+
+	rdtgrp = rdtgroup_kn_lock_live(kn);
+	if (!rdtgrp) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	/*
+	 * If the rdtgroup is a ctrl_mon group and parent directory
+	 * is the root directory, remove the ctrl group.
+	 */
+	if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn)
+		ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
+	else
+		ret = -EPERM;
+
 out:
 	rdtgroup_kn_unlock(kn);
 	free_cpumask_var(tmpmask);
-- 
cgit v1.2.3


From f3cbeacaa06e2b8c2f3ce8531e9aa3fe1f2762cd Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:40 -0700
Subject: x86/intel_rdt/cqm: Add rmdir support

Resource groups (ctrl_mon and monitor groups) are represented by
directories in resctrl fs. Add support to remove the directories.

When a ctrl_mon directory is removed all the cpus and tasks are assigned
back to the root rdtgroup. When a monitor group is removed the cpus and
tasks are returned to the parent control group.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-22-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 90 ++++++++++++++++++++++++++++++--
 1 file changed, 86 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 3b62287a2f01..e61908169dca 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1130,6 +1130,18 @@ static int reset_all_ctrls(struct rdt_resource *r)
 	return 0;
 }
 
+static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
+{
+	return (rdt_alloc_capable &&
+		(r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
+}
+
+static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
+{
+	return (rdt_mon_capable &&
+		(r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
+}
+
 /*
  * Move tasks from one to the other group. If @from is NULL, then all tasks
  * in the systems are moved unconditionally (used for teardown).
@@ -1145,8 +1157,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
 
 	read_lock(&tasklist_lock);
 	for_each_process_thread(p, t) {
-		if (!from || t->closid == from->closid) {
+		if (!from || is_closid_match(t, from) ||
+		    is_rmid_match(t, from)) {
 			t->closid = to->closid;
+			t->rmid = to->mon.rmid;
+
 #ifdef CONFIG_SMP
 			/*
 			 * This is safe on x86 w/o barriers as the ordering
@@ -1165,6 +1180,19 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
 	read_unlock(&tasklist_lock);
 }
 
+static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
+{
+	struct rdtgroup *sentry, *stmp;
+	struct list_head *head;
+
+	head = &rdtgrp->mon.crdtgrp_list;
+	list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
+		free_rmid(sentry->mon.rmid);
+		list_del(&sentry->mon.crdtgrp_list);
+		kfree(sentry);
+	}
+}
+
 /*
  * Forcibly remove all of subdirectories under root.
  */
@@ -1565,6 +1593,44 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	return -EPERM;
 }
 
+static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+			      cpumask_var_t tmpmask)
+{
+	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+	int cpu;
+
+	/* Give any tasks back to the parent group */
+	rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
+
+	/* Update per cpu rmid of the moved CPUs first */
+	for_each_cpu(cpu, &rdtgrp->cpu_mask)
+		per_cpu(rdt_cpu_default.rmid, cpu) = prdtgrp->mon.rmid;
+	/*
+	 * Update the MSR on moved CPUs and CPUs which have moved
+	 * task running on them.
+	 */
+	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+	update_closid_rmid(tmpmask, NULL);
+
+	rdtgrp->flags = RDT_DELETED;
+	free_rmid(rdtgrp->mon.rmid);
+
+	/*
+	 * Remove the rdtgrp from the parent ctrl_mon group's list
+	 */
+	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
+	list_del(&rdtgrp->mon.crdtgrp_list);
+
+	/*
+	 * one extra hold on this, will drop when we kfree(rdtgrp)
+	 * in rdtgroup_kn_unlock()
+	 */
+	kernfs_get(kn);
+	kernfs_remove(rdtgrp->kn);
+
+	return 0;
+}
+
 static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
 			       cpumask_var_t tmpmask)
 {
@@ -1577,9 +1643,12 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
 	cpumask_or(&rdtgroup_default.cpu_mask,
 		   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
 
-	/* Update per cpu closid of the moved CPUs first */
-	for_each_cpu(cpu, &rdtgrp->cpu_mask)
+	/* Update per cpu closid and rmid of the moved CPUs first */
+	for_each_cpu(cpu, &rdtgrp->cpu_mask) {
 		per_cpu(rdt_cpu_default.closid, cpu) = rdtgroup_default.closid;
+		per_cpu(rdt_cpu_default.rmid, cpu) = rdtgroup_default.mon.rmid;
+	}
+
 	/*
 	 * Update the MSR on moved CPUs and CPUs which have moved
 	 * task running on them.
@@ -1589,6 +1658,13 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
 
 	rdtgrp->flags = RDT_DELETED;
 	closid_free(rdtgrp->closid);
+	free_rmid(rdtgrp->mon.rmid);
+
+	/*
+	 * Free all the child monitor group rmids.
+	 */
+	free_all_child_rdtgrp(rdtgrp);
+
 	list_del(&rdtgrp->rdtgroup_list);
 
 	/*
@@ -1619,10 +1695,16 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 
 	/*
 	 * If the rdtgroup is a ctrl_mon group and parent directory
-	 * is the root directory, remove the ctrl group.
+	 * is the root directory, remove the ctrl_mon group.
+	 *
+	 * If the rdtgroup is a mon group and parent directory
+	 * is a valid "mon_groups" directory, remove the mon group.
 	 */
 	if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn)
 		ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
+	else if (rdtgrp->type == RDTMON_GROUP &&
+		 is_mon_groups(parent_kn, kn->name))
+		ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
 	else
 		ret = -EPERM;
 
-- 
cgit v1.2.3


From 4af4a88e0c9246990f95c88eeba781265f27c58e Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:41 -0700
Subject: x86/intel_rdt/cqm: Add mount,umount support

Add monitoring support during mount and unmount. Since root directory is
a "ctrl_mon" directory which can control and monitor resources create
the "mon_groups" directory which can hold monitor groups and a
"mon_data" directory which would hold all monitoring data like the rest
of resource groups.

The mount succeeds if either of monitoring or control/allocation is
enabled. If only monitoring is enabled user can still create monitor
groups under the "/sys/fs/resctrl/mon_groups/" and any mkdir under root
would fail. If only control/allocation is enabled all of the monitoring
related directories/files would not exist and resctrl would work in
legacy mode.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-23-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt.h          |  4 +++
 arch/x86/kernel/cpu/intel_rdt_monitor.c  |  1 +
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 61 +++++++++++++++++++++++++++++---
 3 files changed, 62 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 7fcaa5f82bd0..92a5d3072d93 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -22,6 +22,8 @@
 #define RMID_VAL_ERROR			BIT_ULL(63)
 #define RMID_VAL_UNAVAIL		BIT_ULL(62)
 
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+
 /**
  * struct mon_evt - Entry in the event list of a resource
  * @evtid:		event id
@@ -59,6 +61,8 @@ extern bool rdt_alloc_capable;
 extern bool rdt_mon_capable;
 extern unsigned int rdt_mon_features;
 
+DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+
 enum rdt_group_type {
 	RDTCTRL_GROUP = 0,
 	RDTMON_GROUP,
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index f8f06f5b7fc3..6ae5cf58e50a 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -343,6 +343,7 @@ void mon_event_count(void *info)
 		}
 	}
 }
+
 static int dom_data_init(struct rdt_resource *r)
 {
 	struct rmid_entry *entry = NULL;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index e61908169dca..0a7be9b7a201 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -35,6 +35,8 @@
 #include <asm/intel_rdt_sched.h>
 #include "intel_rdt.h"
 
+DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
 DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 static struct kernfs_root *rdt_root;
 struct rdtgroup rdtgroup_default;
@@ -43,6 +45,12 @@ LIST_HEAD(rdt_all_groups);
 /* Kernel fs node for "info" directory under root */
 static struct kernfs_node *kn_info;
 
+/* Kernel fs node for "mon_groups" directory under root */
+static struct kernfs_node *kn_mongrp;
+
+/* Kernel fs node for "mon_data" directory under root */
+static struct kernfs_node *kn_mondata;
+
 /*
  * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
  * we can keep a bitmap of free CLOSIDs in a single integer.
@@ -1044,6 +1052,10 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
 	}
 }
 
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+			     struct rdtgroup *prgrp,
+			     struct kernfs_node **mon_data_kn);
+
 static struct dentry *rdt_mount(struct file_system_type *fs_type,
 				int flags, const char *unused_dev_name,
 				void *data)
@@ -1055,7 +1067,7 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
 	/*
 	 * resctrl file system can only be mounted once.
 	 */
-	if (static_branch_unlikely(&rdt_alloc_enable_key)) {
+	if (static_branch_unlikely(&rdt_enable_key)) {
 		dentry = ERR_PTR(-EBUSY);
 		goto out;
 	}
@@ -1074,15 +1086,47 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
 		goto out_cdp;
 	}
 
+	if (rdt_mon_capable) {
+		ret = mongroup_create_dir(rdtgroup_default.kn,
+					  NULL, "mon_groups",
+					  &kn_mongrp);
+		if (ret) {
+			dentry = ERR_PTR(ret);
+			goto out_info;
+		}
+		kernfs_get(kn_mongrp);
+
+		ret = mkdir_mondata_all(rdtgroup_default.kn,
+					&rdtgroup_default, &kn_mondata);
+		if (ret) {
+			dentry = ERR_PTR(ret);
+			goto out_mongrp;
+		}
+		kernfs_get(kn_mondata);
+		rdtgroup_default.mon.mon_data_kn = kn_mondata;
+	}
+
 	dentry = kernfs_mount(fs_type, flags, rdt_root,
 			      RDTGROUP_SUPER_MAGIC, NULL);
 	if (IS_ERR(dentry))
-		goto out_destroy;
+		goto out_mondata;
 
-	static_branch_enable(&rdt_alloc_enable_key);
+	if (rdt_alloc_capable)
+		static_branch_enable(&rdt_alloc_enable_key);
+	if (rdt_mon_capable)
+		static_branch_enable(&rdt_mon_enable_key);
+
+	if (rdt_alloc_capable || rdt_mon_capable)
+		static_branch_enable(&rdt_enable_key);
 	goto out;
 
-out_destroy:
+out_mondata:
+	if (rdt_mon_capable)
+		kernfs_remove(kn_mondata);
+out_mongrp:
+	if (rdt_mon_capable)
+		kernfs_remove(kn_mongrp);
+out_info:
 	kernfs_remove(kn_info);
 out_cdp:
 	cdp_disable();
@@ -1204,6 +1248,9 @@ static void rmdir_all_sub(void)
 	rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
 
 	list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
+		/* Free any child rmids */
+		free_all_child_rdtgrp(rdtgrp);
+
 		/* Remove each rdtgroup other than root */
 		if (rdtgrp == &rdtgroup_default)
 			continue;
@@ -1216,6 +1263,8 @@ static void rmdir_all_sub(void)
 		cpumask_or(&rdtgroup_default.cpu_mask,
 			   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
 
+		free_rmid(rdtgrp->mon.rmid);
+
 		kernfs_remove(rdtgrp->kn);
 		list_del(&rdtgrp->rdtgroup_list);
 		kfree(rdtgrp);
@@ -1226,6 +1275,8 @@ static void rmdir_all_sub(void)
 	put_online_cpus();
 
 	kernfs_remove(kn_info);
+	kernfs_remove(kn_mongrp);
+	kernfs_remove(kn_mondata);
 }
 
 static void rdt_kill_sb(struct super_block *sb)
@@ -1240,6 +1291,8 @@ static void rdt_kill_sb(struct super_block *sb)
 	cdp_disable();
 	rmdir_all_sub();
 	static_branch_disable(&rdt_alloc_enable_key);
+	static_branch_disable(&rdt_mon_enable_key);
+	static_branch_disable(&rdt_enable_key);
 	kernfs_kill_sb(sb);
 	mutex_unlock(&rdtgroup_mutex);
 }
-- 
cgit v1.2.3


From 4be6c078428b08d1a948cc09faca8f1326231866 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:42 -0700
Subject: x86/intel_rdt: Introduce rdt_enable_key for scheduling

Introduce the usage of rdt_enable_key in sched_in code as a preparation
to add RDT monitoring support for sched_in.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-24-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/include/asm/intel_rdt_sched.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h
index 2c704d205e2f..8c5be0117457 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -27,10 +27,12 @@ struct intel_pqr_state {
 
 DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
 DECLARE_PER_CPU_READ_MOSTLY(struct intel_pqr_state, rdt_cpu_default);
+
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
 DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
 
 /*
- * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
+ * __intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
  *
  * Following considerations are made so that this has minimal impact
  * on scheduler hot path:
@@ -42,7 +44,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
  *
  * Must be called with preemption disabled.
  */
-static inline void intel_rdt_sched_in(void)
+static inline void __intel_rdt_sched_in(void)
 {
 	if (static_branch_likely(&rdt_alloc_enable_key)) {
 		struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
@@ -63,6 +65,12 @@ static inline void intel_rdt_sched_in(void)
 	}
 }
 
+static inline void intel_rdt_sched_in(void)
+{
+	if (static_branch_likely(&rdt_enable_key))
+		__intel_rdt_sched_in();
+}
+
 #else
 
 static inline void intel_rdt_sched_in(void) {}
-- 
cgit v1.2.3


From 748b6b881ccdda8f0663c68605f431279e06f49a Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:43 -0700
Subject: x86/intel_rdt/cqm: Add sched_in support

OS associates an RMID/CLOSid to a task by writing the per CPU
IA32_PQR_ASSOC MSR when a task is scheduled in.

The sched_in code will stay as no-op unless we are running on Intel SKU
which supports either resource control or monitoring and we also enable
them by mounting the resctrl fs.  The per cpu CLOSid/RMID values are
cached and the write is performed only when a task with a different
CLOSid/RMID is scheduled in.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-25-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/include/asm/intel_rdt_sched.h | 50 ++++++++++++++++++++--------------
 arch/x86/kernel/cpu/intel_rdt.h        |  4 ---
 2 files changed, 29 insertions(+), 25 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h
index 8c5be0117457..3badc0a87ef5 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -15,7 +15,8 @@
  *
  * The upper 32 bits of IA32_PQR_ASSOC contain closid and the
  * lower 10 bits rmid. The update to IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them.
+ * contains both parts, so we need to cache them. This also
+ * stores the user configured per cpu CLOSID and RMID.
  *
  * The cache also helps to avoid pointless updates if the value does
  * not change.
@@ -30,38 +31,45 @@ DECLARE_PER_CPU_READ_MOSTLY(struct intel_pqr_state, rdt_cpu_default);
 
 DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
 DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
 
 /*
- * __intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
+ * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
  *
  * Following considerations are made so that this has minimal impact
  * on scheduler hot path:
  * - This will stay as no-op unless we are running on an Intel SKU
- *   which supports resource control and we enable by mounting the
- *   resctrl file system.
- * - Caches the per cpu CLOSid values and does the MSR write only
- *   when a task with a different CLOSid is scheduled in.
- *
+ *   which supports resource control or monitoring and we enable by
+ *   mounting the resctrl file system.
+ * - Caches the per cpu CLOSid/RMID values and does the MSR write only
+ *   when a task with a different CLOSid/RMID is scheduled in.
+ * - We allocate RMIDs/CLOSids globally in order to keep this as
+ *   simple as possible.
  * Must be called with preemption disabled.
  */
-static inline void __intel_rdt_sched_in(void)
+static void __intel_rdt_sched_in(void)
 {
+	struct intel_pqr_state newstate = this_cpu_read(rdt_cpu_default);
+	struct intel_pqr_state *curstate = this_cpu_ptr(&pqr_state);
+
+	/*
+	 * If this task has a closid/rmid assigned, use it.
+	 * Else use the closid/rmid assigned to this cpu.
+	 */
 	if (static_branch_likely(&rdt_alloc_enable_key)) {
-		struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-		u32 closid;
+		if (current->closid)
+			newstate.closid = current->closid;
+	}
 
-		/*
-		 * If this task has a closid assigned, use it.
-		 * Else use the closid assigned to this cpu.
-		 */
-		closid = current->closid;
-		if (closid == 0)
-			closid = this_cpu_read(rdt_cpu_default.closid);
+	if (static_branch_likely(&rdt_mon_enable_key)) {
+		if (current->rmid)
+			newstate.rmid = current->rmid;
+	}
 
-		if (closid != state->closid) {
-			state->closid = closid;
-			wrmsr(IA32_PQR_ASSOC, state->rmid, closid);
-		}
+	if (newstate.closid != curstate->closid ||
+	    newstate.rmid != curstate->rmid) {
+		*curstate = newstate;
+		wrmsr(IA32_PQR_ASSOC, newstate.rmid, newstate.closid);
 	}
 }
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 92a5d3072d93..7fcaa5f82bd0 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -22,8 +22,6 @@
 #define RMID_VAL_ERROR			BIT_ULL(63)
 #define RMID_VAL_UNAVAIL		BIT_ULL(62)
 
-DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
-
 /**
  * struct mon_evt - Entry in the event list of a resource
  * @evtid:		event id
@@ -61,8 +59,6 @@ extern bool rdt_alloc_capable;
 extern bool rdt_mon_capable;
 extern unsigned int rdt_mon_features;
 
-DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
-
 enum rdt_group_type {
 	RDTCTRL_GROUP = 0,
 	RDTMON_GROUP,
-- 
cgit v1.2.3


From 895c663ecef16c8138e20a7d5c052e0fcc400241 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:44 -0700
Subject: x86/intel_rdt/cqm: Add CPU hotplug support

Resource groups have a per domain directory under "mon_data". Add or
remove these directories as and when domains come online and go offline.
Also update the per cpu rmids and cache upon onlining and offlining
cpus.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-26-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt.c          | 41 +++++++++++++++++++++++-----
 arch/x86/kernel/cpu/intel_rdt.h          |  9 ++++++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 47 ++++++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 050ea15259ea..f12bb91b8c66 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -479,6 +479,13 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 
 	cpumask_set_cpu(cpu, &d->cpu_mask);
 	list_add_tail(&d->list, add_pos);
+
+	/*
+	 * If resctrl is mounted, add
+	 * per domain monitor data directories.
+	 */
+	if (static_branch_unlikely(&rdt_mon_enable_key))
+		mkdir_mondata_subdir_allrdtgrp(r, d);
 }
 
 static void domain_remove_cpu(int cpu, struct rdt_resource *r)
@@ -494,6 +501,12 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 
 	cpumask_clear_cpu(cpu, &d->cpu_mask);
 	if (cpumask_empty(&d->cpu_mask)) {
+		/*
+		 * If resctrl is mounted, remove all the
+		 * per domain monitor data directories.
+		 */
+		if (static_branch_unlikely(&rdt_mon_enable_key))
+			rmdir_mondata_subdir_allrdtgrp(r, d->id);
 		kfree(d->ctrl_val);
 		kfree(d->rmid_busy_llc);
 		list_del(&d->list);
@@ -501,13 +514,14 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 	}
 }
 
-static void clear_closid(int cpu)
+static void clear_closid_rmid(int cpu)
 {
 	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
 
 	per_cpu(rdt_cpu_default.closid, cpu) = 0;
 	state->closid = 0;
-	wrmsr(IA32_PQR_ASSOC, state->rmid, 0);
+	state->rmid = 0;
+	wrmsr(IA32_PQR_ASSOC, 0, 0);
 }
 
 static int intel_rdt_online_cpu(unsigned int cpu)
@@ -515,29 +529,42 @@ static int intel_rdt_online_cpu(unsigned int cpu)
 	struct rdt_resource *r;
 
 	mutex_lock(&rdtgroup_mutex);
-	for_each_alloc_capable_rdt_resource(r)
+	for_each_capable_rdt_resource(r)
 		domain_add_cpu(cpu, r);
 	/* The cpu is set in default rdtgroup after online. */
 	cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
-	clear_closid(cpu);
+	clear_closid_rmid(cpu);
 	mutex_unlock(&rdtgroup_mutex);
 
 	return 0;
 }
 
+static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
+{
+	struct rdtgroup *cr;
+
+	list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
+		if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) {
+			break;
+		}
+	}
+}
+
 static int intel_rdt_offline_cpu(unsigned int cpu)
 {
 	struct rdtgroup *rdtgrp;
 	struct rdt_resource *r;
 
 	mutex_lock(&rdtgroup_mutex);
-	for_each_alloc_capable_rdt_resource(r)
+	for_each_capable_rdt_resource(r)
 		domain_remove_cpu(cpu, r);
 	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
-		if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask))
+		if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
+			clear_childcpus(rdtgrp, cpu);
 			break;
+		}
 	}
-	clear_closid(cpu);
+	clear_closid_rmid(cpu);
 	mutex_unlock(&rdtgroup_mutex);
 
 	return 0;
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 7fcaa5f82bd0..2660d15d1712 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -301,6 +301,11 @@ enum {
 	RDT_NUM_RESOURCES,
 };
 
+#define for_each_capable_rdt_resource(r)				      \
+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+	     r++)							      \
+		if (r->alloc_capable || r->mon_capable)
+
 #define for_each_alloc_capable_rdt_resource(r)				      \
 	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
 	     r++)							      \
@@ -360,5 +365,9 @@ void free_rmid(u32 rmid);
 int rdt_get_mon_l3_config(struct rdt_resource *r);
 void mon_event_count(void *info);
 int rdtgroup_mondata_show(struct seq_file *m, void *arg);
+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+				    unsigned int dom_id);
+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+				    struct rdt_domain *d);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 0a7be9b7a201..ea37b972df4f 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1323,6 +1323,27 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
 	return ret;
 }
 
+/*
+ * Remove all subdirectories of mon_data of ctrl_mon groups
+ * and monitor groups with given domain id.
+ */
+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
+{
+	struct rdtgroup *prgrp, *crgrp;
+	char name[32];
+
+	if (!r->mon_enabled)
+		return;
+
+	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+		sprintf(name, "mon_%s_%02d", r->name, dom_id);
+		kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
+
+		list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
+			kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
+	}
+}
+
 static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
 				struct rdt_domain *d,
 				struct rdt_resource *r, struct rdtgroup *prgrp)
@@ -1369,6 +1390,32 @@ out_destroy:
 	return ret;
 }
 
+/*
+ * Add all subdirectories of mon_data for "ctrl_mon" groups
+ * and "monitor" groups with given domain id.
+ */
+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+				    struct rdt_domain *d)
+{
+	struct kernfs_node *parent_kn;
+	struct rdtgroup *prgrp, *crgrp;
+	struct list_head *head;
+
+	if (!r->mon_enabled)
+		return;
+
+	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+		parent_kn = prgrp->mon.mon_data_kn;
+		mkdir_mondata_subdir(parent_kn, d, r, prgrp);
+
+		head = &prgrp->mon.crdtgrp_list;
+		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+			parent_kn = crgrp->mon.mon_data_kn;
+			mkdir_mondata_subdir(parent_kn, d, r, crgrp);
+		}
+	}
+}
+
 static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
 				       struct rdt_resource *r,
 				       struct rdtgroup *prgrp)
-- 
cgit v1.2.3


From 9f52425ba303d91c8370719e91d7e578bfdf309f Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 25 Jul 2017 14:14:45 -0700
Subject: x86/intel_rdt/mbm: Basic counting of MBM events (total and local)

Check CPUID bits for whether each of the MBM events is supported.
Allocate space for each RMID for each counter in each domain to save
previous MSR counter value and running total of data.
Create files in each of the monitor directories.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-27-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt.c             | 23 +++++++++++++++++++-
 arch/x86/kernel/cpu/intel_rdt.h             | 33 +++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c |  1 +
 arch/x86/kernel/cpu/intel_rdt_monitor.c     | 31 ++++++++++++++++++++++++++-
 4 files changed, 86 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index f12bb91b8c66..4c17a7060dfd 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -420,6 +420,8 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 
 static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
 {
+	size_t tsize;
+
 	if (is_llc_occupancy_enabled()) {
 		d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid),
 					   sizeof(unsigned long),
@@ -427,6 +429,23 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
 		if (!d->rmid_busy_llc)
 			return -ENOMEM;
 	}
+	if (is_mbm_total_enabled()) {
+		tsize = sizeof(*d->mbm_total);
+		d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+		if (!d->mbm_total) {
+			kfree(d->rmid_busy_llc);
+			return -ENOMEM;
+		}
+	}
+	if (is_mbm_local_enabled()) {
+		tsize = sizeof(*d->mbm_local);
+		d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+		if (!d->mbm_local) {
+			kfree(d->rmid_busy_llc);
+			kfree(d->mbm_total);
+			return -ENOMEM;
+		}
+	}
 
 	return 0;
 }
@@ -466,6 +485,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 		return;
 
 	d->id = id;
+	cpumask_set_cpu(cpu, &d->cpu_mask);
 
 	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
 		kfree(d);
@@ -477,7 +497,6 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 		return;
 	}
 
-	cpumask_set_cpu(cpu, &d->cpu_mask);
 	list_add_tail(&d->list, add_pos);
 
 	/*
@@ -509,6 +528,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 			rmdir_mondata_subdir_allrdtgrp(r, d->id);
 		kfree(d->ctrl_val);
 		kfree(d->rmid_busy_llc);
+		kfree(d->mbm_total);
+		kfree(d->mbm_local);
 		list_del(&d->list);
 		kfree(d);
 	}
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 2660d15d1712..2137d5e55803 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -19,6 +19,9 @@
 #define QOS_L3_OCCUP_EVENT_ID		0x01
 #define QOS_L3_MBM_TOTAL_EVENT_ID	0x02
 #define QOS_L3_MBM_LOCAL_EVENT_ID	0x03
+
+#define MBM_CNTR_WIDTH			24
+
 #define RMID_VAL_ERROR			BIT_ULL(63)
 #define RMID_VAL_UNAVAIL		BIT_ULL(62)
 
@@ -50,6 +53,7 @@ union mon_data_bits {
 
 struct rmid_read {
 	struct rdtgroup		*rgrp;
+	struct rdt_domain	*d;
 	int			evtid;
 	u64			val;
 };
@@ -159,6 +163,16 @@ struct rftype {
 			 char *buf, size_t nbytes, loff_t off);
 };
 
+/**
+ * struct mbm_state - status for each MBM counter in each domain
+ * @chunks:	Total data moved (multiply by rdt_group.mon_scale to get bytes)
+ * @prev_msr	Value of IA32_QM_CTR for this RMID last time we read it
+ */
+struct mbm_state {
+	u64	chunks;
+	u64	prev_msr;
+};
+
 /**
  * struct rdt_domain - group of cpus sharing an RDT resource
  * @list:	all instances of this resource
@@ -166,6 +180,8 @@ struct rftype {
  * @cpu_mask:	which cpus share this resource
  * @rmid_busy_llc:
  *		bitmap of which limbo RMIDs are above threshold
+ * @mbm_total:	saved state for MBM total bandwidth
+ * @mbm_local:	saved state for MBM local bandwidth
  * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
  * @new_ctrl:	new ctrl value to be loaded
  * @have_new_ctrl: did user provide new_ctrl for this domain
@@ -175,6 +191,8 @@ struct rdt_domain {
 	int			id;
 	struct cpumask		cpu_mask;
 	unsigned long		*rmid_busy_llc;
+	struct mbm_state	*mbm_total;
+	struct mbm_state	*mbm_local;
 	u32			*ctrl_val;
 	u32			new_ctrl;
 	bool			have_new_ctrl;
@@ -230,6 +248,21 @@ static inline bool is_llc_occupancy_enabled(void)
 	return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
 }
 
+static inline bool is_mbm_total_enabled(void)
+{
+	return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID));
+}
+
+static inline bool is_mbm_local_enabled(void)
+{
+	return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID));
+}
+
+static inline bool is_mbm_enabled(void)
+{
+	return (is_mbm_total_enabled() || is_mbm_local_enabled());
+}
+
 /**
  * struct rdt_resource - attributes of an RDT resource
  * @rid:		The index of the resource
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index f44e77dc568b..cf8e2c776440 100644
--- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -294,6 +294,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
 	 */
 	rr->rgrp = rdtgrp;
 	rr->evtid = evtid;
+	rr->d = d;
 	rr->val = 0;
 
 	smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index 6ae5cf58e50a..ef0358b59102 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -296,7 +296,8 @@ void free_rmid(u32 rmid)
 
 static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 {
-	u64 tval;
+	u64 chunks, shift, tval;
+	struct mbm_state *m;
 
 	tval = __rmid_read(rmid, rr->evtid);
 	if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
@@ -307,6 +308,12 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 	case QOS_L3_OCCUP_EVENT_ID:
 		rr->val += tval;
 		return 0;
+	case QOS_L3_MBM_TOTAL_EVENT_ID:
+		m = &rr->d->mbm_total[rmid];
+		break;
+	case QOS_L3_MBM_LOCAL_EVENT_ID:
+		m = &rr->d->mbm_local[rmid];
+		break;
 	default:
 		/*
 		 * Code would never reach here because
@@ -314,6 +321,14 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 		 */
 		return -EINVAL;
 	}
+	shift = 64 - MBM_CNTR_WIDTH;
+	chunks = (tval << shift) - (m->prev_msr << shift);
+	chunks >>= shift;
+	m->chunks += chunks;
+	m->prev_msr = tval;
+
+	rr->val += m->chunks;
+	return 0;
 }
 
 /*
@@ -377,6 +392,16 @@ static struct mon_evt llc_occupancy_event = {
 	.evtid		= QOS_L3_OCCUP_EVENT_ID,
 };
 
+static struct mon_evt mbm_total_event = {
+	.name		= "mbm_total_bytes",
+	.evtid		= QOS_L3_MBM_TOTAL_EVENT_ID,
+};
+
+static struct mon_evt mbm_local_event = {
+	.name		= "mbm_local_bytes",
+	.evtid		= QOS_L3_MBM_LOCAL_EVENT_ID,
+};
+
 /*
  * Initialize the event list for the resource.
  *
@@ -390,6 +415,10 @@ static void l3_mon_evt_init(struct rdt_resource *r)
 
 	if (is_llc_occupancy_enabled())
 		list_add_tail(&llc_occupancy_event.list, &r->evt_list);
+	if (is_mbm_total_enabled())
+		list_add_tail(&mbm_total_event.list, &r->evt_list);
+	if (is_mbm_local_enabled())
+		list_add_tail(&mbm_local_event.list, &r->evt_list);
 }
 
 int rdt_get_mon_l3_config(struct rdt_resource *r)
-- 
cgit v1.2.3


From a4de1dfdd726537e2a78b55759fc646d9b0a0be8 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:46 -0700
Subject: x86/intel_rdt/mbm: Add mbm counter initialization

MBM counters are monotonically increasing counts representing the total
memory bytes at a particular time. In order to calculate total_bytes for
an rdtgroup, we store the value of the counter when we create an
rdtgroup or when a new domain comes online.

When the total_bytes(all memory controller bytes) or local_bytes(local
memory controller bytes) file in "mon_data" is read it shows the
total bytes for that rdtgroup since its creation. User can snapshot this
at different time intervals to obtain bytes/second.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-28-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt.h             | 9 +++++++++
 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 5 +++--
 arch/x86/kernel/cpu/intel_rdt_monitor.c     | 7 +++++++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c    | 4 ++++
 4 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 2137d5e55803..f160403324d3 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -55,6 +55,7 @@ struct rmid_read {
 	struct rdtgroup		*rgrp;
 	struct rdt_domain	*d;
 	int			evtid;
+	bool			first;
 	u64			val;
 };
 
@@ -263,6 +264,12 @@ static inline bool is_mbm_enabled(void)
 	return (is_mbm_total_enabled() || is_mbm_local_enabled());
 }
 
+static inline bool is_mbm_event(int e)
+{
+	return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
+		e <= QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+
 /**
  * struct rdt_resource - attributes of an RDT resource
  * @rid:		The index of the resource
@@ -402,5 +409,7 @@ void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
 				    unsigned int dom_id);
 void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
 				    struct rdt_domain *d);
+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
+		    struct rdtgroup *rdtgrp, int evtid, int first);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index cf8e2c776440..f6ea94f8954a 100644
--- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -287,7 +287,7 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 }
 
 void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
-		    struct rdtgroup *rdtgrp, int evtid)
+		    struct rdtgroup *rdtgrp, int evtid, int first)
 {
 	/*
 	 * setup the parameters to send to the IPI to read the data.
@@ -296,6 +296,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
 	rr->evtid = evtid;
 	rr->d = d;
 	rr->val = 0;
+	rr->first = first;
 
 	smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
 }
@@ -325,7 +326,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 		goto out;
 	}
 
-	mon_event_read(&rr, d, rdtgrp, evtid);
+	mon_event_read(&rr, d, rdtgrp, evtid, false);
 
 	if (rr.val & RMID_VAL_ERROR)
 		seq_puts(m, "Error\n");
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index ef0358b59102..383a023fca35 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -321,6 +321,13 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 		 */
 		return -EINVAL;
 	}
+
+	if (rr->first) {
+		m->prev_msr = tval;
+		m->chunks = 0;
+		return 0;
+	}
+
 	shift = 64 - MBM_CNTR_WIDTH;
 	chunks = (tval << shift) - (m->prev_msr << shift);
 	chunks >>= shift;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index ea37b972df4f..05088e301ef8 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1351,6 +1351,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
 	union mon_data_bits priv;
 	struct kernfs_node *kn;
 	struct mon_evt *mevt;
+	struct rmid_read rr;
 	char name[32];
 	int ret;
 
@@ -1381,6 +1382,9 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
 		ret = mon_addfile(kn, mevt->name, priv.priv);
 		if (ret)
 			goto out_destroy;
+
+		if (is_mbm_event(mevt->evtid))
+			mon_event_read(&rr, d, prgrp, mevt->evtid, true);
 	}
 	kernfs_activate(kn);
 	return 0;
-- 
cgit v1.2.3


From e33026831bdb5f051499fec6a606f79fe1f94cc8 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:47 -0700
Subject: x86/intel_rdt/mbm: Handle counter overflow

Set up a delayed work queue for each domain that will read all
the MBM counters of active RMIDs once per second to make sure
that they don't wrap around between reads from users.

[Tony: Added the initializations for the work structure and completed
the patch]

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-29-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt.c          | 21 ++++++++---
 arch/x86/kernel/cpu/intel_rdt.h          | 10 +++++
 arch/x86/kernel/cpu/intel_rdt_monitor.c  | 63 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c |  9 +++++
 4 files changed, 97 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 4c17a7060dfd..de26aa7971d4 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -348,12 +348,10 @@ void rdt_ctrl_update(void *arg)
 	int cpu = smp_processor_id();
 	struct rdt_domain *d;
 
-	list_for_each_entry(d, &r->domains, list) {
-		/* Find the domain that contains this CPU */
-		if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
-			r->msr_update(d, m, r);
-			return;
-		}
+	d = get_domain_from_cpu(cpu, r);
+	if (d) {
+		r->msr_update(d, m, r);
+		return;
 	}
 	pr_warn_once("cpu %d not found in any domain for resource %s\n",
 		     cpu, r->name);
@@ -447,6 +445,11 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
 		}
 	}
 
+	if (is_mbm_enabled()) {
+		INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
+		mbm_setup_overflow_handler(d);
+	}
+
 	return 0;
 }
 
@@ -531,7 +534,13 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 		kfree(d->mbm_total);
 		kfree(d->mbm_local);
 		list_del(&d->list);
+		if (is_mbm_enabled())
+			cancel_delayed_work(&d->mbm_over);
 		kfree(d);
+	} else if (r == &rdt_resources_all[RDT_RESOURCE_L3] &&
+		   cpu == d->mbm_work_cpu && is_mbm_enabled()) {
+		cancel_delayed_work(&d->mbm_over);
+		mbm_setup_overflow_handler(d);
 	}
 }
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index f160403324d3..94e488af082e 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -21,10 +21,13 @@
 #define QOS_L3_MBM_LOCAL_EVENT_ID	0x03
 
 #define MBM_CNTR_WIDTH			24
+#define MBM_OVERFLOW_INTERVAL		1000
 
 #define RMID_VAL_ERROR			BIT_ULL(63)
 #define RMID_VAL_UNAVAIL		BIT_ULL(62)
 
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+
 /**
  * struct mon_evt - Entry in the event list of a resource
  * @evtid:		event id
@@ -183,6 +186,9 @@ struct mbm_state {
  *		bitmap of which limbo RMIDs are above threshold
  * @mbm_total:	saved state for MBM total bandwidth
  * @mbm_local:	saved state for MBM local bandwidth
+ * @mbm_over:	worker to periodically read MBM h/w counters
+ * @mbm_work_cpu:
+ *		worker cpu for MBM h/w counters
  * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
  * @new_ctrl:	new ctrl value to be loaded
  * @have_new_ctrl: did user provide new_ctrl for this domain
@@ -194,6 +200,8 @@ struct rdt_domain {
 	unsigned long		*rmid_busy_llc;
 	struct mbm_state	*mbm_total;
 	struct mbm_state	*mbm_local;
+	struct delayed_work	mbm_over;
+	int			mbm_work_cpu;
 	u32			*ctrl_val;
 	u32			new_ctrl;
 	bool			have_new_ctrl;
@@ -411,5 +419,7 @@ void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
 				    struct rdt_domain *d);
 void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
 		    struct rdtgroup *rdtgrp, int evtid, int first);
+void mbm_setup_overflow_handler(struct rdt_domain *dom);
+void mbm_handle_overflow(struct work_struct *work);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index 383a023fca35..d6bfdfd22abe 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -366,6 +366,69 @@ void mon_event_count(void *info)
 	}
 }
 
+static void mbm_update(struct rdt_domain *d, int rmid)
+{
+	struct rmid_read rr;
+
+	rr.first = false;
+	rr.d = d;
+
+	/*
+	 * This is protected from concurrent reads from user
+	 * as both the user and we hold the global mutex.
+	 */
+	if (is_mbm_total_enabled()) {
+		rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
+		__mon_event_count(rmid, &rr);
+	}
+	if (is_mbm_local_enabled()) {
+		rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
+		__mon_event_count(rmid, &rr);
+	}
+}
+
+void mbm_handle_overflow(struct work_struct *work)
+{
+	unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
+	struct rdtgroup *prgrp, *crgrp;
+	int cpu = smp_processor_id();
+	struct list_head *head;
+	struct rdt_domain *d;
+
+	mutex_lock(&rdtgroup_mutex);
+
+	if (!static_branch_likely(&rdt_enable_key))
+		goto out_unlock;
+
+	d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
+	if (!d)
+		goto out_unlock;
+
+	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+		mbm_update(d, prgrp->mon.rmid);
+
+		head = &prgrp->mon.crdtgrp_list;
+		list_for_each_entry(crgrp, head, mon.crdtgrp_list)
+			mbm_update(d, crgrp->mon.rmid);
+	}
+
+	schedule_delayed_work_on(cpu, &d->mbm_over, delay);
+out_unlock:
+	mutex_unlock(&rdtgroup_mutex);
+}
+
+void mbm_setup_overflow_handler(struct rdt_domain *dom)
+{
+	unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
+	int cpu;
+
+	if (!static_branch_likely(&rdt_enable_key))
+		return;
+	cpu = cpumask_any(&dom->cpu_mask);
+	dom->mbm_work_cpu = cpu;
+	schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
+}
+
 static int dom_data_init(struct rdt_resource *r)
 {
 	struct rmid_entry *entry = NULL;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 05088e301ef8..c24dd067b9c5 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1060,6 +1060,8 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
 				int flags, const char *unused_dev_name,
 				void *data)
 {
+	struct rdt_domain *dom;
+	struct rdt_resource *r;
 	struct dentry *dentry;
 	int ret;
 
@@ -1118,6 +1120,13 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
 
 	if (rdt_alloc_capable || rdt_mon_capable)
 		static_branch_enable(&rdt_enable_key);
+
+	if (is_mbm_enabled()) {
+		r = &rdt_resources_all[RDT_RESOURCE_L3];
+		list_for_each_entry(dom, &r->domains, list)
+			mbm_setup_overflow_handler(dom);
+	}
+
 	goto out;
 
 out_mondata:
-- 
cgit v1.2.3


From 0dd2d7494cd818d06a2ae1cd840cd62124a2d25e Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 25 Jul 2017 15:39:04 -0700
Subject: x86/intel_rdt: Show bitmask of shareable resource with other
 executing units

CPUID.(EAX=0x10, ECX=res#):EBX[31:0] reports a bit mask for a resource.
Each set bit within the length of the CBM indicates the corresponding
unit of the resource allocation may be used by other entities in the
platform (e.g. an integrated graphics engine or hardware units outside
the processor core and have direct access to the resource). Each
cleared bit within the length of the CBM indicates the corresponding
allocation unit can be configured to implement a priority-based
allocation scheme without interference with other hardware agents in
the system. Bits outside the length of the CBM are reserved.

More details on the bit mask are described in x86 Software Developer's
Manual.

The bitmask is shown in "info" directory for each resource. It's
up to user to decide how to use the bitmask within a CBM in a partition
to share or isolate a resource with other executing units.

Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: vikas.shivappa@linux.intel.com
Link: http://lkml.kernel.org/r/20170725223904.12996-1-tony.luck@intel.com
---
 Documentation/x86/intel_rdt_ui.txt       |  7 +++++++
 arch/x86/kernel/cpu/intel_rdt.c          |  2 ++
 arch/x86/kernel/cpu/intel_rdt.h          |  3 +++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 16 ++++++++++++++++
 4 files changed, 28 insertions(+)

(limited to 'arch')

diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt
index 76f21e2ac176..4d8848e4e224 100644
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -48,6 +48,13 @@ related to allocation:
 "min_cbm_bits": 	The minimum number of consecutive bits which
 			must be set when writing a mask.
 
+"shareable_bits":	Bitmask of shareable resource with other executing
+			entities (e.g. I/O). User can use this when
+			setting up exclusive cache partitions. Note that
+			some platforms support devices that have their
+			own settings for cache use which can over-ride
+			these bits.
+
 Memory bandwitdh(MB) subdirectory contains the following files
 with respect to allocation:
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index de26aa7971d4..da4f3898d517 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -193,6 +193,7 @@ static inline bool cache_alloc_hsw_probe(void)
 		r->num_closid = 4;
 		r->default_ctrl = max_cbm;
 		r->cache.cbm_len = 20;
+		r->cache.shareable_bits = 0xc0000;
 		r->cache.min_cbm_bits = 2;
 		r->alloc_capable = true;
 		r->alloc_enabled = true;
@@ -260,6 +261,7 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
 	r->num_closid = edx.split.cos_max + 1;
 	r->cache.cbm_len = eax.split.cbm_len + 1;
 	r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+	r->cache.shareable_bits = ebx & r->default_ctrl;
 	r->data_width = (r->cache.cbm_len + 3) / 4;
 	r->alloc_capable = true;
 	r->alloc_enabled = true;
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 94e488af082e..4040bf1a075c 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -227,12 +227,15 @@ struct msr_param {
  * @cbm_idx_offset:	Offset of CBM index. CBM index is computed by:
  *			closid * cbm_idx_multi + cbm_idx_offset
  *			in a cache bit mask
+ * @shareable_bits:	Bitmask of shareable resource with other
+ *			executing entities
  */
 struct rdt_cache {
 	unsigned int	cbm_len;
 	unsigned int	min_cbm_bits;
 	unsigned int	cbm_idx_mult;
 	unsigned int	cbm_idx_offset;
+	unsigned int	shareable_bits;
 };
 
 /**
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index c24dd067b9c5..2621ae3f07fc 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -596,6 +596,15 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
 	return 0;
 }
 
+static int rdt_shareable_bits_show(struct kernfs_open_file *of,
+				   struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+
+	seq_printf(seq, "%x\n", r->cache.shareable_bits);
+	return 0;
+}
+
 static int rdt_min_bw_show(struct kernfs_open_file *of,
 			     struct seq_file *seq, void *v)
 {
@@ -711,6 +720,13 @@ static struct rftype res_common_files[] = {
 		.seq_show	= rdt_min_cbm_bits_show,
 		.fflags		= RF_CTRL_INFO | RFTYPE_RES_CACHE,
 	},
+	{
+		.name		= "shareable_bits",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_shareable_bits_show,
+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_CACHE,
+	},
 	{
 		.name		= "min_bandwidth",
 		.mode		= 0444,
-- 
cgit v1.2.3


From eda61c265f3656be8345fdf0334b3a77829437fc Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Wed, 9 Aug 2017 11:46:33 -0700
Subject: x86/intel_rdt/cqm: Clear the default RMID during hotcpu

The user configured per cpu default RMID is not cleared during cpu
hotplug. This may lead to incorrect RMID values after a cpu goes offline
and again comes back online. Clear the per cpu default RMID during cpu
offline and online handling.

Reported-by: Prakyha Sai Praneeth <sai.praneeth.prakhya@intel.com>
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Link: http://lkml.kernel.org/r/1502304395-7166-2-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index da4f3898d517..4b9edb2617a5 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -551,6 +551,7 @@ static void clear_closid_rmid(int cpu)
 	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
 
 	per_cpu(rdt_cpu_default.closid, cpu) = 0;
+	per_cpu(rdt_cpu_default.rmid, cpu) = 0;
 	state->closid = 0;
 	state->rmid = 0;
 	wrmsr(IA32_PQR_ASSOC, 0, 0);
-- 
cgit v1.2.3


From a9110b552d44fedbd1221eb0e5bde81da32d9350 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Wed, 9 Aug 2017 11:46:34 -0700
Subject: x86/intel_rdt: Modify the intel_pqr_state for better performance

Currently we have pqr_state and rdt_default_state which store the cached
CLOSID/RMIDs and the user configured cpu default values respectively. We
touch both of these during context switch. Put all of them in one
structure so that we can spare a cache line.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: sai.praneeth.prakhya@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Link: http://lkml.kernel.org/r/1502304395-7166-3-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/include/asm/intel_rdt_sched.h   | 30 +++++++++++++++++-------------
 arch/x86/kernel/cpu/intel_rdt.c          | 10 ++++------
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 10 +++++-----
 3 files changed, 26 insertions(+), 24 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h
index 3badc0a87ef5..b4bbf8b21512 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -10,8 +10,10 @@
 
 /**
  * struct intel_pqr_state - State cache for the PQR MSR
- * @rmid:		The cached Resource Monitoring ID
- * @closid:		The cached Class Of Service ID
+ * @cur_rmid:		The cached Resource Monitoring ID
+ * @cur_closid:	The cached Class Of Service ID
+ * @default_rmid:	The user assigned Resource Monitoring ID
+ * @default_closid:	The user assigned cached Class Of Service ID
  *
  * The upper 32 bits of IA32_PQR_ASSOC contain closid and the
  * lower 10 bits rmid. The update to IA32_PQR_ASSOC always
@@ -22,12 +24,13 @@
  * not change.
  */
 struct intel_pqr_state {
-	u32			rmid;
-	u32			closid;
+	u32			cur_rmid;
+	u32			cur_closid;
+	u32			default_rmid;
+	u32			default_closid;
 };
 
 DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
-DECLARE_PER_CPU_READ_MOSTLY(struct intel_pqr_state, rdt_cpu_default);
 
 DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
 DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
@@ -49,8 +52,9 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
  */
 static void __intel_rdt_sched_in(void)
 {
-	struct intel_pqr_state newstate = this_cpu_read(rdt_cpu_default);
-	struct intel_pqr_state *curstate = this_cpu_ptr(&pqr_state);
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+	u32 closid = state->default_closid;
+	u32 rmid = state->default_rmid;
 
 	/*
 	 * If this task has a closid/rmid assigned, use it.
@@ -58,18 +62,18 @@ static void __intel_rdt_sched_in(void)
 	 */
 	if (static_branch_likely(&rdt_alloc_enable_key)) {
 		if (current->closid)
-			newstate.closid = current->closid;
+			closid = current->closid;
 	}
 
 	if (static_branch_likely(&rdt_mon_enable_key)) {
 		if (current->rmid)
-			newstate.rmid = current->rmid;
+			rmid = current->rmid;
 	}
 
-	if (newstate.closid != curstate->closid ||
-	    newstate.rmid != curstate->rmid) {
-		*curstate = newstate;
-		wrmsr(IA32_PQR_ASSOC, newstate.rmid, newstate.closid);
+	if (closid != state->cur_closid || rmid != state->cur_rmid) {
+		state->cur_closid = closid;
+		state->cur_rmid = rmid;
+		wrmsr(IA32_PQR_ASSOC, rmid, closid);
 	}
 }
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 4b9edb2617a5..97c8d8321e04 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -47,8 +47,6 @@ DEFINE_MUTEX(rdtgroup_mutex);
  */
 DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
 
-DEFINE_PER_CPU_READ_MOSTLY(struct intel_pqr_state, rdt_cpu_default);
-
 /*
  * Used to store the max resource name width and max resource data width
  * to display the schemata in a tabular format
@@ -550,10 +548,10 @@ static void clear_closid_rmid(int cpu)
 {
 	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
 
-	per_cpu(rdt_cpu_default.closid, cpu) = 0;
-	per_cpu(rdt_cpu_default.rmid, cpu) = 0;
-	state->closid = 0;
-	state->rmid = 0;
+	state->default_closid = 0;
+	state->default_rmid = 0;
+	state->cur_closid = 0;
+	state->cur_rmid = 0;
 	wrmsr(IA32_PQR_ASSOC, 0, 0);
 }
 
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 2621ae3f07fc..86a69794d7e4 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -202,8 +202,8 @@ static void update_cpu_closid_rmid(void *info)
 	struct rdtgroup *r = info;
 
 	if (r) {
-		this_cpu_write(rdt_cpu_default.closid, r->closid);
-		this_cpu_write(rdt_cpu_default.rmid, r->mon.rmid);
+		this_cpu_write(pqr_state.default_closid, r->closid);
+		this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
 	}
 
 	/*
@@ -1733,7 +1733,7 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
 
 	/* Update per cpu rmid of the moved CPUs first */
 	for_each_cpu(cpu, &rdtgrp->cpu_mask)
-		per_cpu(rdt_cpu_default.rmid, cpu) = prdtgrp->mon.rmid;
+		per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
 	/*
 	 * Update the MSR on moved CPUs and CPUs which have moved
 	 * task running on them.
@@ -1774,8 +1774,8 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
 
 	/* Update per cpu closid and rmid of the moved CPUs first */
 	for_each_cpu(cpu, &rdtgrp->cpu_mask) {
-		per_cpu(rdt_cpu_default.closid, cpu) = rdtgroup_default.closid;
-		per_cpu(rdt_cpu_default.rmid, cpu) = rdtgroup_default.mon.rmid;
+		per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
+		per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
 	}
 
 	/*
-- 
cgit v1.2.3


From bbc4615e0b7df5e21d0991adb4b2798508354924 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 15 Aug 2017 18:00:42 -0700
Subject: x86/intel_rdt/mbm: Fix MBM overflow handler during CPU hotplug

When a CPU is dying, the overflow worker is canceled and rescheduled on a
different CPU in the same domain. But if the timer is already about to
expire this essentially doubles the interval which might result in a non
detected overflow.

Cancel the overflow worker and reschedule it immediately on a different CPU
in same domain. The work could be flushed as well, but that would
reschedule it on the same CPU.

[ tglx: Rewrote changelog once again ]

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Link: http://lkml.kernel.org/r/1502845243-20454-2-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt.c          | 4 ++--
 arch/x86/kernel/cpu/intel_rdt.h          | 2 +-
 arch/x86/kernel/cpu/intel_rdt_monitor.c  | 4 ++--
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 97c8d8321e04..b8dc141896b6 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -447,7 +447,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
 
 	if (is_mbm_enabled()) {
 		INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
-		mbm_setup_overflow_handler(d);
+		mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL);
 	}
 
 	return 0;
@@ -540,7 +540,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 	} else if (r == &rdt_resources_all[RDT_RESOURCE_L3] &&
 		   cpu == d->mbm_work_cpu && is_mbm_enabled()) {
 		cancel_delayed_work(&d->mbm_over);
-		mbm_setup_overflow_handler(d);
+		mbm_setup_overflow_handler(d, 0);
 	}
 }
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 4040bf1a075c..3e4869390603 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -422,7 +422,7 @@ void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
 				    struct rdt_domain *d);
 void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
 		    struct rdtgroup *rdtgrp, int evtid, int first);
-void mbm_setup_overflow_handler(struct rdt_domain *dom);
+void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms);
 void mbm_handle_overflow(struct work_struct *work);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index d6bfdfd22abe..8378785883dc 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -417,9 +417,9 @@ out_unlock:
 	mutex_unlock(&rdtgroup_mutex);
 }
 
-void mbm_setup_overflow_handler(struct rdt_domain *dom)
+void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
 {
-	unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
+	unsigned long delay = msecs_to_jiffies(delay_ms);
 	int cpu;
 
 	if (!static_branch_likely(&rdt_enable_key))
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 86a69794d7e4..b529f93e8ed0 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -1140,7 +1140,7 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
 	if (is_mbm_enabled()) {
 		r = &rdt_resources_all[RDT_RESOURCE_L3];
 		list_for_each_entry(dom, &r->domains, list)
-			mbm_setup_overflow_handler(dom);
+			mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
 	}
 
 	goto out;
-- 
cgit v1.2.3


From 24247aeeabe99eab13b798ccccc2dec066dd6f07 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 15 Aug 2017 18:00:43 -0700
Subject: x86/intel_rdt/cqm: Improve limbo list processing

During a mkdir, the entire limbo list is synchronously checked on each
package for free RMIDs by sending IPIs. With a large number of RMIDs (SKL
has 192) this creates a intolerable amount of work in IPIs.

Replace the IPI based checking of the limbo list with asynchronous worker
threads on each package which periodically scan the limbo list and move the
RMIDs that have:

	llc_occupancy < threshold_occupancy

on all packages to the free list.

mkdir now returns -ENOSPC if the free list and the limbo list ere empty or
returns -EBUSY if there are RMIDs on the limbo list and the free list is
empty.

Getting rid of the IPIs also simplifies the data structures and the
serialization required for handling the lists.

[ tglx: Rewrote changelog ... ]

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Link: http://lkml.kernel.org/r/1502845243-20454-3-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt.c         |  31 ++++-
 arch/x86/kernel/cpu/intel_rdt.h         |  14 ++-
 arch/x86/kernel/cpu/intel_rdt_monitor.c | 210 ++++++++++++++------------------
 3 files changed, 133 insertions(+), 122 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index b8dc141896b6..6935c8ecad7f 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -426,6 +426,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
 					   GFP_KERNEL);
 		if (!d->rmid_busy_llc)
 			return -ENOMEM;
+		INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
 	}
 	if (is_mbm_total_enabled()) {
 		tsize = sizeof(*d->mbm_total);
@@ -536,11 +537,33 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 		list_del(&d->list);
 		if (is_mbm_enabled())
 			cancel_delayed_work(&d->mbm_over);
+		if (is_llc_occupancy_enabled() &&  has_busy_rmid(r, d)) {
+			/*
+			 * When a package is going down, forcefully
+			 * decrement rmid->ebusy. There is no way to know
+			 * that the L3 was flushed and hence may lead to
+			 * incorrect counts in rare scenarios, but leaving
+			 * the RMID as busy creates RMID leaks if the
+			 * package never comes back.
+			 */
+			__check_limbo(d, true);
+			cancel_delayed_work(&d->cqm_limbo);
+		}
+
 		kfree(d);
-	} else if (r == &rdt_resources_all[RDT_RESOURCE_L3] &&
-		   cpu == d->mbm_work_cpu && is_mbm_enabled()) {
-		cancel_delayed_work(&d->mbm_over);
-		mbm_setup_overflow_handler(d, 0);
+		return;
+	}
+
+	if (r == &rdt_resources_all[RDT_RESOURCE_L3]) {
+		if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
+			cancel_delayed_work(&d->mbm_over);
+			mbm_setup_overflow_handler(d, 0);
+		}
+		if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
+		    has_busy_rmid(r, d)) {
+			cancel_delayed_work(&d->cqm_limbo);
+			cqm_setup_limbo_handler(d, 0);
+		}
 	}
 }
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 3e4869390603..ebaddaeef023 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -20,6 +20,8 @@
 #define QOS_L3_MBM_TOTAL_EVENT_ID	0x02
 #define QOS_L3_MBM_LOCAL_EVENT_ID	0x03
 
+#define CQM_LIMBOCHECK_INTERVAL	1000
+
 #define MBM_CNTR_WIDTH			24
 #define MBM_OVERFLOW_INTERVAL		1000
 
@@ -187,8 +189,11 @@ struct mbm_state {
  * @mbm_total:	saved state for MBM total bandwidth
  * @mbm_local:	saved state for MBM local bandwidth
  * @mbm_over:	worker to periodically read MBM h/w counters
+ * @cqm_limbo:	worker to periodically read CQM h/w counters
  * @mbm_work_cpu:
  *		worker cpu for MBM h/w counters
+ * @cqm_work_cpu:
+ *		worker cpu for CQM h/w counters
  * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
  * @new_ctrl:	new ctrl value to be loaded
  * @have_new_ctrl: did user provide new_ctrl for this domain
@@ -201,7 +206,9 @@ struct rdt_domain {
 	struct mbm_state	*mbm_total;
 	struct mbm_state	*mbm_local;
 	struct delayed_work	mbm_over;
+	struct delayed_work	cqm_limbo;
 	int			mbm_work_cpu;
+	int			cqm_work_cpu;
 	u32			*ctrl_val;
 	u32			new_ctrl;
 	bool			have_new_ctrl;
@@ -422,7 +429,12 @@ void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
 				    struct rdt_domain *d);
 void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
 		    struct rdtgroup *rdtgrp, int evtid, int first);
-void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms);
+void mbm_setup_overflow_handler(struct rdt_domain *dom,
+				unsigned long delay_ms);
 void mbm_handle_overflow(struct work_struct *work);
+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
+void cqm_handle_limbo(struct work_struct *work);
+bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
+void __check_limbo(struct rdt_domain *d, bool force_free);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index 8378785883dc..30827510094b 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -33,7 +33,7 @@
 
 struct rmid_entry {
 	u32				rmid;
-	atomic_t			busy;
+	int				busy;
 	struct list_head		list;
 };
 
@@ -45,13 +45,13 @@ struct rmid_entry {
 static LIST_HEAD(rmid_free_lru);
 
 /**
- * @rmid_limbo_lru       list of currently unused but (potentially)
+ * @rmid_limbo_count     count of currently unused but (potentially)
  *     dirty RMIDs.
- *     This list contains RMIDs that no one is currently using but that
+ *     This counts RMIDs that no one is currently using but that
  *     may have a occupancy value > intel_cqm_threshold. User can change
  *     the threshold occupancy value.
  */
-static LIST_HEAD(rmid_limbo_lru);
+unsigned int rmid_limbo_count;
 
 /**
  * @rmid_entry - The entry in the limbo and free lists.
@@ -103,124 +103,53 @@ static u64 __rmid_read(u32 rmid, u32 eventid)
 	return val;
 }
 
-/*
- * Walk the limbo list looking at any RMIDs that are flagged in the
- * domain rmid_busy_llc bitmap as busy. If the reported LLC occupancy
- * is below the threshold clear the busy bit and decrement the count.
- * If the busy count gets to zero on an RMID we stop looking.
- * This can be called from an IPI.
- * We need an atomic for the busy count because multiple CPUs may check
- * the same RMID at the same time.
- */
-static bool __check_limbo(struct rdt_domain *d)
-{
-	struct rmid_entry *entry;
-	u64 val;
-
-	list_for_each_entry(entry, &rmid_limbo_lru, list) {
-		if (!test_bit(entry->rmid, d->rmid_busy_llc))
-			continue;
-		val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
-		if (val <= intel_cqm_threshold) {
-			clear_bit(entry->rmid, d->rmid_busy_llc);
-			if (atomic_dec_and_test(&entry->busy))
-				return true;
-		}
-	}
-	return false;
-}
-
-static void check_limbo(void *arg)
+static bool rmid_dirty(struct rmid_entry *entry)
 {
-	struct rdt_domain *d;
-
-	d = get_domain_from_cpu(smp_processor_id(),
-				&rdt_resources_all[RDT_RESOURCE_L3]);
-
-	if (d)
-		__check_limbo(d);
-}
+	u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
 
-static bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
-{
-	return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
+	return val >= intel_cqm_threshold;
 }
 
 /*
- * Scan the limbo list and move all entries that are below the
- * intel_cqm_threshold to the free list.
- * Return "true" if the limbo list is empty, "false" if there are
- * still some RMIDs there.
+ * Check the RMIDs that are marked as busy for this domain. If the
+ * reported LLC occupancy is below the threshold clear the busy bit and
+ * decrement the count. If the busy count gets to zero on an RMID, we
+ * free the RMID
  */
-static bool try_freeing_limbo_rmid(void)
+void __check_limbo(struct rdt_domain *d, bool force_free)
 {
-	struct rmid_entry *entry, *tmp;
+	struct rmid_entry *entry;
 	struct rdt_resource *r;
-	cpumask_var_t cpu_mask;
-	struct rdt_domain *d;
-	bool ret = true;
-	int cpu;
-
-	if (list_empty(&rmid_limbo_lru))
-		return ret;
+	u32 crmid = 1, nrmid;
 
 	r = &rdt_resources_all[RDT_RESOURCE_L3];
 
-	cpu = get_cpu();
-
 	/*
-	 * First see if we can free up an RMID by checking busy values
-	 * on the local package.
+	 * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
+	 * are marked as busy for occupancy < threshold. If the occupancy
+	 * is less than the threshold decrement the busy counter of the
+	 * RMID and move it to the free list when the counter reaches 0.
 	 */
-	d = get_domain_from_cpu(cpu, r);
-	if (d && has_busy_rmid(r, d) && __check_limbo(d)) {
-		list_for_each_entry_safe(entry, tmp, &rmid_limbo_lru, list) {
-			if (atomic_read(&entry->busy) == 0) {
-				list_del(&entry->list);
+	for (;;) {
+		nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid);
+		if (nrmid >= r->num_rmid)
+			break;
+
+		entry = __rmid_entry(nrmid);
+		if (force_free || !rmid_dirty(entry)) {
+			clear_bit(entry->rmid, d->rmid_busy_llc);
+			if (!--entry->busy) {
+				rmid_limbo_count--;
 				list_add_tail(&entry->list, &rmid_free_lru);
-				goto done;
 			}
 		}
+		crmid = nrmid + 1;
 	}
+}
 
-	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) {
-		ret = false;
-		goto done;
-	}
-
-	/*
-	 * Build a mask of other domains that have busy RMIDs
-	 */
-	list_for_each_entry(d, &r->domains, list) {
-		if (!cpumask_test_cpu(cpu, &d->cpu_mask) &&
-		    has_busy_rmid(r, d))
-			cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
-	}
-	if (cpumask_empty(cpu_mask)) {
-		ret = false;
-		goto free_mask;
-	}
-
-	/*
-	 * Scan domains with busy RMIDs to check if they still are busy
-	 */
-	on_each_cpu_mask(cpu_mask, check_limbo, NULL, true);
-
-	/* Walk limbo list moving all free RMIDs to the &rmid_free_lru list */
-	list_for_each_entry_safe(entry, tmp, &rmid_limbo_lru, list) {
-		if (atomic_read(&entry->busy) != 0) {
-			ret = false;
-			continue;
-		}
-		list_del(&entry->list);
-		list_add_tail(&entry->list, &rmid_free_lru);
-	}
-
-free_mask:
-	free_cpumask_var(cpu_mask);
-done:
-	put_cpu();
-	return ret;
+bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
+{
+	return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
 }
 
 /*
@@ -231,15 +160,11 @@ done:
 int alloc_rmid(void)
 {
 	struct rmid_entry *entry;
-	bool ret;
 
 	lockdep_assert_held(&rdtgroup_mutex);
 
-	if (list_empty(&rmid_free_lru)) {
-		ret = try_freeing_limbo_rmid();
-		if (list_empty(&rmid_free_lru))
-			return ret ? -ENOSPC : -EBUSY;
-	}
+	if (list_empty(&rmid_free_lru))
+		return rmid_limbo_count ? -EBUSY : -ENOSPC;
 
 	entry = list_first_entry(&rmid_free_lru,
 				 struct rmid_entry, list);
@@ -252,11 +177,12 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
 {
 	struct rdt_resource *r;
 	struct rdt_domain *d;
-	int cpu, nbusy = 0;
+	int cpu;
 	u64 val;
 
 	r = &rdt_resources_all[RDT_RESOURCE_L3];
 
+	entry->busy = 0;
 	cpu = get_cpu();
 	list_for_each_entry(d, &r->domains, list) {
 		if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
@@ -264,17 +190,22 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
 			if (val <= intel_cqm_threshold)
 				continue;
 		}
+
+		/*
+		 * For the first limbo RMID in the domain,
+		 * setup up the limbo worker.
+		 */
+		if (!has_busy_rmid(r, d))
+			cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL);
 		set_bit(entry->rmid, d->rmid_busy_llc);
-		nbusy++;
+		entry->busy++;
 	}
 	put_cpu();
 
-	if (nbusy) {
-		atomic_set(&entry->busy, nbusy);
-		list_add_tail(&entry->list, &rmid_limbo_lru);
-	} else {
+	if (entry->busy)
+		rmid_limbo_count++;
+	else
 		list_add_tail(&entry->list, &rmid_free_lru);
-	}
 }
 
 void free_rmid(u32 rmid)
@@ -387,6 +318,50 @@ static void mbm_update(struct rdt_domain *d, int rmid)
 	}
 }
 
+/*
+ * Handler to scan the limbo list and move the RMIDs
+ * to free list whose occupancy < threshold_occupancy.
+ */
+void cqm_handle_limbo(struct work_struct *work)
+{
+	unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
+	int cpu = smp_processor_id();
+	struct rdt_resource *r;
+	struct rdt_domain *d;
+
+	mutex_lock(&rdtgroup_mutex);
+
+	r = &rdt_resources_all[RDT_RESOURCE_L3];
+	d = get_domain_from_cpu(cpu, r);
+
+	if (!d) {
+		pr_warn_once("Failure to get domain for limbo worker\n");
+		goto out_unlock;
+	}
+
+	__check_limbo(d, false);
+
+	if (has_busy_rmid(r, d))
+		schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
+
+out_unlock:
+	mutex_unlock(&rdtgroup_mutex);
+}
+
+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
+{
+	unsigned long delay = msecs_to_jiffies(delay_ms);
+	struct rdt_resource *r;
+	int cpu;
+
+	r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+	cpu = cpumask_any(&dom->cpu_mask);
+	dom->cqm_work_cpu = cpu;
+
+	schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
+}
+
 void mbm_handle_overflow(struct work_struct *work)
 {
 	unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
@@ -413,6 +388,7 @@ void mbm_handle_overflow(struct work_struct *work)
 	}
 
 	schedule_delayed_work_on(cpu, &d->mbm_over, delay);
+
 out_unlock:
 	mutex_unlock(&rdtgroup_mutex);
 }
-- 
cgit v1.2.3


From 5707b46a4206be2068444eb6b514a1ee070651c8 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 8 Aug 2017 10:28:59 +0100
Subject: x86/intel_rdt: Remove redundant ternary operator on return

The use of the ternary operator is redundant as ret can never be
non-zero at that point. Instead, just return nbytes.

Detected by CoverityScan, CID#1452658 ("Logically dead code")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: kernel-janitors@vger.kernel.org
Link: http://lkml.kernel.org/r/20170808092859.13021-1-colin.king@canonical.com
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index b529f93e8ed0..a869d4a073c5 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -680,7 +680,7 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
 
 	intel_cqm_threshold = bytes / r->mon_scale;
 
-	return ret ?: nbytes;
+	return nbytes;
 }
 
 /* rdtgroup information files for one cache resource. */
-- 
cgit v1.2.3


From 0576113a387e0c8a5d9e24b4cd62605d1c9c0db8 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 24 Aug 2017 09:26:50 -0700
Subject: x86/intel_rdt: Move special case code for Haswell to a quirk function

No functional change, but lay the ground work for other per-model
quirks.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Fenghua" <fenghua.yu@intel.com>
Cc: Ravi V" <ravi.v.shankar@intel.com>
Cc: "Peter Zijlstra" <peterz@infradead.org>
Cc: "Stephane Eranian" <eranian@google.com>
Cc: "Andi Kleen" <ak@linux.intel.com>
Cc: "David Carrillo-Cisneros" <davidcc@google.com>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/f195a83751b5f8b1d8a78bd3c1914300c8fa3142.1503512900.git.tony.luck@intel.com
---
 arch/x86/kernel/cpu/intel_rdt.c | 52 ++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 24 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 6935c8ecad7f..25514cd454b3 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -172,34 +172,28 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
  * is always 20 on hsw server parts. The minimum cache bitmask length
  * allowed for HSW server is always 2 bits. Hardcode all of them.
  */
-static inline bool cache_alloc_hsw_probe(void)
+static inline void cache_alloc_hsw_probe(void)
 {
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
-	    boot_cpu_data.x86 == 6 &&
-	    boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) {
-		struct rdt_resource *r  = &rdt_resources_all[RDT_RESOURCE_L3];
-		u32 l, h, max_cbm = BIT_MASK(20) - 1;
+	struct rdt_resource *r  = &rdt_resources_all[RDT_RESOURCE_L3];
+	u32 l, h, max_cbm = BIT_MASK(20) - 1;
 
-		if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
-			return false;
-		rdmsr(IA32_L3_CBM_BASE, l, h);
-
-		/* If all the bits were set in MSR, return success */
-		if (l != max_cbm)
-			return false;
+	if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
+		return;
+	rdmsr(IA32_L3_CBM_BASE, l, h);
 
-		r->num_closid = 4;
-		r->default_ctrl = max_cbm;
-		r->cache.cbm_len = 20;
-		r->cache.shareable_bits = 0xc0000;
-		r->cache.min_cbm_bits = 2;
-		r->alloc_capable = true;
-		r->alloc_enabled = true;
+	/* If all the bits were set in MSR, return success */
+	if (l != max_cbm)
+		return;
 
-		return true;
-	}
+	r->num_closid = 4;
+	r->default_ctrl = max_cbm;
+	r->cache.cbm_len = 20;
+	r->cache.shareable_bits = 0xc0000;
+	r->cache.min_cbm_bits = 2;
+	r->alloc_capable = true;
+	r->alloc_enabled = true;
 
-	return false;
+	rdt_alloc_capable = true;
 }
 
 /*
@@ -647,7 +641,7 @@ static __init bool get_rdt_alloc_resources(void)
 {
 	bool ret = false;
 
-	if (cache_alloc_hsw_probe())
+	if (rdt_alloc_capable)
 		return true;
 
 	if (!boot_cpu_has(X86_FEATURE_RDT_A))
@@ -689,8 +683,18 @@ static __init bool get_rdt_mon_resources(void)
 	return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]);
 }
 
+static __init void rdt_quirks(void)
+{
+	switch (boot_cpu_data.x86_model) {
+	case INTEL_FAM6_HASWELL_X:
+		cache_alloc_hsw_probe();
+		break;
+	}
+}
+
 static __init bool get_rdt_resources(void)
 {
+	rdt_quirks();
 	rdt_alloc_capable = get_rdt_alloc_resources();
 	rdt_mon_capable = get_rdt_mon_resources();
 
-- 
cgit v1.2.3


From 1d9807fc64c131a83a96917f2b2da1c9b00cf127 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 24 Aug 2017 09:26:51 -0700
Subject: x86/intel_rdt: Add command line options for resource director
 technology

Command line options allow us to ignore features that we don't want.
Also we can re-enable options that have been disabled on a platform
(so long as the underlying h/w actually supports the option).

[ tglx: Marked the option array __initdata and the helper function __init ]

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Fenghua" <fenghua.yu@intel.com>
Cc: Ravi V" <ravi.v.shankar@intel.com>
Cc: "Peter Zijlstra" <peterz@infradead.org>
Cc: "Stephane Eranian" <eranian@google.com>
Cc: "Andi Kleen" <ak@linux.intel.com>
Cc: "David Carrillo-Cisneros" <davidcc@google.com>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/0c37b0d4dbc30977a3c1cee08b66420f83662694.1503512900.git.tony.luck@intel.com
---
 Documentation/admin-guide/kernel-parameters.rst |  1 +
 Documentation/admin-guide/kernel-parameters.txt |  6 ++
 arch/x86/kernel/cpu/intel_rdt.c                 | 96 ++++++++++++++++++++++---
 3 files changed, 95 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index d76ab3907e2b..b2598cc9834c 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -138,6 +138,7 @@ parameter is applicable::
 	PPT	Parallel port support is enabled.
 	PS2	Appropriate PS/2 support is enabled.
 	RAM	RAM disk support is enabled.
+	RDT	Intel Resource Director Technology.
 	S390	S390 architecture is enabled.
 	SCSI	Appropriate SCSI support is enabled.
 			A lot of drivers have their options described inside
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index d9c171ce4190..ef52ae4ed6e8 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3598,6 +3598,12 @@
 			Run specified binary instead of /init from the ramdisk,
 			used for early userspace startup. See initrd.
 
+	rdt=		[HW,X86,RDT]
+			Turn on/off individual RDT features. List is:
+			cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, mba.
+			E.g. to turn on cmt and turn off mba use:
+				rdt=cmt,!mba
+
 	reboot=		[KNL]
 			Format (x86 or x86_64):
 				[w[arm] | c[old] | h[ard] | s[oft] | g[pio]] \
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 25514cd454b3..b641622003cf 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -637,6 +637,85 @@ static __init void rdt_init_padding(void)
 	}
 }
 
+enum {
+	RDT_FLAG_CMT,
+	RDT_FLAG_MBM_TOTAL,
+	RDT_FLAG_MBM_LOCAL,
+	RDT_FLAG_L3_CAT,
+	RDT_FLAG_L3_CDP,
+	RDT_FLAG_L2_CAT,
+	RDT_FLAG_MBA,
+};
+
+#define RDT_OPT(idx, n, f)	\
+[idx] = {			\
+	.name = n,		\
+	.flag = f		\
+}
+
+struct rdt_options {
+	char	*name;
+	int	flag;
+	bool	force_off, force_on;
+};
+
+static struct rdt_options rdt_options[]  __initdata = {
+	RDT_OPT(RDT_FLAG_CMT,	    "cmt",	X86_FEATURE_CQM_OCCUP_LLC),
+	RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
+	RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
+	RDT_OPT(RDT_FLAG_L3_CAT,    "l3cat",	X86_FEATURE_CAT_L3),
+	RDT_OPT(RDT_FLAG_L3_CDP,    "l3cdp",	X86_FEATURE_CDP_L3),
+	RDT_OPT(RDT_FLAG_L2_CAT,    "l2cat",	X86_FEATURE_CAT_L2),
+	RDT_OPT(RDT_FLAG_MBA,	    "mba",	X86_FEATURE_MBA),
+};
+#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
+
+static int __init set_rdt_options(char *str)
+{
+	struct rdt_options *o;
+	bool force_off;
+	char *tok;
+
+	if (*str == '=')
+		str++;
+	while ((tok = strsep(&str, ",")) != NULL) {
+		force_off = *tok == '!';
+		if (force_off)
+			tok++;
+		for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+			if (strcmp(tok, o->name) == 0) {
+				if (force_off)
+					o->force_off = true;
+				else
+					o->force_on = true;
+				break;
+			}
+		}
+	}
+	return 1;
+}
+__setup("rdt", set_rdt_options);
+
+static bool __init rdt_cpu_has(int flag)
+{
+	bool ret = boot_cpu_has(flag);
+	struct rdt_options *o;
+
+	if (!ret)
+		return ret;
+
+	for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+		if (flag == o->flag) {
+			if (o->force_off)
+				ret = false;
+			if (o->force_on)
+				ret = true;
+			break;
+		}
+	}
+	return ret;
+}
+
 static __init bool get_rdt_alloc_resources(void)
 {
 	bool ret = false;
@@ -647,21 +726,21 @@ static __init bool get_rdt_alloc_resources(void)
 	if (!boot_cpu_has(X86_FEATURE_RDT_A))
 		return false;
 
-	if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
+	if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
 		rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
-		if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
+		if (rdt_cpu_has(X86_FEATURE_CDP_L3)) {
 			rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
 			rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
 		}
 		ret = true;
 	}
-	if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
+	if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
 		/* CPUID 0x10.2 fields are same format at 0x10.1 */
 		rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
 		ret = true;
 	}
 
-	if (boot_cpu_has(X86_FEATURE_MBA)) {
+	if (rdt_cpu_has(X86_FEATURE_MBA)) {
 		if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA]))
 			ret = true;
 	}
@@ -670,11 +749,11 @@ static __init bool get_rdt_alloc_resources(void)
 
 static __init bool get_rdt_mon_resources(void)
 {
-	if (boot_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
+	if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
 		rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
-	if (boot_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
+	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
 		rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID);
-	if (boot_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
+	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
 		rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID);
 
 	if (!rdt_mon_features)
@@ -687,7 +766,8 @@ static __init void rdt_quirks(void)
 {
 	switch (boot_cpu_data.x86_model) {
 	case INTEL_FAM6_HASWELL_X:
-		cache_alloc_hsw_probe();
+		if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
+			cache_alloc_hsw_probe();
 		break;
 	}
 }
-- 
cgit v1.2.3


From d56593eb5eda8f593db92927059697bbf89bc4b3 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 24 Aug 2017 09:26:52 -0700
Subject: x86/intel_rdt: Turn off most RDT features on Skylake

Errata list is included in this document:
https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/6th-gen-x-series-spec-update.pdf
with more details in:
https://www.intel.com/content/www/us/en/processors/xeon/scalable/xeon-scalable-spec-update.html

But the tl;dr summary (using tags from first of the documents) is:
SKZ4  MBM does not accurately track write bandwidth
SKZ17 CMT counters may not count accurately
SKZ18 CAT may not restrict cacheline allocation under certain conditions
SKZ19 MBM counters may undercount

Disable all these features on Skylake models. Users who understand the
errata may re-enable using boot command line options.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Fenghua" <fenghua.yu@intel.com>
Cc: Ravi V" <ravi.v.shankar@intel.com>
Cc: "Peter Zijlstra" <peterz@infradead.org>
Cc: "Stephane Eranian" <eranian@google.com>
Cc: "Andi Kleen" <ak@linux.intel.com>
Cc: "David Carrillo-Cisneros" <davidcc@google.com>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/3aea0a3bae219062c812668bd9b7b8f1a25003ba.1503512900.git.tony.luck@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/intel_rdt.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index b641622003cf..cd5fc61ba450 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -769,6 +769,9 @@ static __init void rdt_quirks(void)
 		if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
 			cache_alloc_hsw_probe();
 		break;
+	case INTEL_FAM6_SKYLAKE_X:
+		if (boot_cpu_data.x86_mask <= 4)
+			set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
 	}
 }
 
-- 
cgit v1.2.3