From ff303e66c240ba6269e31817a386995440a18c99 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 Apr 2015 20:05:30 +0200
Subject: perf: Fix software migrate events

Stephane asked about PERF_COUNT_SW_CPU_MIGRATIONS and I realized it
was borken:

 > The problem is that the task isn't actually scheduled while its being
 > migrated (obviously), and if its not scheduled, the counters aren't
 > scheduled either, so there's no observing of the fact.
 >
 > A further problem with migrations is that many migrations happen from
 > softirq context, which is nested inside the 'random' task context of
 > whoemever happens to run at that time, similarly for the wakeup
 > migrations triggered from (soft)irq context. All those end up being
 > accounted in the task that's currently running, eg. your 'ls'.

The below cures this by marking a task as migrated and accounting it
on the subsequent sched_in().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 24 ++++++++++++++++++++++++
 include/linux/sched.h      |  7 ++++---
 2 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 61992cf2e977..e86f85abeda7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -798,11 +798,33 @@ perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
 
 extern struct static_key_deferred perf_sched_events;
 
+static __always_inline bool
+perf_sw_migrate_enabled(void)
+{
+	if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS]))
+		return true;
+	return false;
+}
+
+static inline void perf_event_task_migrate(struct task_struct *task)
+{
+	if (perf_sw_migrate_enabled())
+		task->sched_migrated = 1;
+}
+
 static inline void perf_event_task_sched_in(struct task_struct *prev,
 					    struct task_struct *task)
 {
 	if (static_key_false(&perf_sched_events.key))
 		__perf_event_task_sched_in(prev, task);
+
+	if (perf_sw_migrate_enabled() && task->sched_migrated) {
+		struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
+
+		perf_fetch_caller_regs(regs);
+		___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0);
+		task->sched_migrated = 0;
+	}
 }
 
 static inline void perf_event_task_sched_out(struct task_struct *prev,
@@ -925,6 +947,8 @@ perf_aux_output_skip(struct perf_output_handle *handle,
 static inline void *
 perf_get_aux(struct perf_output_handle *handle)				{ return NULL; }
 static inline void
+perf_event_task_migrate(struct task_struct *task)			{ }
+static inline void
 perf_event_task_sched_in(struct task_struct *prev,
 			 struct task_struct *task)			{ }
 static inline void
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a2e6122734..2c5e6c3db654 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1356,9 +1356,6 @@ struct task_struct {
 #endif
 
 	struct mm_struct *mm, *active_mm;
-#ifdef CONFIG_COMPAT_BRK
-	unsigned brk_randomized:1;
-#endif
 	/* per-thread vma caching */
 	u32 vmacache_seqnum;
 	struct vm_area_struct *vmacache[VMACACHE_SIZE];
@@ -1381,10 +1378,14 @@ struct task_struct {
 	/* Revert to default priority/policy when forking */
 	unsigned sched_reset_on_fork:1;
 	unsigned sched_contributes_to_load:1;
+	unsigned sched_migrated:1;
 
 #ifdef CONFIG_MEMCG_KMEM
 	unsigned memcg_kmem_skip_account:1;
 #endif
+#ifdef CONFIG_COMPAT_BRK
+	unsigned brk_randomized:1;
+#endif
 
 	unsigned long atomic_flags; /* Flags needing atomic access. */
 
-- 
cgit v1.2.3


From b3df4ec4424f27e55d754cfe586195fecca1c4e4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 19 May 2015 00:00:51 +0000
Subject: perf/x86/intel/cqm: Use proper data types

'int' is really not a proper data type for an MSR. Use u32 to make it
clear that we are dealing with a 32-bit unsigned hardware value.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Matt Fleming <matt.fleming@intel.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Cc: Will Auld <will.auld@intel.com>
Link: http://lkml.kernel.org/r/20150518235149.919350144@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 4 ++--
 include/linux/perf_event.h                 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 572582e2143e..3e9a7fbfce58 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -18,7 +18,7 @@ static unsigned int cqm_l3_scale; /* supposedly cacheline size */
 
 struct intel_cqm_state {
 	raw_spinlock_t		lock;
-	int			rmid;
+	u32			rmid;
 	int			cnt;
 };
 
@@ -962,7 +962,7 @@ out:
 static void intel_cqm_event_start(struct perf_event *event, int mode)
 {
 	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
-	unsigned int rmid = event->hw.cqm_rmid;
+	u32 rmid = event->hw.cqm_rmid;
 	unsigned long flags;
 
 	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 248f7829ce41..06580028cee6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -120,7 +120,7 @@ struct hw_perf_event {
 		};
 		struct { /* intel_cqm */
 			int			cqm_state;
-			int			cqm_rmid;
+			u32			cqm_rmid;
 			struct list_head	cqm_events_entry;
 			struct list_head	cqm_groups_entry;
 			struct list_head	cqm_group_entry;
-- 
cgit v1.2.3


From c9fdfa14c3792c0160849c484e83aa57afd80ccc Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 14 May 2015 23:09:58 +0200
Subject: perf: add new PERF_SAMPLE_BRANCH_IND_JUMP branch sample type

This patch adds a new branch_sample_type flag to enable
filtering branch sampling to indirect jumps. The support
is subject to hardware or kernel software support on each
architecture.

Filtering on indirect jump is useful to study the targets
of the jump.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@redhat.com
Cc: dsahern@gmail.com
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Cc: namhyung@kernel.org
Link: http://lkml.kernel.org/r/1431637800-31061-2-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 309211b3eb67..c4622f1ce046 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -167,6 +167,7 @@ enum perf_branch_sample_type_shift {
 	PERF_SAMPLE_BRANCH_COND_SHIFT		= 10, /* conditional branches */
 
 	PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT	= 11, /* call/ret stack */
+	PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT	= 12, /* indirect jumps */
 
 	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
 };
@@ -186,6 +187,7 @@ enum perf_branch_sample_type {
 	PERF_SAMPLE_BRANCH_COND		= 1U << PERF_SAMPLE_BRANCH_COND_SHIFT,
 
 	PERF_SAMPLE_BRANCH_CALL_STACK	= 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
+	PERF_SAMPLE_BRANCH_IND_JUMP	= 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT,
 
 	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
-- 
cgit v1.2.3


From 21509084f999d7accd32e45961ef76853112e978 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Wed, 6 May 2015 15:33:49 -0400
Subject: perf/x86/intel: Handle multiple records in the PEBS buffer

When the PEBS interrupt threshold is larger than one record and the
machine supports multiple PEBS events, the records of these events are
mixed up and we need to demultiplex them.

Demuxing the records is hard because the hardware is deficient. The
hardware has two issues that, when combined, create impossible
scenarios to demux.

The first issue is that the 'status' field of the PEBS record is a copy
of the GLOBAL_STATUS MSR at PEBS assist time. To see why this is a
problem let us first describe the regular PEBS cycle:

A) the CTRn value reaches 0:
  - the corresponding bit in GLOBAL_STATUS gets set
  - we start arming the hardware assist
  < some unspecified amount of time later -- this could cover multiple
    events of interest >

B) the hardware assist is armed, any next event will trigger it

C) a matching event happens:
  - the hardware assist triggers and generates a PEBS record
    this includes a copy of GLOBAL_STATUS at this moment
  - if we auto-reload we (re)set CTRn
  - we clear the relevant bit in GLOBAL_STATUS

Now consider the following chain of events:

  A0, B0, A1, C0

The event generated for counter 0 will include a status with counter 1
set, even though its not at all related to the record. A similar thing
can happen with a !PEBS event if it just happens to overflow at the
right moment.

The second issue is that the hardware will only emit one record for two
or more counters if the event that triggers the assist is 'close'. The
'close' can be several cycles. In some cases even the complete assist,
if the event is something that doesn't need retirement.

For instance, consider this chain of events:

  A0, B0, A1, B1, C01

Where C01 is an event that triggers both hardware assists, we will
generate but a single record, but again with both counters listed in the
status field.

This time the record pertains to both events.

Note that these two cases are different but undistinguishable with the
data as generated. Therefore demuxing records with multiple PEBS bits
(we can safely ignore status bits for !PEBS counters) is impossible.

Furthermore we cannot emit the record to both events because that might
cause a data leak -- the events might not have the same privileges -- so
what this patch does is discard such events.

The assumption/hope is that such discards will be rare.

Here lists some possible ways you may get high discard rate.

  - when you count the same thing multiple times. But it is not a useful
    configuration.
  - you can be unfortunate if you measure with a userspace only PEBS
    event along with either a kernel or unrestricted PEBS event. Imagine
    the event triggering and setting the overflow flag right before
    entering the kernel. Then all kernel side events will end up with
    multiple bits set.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
[ Changelog improvements. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1430940834-8964-4-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 142 +++++++++++++++++++++---------
 include/linux/perf_event.h                |  13 +++
 kernel/events/core.c                      |   6 +-
 kernel/events/internal.h                  |   9 --
 4 files changed, 116 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index a5fe561c4902..72529c237e6e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -872,6 +872,9 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	int fll, fst, dsrc;
 	int fl = event->hw.flags;
 
+	if (pebs == NULL)
+		return;
+
 	sample_type = event->attr.sample_type;
 	dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
 
@@ -966,19 +969,68 @@ static void setup_pebs_sample_data(struct perf_event *event,
 		data->br_stack = &cpuc->lbr_stack;
 }
 
+static inline void *
+get_next_pebs_record_by_bit(void *base, void *top, int bit)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	void *at;
+	u64 pebs_status;
+
+	if (base == NULL)
+		return NULL;
+
+	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+		struct pebs_record_nhm *p = at;
+
+		if (test_bit(bit, (unsigned long *)&p->status)) {
+
+			if (p->status == (1 << bit))
+				return at;
+
+			/* clear non-PEBS bit and re-check */
+			pebs_status = p->status & cpuc->pebs_enabled;
+			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+			if (pebs_status == (1 << bit))
+				return at;
+		}
+	}
+	return NULL;
+}
+
 static void __intel_pmu_pebs_event(struct perf_event *event,
-				   struct pt_regs *iregs, void *__pebs)
+				   struct pt_regs *iregs,
+				   void *base, void *top,
+				   int bit, int count)
 {
 	struct perf_sample_data data;
 	struct pt_regs regs;
+	int i;
+	void *at = get_next_pebs_record_by_bit(base, top, bit);
 
-	if (!intel_pmu_save_and_restart(event))
+	if (!intel_pmu_save_and_restart(event) &&
+	    !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
 		return;
 
-	setup_pebs_sample_data(event, iregs, __pebs, &data, &regs);
+	if (count > 1) {
+		for (i = 0; i < count - 1; i++) {
+			setup_pebs_sample_data(event, iregs, at, &data, &regs);
+			perf_event_output(event, &data, &regs);
+			at += x86_pmu.pebs_record_size;
+			at = get_next_pebs_record_by_bit(at, top, bit);
+		}
+	}
+
+	setup_pebs_sample_data(event, iregs, at, &data, &regs);
 
-	if (perf_event_overflow(event, &data, &regs))
+	/*
+	 * All but the last records are processed.
+	 * The last one is left to be able to call the overflow handler.
+	 */
+	if (perf_event_overflow(event, &data, &regs)) {
 		x86_pmu_stop(event, 0);
+		return;
+	}
+
 }
 
 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
@@ -1008,72 +1060,78 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	if (!event->attr.precise_ip)
 		return;
 
-	n = top - at;
+	n = (top - at) / x86_pmu.pebs_record_size;
 	if (n <= 0)
 		return;
 
-	/*
-	 * Should not happen, we program the threshold at 1 and do not
-	 * set a reset value.
-	 */
-	WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
-	at += n - 1;
-
-	__intel_pmu_pebs_event(event, iregs, at);
+	__intel_pmu_pebs_event(event, iregs, at, top, 0, n);
 }
 
 static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
-	struct perf_event *event = NULL;
-	void *at, *top;
-	u64 status = 0;
+	struct perf_event *event;
+	void *base, *at, *top;
 	int bit;
+	short counts[MAX_PEBS_EVENTS] = {};
 
 	if (!x86_pmu.pebs_active)
 		return;
 
-	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+	base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
 	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
 
 	ds->pebs_index = ds->pebs_buffer_base;
 
-	if (unlikely(at > top))
+	if (unlikely(base >= top))
 		return;
 
-	/*
-	 * Should not happen, we program the threshold at 1 and do not
-	 * set a reset value.
-	 */
-	WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size,
-		  "Unexpected number of pebs records %ld\n",
-		  (long)(top - at) / x86_pmu.pebs_record_size);
-
-	for (; at < top; at += x86_pmu.pebs_record_size) {
+	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
 		struct pebs_record_nhm *p = at;
 
-		for_each_set_bit(bit, (unsigned long *)&p->status,
-				 x86_pmu.max_pebs_events) {
-			event = cpuc->events[bit];
-			if (!test_bit(bit, cpuc->active_mask))
-				continue;
-
-			WARN_ON_ONCE(!event);
-
-			if (!event->attr.precise_ip)
-				continue;
+		bit = find_first_bit((unsigned long *)&p->status,
+					x86_pmu.max_pebs_events);
+		if (bit >= x86_pmu.max_pebs_events)
+			continue;
+		if (!test_bit(bit, cpuc->active_mask))
+			continue;
+		/*
+		 * The PEBS hardware does not deal well with the situation
+		 * when events happen near to each other and multiple bits
+		 * are set. But it should happen rarely.
+		 *
+		 * If these events include one PEBS and multiple non-PEBS
+		 * events, it doesn't impact PEBS record. The record will
+		 * be handled normally. (slow path)
+		 *
+		 * If these events include two or more PEBS events, the
+		 * records for the events can be collapsed into a single
+		 * one, and it's not possible to reconstruct all events
+		 * that caused the PEBS record. It's called collision.
+		 * If collision happened, the record will be dropped.
+		 *
+		 */
+		if (p->status != (1 << bit)) {
+			u64 pebs_status;
 
-			if (__test_and_set_bit(bit, (unsigned long *)&status))
+			/* slow path */
+			pebs_status = p->status & cpuc->pebs_enabled;
+			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+			if (pebs_status != (1 << bit))
 				continue;
-
-			break;
 		}
+		counts[bit]++;
+	}
 
-		if (!event || bit >= x86_pmu.max_pebs_events)
+	for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
+		if (counts[bit] == 0)
 			continue;
+		event = cpuc->events[bit];
+		WARN_ON_ONCE(!event);
+		WARN_ON_ONCE(!event->attr.precise_ip);
 
-		__intel_pmu_pebs_event(event, iregs, at);
+		__intel_pmu_pebs_event(event, iregs, base, top, bit, counts[bit]);
 	}
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 06580028cee6..5f192e1bc98e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -730,6 +730,19 @@ extern int perf_event_overflow(struct perf_event *event,
 				 struct perf_sample_data *data,
 				 struct pt_regs *regs);
 
+extern void perf_event_output(struct perf_event *event,
+				struct perf_sample_data *data,
+				struct pt_regs *regs);
+
+extern void
+perf_event_header__init_id(struct perf_event_header *header,
+			   struct perf_sample_data *data,
+			   struct perf_event *event);
+extern void
+perf_event__output_id_sample(struct perf_event *event,
+			     struct perf_output_handle *handle,
+			     struct perf_sample_data *sample);
+
 static inline bool is_sampling_event(struct perf_event *event)
 {
 	return event->attr.sample_period != 0;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index eddf1ed4155e..e499b4e43aff 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5381,9 +5381,9 @@ void perf_prepare_sample(struct perf_event_header *header,
 	}
 }
 
-static void perf_event_output(struct perf_event *event,
-				struct perf_sample_data *data,
-				struct pt_regs *regs)
+void perf_event_output(struct perf_event *event,
+			struct perf_sample_data *data,
+			struct pt_regs *regs)
 {
 	struct perf_output_handle handle;
 	struct perf_event_header header;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 9f6ce9ba4a04..2deb24c7a40d 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -72,15 +72,6 @@ static inline bool rb_has_aux(struct ring_buffer *rb)
 void perf_event_aux_event(struct perf_event *event, unsigned long head,
 			  unsigned long size, u64 flags);
 
-extern void
-perf_event_header__init_id(struct perf_event_header *header,
-			   struct perf_sample_data *data,
-			   struct perf_event *event);
-extern void
-perf_event__output_id_sample(struct perf_event *event,
-			     struct perf_output_handle *handle,
-			     struct perf_sample_data *sample);
-
 extern struct page *
 perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
 
-- 
cgit v1.2.3


From f38b0dbb491a6987e198aa6b428db8692a6480f8 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Sun, 10 May 2015 15:13:14 -0400
Subject: perf/x86/intel: Introduce PERF_RECORD_LOST_SAMPLES

After enlarging the PEBS interrupt threshold, there may be some mixed up
PEBS samples which are discarded by the kernel.

This patch makes the kernel emit a PERF_RECORD_LOST_SAMPLES record with
the number of possible discarded records when it is impossible to demux
the samples.

It makes sure the user is not left in the dark about such discards.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1431285195-14269-8-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 20 ++++++++++++++++---
 include/linux/perf_event.h                |  3 +++
 include/uapi/linux/perf_event.h           | 12 +++++++++++
 kernel/events/core.c                      | 33 +++++++++++++++++++++++++++++++
 4 files changed, 65 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 266079a3a646..34d0c4816141 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -1126,6 +1126,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	void *base, *at, *top;
 	int bit;
 	short counts[MAX_PEBS_EVENTS] = {};
+	short error[MAX_PEBS_EVENTS] = {};
 
 	if (!x86_pmu.pebs_active)
 		return;
@@ -1169,20 +1170,33 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 			/* slow path */
 			pebs_status = p->status & cpuc->pebs_enabled;
 			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
-			if (pebs_status != (1 << bit))
+			if (pebs_status != (1 << bit)) {
+				u8 i;
+
+				for_each_set_bit(i, (unsigned long *)&pebs_status,
+						 MAX_PEBS_EVENTS)
+					error[i]++;
 				continue;
+			}
 		}
 		counts[bit]++;
 	}
 
 	for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
-		if (counts[bit] == 0)
+		if ((counts[bit] == 0) && (error[bit] == 0))
 			continue;
 		event = cpuc->events[bit];
 		WARN_ON_ONCE(!event);
 		WARN_ON_ONCE(!event->attr.precise_ip);
 
-		__intel_pmu_pebs_event(event, iregs, base, top, bit, counts[bit]);
+		/* log dropped samples number */
+		if (error[bit])
+			perf_log_lost_samples(event, error[bit]);
+
+		if (counts[bit]) {
+			__intel_pmu_pebs_event(event, iregs, base,
+					       top, bit, counts[bit]);
+		}
 	}
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5f192e1bc98e..a204d5266f5f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -743,6 +743,9 @@ perf_event__output_id_sample(struct perf_event *event,
 			     struct perf_output_handle *handle,
 			     struct perf_sample_data *sample);
 
+extern void
+perf_log_lost_samples(struct perf_event *event, u64 lost);
+
 static inline bool is_sampling_event(struct perf_event *event)
 {
 	return event->attr.sample_period != 0;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c4622f1ce046..613ed9ad588f 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -802,6 +802,18 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_ITRACE_START		= 12,
 
+	/*
+	 * Records the dropped/lost sample number.
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	u64				lost;
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_LOST_SAMPLES		= 13,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e499b4e43aff..9e0773d5d110 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5974,6 +5974,39 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,
 	perf_output_end(&handle);
 }
 
+/*
+ * Lost/dropped samples logging
+ */
+void perf_log_lost_samples(struct perf_event *event, u64 lost)
+{
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	int ret;
+
+	struct {
+		struct perf_event_header	header;
+		u64				lost;
+	} lost_samples_event = {
+		.header = {
+			.type = PERF_RECORD_LOST_SAMPLES,
+			.misc = 0,
+			.size = sizeof(lost_samples_event),
+		},
+		.lost		= lost,
+	};
+
+	perf_event_header__init_id(&lost_samples_event.header, &sample, event);
+
+	ret = perf_output_begin(&handle, event,
+				lost_samples_event.header.size);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, lost_samples_event);
+	perf_event__output_id_sample(event, &handle, &sample);
+	perf_output_end(&handle);
+}
+
 /*
  * IRQ throttle logging
  */
-- 
cgit v1.2.3


From 930e6fcd2bcce9bcd9d4aa7e755678d33f3fe6f4 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Wed, 17 Jun 2015 09:51:10 -0400
Subject: perf tools: Add time out to force stop proc map processing

System wide sampling like 'perf top' or 'perf record -a' read all
threads /proc/xxx/maps before sampling. If there are any threads which
generating a keeping growing huge maps, perf will do infinite loop
during synthesizing. Nothing will be sampled.

This patch fixes this issue by adding per-thread timeout to force stop
this kind of endless proc map processing.

PERF_RECORD_MISC_PROC_MAP_PARSE_TIME_OUT is introduced to indicate that
the mmap record are truncated by time out. User will get warning
notification when truncated mmap records are detected.

Reported-by: Ying Huang <ying.huang@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Ying Huang <ying.huang@intel.com>
Link: http://lkml.kernel.org/r/1434549071-25611-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/uapi/linux/perf_event.h |  4 ++++
 tools/perf/util/event.c         | 18 ++++++++++++++++++
 tools/perf/util/event.h         |  1 +
 tools/perf/util/session.c       | 11 +++++++++++
 4 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 613ed9ad588f..d97f84c080da 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -565,6 +565,10 @@ struct perf_event_mmap_page {
 #define PERF_RECORD_MISC_GUEST_KERNEL		(4 << 0)
 #define PERF_RECORD_MISC_GUEST_USER		(5 << 0)
 
+/*
+ * Indicates that /proc/PID/maps parsing are truncated by time out.
+ */
+#define PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT	(1 << 12)
 /*
  * PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on
  * different events so can reuse the same bit position.
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 793b1503d437..416ba80c628f 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -213,6 +213,8 @@ static int perf_event__synthesize_fork(struct perf_tool *tool,
 	return 0;
 }
 
+#define PROC_MAP_PARSE_TIMEOUT	(500 * 1000000ULL)
+
 int perf_event__synthesize_mmap_events(struct perf_tool *tool,
 				       union perf_event *event,
 				       pid_t pid, pid_t tgid,
@@ -222,6 +224,8 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
 {
 	char filename[PATH_MAX];
 	FILE *fp;
+	unsigned long long t;
+	bool truncation = false;
 	int rc = 0;
 
 	if (machine__is_default_guest(machine))
@@ -240,6 +244,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
 	}
 
 	event->header.type = PERF_RECORD_MMAP2;
+	t = rdclock();
 
 	while (1) {
 		char bf[BUFSIZ];
@@ -253,6 +258,12 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
 		if (fgets(bf, sizeof(bf), fp) == NULL)
 			break;
 
+		if ((rdclock() - t) > PROC_MAP_PARSE_TIMEOUT) {
+			pr_warning("Reading %s time out.\n", filename);
+			truncation = true;
+			goto out;
+		}
+
 		/* ensure null termination since stack will be reused. */
 		strcpy(execname, "");
 
@@ -301,6 +312,10 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
 			event->header.misc |= PERF_RECORD_MISC_MMAP_DATA;
 		}
 
+out:
+		if (truncation)
+			event->header.misc |= PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT;
+
 		if (!strcmp(execname, ""))
 			strcpy(execname, anonstr);
 
@@ -319,6 +334,9 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
 			rc = -1;
 			break;
 		}
+
+		if (truncation)
+			break;
 	}
 
 	fclose(fp);
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 5dc51ada05df..39868f529cab 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -265,6 +265,7 @@ struct events_stats {
 	u32 nr_unknown_id;
 	u32 nr_unprocessable_samples;
 	u32 nr_auxtrace_errors[PERF_AUXTRACE_ERROR_MAX];
+	u32 nr_proc_map_timeout;
 };
 
 struct attr_event {
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index c371336d1eb2..2d882fd1f1b9 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1064,6 +1064,8 @@ static int machines__deliver_event(struct machines *machines,
 	case PERF_RECORD_MMAP:
 		return tool->mmap(tool, event, sample, machine);
 	case PERF_RECORD_MMAP2:
+		if (event->header.misc & PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT)
+			++evlist->stats.nr_proc_map_timeout;
 		return tool->mmap2(tool, event, sample, machine);
 	case PERF_RECORD_COMM:
 		return tool->comm(tool, event, sample, machine);
@@ -1360,6 +1362,15 @@ static void perf_session__warn_about_errors(const struct perf_session *session)
 		ui__warning("%u out of order events recorded.\n", oe->nr_unordered_events);
 
 	events_stats__auxtrace_error_warn(stats);
+
+	if (stats->nr_proc_map_timeout != 0) {
+		ui__warning("%d map information files for pre-existing threads were\n"
+			    "not processed, if there are samples for addresses they\n"
+			    "will not be resolved, you may find out which are these\n"
+			    "threads by running with -v and redirecting the output\n"
+			    "to a file.\n",
+			    stats->nr_proc_map_timeout);
+	}
 }
 
 static int perf_session__flush_thread_stack(struct thread *thread,
-- 
cgit v1.2.3