summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-devices-mapping33
-rw-r--r--arch/x86/events/intel/uncore.c26
-rw-r--r--arch/x86/events/intel/uncore.h37
-rw-r--r--arch/x86/events/intel/uncore_snb.c80
-rw-r--r--arch/x86/events/intel/uncore_snbep.c208
-rw-r--r--arch/x86/include/asm/kprobes.h2
-rw-r--r--arch/x86/kernel/alternative.c37
-rw-r--r--arch/x86/kernel/kprobes/core.c15
-rw-r--r--arch/x86/kernel/kprobes/opt.c38
-rw-r--r--include/linux/ftrace.h12
-rw-r--r--include/linux/kprobes.h15
-rw-r--r--include/linux/perf_event.h8
-rw-r--r--include/uapi/linux/perf_event.h26
-rw-r--r--kernel/events/core.c90
-rw-r--r--kernel/kallsyms.c42
-rw-r--r--kernel/kprobes.c57
-rw-r--r--kernel/trace/ftrace.c101
17 files changed, 798 insertions, 29 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-mapping b/Documentation/ABI/testing/sysfs-devices-mapping
new file mode 100644
index 000000000000..490ccfd67f12
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-mapping
@@ -0,0 +1,33 @@
+What: /sys/devices/uncore_iio_x/dieX
+Date: February 2020
+Contact: Roman Sudarikov <roman.sudarikov@linux.intel.com>
+Description:
+ Each IIO stack (PCIe root port) has its own IIO PMON block, so
+ each dieX file (where X is die number) holds "Segment:Root Bus"
+ for PCIe root port, which can be monitored by that IIO PMON
+ block.
+ For example, on 4-die Xeon platform with up to 6 IIO stacks per
+ die and, therefore, 6 IIO PMON blocks per die, the mapping of
+ IIO PMON block 0 exposes as the following:
+
+ $ ls /sys/devices/uncore_iio_0/die*
+ -r--r--r-- /sys/devices/uncore_iio_0/die0
+ -r--r--r-- /sys/devices/uncore_iio_0/die1
+ -r--r--r-- /sys/devices/uncore_iio_0/die2
+ -r--r--r-- /sys/devices/uncore_iio_0/die3
+
+ $ tail /sys/devices/uncore_iio_0/die*
+ ==> /sys/devices/uncore_iio_0/die0 <==
+ 0000:00
+ ==> /sys/devices/uncore_iio_0/die1 <==
+ 0000:40
+ ==> /sys/devices/uncore_iio_0/die2 <==
+ 0000:80
+ ==> /sys/devices/uncore_iio_0/die3 <==
+ 0000:c0
+
+ Which means:
+ IIO PMU 0 on die 0 belongs to PCI RP on bus 0x00, domain 0x0000
+ IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x0000
+ IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x0000
+ IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x0000
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index cf76d6631afa..d5c6d3b340c5 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -16,7 +16,7 @@ struct pci_driver *uncore_pci_driver;
DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
struct pci_extra_dev *uncore_extra_pci_dev;
-static int max_dies;
+int __uncore_max_dies;
/* mask of cpus that collect uncore events */
static cpumask_t uncore_cpu_mask;
@@ -108,7 +108,7 @@ struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu
* The unsigned check also catches the '-1' return value for non
* existent mappings in the topology map.
*/
- return dieid < max_dies ? pmu->boxes[dieid] : NULL;
+ return dieid < uncore_max_dies() ? pmu->boxes[dieid] : NULL;
}
u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
@@ -132,6 +132,9 @@ u64 uncore_mmio_read_counter(struct intel_uncore_box *box,
if (!box->io_addr)
return 0;
+ if (!uncore_mmio_is_valid_offset(box, event->hw.event_base))
+ return 0;
+
return readq(box->io_addr + event->hw.event_base);
}
@@ -843,10 +846,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
.read = uncore_pmu_event_read,
.module = THIS_MODULE,
.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+ .attr_update = pmu->type->attr_update,
};
} else {
pmu->pmu = *pmu->type->pmu;
pmu->pmu.attr_groups = pmu->type->attr_groups;
+ pmu->pmu.attr_update = pmu->type->attr_update;
}
if (pmu->type->num_boxes == 1) {
@@ -877,7 +882,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
{
int die;
- for (die = 0; die < max_dies; die++)
+ for (die = 0; die < uncore_max_dies(); die++)
kfree(pmu->boxes[die]);
kfree(pmu->boxes);
}
@@ -887,6 +892,9 @@ static void uncore_type_exit(struct intel_uncore_type *type)
struct intel_uncore_pmu *pmu = type->pmus;
int i;
+ if (type->cleanup_mapping)
+ type->cleanup_mapping(type);
+
if (pmu) {
for (i = 0; i < type->num_boxes; i++, pmu++) {
uncore_pmu_unregister(pmu);
@@ -915,7 +923,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
if (!pmus)
return -ENOMEM;
- size = max_dies * sizeof(struct intel_uncore_box *);
+ size = uncore_max_dies() * sizeof(struct intel_uncore_box *);
for (i = 0; i < type->num_boxes; i++) {
pmus[i].func_id = setid ? i : -1;
@@ -954,6 +962,9 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
type->pmu_group = &uncore_pmu_attr_group;
+ if (type->set_mapping)
+ type->set_mapping(type);
+
return 0;
err:
@@ -1112,7 +1123,7 @@ static int __init uncore_pci_init(void)
size_t size;
int ret;
- size = max_dies * sizeof(struct pci_extra_dev);
+ size = uncore_max_dies() * sizeof(struct pci_extra_dev);
uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
if (!uncore_extra_pci_dev) {
ret = -ENOMEM;
@@ -1514,6 +1525,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &skx_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &skl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &skl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &skl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &skl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &icl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_uncore_init),
@@ -1539,7 +1552,8 @@ static int __init intel_uncore_init(void)
if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
return -ENODEV;
- max_dies = topology_max_packages() * topology_max_die_per_package();
+ __uncore_max_dies =
+ topology_max_packages() * topology_max_die_per_package();
uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
if (uncore_init->pci_init) {
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index b469ddd45515..105fdc69825e 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -61,6 +61,7 @@ struct intel_uncore_type {
unsigned msr_offset;
unsigned mmio_offset;
};
+ unsigned mmio_map_size;
unsigned num_shared_regs:8;
unsigned single_fixed:1;
unsigned pair_ctr_ctl:1;
@@ -72,7 +73,19 @@ struct intel_uncore_type {
struct uncore_event_desc *event_descs;
struct freerunning_counters *freerunning;
const struct attribute_group *attr_groups[4];
+ const struct attribute_group **attr_update;
struct pmu *pmu; /* for custom pmu ops */
+ /*
+ * Uncore PMU would store relevant platform topology configuration here
+ * to identify which platform component each PMON block of that type is
+ * supposed to monitor.
+ */
+ u64 *topology;
+ /*
+ * Optional callbacks for managing mapping of Uncore units to PMONs
+ */
+ int (*set_mapping)(struct intel_uncore_type *type);
+ void (*cleanup_mapping)(struct intel_uncore_type *type);
};
#define pmu_group attr_groups[0]
@@ -169,6 +182,18 @@ int uncore_pcibus_to_physid(struct pci_bus *bus);
ssize_t uncore_event_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf);
+static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev)
+{
+ return container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu);
+}
+
+#define to_device_attribute(n) container_of(n, struct device_attribute, attr)
+#define to_dev_ext_attribute(n) container_of(n, struct dev_ext_attribute, attr)
+#define attr_to_ext_attr(n) to_dev_ext_attribute(to_device_attribute(n))
+
+extern int __uncore_max_dies;
+#define uncore_max_dies() (__uncore_max_dies)
+
#define INTEL_UNCORE_EVENT_DESC(_name, _config) \
{ \
.attr = __ATTR(_name, 0444, uncore_event_show, NULL), \
@@ -196,6 +221,18 @@ static inline bool uncore_pmc_freerunning(int idx)
return idx == UNCORE_PMC_IDX_FREERUNNING;
}
+static inline bool uncore_mmio_is_valid_offset(struct intel_uncore_box *box,
+ unsigned long offset)
+{
+ if (offset < box->pmu->type->mmio_map_size)
+ return true;
+
+ pr_warn_once("perf uncore: Invalid offset 0x%lx exceeds mapped area of %s.\n",
+ offset, box->pmu->type->name);
+
+ return false;
+}
+
static inline
unsigned int uncore_mmio_box_ctl(struct intel_uncore_box *box)
{
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 3de1065eefc4..cb94ba86efd2 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -42,6 +42,17 @@
#define PCI_DEVICE_ID_INTEL_WHL_UQ_IMC 0x3ed0
#define PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC 0x3e34
#define PCI_DEVICE_ID_INTEL_WHL_UD_IMC 0x3e35
+#define PCI_DEVICE_ID_INTEL_CML_H1_IMC 0x9b44
+#define PCI_DEVICE_ID_INTEL_CML_H2_IMC 0x9b54
+#define PCI_DEVICE_ID_INTEL_CML_H3_IMC 0x9b64
+#define PCI_DEVICE_ID_INTEL_CML_U1_IMC 0x9b51
+#define PCI_DEVICE_ID_INTEL_CML_U2_IMC 0x9b61
+#define PCI_DEVICE_ID_INTEL_CML_U3_IMC 0x9b71
+#define PCI_DEVICE_ID_INTEL_CML_S1_IMC 0x9b33
+#define PCI_DEVICE_ID_INTEL_CML_S2_IMC 0x9b43
+#define PCI_DEVICE_ID_INTEL_CML_S3_IMC 0x9b53
+#define PCI_DEVICE_ID_INTEL_CML_S4_IMC 0x9b63
+#define PCI_DEVICE_ID_INTEL_CML_S5_IMC 0x9b73
#define PCI_DEVICE_ID_INTEL_ICL_U_IMC 0x8a02
#define PCI_DEVICE_ID_INTEL_ICL_U2_IMC 0x8a12
#define PCI_DEVICE_ID_INTEL_TGL_U1_IMC 0x9a02
@@ -415,6 +426,7 @@ static const struct attribute_group snb_uncore_imc_format_group = {
static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
{
+ struct intel_uncore_type *type = box->pmu->type;
struct pci_dev *pdev = box->pci_dev;
int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
resource_size_t addr;
@@ -430,7 +442,10 @@ static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
addr &= ~(PAGE_SIZE - 1);
- box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+ box->io_addr = ioremap(addr, type->mmio_map_size);
+ if (!box->io_addr)
+ pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
+
box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
}
@@ -586,6 +601,7 @@ static struct intel_uncore_type snb_uncore_imc = {
.num_counters = 2,
.num_boxes = 1,
.num_freerunning_types = SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+ .mmio_map_size = SNB_UNCORE_PCI_IMC_MAP_SIZE,
.freerunning = snb_uncore_imc_freerunning,
.event_descs = snb_uncore_imc_events,
.format_group = &snb_uncore_imc_format_group,
@@ -771,6 +787,50 @@ static const struct pci_device_id skl_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UD_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H1_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H2_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H3_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U1_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U2_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U3_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S1_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S2_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S3_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S4_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S5_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
{ /* end: all zeroes */ },
};
@@ -863,6 +923,17 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
IMC_DEV(WHL_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */
IMC_DEV(WHL_4_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */
IMC_DEV(WHL_UD_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Dual Core */
+ IMC_DEV(CML_H1_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_H2_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_H3_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_U1_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_U2_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_U3_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_S1_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_S2_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_S3_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_S4_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_S5_IMC, &skl_uncore_pci_driver),
IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */
IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */
{ /* end marker */ }
@@ -1085,11 +1156,13 @@ static struct pci_dev *tgl_uncore_get_mc_dev(void)
}
#define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000
+#define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000
static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
{
struct pci_dev *pdev = tgl_uncore_get_mc_dev();
struct intel_uncore_pmu *pmu = box->pmu;
+ struct intel_uncore_type *type = pmu->type;
resource_size_t addr;
u32 mch_bar;
@@ -1112,7 +1185,9 @@ static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
addr |= ((resource_size_t)mch_bar << 32);
#endif
- box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+ box->io_addr = ioremap(addr, type->mmio_map_size);
+ if (!box->io_addr)
+ pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
}
static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = {
@@ -1138,6 +1213,7 @@ static struct intel_uncore_type tgl_uncore_imc_free_running = {
.num_counters = 3,
.num_boxes = 2,
.num_freerunning_types = TGL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+ .mmio_map_size = TGL_UNCORE_PCI_IMC_MAP_SIZE,
.freerunning = tgl_uncore_imc_freerunning,
.ops = &tgl_uncore_imc_freerunning_ops,
.event_descs = tgl_uncore_imc_events,
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 07652fa20ebb..62e88ad919ff 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -273,6 +273,30 @@
#define SKX_CPUNODEID 0xc0
#define SKX_GIDNIDMAP 0xd4
+/*
+ * The CPU_BUS_NUMBER MSR returns the values of the respective CPUBUSNO CSR
+ * that BIOS programmed. MSR has package scope.
+ * | Bit | Default | Description
+ * | [63] | 00h | VALID - When set, indicates the CPU bus
+ * numbers have been initialized. (RO)
+ * |[62:48]| --- | Reserved
+ * |[47:40]| 00h | BUS_NUM_5 — Return the bus number BIOS assigned
+ * CPUBUSNO(5). (RO)
+ * |[39:32]| 00h | BUS_NUM_4 — Return the bus number BIOS assigned
+ * CPUBUSNO(4). (RO)
+ * |[31:24]| 00h | BUS_NUM_3 — Return the bus number BIOS assigned
+ * CPUBUSNO(3). (RO)
+ * |[23:16]| 00h | BUS_NUM_2 — Return the bus number BIOS assigned
+ * CPUBUSNO(2). (RO)
+ * |[15:8] | 00h | BUS_NUM_1 — Return the bus number BIOS assigned
+ * CPUBUSNO(1). (RO)
+ * | [7:0] | 00h | BUS_NUM_0 — Return the bus number BIOS assigned
+ * CPUBUSNO(0). (RO)
+ */
+#define SKX_MSR_CPU_BUS_NUMBER 0x300
+#define SKX_MSR_CPU_BUS_VALID_BIT (1ULL << 63)
+#define BUS_NUM_STRIDE 8
+
/* SKX CHA */
#define SKX_CHA_MSR_PMON_BOX_FILTER_TID (0x1ffULL << 0)
#define SKX_CHA_MSR_PMON_BOX_FILTER_LINK (0xfULL << 9)
@@ -3612,6 +3636,170 @@ static struct intel_uncore_ops skx_uncore_iio_ops = {
.read_counter = uncore_msr_read_counter,
};
+static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die)
+{
+ return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE);
+}
+
+static umode_t
+skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+ struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj));
+
+ /* Root bus 0x00 is valid only for die 0 AND pmu_idx = 0. */
+ return (!skx_iio_stack(pmu, die) && pmu->pmu_idx) ? 0 : attr->mode;
+}
+
+static ssize_t skx_iio_mapping_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct pci_bus *bus = pci_find_next_bus(NULL);
+ struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev);
+ struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
+ long die = (long)ea->var;
+
+ /*
+ * Current implementation is for single segment configuration hence it's
+ * safe to take the segment value from the first available root bus.
+ */
+ return sprintf(buf, "%04x:%02x\n", pci_domain_nr(bus),
+ skx_iio_stack(uncore_pmu, die));
+}
+
+static int skx_msr_cpu_bus_read(int cpu, u64 *topology)
+{
+ u64 msr_value;
+
+ if (rdmsrl_on_cpu(cpu, SKX_MSR_CPU_BUS_NUMBER, &msr_value) ||
+ !(msr_value & SKX_MSR_CPU_BUS_VALID_BIT))
+ return -ENXIO;
+
+ *topology = msr_value;
+
+ return 0;
+}
+
+static int die_to_cpu(int die)
+{
+ int res = 0, cpu, current_die;
+ /*
+ * Using cpus_read_lock() to ensure cpu is not going down between
+ * looking at cpu_online_mask.
+ */
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ current_die = topology_logical_die_id(cpu);
+ if (current_die == die) {
+ res = cpu;
+ break;
+ }
+ }
+ cpus_read_unlock();
+ return res;
+}
+
+static int skx_iio_get_topology(struct intel_uncore_type *type)
+{
+ int i, ret;
+ struct pci_bus *bus = NULL;
+
+ /*
+ * Verified single-segment environments only; disabled for multiple
+ * segment topologies for now except VMD domains.
+ * VMD domains start at 0x10000 to not clash with ACPI _SEG domains.
+ */
+ while ((bus = pci_find_next_bus(bus))
+ && (!pci_domain_nr(bus) || pci_domain_nr(bus) > 0xffff))
+ ;
+ if (bus)
+ return -EPERM;
+
+ type->topology = kcalloc(uncore_max_dies(), sizeof(u64), GFP_KERNEL);
+ if (!type->topology)
+ return -ENOMEM;
+
+ for (i = 0; i < uncore_max_dies(); i++) {
+ ret = skx_msr_cpu_bus_read(die_to_cpu(i), &type->topology[i]);
+ if (ret) {
+ kfree(type->topology);
+ type->topology = NULL;
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static struct attribute_group skx_iio_mapping_group = {
+ .is_visible = skx_iio_mapping_visible,
+};
+
+static const struct attribute_group *skx_iio_attr_update[] = {
+ &skx_iio_mapping_group,
+ NULL,
+};
+
+static int skx_iio_set_mapping(struct intel_uncore_type *type)
+{
+ char buf[64];
+ int ret;
+ long die = -1;
+ struct attribute **attrs = NULL;
+ struct dev_ext_attribute *eas = NULL;
+
+ ret = skx_iio_get_topology(type);
+ if (ret)
+ return ret;
+
+ /* One more for NULL. */
+ attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL);
+ if (!attrs)
+ goto err;
+
+ eas = kcalloc(uncore_max_dies(), sizeof(*eas), GFP_KERNEL);
+ if (!eas)
+ goto err;
+
+ for (die = 0; die < uncore_max_dies(); die++) {
+ sprintf(buf, "die%ld", die);
+ sysfs_attr_init(&eas[die].attr.attr);
+ eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL);
+ if (!eas[die].attr.attr.name)
+ goto err;
+ eas[die].attr.attr.mode = 0444;
+ eas[die].attr.show = skx_iio_mapping_show;
+ eas[die].attr.store = NULL;
+ eas[die].var = (void *)die;
+ attrs[die] = &eas[die].attr.attr;
+ }
+ skx_iio_mapping_group.attrs = attrs;
+
+ return 0;
+err:
+ for (; die >= 0; die--)
+ kfree(eas[die].attr.attr.name);
+ kfree(eas);
+ kfree(attrs);
+ kfree(type->topology);
+ type->attr_update = NULL;
+ return -ENOMEM;
+}
+
+static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)
+{
+ struct attribute **attr = skx_iio_mapping_group.attrs;
+
+ if (!attr)
+ return;
+
+ for (; *attr; attr++)
+ kfree((*attr)->name);
+ kfree(attr_to_ext_attr(*skx_iio_mapping_group.attrs));
+ kfree(skx_iio_mapping_group.attrs);
+ skx_iio_mapping_group.attrs = NULL;
+ kfree(type->topology);
+}
+
static struct intel_uncore_type skx_uncore_iio = {
.name = "iio",
.num_counters = 4,
@@ -3626,6 +3814,9 @@ static struct intel_uncore_type skx_uncore_iio = {
.constraints = skx_uncore_iio_constraints,
.ops = &skx_uncore_iio_ops,
.format_group = &skx_uncore_iio_format_group,
+ .attr_update = skx_iio_attr_update,
+ .set_mapping = skx_iio_set_mapping,
+ .cleanup_mapping = skx_iio_cleanup_mapping,
};
enum perf_uncore_iio_freerunning_type_id {
@@ -4421,6 +4612,7 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
unsigned int box_ctl, int mem_offset)
{
struct pci_dev *pdev = snr_uncore_get_mc_dev(box->dieid);
+ struct intel_uncore_type *type = box->pmu->type;
resource_size_t addr;
u32 pci_dword;
@@ -4435,9 +4627,11 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
addr += box_ctl;
- box->io_addr = ioremap(addr, SNR_IMC_MMIO_SIZE);
- if (!box->io_addr)
+ box->io_addr = ioremap(addr, type->mmio_map_size);
+ if (!box->io_addr) {
+ pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
return;
+ }
writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr);
}
@@ -4480,6 +4674,9 @@ static void snr_uncore_mmio_enable_event(struct intel_uncore_box *box,
if (!box->io_addr)
return;
+ if (!uncore_mmio_is_valid_offset(box, hwc->config_base))
+ return;
+
writel(hwc->config | SNBEP_PMON_CTL_EN,
box->io_addr + hwc->config_base);
}
@@ -4492,6 +4689,9 @@ static void snr_uncore_mmio_disable_event(struct intel_uncore_box *box,
if (!box->io_addr)
return;
+ if (!uncore_mmio_is_valid_offset(box, hwc->config_base))
+ return;
+
writel(hwc->config, box->io_addr + hwc->config_base);
}
@@ -4530,6 +4730,7 @@ static struct intel_uncore_type snr_uncore_imc = {
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
.box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL,
.mmio_offset = SNR_IMC_MMIO_OFFSET,
+ .mmio_map_size = SNR_IMC_MMIO_SIZE,
.ops = &snr_uncore_mmio_ops,
.format_group = &skx_uncore_format_group,
};
@@ -4570,6 +4771,7 @@ static struct intel_uncore_type snr_uncore_imc_free_running = {
.num_counters = 3,
.num_boxes = 1,
.num_freerunning_types = SNR_IMC_FREERUNNING_TYPE_MAX,
+ .mmio_map_size = SNR_IMC_MMIO_SIZE,
.freerunning = snr_imc_freerunning,
.ops = &snr_uncore_imc_freerunning_ops,
.event_descs = snr_uncore_imc_freerunning_events,
@@ -4987,6 +5189,7 @@ static struct intel_uncore_type icx_uncore_imc = {
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
.box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL,
.mmio_offset = SNR_IMC_MMIO_OFFSET,
+ .mmio_map_size = SNR_IMC_MMIO_SIZE,
.ops = &icx_uncore_mmio_ops,
.format_group = &skx_uncore_format_group,
};
@@ -5044,6 +5247,7 @@ static struct intel_uncore_type icx_uncore_imc_free_running = {
.num_counters = 5,
.num_boxes = 4,
.num_freerunning_types = ICX_IMC_FREERUNNING_TYPE_MAX,
+ .mmio_map_size = SNR_IMC_MMIO_SIZE,
.freerunning = icx_imc_freerunning,
.ops = &icx_uncore_imc_freerunning_ops,
.event_descs = icx_uncore_imc_freerunning_events,
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 073eb7ad2f56..143bc9abe99c 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -66,6 +66,8 @@ struct arch_specific_insn {
*/
bool boostable;
bool if_modifier;
+ /* Number of bytes of text poked */
+ int tp_len;
};
struct arch_optimized_insn {
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 8fd39ff74a49..f94c9f371411 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -3,6 +3,7 @@
#include <linux/module.h>
#include <linux/sched.h>
+#include <linux/perf_event.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/stringify.h>
@@ -1001,6 +1002,7 @@ struct text_poke_loc {
s32 rel32;
u8 opcode;
const u8 text[POKE_MAX_OPCODE_SIZE];
+ u8 old;
};
struct bp_patching_desc {
@@ -1168,8 +1170,10 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
/*
* First step: add a int3 trap to the address that will be patched.
*/
- for (i = 0; i < nr_entries; i++)
+ for (i = 0; i < nr_entries; i++) {
+ tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
+ }
text_poke_sync();
@@ -1177,14 +1181,45 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* Second step: update all but the first byte of the patched range.
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
+ u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
int len = text_opcode_size(tp[i].opcode);
if (len - INT3_INSN_SIZE > 0) {
+ memcpy(old + INT3_INSN_SIZE,
+ text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
+ len - INT3_INSN_SIZE);
text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
(const char *)tp[i].text + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
do_sync++;
}
+
+ /*
+ * Emit a perf event to record the text poke, primarily to
+ * support Intel PT decoding which must walk the executable code
+ * to reconstruct the trace. The flow up to here is:
+ * - write INT3 byte
+ * - IPI-SYNC
+ * - write instruction tail
+ * At this point the actual control flow will be through the
+ * INT3 and handler and not hit the old or new instruction.
+ * Intel PT outputs FUP/TIP packets for the INT3, so the flow
+ * can still be decoded. Subsequently:
+ * - emit RECORD_TEXT_POKE with the new instruction
+ * - IPI-SYNC
+ * - write first byte
+ * - IPI-SYNC
+ * So before the text poke event timestamp, the decoder will see
+ * either the old instruction flow or FUP/TIP of INT3. After the
+ * text poke event timestamp, the decoder will see either the
+ * new instruction flow or FUP/TIP of INT3. Thus decoders can
+ * use the timestamp as the point at which to modify the
+ * executable code.
+ * The old instruction is recorded so that the event can be
+ * processed forwards or backwards.
+ */
+ perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
+ tp[i].text, len);
}
if (do_sync) {
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index ada39ddbc922..fdadc37d72af 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -33,6 +33,7 @@
#include <linux/hardirq.h>
#include <linux/preempt.h>
#include <linux/sched/debug.h>
+#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
@@ -472,6 +473,9 @@ static int arch_copy_kprobe(struct kprobe *p)
/* Also, displacement change doesn't affect the first byte */
p->opcode = buf[0];
+ p->ainsn.tp_len = len;
+ perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len);
+
/* OK, write back the instruction(s) into ROX insn buffer */
text_poke(p->ainsn.insn, buf, len);
@@ -503,12 +507,18 @@ int arch_prepare_kprobe(struct kprobe *p)
void arch_arm_kprobe(struct kprobe *p)
{
- text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1);
+ u8 int3 = INT3_INSN_OPCODE;
+
+ text_poke(p->addr, &int3, 1);
text_poke_sync();
+ perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1);
}
void arch_disarm_kprobe(struct kprobe *p)
{
+ u8 int3 = INT3_INSN_OPCODE;
+
+ perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1);
text_poke(p->addr, &p->opcode, 1);
text_poke_sync();
}
@@ -516,6 +526,9 @@ void arch_disarm_kprobe(struct kprobe *p)
void arch_remove_kprobe(struct kprobe *p)
{
if (p->ainsn.insn) {
+ /* Record the perf event before freeing the slot */
+ perf_event_text_poke(p->ainsn.insn, p->ainsn.insn,
+ p->ainsn.tp_len, NULL, 0);
free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
p->ainsn.insn = NULL;
}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 7af4c61dde52..40f380461e6d 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -6,6 +6,7 @@
* Copyright (C) Hitachi Ltd., 2012
*/
#include <linux/kprobes.h>
+#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/string.h>
#include <linux/slab.h>
@@ -352,8 +353,15 @@ int arch_within_optimized_kprobe(struct optimized_kprobe *op,
static
void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
{
- if (op->optinsn.insn) {
- free_optinsn_slot(op->optinsn.insn, dirty);
+ u8 *slot = op->optinsn.insn;
+ if (slot) {
+ int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE;
+
+ /* Record the perf event before freeing the slot */
+ if (dirty)
+ perf_event_text_poke(slot, slot, len, NULL, 0);
+
+ free_optinsn_slot(slot, dirty);
op->optinsn.insn = NULL;
op->optinsn.size = 0;
}
@@ -424,8 +432,15 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
(u8 *)op->kp.addr + op->optinsn.size);
len += JMP32_INSN_SIZE;
+ /*
+ * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also
+ * used in __arch_remove_optimized_kprobe().
+ */
+
/* We have to use text_poke() for instruction buffer because it is RO */
+ perf_event_text_poke(slot, NULL, 0, buf, len);
text_poke(slot, buf, len);
+
ret = 0;
out:
kfree(buf);
@@ -477,10 +492,23 @@ void arch_optimize_kprobes(struct list_head *oplist)
*/
void arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
- arch_arm_kprobe(&op->kp);
- text_poke(op->kp.addr + INT3_INSN_SIZE,
- op->optinsn.copied_insn, DISP32_SIZE);
+ u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, };
+ u8 old[JMP32_INSN_SIZE];
+ u8 *addr = op->kp.addr;
+
+ memcpy(old, op->kp.addr, JMP32_INSN_SIZE);
+ memcpy(new + INT3_INSN_SIZE,
+ op->optinsn.copied_insn,
+ JMP32_INSN_SIZE - INT3_INSN_SIZE);
+
+ text_poke(addr, new, INT3_INSN_SIZE);
text_poke_sync();
+ text_poke(addr + INT3_INSN_SIZE,
+ new + INT3_INSN_SIZE,
+ JMP32_INSN_SIZE - INT3_INSN_SIZE);
+ text_poke_sync();
+
+ perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE);
}
/*
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e339dac91ee6..ce2c06f72e86 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -58,9 +58,6 @@ struct ftrace_direct_func;
const char *
ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
unsigned long *off, char **modname, char *sym);
-int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
- char *type, char *name,
- char *module_name, int *exported);
#else
static inline const char *
ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
@@ -68,6 +65,13 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
{
return NULL;
}
+#endif
+
+#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+ char *type, char *name,
+ char *module_name, int *exported);
+#else
static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
char *type, char *name,
char *module_name, int *exported)
@@ -76,7 +80,6 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val
}
#endif
-
#ifdef CONFIG_FUNCTION_TRACER
extern int ftrace_enabled;
@@ -207,6 +210,7 @@ struct ftrace_ops {
struct ftrace_ops_hash old_hash;
unsigned long trampoline;
unsigned long trampoline_size;
+ struct list_head list;
#endif
};
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 6adf90f248d7..45b8cdc9fad7 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -242,6 +242,7 @@ struct kprobe_insn_cache {
struct mutex mutex;
void *(*alloc)(void); /* allocate insn page */
void (*free)(void *); /* free insn page */
+ const char *sym; /* symbol for insn pages */
struct list_head pages; /* list of kprobe_insn_page */
size_t insn_size; /* size of instruction slot */
int nr_garbage;
@@ -272,6 +273,10 @@ static inline bool is_kprobe_##__name##_slot(unsigned long addr) \
{ \
return __is_insn_slot_addr(&kprobe_##__name##_slots, addr); \
}
+#define KPROBE_INSN_PAGE_SYM "kprobe_insn_page"
+#define KPROBE_OPTINSN_PAGE_SYM "kprobe_optinsn_page"
+int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
+ unsigned long *value, char *type, char *sym);
#else /* __ARCH_WANT_KPROBES_INSN_SLOT */
#define DEFINE_INSN_CACHE_OPS(__name) \
static inline bool is_kprobe_##__name##_slot(unsigned long addr) \
@@ -377,6 +382,11 @@ void dump_kprobe(struct kprobe *kp);
void *alloc_insn_page(void);
void free_insn_page(void *page);
+int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *sym);
+
+int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
+ char *type, char *sym);
#else /* !CONFIG_KPROBES: */
static inline int kprobes_built_in(void)
@@ -439,6 +449,11 @@ static inline bool within_kprobe_blacklist(unsigned long addr)
{
return true;
}
+static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value,
+ char *type, char *sym)
+{
+ return -ERANGE;
+}
#endif /* CONFIG_KPROBES */
static inline int disable_kretprobe(struct kretprobe *rp)
{
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b4bb32082342..46fe5cfb5163 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1232,6 +1232,9 @@ extern void perf_event_exec(void);
extern void perf_event_comm(struct task_struct *tsk, bool exec);
extern void perf_event_namespaces(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);
+extern void perf_event_text_poke(const void *addr,
+ const void *old_bytes, size_t old_len,
+ const void *new_bytes, size_t new_len);
/* Callchains */
DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
@@ -1479,6 +1482,11 @@ static inline void perf_event_exec(void) { }
static inline void perf_event_comm(struct task_struct *tsk, bool exec) { }
static inline void perf_event_namespaces(struct task_struct *tsk) { }
static inline void perf_event_fork(struct task_struct *tsk) { }
+static inline void perf_event_text_poke(const void *addr,
+ const void *old_bytes,
+ size_t old_len,
+ const void *new_bytes,
+ size_t new_len) { }
static inline void perf_event_init(void) { }
static inline int perf_swevent_get_recursion_context(void) { return -1; }
static inline void perf_swevent_put_recursion_context(int rctx) { }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 7b2d6fc9e6ed..52ca2093831c 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -383,7 +383,8 @@ struct perf_event_attr {
bpf_event : 1, /* include bpf events */
aux_output : 1, /* generate AUX records instead of events */
cgroup : 1, /* include cgroup events */
- __reserved_1 : 31;
+ text_poke : 1, /* include text poke events */
+ __reserved_1 : 30;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -1024,12 +1025,35 @@ enum perf_event_type {
*/
PERF_RECORD_CGROUP = 19,
+ /*
+ * Records changes to kernel text i.e. self-modified code. 'old_len' is
+ * the number of old bytes, 'new_len' is the number of new bytes. Either
+ * 'old_len' or 'new_len' may be zero to indicate, for example, the
+ * addition or removal of a trampoline. 'bytes' contains the old bytes
+ * followed immediately by the new bytes.
+ *
+ * struct {
+ * struct perf_event_header header;
+ * u64 addr;
+ * u16 old_len;
+ * u16 new_len;
+ * u8 bytes[];
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_TEXT_POKE = 20,
+
PERF_RECORD_MAX, /* non-ABI */
};
enum perf_record_ksymbol_type {
PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0,
PERF_RECORD_KSYMBOL_TYPE_BPF = 1,
+ /*
+ * Out of line code such as kprobe-replaced instructions or optimized
+ * kprobes or ftrace trampolines.
+ */
+ PERF_RECORD_KSYMBOL_TYPE_OOL = 2,
PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */
};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 856d98c36f56..9b8f92500833 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -394,6 +394,7 @@ static atomic_t nr_switch_events __read_mostly;
static atomic_t nr_ksymbol_events __read_mostly;
static atomic_t nr_bpf_events __read_mostly;
static atomic_t nr_cgroup_events __read_mostly;
+static atomic_t nr_text_poke_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -4575,7 +4576,7 @@ static bool is_sb_event(struct perf_event *event)
if (attr->mmap || attr->mmap_data || attr->mmap2 ||
attr->comm || attr->comm_exec ||
attr->task || attr->ksymbol ||
- attr->context_switch ||
+ attr->context_switch || attr->text_poke ||
attr->bpf_event)
return true;
return false;
@@ -4651,6 +4652,8 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_ksymbol_events);
if (event->attr.bpf_event)
atomic_dec(&nr_bpf_events);
+ if (event->attr.text_poke)
+ atomic_dec(&nr_text_poke_events);
if (dec) {
if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -8628,6 +8631,89 @@ void perf_event_bpf_event(struct bpf_prog *prog,
perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
}
+struct perf_text_poke_event {
+ const void *old_bytes;
+ const void *new_bytes;
+ size_t pad;
+ u16 old_len;
+ u16 new_len;
+
+ struct {
+ struct perf_event_header header;
+
+ u64 addr;
+ } event_id;
+};
+
+static int perf_event_text_poke_match(struct perf_event *event)
+{
+ return event->attr.text_poke;
+}
+
+static void perf_event_text_poke_output(struct perf_event *event, void *data)
+{
+ struct perf_text_poke_event *text_poke_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ u64 padding = 0;
+ int ret;
+
+ if (!perf_event_text_poke_match(event))
+ return;
+
+ perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, text_poke_event->event_id);
+ perf_output_put(&handle, text_poke_event->old_len);
+ perf_output_put(&handle, text_poke_event->new_len);
+
+ __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
+ __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
+
+ if (text_poke_event->pad)
+ __output_copy(&handle, &padding, text_poke_event->pad);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+void perf_event_text_poke(const void *addr, const void *old_bytes,
+ size_t old_len, const void *new_bytes, size_t new_len)
+{
+ struct perf_text_poke_event text_poke_event;
+ size_t tot, pad;
+
+ if (!atomic_read(&nr_text_poke_events))
+ return;
+
+ tot = sizeof(text_poke_event.old_len) + old_len;
+ tot += sizeof(text_poke_event.new_len) + new_len;
+ pad = ALIGN(tot, sizeof(u64)) - tot;
+
+ text_poke_event = (struct perf_text_poke_event){
+ .old_bytes = old_bytes,
+ .new_bytes = new_bytes,
+ .pad = pad,
+ .old_len = old_len,
+ .new_len = new_len,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_TEXT_POKE,
+ .misc = PERF_RECORD_MISC_KERNEL,
+ .size = sizeof(text_poke_event.event_id) + tot + pad,
+ },
+ .addr = (unsigned long)addr,
+ },
+ };
+
+ perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
+}
+
void perf_event_itrace_started(struct perf_event *event)
{
event->attach_state |= PERF_ATTACH_ITRACE;
@@ -10945,6 +11031,8 @@ static void account_event(struct perf_event *event)
atomic_inc(&nr_ksymbol_events);
if (event->attr.bpf_event)
atomic_inc(&nr_bpf_events);
+ if (event->attr.text_poke)
+ atomic_inc(&nr_text_poke_events);
if (inc) {
/*
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 16c8c605f4b0..834bfdc43235 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -24,6 +24,7 @@
#include <linux/slab.h>
#include <linux/filter.h>
#include <linux/ftrace.h>
+#include <linux/kprobes.h>
#include <linux/compiler.h>
/*
@@ -437,6 +438,7 @@ struct kallsym_iter {
loff_t pos_arch_end;
loff_t pos_mod_end;
loff_t pos_ftrace_mod_end;
+ loff_t pos_bpf_end;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols. */
char type;
@@ -480,6 +482,11 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
return 1;
}
+/*
+ * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace
+ * purposes. In that case "__builtin__ftrace" is used as a module name, even
+ * though "__builtin__ftrace" is not a module.
+ */
static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
{
int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end,
@@ -496,11 +503,33 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
static int get_ksymbol_bpf(struct kallsym_iter *iter)
{
+ int ret;
+
strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN);
iter->exported = 0;
- return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
- &iter->value, &iter->type,
- iter->name) < 0 ? 0 : 1;
+ ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
+ &iter->value, &iter->type,
+ iter->name);
+ if (ret < 0) {
+ iter->pos_bpf_end = iter->pos;
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * This uses "__builtin__kprobes" as a module name for symbols for pages
+ * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a
+ * module.
+ */
+static int get_ksymbol_kprobe(struct kallsym_iter *iter)
+{
+ strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN);
+ iter->exported = 0;
+ return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end,
+ &iter->value, &iter->type,
+ iter->name) < 0 ? 0 : 1;
}
/* Returns space to next name. */
@@ -527,6 +556,7 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
iter->pos_arch_end = 0;
iter->pos_mod_end = 0;
iter->pos_ftrace_mod_end = 0;
+ iter->pos_bpf_end = 0;
}
}
@@ -551,7 +581,11 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
get_ksymbol_ftrace_mod(iter))
return 1;
- return get_ksymbol_bpf(iter);
+ if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) &&
+ get_ksymbol_bpf(iter))
+ return 1;
+
+ return get_ksymbol_kprobe(iter);
}
/* Returns false if pos at or past end of file. */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 4a904cc56d68..0924397d9d59 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,6 +35,7 @@
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/jump_label.h>
+#include <linux/perf_event.h>
#include <asm/sections.h>
#include <asm/cacheflush.h>
@@ -123,6 +124,7 @@ struct kprobe_insn_cache kprobe_insn_slots = {
.mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
.alloc = alloc_insn_page,
.free = free_insn_page,
+ .sym = KPROBE_INSN_PAGE_SYM,
.pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
.insn_size = MAX_INSN_SIZE,
.nr_garbage = 0,
@@ -188,6 +190,10 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
kip->cache = c;
list_add_rcu(&kip->list, &c->pages);
slot = kip->insns;
+
+ /* Record the perf ksymbol register event after adding the page */
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns,
+ PAGE_SIZE, false, c->sym);
out:
mutex_unlock(&c->mutex);
return slot;
@@ -206,6 +212,13 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
* next time somebody inserts a probe.
*/
if (!list_is_singular(&kip->list)) {
+ /*
+ * Record perf ksymbol unregister event before removing
+ * the page.
+ */
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ (unsigned long)kip->insns, PAGE_SIZE, true,
+ kip->cache->sym);
list_del_rcu(&kip->list);
synchronize_rcu();
kip->cache->free(kip->insns);
@@ -295,12 +308,34 @@ bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
return ret;
}
+int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
+ unsigned long *value, char *type, char *sym)
+{
+ struct kprobe_insn_page *kip;
+ int ret = -ERANGE;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(kip, &c->pages, list) {
+ if ((*symnum)--)
+ continue;
+ strlcpy(sym, c->sym, KSYM_NAME_LEN);
+ *type = 't';
+ *value = (unsigned long)kip->insns;
+ ret = 0;
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
#ifdef CONFIG_OPTPROBES
/* For optimized_kprobe buffer */
struct kprobe_insn_cache kprobe_optinsn_slots = {
.mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
.alloc = alloc_insn_page,
.free = free_insn_page,
+ .sym = KPROBE_OPTINSN_PAGE_SYM,
.pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
/* .insn_size is initialized later */
.nr_garbage = 0,
@@ -2232,6 +2267,28 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
kprobe_remove_area_blacklist(entry, entry + 1);
}
+int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
+ char *type, char *sym)
+{
+ return -ERANGE;
+}
+
+int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *sym)
+{
+#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
+ if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym))
+ return 0;
+#ifdef CONFIG_OPTPROBES
+ if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym))
+ return 0;
+#endif
+#endif
+ if (!arch_kprobe_get_kallsym(&symnum, value, type, sym))
+ return 0;
+ return -ERANGE;
+}
+
int __init __weak arch_populate_kprobe_blacklist(void)
{
return 0;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1903b80db6eb..72064541bef2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2764,6 +2764,50 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
{
}
+/* List of trace_ops that have allocated trampolines */
+static LIST_HEAD(ftrace_ops_trampoline_list);
+
+static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops)
+{
+ lockdep_assert_held(&ftrace_lock);
+ list_add_rcu(&ops->list, &ftrace_ops_trampoline_list);
+}
+
+static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops)
+{
+ lockdep_assert_held(&ftrace_lock);
+ list_del_rcu(&ops->list);
+}
+
+/*
+ * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols
+ * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is
+ * not a module.
+ */
+#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace"
+#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline"
+
+static void ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+ if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) &&
+ ops->trampoline) {
+ /*
+ * Record the text poke event before the ksymbol unregister
+ * event.
+ */
+ perf_event_text_poke((void *)ops->trampoline,
+ (void *)ops->trampoline,
+ ops->trampoline_size, NULL, 0);
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ ops->trampoline, ops->trampoline_size,
+ true, FTRACE_TRAMPOLINE_SYM);
+ /* Remove from kallsyms after the perf events */
+ ftrace_remove_trampoline_from_kallsyms(ops);
+ }
+
+ arch_ftrace_trampoline_free(ops);
+}
+
static void ftrace_startup_enable(int command)
{
if (saved_ftrace_func != ftrace_trace_function) {
@@ -2934,7 +2978,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
synchronize_rcu_tasks();
free_ops:
- arch_ftrace_trampoline_free(ops);
+ ftrace_trampoline_free(ops);
}
return 0;
@@ -6178,6 +6222,27 @@ struct ftrace_mod_map {
unsigned int num_funcs;
};
+static int ftrace_get_trampoline_kallsym(unsigned int symnum,
+ unsigned long *value, char *type,
+ char *name, char *module_name,
+ int *exported)
+{
+ struct ftrace_ops *op;
+
+ list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) {
+ if (!op->trampoline || symnum--)
+ continue;
+ *value = op->trampoline;
+ *type = 't';
+ strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN);
+ strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN);
+ *exported = 0;
+ return 0;
+ }
+
+ return -ERANGE;
+}
+
#ifdef CONFIG_MODULES
#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
@@ -6514,6 +6579,7 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
{
struct ftrace_mod_map *mod_map;
struct ftrace_mod_func *mod_func;
+ int ret;
preempt_disable();
list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
@@ -6540,8 +6606,10 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
WARN_ON(1);
break;
}
+ ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
+ module_name, exported);
preempt_enable();
- return -ERANGE;
+ return ret;
}
#else
@@ -6553,6 +6621,18 @@ allocate_ftrace_mod_map(struct module *mod,
{
return NULL;
}
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+ char *type, char *name, char *module_name,
+ int *exported)
+{
+ int ret;
+
+ preempt_disable();
+ ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
+ module_name, exported);
+ preempt_enable();
+ return ret;
+}
#endif /* CONFIG_MODULES */
struct ftrace_init_func {
@@ -6733,7 +6813,24 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
static void ftrace_update_trampoline(struct ftrace_ops *ops)
{
+ unsigned long trampoline = ops->trampoline;
+
arch_ftrace_update_trampoline(ops);
+ if (ops->trampoline && ops->trampoline != trampoline &&
+ (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) {
+ /* Add to kallsyms before the perf events */
+ ftrace_add_trampoline_to_kallsyms(ops);
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ ops->trampoline, ops->trampoline_size, false,
+ FTRACE_TRAMPOLINE_SYM);
+ /*
+ * Record the perf text poke event after the ksymbol register
+ * event.
+ */
+ perf_event_text_poke((void *)ops->trampoline, NULL, 0,
+ (void *)ops->trampoline,
+ ops->trampoline_size);
+ }
}
void ftrace_init_trace_array(struct trace_array *tr)