diff options
-rw-r--r-- | Documentation/ABI/testing/sysfs-devices-mapping | 33 | ||||
-rw-r--r-- | arch/x86/events/intel/uncore.c | 26 | ||||
-rw-r--r-- | arch/x86/events/intel/uncore.h | 37 | ||||
-rw-r--r-- | arch/x86/events/intel/uncore_snb.c | 80 | ||||
-rw-r--r-- | arch/x86/events/intel/uncore_snbep.c | 208 | ||||
-rw-r--r-- | arch/x86/include/asm/kprobes.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/alternative.c | 37 | ||||
-rw-r--r-- | arch/x86/kernel/kprobes/core.c | 15 | ||||
-rw-r--r-- | arch/x86/kernel/kprobes/opt.c | 38 | ||||
-rw-r--r-- | include/linux/ftrace.h | 12 | ||||
-rw-r--r-- | include/linux/kprobes.h | 15 | ||||
-rw-r--r-- | include/linux/perf_event.h | 8 | ||||
-rw-r--r-- | include/uapi/linux/perf_event.h | 26 | ||||
-rw-r--r-- | kernel/events/core.c | 90 | ||||
-rw-r--r-- | kernel/kallsyms.c | 42 | ||||
-rw-r--r-- | kernel/kprobes.c | 57 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 101 |
17 files changed, 798 insertions, 29 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-mapping b/Documentation/ABI/testing/sysfs-devices-mapping new file mode 100644 index 000000000000..490ccfd67f12 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-mapping @@ -0,0 +1,33 @@ +What: /sys/devices/uncore_iio_x/dieX +Date: February 2020 +Contact: Roman Sudarikov <roman.sudarikov@linux.intel.com> +Description: + Each IIO stack (PCIe root port) has its own IIO PMON block, so + each dieX file (where X is die number) holds "Segment:Root Bus" + for PCIe root port, which can be monitored by that IIO PMON + block. + For example, on 4-die Xeon platform with up to 6 IIO stacks per + die and, therefore, 6 IIO PMON blocks per die, the mapping of + IIO PMON block 0 exposes as the following: + + $ ls /sys/devices/uncore_iio_0/die* + -r--r--r-- /sys/devices/uncore_iio_0/die0 + -r--r--r-- /sys/devices/uncore_iio_0/die1 + -r--r--r-- /sys/devices/uncore_iio_0/die2 + -r--r--r-- /sys/devices/uncore_iio_0/die3 + + $ tail /sys/devices/uncore_iio_0/die* + ==> /sys/devices/uncore_iio_0/die0 <== + 0000:00 + ==> /sys/devices/uncore_iio_0/die1 <== + 0000:40 + ==> /sys/devices/uncore_iio_0/die2 <== + 0000:80 + ==> /sys/devices/uncore_iio_0/die3 <== + 0000:c0 + + Which means: + IIO PMU 0 on die 0 belongs to PCI RP on bus 0x00, domain 0x0000 + IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x0000 + IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x0000 + IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x0000 diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index cf76d6631afa..d5c6d3b340c5 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -16,7 +16,7 @@ struct pci_driver *uncore_pci_driver; DEFINE_RAW_SPINLOCK(pci2phy_map_lock); struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); struct pci_extra_dev *uncore_extra_pci_dev; -static int max_dies; +int __uncore_max_dies; /* mask of cpus that collect uncore events */ static cpumask_t uncore_cpu_mask; @@ -108,7 +108,7 @@ struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu * The unsigned check also catches the '-1' return value for non * existent mappings in the topology map. */ - return dieid < max_dies ? pmu->boxes[dieid] : NULL; + return dieid < uncore_max_dies() ? pmu->boxes[dieid] : NULL; } u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) @@ -132,6 +132,9 @@ u64 uncore_mmio_read_counter(struct intel_uncore_box *box, if (!box->io_addr) return 0; + if (!uncore_mmio_is_valid_offset(box, event->hw.event_base)) + return 0; + return readq(box->io_addr + event->hw.event_base); } @@ -843,10 +846,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) .read = uncore_pmu_event_read, .module = THIS_MODULE, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .attr_update = pmu->type->attr_update, }; } else { pmu->pmu = *pmu->type->pmu; pmu->pmu.attr_groups = pmu->type->attr_groups; + pmu->pmu.attr_update = pmu->type->attr_update; } if (pmu->type->num_boxes == 1) { @@ -877,7 +882,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu) { int die; - for (die = 0; die < max_dies; die++) + for (die = 0; die < uncore_max_dies(); die++) kfree(pmu->boxes[die]); kfree(pmu->boxes); } @@ -887,6 +892,9 @@ static void uncore_type_exit(struct intel_uncore_type *type) struct intel_uncore_pmu *pmu = type->pmus; int i; + if (type->cleanup_mapping) + type->cleanup_mapping(type); + if (pmu) { for (i = 0; i < type->num_boxes; i++, pmu++) { uncore_pmu_unregister(pmu); @@ -915,7 +923,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) if (!pmus) return -ENOMEM; - size = max_dies * sizeof(struct intel_uncore_box *); + size = uncore_max_dies() * sizeof(struct intel_uncore_box *); for (i = 0; i < type->num_boxes; i++) { pmus[i].func_id = setid ? i : -1; @@ -954,6 +962,9 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) type->pmu_group = &uncore_pmu_attr_group; + if (type->set_mapping) + type->set_mapping(type); + return 0; err: @@ -1112,7 +1123,7 @@ static int __init uncore_pci_init(void) size_t size; int ret; - size = max_dies * sizeof(struct pci_extra_dev); + size = uncore_max_dies() * sizeof(struct pci_extra_dev); uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL); if (!uncore_extra_pci_dev) { ret = -ENOMEM; @@ -1514,6 +1525,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &skx_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &skl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &skl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &icl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_uncore_init), @@ -1539,7 +1552,8 @@ static int __init intel_uncore_init(void) if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return -ENODEV; - max_dies = topology_max_packages() * topology_max_die_per_package(); + __uncore_max_dies = + topology_max_packages() * topology_max_die_per_package(); uncore_init = (struct intel_uncore_init_fun *)id->driver_data; if (uncore_init->pci_init) { diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index b469ddd45515..105fdc69825e 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -61,6 +61,7 @@ struct intel_uncore_type { unsigned msr_offset; unsigned mmio_offset; }; + unsigned mmio_map_size; unsigned num_shared_regs:8; unsigned single_fixed:1; unsigned pair_ctr_ctl:1; @@ -72,7 +73,19 @@ struct intel_uncore_type { struct uncore_event_desc *event_descs; struct freerunning_counters *freerunning; const struct attribute_group *attr_groups[4]; + const struct attribute_group **attr_update; struct pmu *pmu; /* for custom pmu ops */ + /* + * Uncore PMU would store relevant platform topology configuration here + * to identify which platform component each PMON block of that type is + * supposed to monitor. + */ + u64 *topology; + /* + * Optional callbacks for managing mapping of Uncore units to PMONs + */ + int (*set_mapping)(struct intel_uncore_type *type); + void (*cleanup_mapping)(struct intel_uncore_type *type); }; #define pmu_group attr_groups[0] @@ -169,6 +182,18 @@ int uncore_pcibus_to_physid(struct pci_bus *bus); ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); +static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev) +{ + return container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu); +} + +#define to_device_attribute(n) container_of(n, struct device_attribute, attr) +#define to_dev_ext_attribute(n) container_of(n, struct dev_ext_attribute, attr) +#define attr_to_ext_attr(n) to_dev_ext_attribute(to_device_attribute(n)) + +extern int __uncore_max_dies; +#define uncore_max_dies() (__uncore_max_dies) + #define INTEL_UNCORE_EVENT_DESC(_name, _config) \ { \ .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ @@ -196,6 +221,18 @@ static inline bool uncore_pmc_freerunning(int idx) return idx == UNCORE_PMC_IDX_FREERUNNING; } +static inline bool uncore_mmio_is_valid_offset(struct intel_uncore_box *box, + unsigned long offset) +{ + if (offset < box->pmu->type->mmio_map_size) + return true; + + pr_warn_once("perf uncore: Invalid offset 0x%lx exceeds mapped area of %s.\n", + offset, box->pmu->type->name); + + return false; +} + static inline unsigned int uncore_mmio_box_ctl(struct intel_uncore_box *box) { diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 3de1065eefc4..cb94ba86efd2 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -42,6 +42,17 @@ #define PCI_DEVICE_ID_INTEL_WHL_UQ_IMC 0x3ed0 #define PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC 0x3e34 #define PCI_DEVICE_ID_INTEL_WHL_UD_IMC 0x3e35 +#define PCI_DEVICE_ID_INTEL_CML_H1_IMC 0x9b44 +#define PCI_DEVICE_ID_INTEL_CML_H2_IMC 0x9b54 +#define PCI_DEVICE_ID_INTEL_CML_H3_IMC 0x9b64 +#define PCI_DEVICE_ID_INTEL_CML_U1_IMC 0x9b51 +#define PCI_DEVICE_ID_INTEL_CML_U2_IMC 0x9b61 +#define PCI_DEVICE_ID_INTEL_CML_U3_IMC 0x9b71 +#define PCI_DEVICE_ID_INTEL_CML_S1_IMC 0x9b33 +#define PCI_DEVICE_ID_INTEL_CML_S2_IMC 0x9b43 +#define PCI_DEVICE_ID_INTEL_CML_S3_IMC 0x9b53 +#define PCI_DEVICE_ID_INTEL_CML_S4_IMC 0x9b63 +#define PCI_DEVICE_ID_INTEL_CML_S5_IMC 0x9b73 #define PCI_DEVICE_ID_INTEL_ICL_U_IMC 0x8a02 #define PCI_DEVICE_ID_INTEL_ICL_U2_IMC 0x8a12 #define PCI_DEVICE_ID_INTEL_TGL_U1_IMC 0x9a02 @@ -415,6 +426,7 @@ static const struct attribute_group snb_uncore_imc_format_group = { static void snb_uncore_imc_init_box(struct intel_uncore_box *box) { + struct intel_uncore_type *type = box->pmu->type; struct pci_dev *pdev = box->pci_dev; int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET; resource_size_t addr; @@ -430,7 +442,10 @@ static void snb_uncore_imc_init_box(struct intel_uncore_box *box) addr &= ~(PAGE_SIZE - 1); - box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); + box->io_addr = ioremap(addr, type->mmio_map_size); + if (!box->io_addr) + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); + box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; } @@ -586,6 +601,7 @@ static struct intel_uncore_type snb_uncore_imc = { .num_counters = 2, .num_boxes = 1, .num_freerunning_types = SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = SNB_UNCORE_PCI_IMC_MAP_SIZE, .freerunning = snb_uncore_imc_freerunning, .event_descs = snb_uncore_imc_events, .format_group = &snb_uncore_imc_format_group, @@ -771,6 +787,50 @@ static const struct pci_device_id skl_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UD_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S4_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S5_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ }, }; @@ -863,6 +923,17 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(WHL_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */ IMC_DEV(WHL_4_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */ IMC_DEV(WHL_UD_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Dual Core */ + IMC_DEV(CML_H1_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_H2_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_H3_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_U1_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_U2_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_U3_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S1_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S2_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S3_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S4_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S5_IMC, &skl_uncore_pci_driver), IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ { /* end marker */ } @@ -1085,11 +1156,13 @@ static struct pci_dev *tgl_uncore_get_mc_dev(void) } #define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000 +#define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000 static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) { struct pci_dev *pdev = tgl_uncore_get_mc_dev(); struct intel_uncore_pmu *pmu = box->pmu; + struct intel_uncore_type *type = pmu->type; resource_size_t addr; u32 mch_bar; @@ -1112,7 +1185,9 @@ static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) addr |= ((resource_size_t)mch_bar << 32); #endif - box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); + box->io_addr = ioremap(addr, type->mmio_map_size); + if (!box->io_addr) + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); } static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = { @@ -1138,6 +1213,7 @@ static struct intel_uncore_type tgl_uncore_imc_free_running = { .num_counters = 3, .num_boxes = 2, .num_freerunning_types = TGL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = TGL_UNCORE_PCI_IMC_MAP_SIZE, .freerunning = tgl_uncore_imc_freerunning, .ops = &tgl_uncore_imc_freerunning_ops, .event_descs = tgl_uncore_imc_events, diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 07652fa20ebb..62e88ad919ff 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -273,6 +273,30 @@ #define SKX_CPUNODEID 0xc0 #define SKX_GIDNIDMAP 0xd4 +/* + * The CPU_BUS_NUMBER MSR returns the values of the respective CPUBUSNO CSR + * that BIOS programmed. MSR has package scope. + * | Bit | Default | Description + * | [63] | 00h | VALID - When set, indicates the CPU bus + * numbers have been initialized. (RO) + * |[62:48]| --- | Reserved + * |[47:40]| 00h | BUS_NUM_5 — Return the bus number BIOS assigned + * CPUBUSNO(5). (RO) + * |[39:32]| 00h | BUS_NUM_4 — Return the bus number BIOS assigned + * CPUBUSNO(4). (RO) + * |[31:24]| 00h | BUS_NUM_3 — Return the bus number BIOS assigned + * CPUBUSNO(3). (RO) + * |[23:16]| 00h | BUS_NUM_2 — Return the bus number BIOS assigned + * CPUBUSNO(2). (RO) + * |[15:8] | 00h | BUS_NUM_1 — Return the bus number BIOS assigned + * CPUBUSNO(1). (RO) + * | [7:0] | 00h | BUS_NUM_0 — Return the bus number BIOS assigned + * CPUBUSNO(0). (RO) + */ +#define SKX_MSR_CPU_BUS_NUMBER 0x300 +#define SKX_MSR_CPU_BUS_VALID_BIT (1ULL << 63) +#define BUS_NUM_STRIDE 8 + /* SKX CHA */ #define SKX_CHA_MSR_PMON_BOX_FILTER_TID (0x1ffULL << 0) #define SKX_CHA_MSR_PMON_BOX_FILTER_LINK (0xfULL << 9) @@ -3612,6 +3636,170 @@ static struct intel_uncore_ops skx_uncore_iio_ops = { .read_counter = uncore_msr_read_counter, }; +static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die) +{ + return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE); +} + +static umode_t +skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) +{ + struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj)); + + /* Root bus 0x00 is valid only for die 0 AND pmu_idx = 0. */ + return (!skx_iio_stack(pmu, die) && pmu->pmu_idx) ? 0 : attr->mode; +} + +static ssize_t skx_iio_mapping_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pci_bus *bus = pci_find_next_bus(NULL); + struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev); + struct dev_ext_attribute *ea = to_dev_ext_attribute(attr); + long die = (long)ea->var; + + /* + * Current implementation is for single segment configuration hence it's + * safe to take the segment value from the first available root bus. + */ + return sprintf(buf, "%04x:%02x\n", pci_domain_nr(bus), + skx_iio_stack(uncore_pmu, die)); +} + +static int skx_msr_cpu_bus_read(int cpu, u64 *topology) +{ + u64 msr_value; + + if (rdmsrl_on_cpu(cpu, SKX_MSR_CPU_BUS_NUMBER, &msr_value) || + !(msr_value & SKX_MSR_CPU_BUS_VALID_BIT)) + return -ENXIO; + + *topology = msr_value; + + return 0; +} + +static int die_to_cpu(int die) +{ + int res = 0, cpu, current_die; + /* + * Using cpus_read_lock() to ensure cpu is not going down between + * looking at cpu_online_mask. + */ + cpus_read_lock(); + for_each_online_cpu(cpu) { + current_die = topology_logical_die_id(cpu); + if (current_die == die) { + res = cpu; + break; + } + } + cpus_read_unlock(); + return res; +} + +static int skx_iio_get_topology(struct intel_uncore_type *type) +{ + int i, ret; + struct pci_bus *bus = NULL; + + /* + * Verified single-segment environments only; disabled for multiple + * segment topologies for now except VMD domains. + * VMD domains start at 0x10000 to not clash with ACPI _SEG domains. + */ + while ((bus = pci_find_next_bus(bus)) + && (!pci_domain_nr(bus) || pci_domain_nr(bus) > 0xffff)) + ; + if (bus) + return -EPERM; + + type->topology = kcalloc(uncore_max_dies(), sizeof(u64), GFP_KERNEL); + if (!type->topology) + return -ENOMEM; + + for (i = 0; i < uncore_max_dies(); i++) { + ret = skx_msr_cpu_bus_read(die_to_cpu(i), &type->topology[i]); + if (ret) { + kfree(type->topology); + type->topology = NULL; + return ret; + } + } + + return 0; +} + +static struct attribute_group skx_iio_mapping_group = { + .is_visible = skx_iio_mapping_visible, +}; + +static const struct attribute_group *skx_iio_attr_update[] = { + &skx_iio_mapping_group, + NULL, +}; + +static int skx_iio_set_mapping(struct intel_uncore_type *type) +{ + char buf[64]; + int ret; + long die = -1; + struct attribute **attrs = NULL; + struct dev_ext_attribute *eas = NULL; + + ret = skx_iio_get_topology(type); + if (ret) + return ret; + + /* One more for NULL. */ + attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL); + if (!attrs) + goto err; + + eas = kcalloc(uncore_max_dies(), sizeof(*eas), GFP_KERNEL); + if (!eas) + goto err; + + for (die = 0; die < uncore_max_dies(); die++) { + sprintf(buf, "die%ld", die); + sysfs_attr_init(&eas[die].attr.attr); + eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL); + if (!eas[die].attr.attr.name) + goto err; + eas[die].attr.attr.mode = 0444; + eas[die].attr.show = skx_iio_mapping_show; + eas[die].attr.store = NULL; + eas[die].var = (void *)die; + attrs[die] = &eas[die].attr.attr; + } + skx_iio_mapping_group.attrs = attrs; + + return 0; +err: + for (; die >= 0; die--) + kfree(eas[die].attr.attr.name); + kfree(eas); + kfree(attrs); + kfree(type->topology); + type->attr_update = NULL; + return -ENOMEM; +} + +static void skx_iio_cleanup_mapping(struct intel_uncore_type *type) +{ + struct attribute **attr = skx_iio_mapping_group.attrs; + + if (!attr) + return; + + for (; *attr; attr++) + kfree((*attr)->name); + kfree(attr_to_ext_attr(*skx_iio_mapping_group.attrs)); + kfree(skx_iio_mapping_group.attrs); + skx_iio_mapping_group.attrs = NULL; + kfree(type->topology); +} + static struct intel_uncore_type skx_uncore_iio = { .name = "iio", .num_counters = 4, @@ -3626,6 +3814,9 @@ static struct intel_uncore_type skx_uncore_iio = { .constraints = skx_uncore_iio_constraints, .ops = &skx_uncore_iio_ops, .format_group = &skx_uncore_iio_format_group, + .attr_update = skx_iio_attr_update, + .set_mapping = skx_iio_set_mapping, + .cleanup_mapping = skx_iio_cleanup_mapping, }; enum perf_uncore_iio_freerunning_type_id { @@ -4421,6 +4612,7 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box, unsigned int box_ctl, int mem_offset) { struct pci_dev *pdev = snr_uncore_get_mc_dev(box->dieid); + struct intel_uncore_type *type = box->pmu->type; resource_size_t addr; u32 pci_dword; @@ -4435,9 +4627,11 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box, addr += box_ctl; - box->io_addr = ioremap(addr, SNR_IMC_MMIO_SIZE); - if (!box->io_addr) + box->io_addr = ioremap(addr, type->mmio_map_size); + if (!box->io_addr) { + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); return; + } writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr); } @@ -4480,6 +4674,9 @@ static void snr_uncore_mmio_enable_event(struct intel_uncore_box *box, if (!box->io_addr) return; + if (!uncore_mmio_is_valid_offset(box, hwc->config_base)) + return; + writel(hwc->config | SNBEP_PMON_CTL_EN, box->io_addr + hwc->config_base); } @@ -4492,6 +4689,9 @@ static void snr_uncore_mmio_disable_event(struct intel_uncore_box *box, if (!box->io_addr) return; + if (!uncore_mmio_is_valid_offset(box, hwc->config_base)) + return; + writel(hwc->config, box->io_addr + hwc->config_base); } @@ -4530,6 +4730,7 @@ static struct intel_uncore_type snr_uncore_imc = { .event_mask = SNBEP_PMON_RAW_EVENT_MASK, .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL, .mmio_offset = SNR_IMC_MMIO_OFFSET, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .ops = &snr_uncore_mmio_ops, .format_group = &skx_uncore_format_group, }; @@ -4570,6 +4771,7 @@ static struct intel_uncore_type snr_uncore_imc_free_running = { .num_counters = 3, .num_boxes = 1, .num_freerunning_types = SNR_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .freerunning = snr_imc_freerunning, .ops = &snr_uncore_imc_freerunning_ops, .event_descs = snr_uncore_imc_freerunning_events, @@ -4987,6 +5189,7 @@ static struct intel_uncore_type icx_uncore_imc = { .event_mask = SNBEP_PMON_RAW_EVENT_MASK, .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL, .mmio_offset = SNR_IMC_MMIO_OFFSET, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .ops = &icx_uncore_mmio_ops, .format_group = &skx_uncore_format_group, }; @@ -5044,6 +5247,7 @@ static struct intel_uncore_type icx_uncore_imc_free_running = { .num_counters = 5, .num_boxes = 4, .num_freerunning_types = ICX_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .freerunning = icx_imc_freerunning, .ops = &icx_uncore_imc_freerunning_ops, .event_descs = icx_uncore_imc_freerunning_events, diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 073eb7ad2f56..143bc9abe99c 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -66,6 +66,8 @@ struct arch_specific_insn { */ bool boostable; bool if_modifier; + /* Number of bytes of text poked */ + int tp_len; }; struct arch_optimized_insn { diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 8fd39ff74a49..f94c9f371411 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -3,6 +3,7 @@ #include <linux/module.h> #include <linux/sched.h> +#include <linux/perf_event.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/stringify.h> @@ -1001,6 +1002,7 @@ struct text_poke_loc { s32 rel32; u8 opcode; const u8 text[POKE_MAX_OPCODE_SIZE]; + u8 old; }; struct bp_patching_desc { @@ -1168,8 +1170,10 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries /* * First step: add a int3 trap to the address that will be patched. */ - for (i = 0; i < nr_entries; i++) + for (i = 0; i < nr_entries; i++) { + tp[i].old = *(u8 *)text_poke_addr(&tp[i]); text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + } text_poke_sync(); @@ -1177,14 +1181,45 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * Second step: update all but the first byte of the patched range. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { + u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; int len = text_opcode_size(tp[i].opcode); if (len - INT3_INSN_SIZE > 0) { + memcpy(old + INT3_INSN_SIZE, + text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + len - INT3_INSN_SIZE); text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, (const char *)tp[i].text + INT3_INSN_SIZE, len - INT3_INSN_SIZE); do_sync++; } + + /* + * Emit a perf event to record the text poke, primarily to + * support Intel PT decoding which must walk the executable code + * to reconstruct the trace. The flow up to here is: + * - write INT3 byte + * - IPI-SYNC + * - write instruction tail + * At this point the actual control flow will be through the + * INT3 and handler and not hit the old or new instruction. + * Intel PT outputs FUP/TIP packets for the INT3, so the flow + * can still be decoded. Subsequently: + * - emit RECORD_TEXT_POKE with the new instruction + * - IPI-SYNC + * - write first byte + * - IPI-SYNC + * So before the text poke event timestamp, the decoder will see + * either the old instruction flow or FUP/TIP of INT3. After the + * text poke event timestamp, the decoder will see either the + * new instruction flow or FUP/TIP of INT3. Thus decoders can + * use the timestamp as the point at which to modify the + * executable code. + * The old instruction is recorded so that the event can be + * processed forwards or backwards. + */ + perf_event_text_poke(text_poke_addr(&tp[i]), old, len, + tp[i].text, len); } if (do_sync) { diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index ada39ddbc922..fdadc37d72af 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -33,6 +33,7 @@ #include <linux/hardirq.h> #include <linux/preempt.h> #include <linux/sched/debug.h> +#include <linux/perf_event.h> #include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> @@ -472,6 +473,9 @@ static int arch_copy_kprobe(struct kprobe *p) /* Also, displacement change doesn't affect the first byte */ p->opcode = buf[0]; + p->ainsn.tp_len = len; + perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len); + /* OK, write back the instruction(s) into ROX insn buffer */ text_poke(p->ainsn.insn, buf, len); @@ -503,12 +507,18 @@ int arch_prepare_kprobe(struct kprobe *p) void arch_arm_kprobe(struct kprobe *p) { - text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1); + u8 int3 = INT3_INSN_OPCODE; + + text_poke(p->addr, &int3, 1); text_poke_sync(); + perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1); } void arch_disarm_kprobe(struct kprobe *p) { + u8 int3 = INT3_INSN_OPCODE; + + perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1); text_poke(p->addr, &p->opcode, 1); text_poke_sync(); } @@ -516,6 +526,9 @@ void arch_disarm_kprobe(struct kprobe *p) void arch_remove_kprobe(struct kprobe *p) { if (p->ainsn.insn) { + /* Record the perf event before freeing the slot */ + perf_event_text_poke(p->ainsn.insn, p->ainsn.insn, + p->ainsn.tp_len, NULL, 0); free_insn_slot(p->ainsn.insn, p->ainsn.boostable); p->ainsn.insn = NULL; } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 7af4c61dde52..40f380461e6d 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -6,6 +6,7 @@ * Copyright (C) Hitachi Ltd., 2012 */ #include <linux/kprobes.h> +#include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/string.h> #include <linux/slab.h> @@ -352,8 +353,15 @@ int arch_within_optimized_kprobe(struct optimized_kprobe *op, static void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) { - if (op->optinsn.insn) { - free_optinsn_slot(op->optinsn.insn, dirty); + u8 *slot = op->optinsn.insn; + if (slot) { + int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE; + + /* Record the perf event before freeing the slot */ + if (dirty) + perf_event_text_poke(slot, slot, len, NULL, 0); + + free_optinsn_slot(slot, dirty); op->optinsn.insn = NULL; op->optinsn.size = 0; } @@ -424,8 +432,15 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, (u8 *)op->kp.addr + op->optinsn.size); len += JMP32_INSN_SIZE; + /* + * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also + * used in __arch_remove_optimized_kprobe(). + */ + /* We have to use text_poke() for instruction buffer because it is RO */ + perf_event_text_poke(slot, NULL, 0, buf, len); text_poke(slot, buf, len); + ret = 0; out: kfree(buf); @@ -477,10 +492,23 @@ void arch_optimize_kprobes(struct list_head *oplist) */ void arch_unoptimize_kprobe(struct optimized_kprobe *op) { - arch_arm_kprobe(&op->kp); - text_poke(op->kp.addr + INT3_INSN_SIZE, - op->optinsn.copied_insn, DISP32_SIZE); + u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, }; + u8 old[JMP32_INSN_SIZE]; + u8 *addr = op->kp.addr; + + memcpy(old, op->kp.addr, JMP32_INSN_SIZE); + memcpy(new + INT3_INSN_SIZE, + op->optinsn.copied_insn, + JMP32_INSN_SIZE - INT3_INSN_SIZE); + + text_poke(addr, new, INT3_INSN_SIZE); text_poke_sync(); + text_poke(addr + INT3_INSN_SIZE, + new + INT3_INSN_SIZE, + JMP32_INSN_SIZE - INT3_INSN_SIZE); + text_poke_sync(); + + perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); } /* diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e339dac91ee6..ce2c06f72e86 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -58,9 +58,6 @@ struct ftrace_direct_func; const char * ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym); -int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, - char *type, char *name, - char *module_name, int *exported); #else static inline const char * ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, @@ -68,6 +65,13 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, { return NULL; } +#endif + +#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, + char *module_name, int *exported); +#else static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *name, char *module_name, int *exported) @@ -76,7 +80,6 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val } #endif - #ifdef CONFIG_FUNCTION_TRACER extern int ftrace_enabled; @@ -207,6 +210,7 @@ struct ftrace_ops { struct ftrace_ops_hash old_hash; unsigned long trampoline; unsigned long trampoline_size; + struct list_head list; #endif }; diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 6adf90f248d7..45b8cdc9fad7 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -242,6 +242,7 @@ struct kprobe_insn_cache { struct mutex mutex; void *(*alloc)(void); /* allocate insn page */ void (*free)(void *); /* free insn page */ + const char *sym; /* symbol for insn pages */ struct list_head pages; /* list of kprobe_insn_page */ size_t insn_size; /* size of instruction slot */ int nr_garbage; @@ -272,6 +273,10 @@ static inline bool is_kprobe_##__name##_slot(unsigned long addr) \ { \ return __is_insn_slot_addr(&kprobe_##__name##_slots, addr); \ } +#define KPROBE_INSN_PAGE_SYM "kprobe_insn_page" +#define KPROBE_OPTINSN_PAGE_SYM "kprobe_optinsn_page" +int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, + unsigned long *value, char *type, char *sym); #else /* __ARCH_WANT_KPROBES_INSN_SLOT */ #define DEFINE_INSN_CACHE_OPS(__name) \ static inline bool is_kprobe_##__name##_slot(unsigned long addr) \ @@ -377,6 +382,11 @@ void dump_kprobe(struct kprobe *kp); void *alloc_insn_page(void); void free_insn_page(void *page); +int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym); + +int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, + char *type, char *sym); #else /* !CONFIG_KPROBES: */ static inline int kprobes_built_in(void) @@ -439,6 +449,11 @@ static inline bool within_kprobe_blacklist(unsigned long addr) { return true; } +static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *sym) +{ + return -ERANGE; +} #endif /* CONFIG_KPROBES */ static inline int disable_kretprobe(struct kretprobe *rp) { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b4bb32082342..46fe5cfb5163 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1232,6 +1232,9 @@ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); extern void perf_event_namespaces(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); +extern void perf_event_text_poke(const void *addr, + const void *old_bytes, size_t old_len, + const void *new_bytes, size_t new_len); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); @@ -1479,6 +1482,11 @@ static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } +static inline void perf_event_text_poke(const void *addr, + const void *old_bytes, + size_t old_len, + const void *new_bytes, + size_t new_len) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 7b2d6fc9e6ed..52ca2093831c 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -383,7 +383,8 @@ struct perf_event_attr { bpf_event : 1, /* include bpf events */ aux_output : 1, /* generate AUX records instead of events */ cgroup : 1, /* include cgroup events */ - __reserved_1 : 31; + text_poke : 1, /* include text poke events */ + __reserved_1 : 30; union { __u32 wakeup_events; /* wakeup every n events */ @@ -1024,12 +1025,35 @@ enum perf_event_type { */ PERF_RECORD_CGROUP = 19, + /* + * Records changes to kernel text i.e. self-modified code. 'old_len' is + * the number of old bytes, 'new_len' is the number of new bytes. Either + * 'old_len' or 'new_len' may be zero to indicate, for example, the + * addition or removal of a trampoline. 'bytes' contains the old bytes + * followed immediately by the new bytes. + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u16 old_len; + * u16 new_len; + * u8 bytes[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_TEXT_POKE = 20, + PERF_RECORD_MAX, /* non-ABI */ }; enum perf_record_ksymbol_type { PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, PERF_RECORD_KSYMBOL_TYPE_BPF = 1, + /* + * Out of line code such as kprobe-replaced instructions or optimized + * kprobes or ftrace trampolines. + */ + PERF_RECORD_KSYMBOL_TYPE_OOL = 2, PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 856d98c36f56..9b8f92500833 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -394,6 +394,7 @@ static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; +static atomic_t nr_text_poke_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4575,7 +4576,7 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || attr->task || attr->ksymbol || - attr->context_switch || + attr->context_switch || attr->text_poke || attr->bpf_event) return true; return false; @@ -4651,6 +4652,8 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_dec(&nr_bpf_events); + if (event->attr.text_poke) + atomic_dec(&nr_text_poke_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -8628,6 +8631,89 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } +struct perf_text_poke_event { + const void *old_bytes; + const void *new_bytes; + size_t pad; + u16 old_len; + u16 new_len; + + struct { + struct perf_event_header header; + + u64 addr; + } event_id; +}; + +static int perf_event_text_poke_match(struct perf_event *event) +{ + return event->attr.text_poke; +} + +static void perf_event_text_poke_output(struct perf_event *event, void *data) +{ + struct perf_text_poke_event *text_poke_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + u64 padding = 0; + int ret; + + if (!perf_event_text_poke_match(event)) + return; + + perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, text_poke_event->event_id); + perf_output_put(&handle, text_poke_event->old_len); + perf_output_put(&handle, text_poke_event->new_len); + + __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); + __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); + + if (text_poke_event->pad) + __output_copy(&handle, &padding, text_poke_event->pad); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_text_poke(const void *addr, const void *old_bytes, + size_t old_len, const void *new_bytes, size_t new_len) +{ + struct perf_text_poke_event text_poke_event; + size_t tot, pad; + + if (!atomic_read(&nr_text_poke_events)) + return; + + tot = sizeof(text_poke_event.old_len) + old_len; + tot += sizeof(text_poke_event.new_len) + new_len; + pad = ALIGN(tot, sizeof(u64)) - tot; + + text_poke_event = (struct perf_text_poke_event){ + .old_bytes = old_bytes, + .new_bytes = new_bytes, + .pad = pad, + .old_len = old_len, + .new_len = new_len, + .event_id = { + .header = { + .type = PERF_RECORD_TEXT_POKE, + .misc = PERF_RECORD_MISC_KERNEL, + .size = sizeof(text_poke_event.event_id) + tot + pad, + }, + .addr = (unsigned long)addr, + }, + }; + + perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -10945,6 +11031,8 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_inc(&nr_bpf_events); + if (event->attr.text_poke) + atomic_inc(&nr_text_poke_events); if (inc) { /* diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 16c8c605f4b0..834bfdc43235 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -24,6 +24,7 @@ #include <linux/slab.h> #include <linux/filter.h> #include <linux/ftrace.h> +#include <linux/kprobes.h> #include <linux/compiler.h> /* @@ -437,6 +438,7 @@ struct kallsym_iter { loff_t pos_arch_end; loff_t pos_mod_end; loff_t pos_ftrace_mod_end; + loff_t pos_bpf_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; @@ -480,6 +482,11 @@ static int get_ksymbol_mod(struct kallsym_iter *iter) return 1; } +/* + * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace + * purposes. In that case "__builtin__ftrace" is used as a module name, even + * though "__builtin__ftrace" is not a module. + */ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) { int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end, @@ -496,11 +503,33 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) static int get_ksymbol_bpf(struct kallsym_iter *iter) { + int ret; + strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN); iter->exported = 0; - return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, - &iter->value, &iter->type, - iter->name) < 0 ? 0 : 1; + ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, + &iter->value, &iter->type, + iter->name); + if (ret < 0) { + iter->pos_bpf_end = iter->pos; + return 0; + } + + return 1; +} + +/* + * This uses "__builtin__kprobes" as a module name for symbols for pages + * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a + * module. + */ +static int get_ksymbol_kprobe(struct kallsym_iter *iter) +{ + strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN); + iter->exported = 0; + return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end, + &iter->value, &iter->type, + iter->name) < 0 ? 0 : 1; } /* Returns space to next name. */ @@ -527,6 +556,7 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->pos_arch_end = 0; iter->pos_mod_end = 0; iter->pos_ftrace_mod_end = 0; + iter->pos_bpf_end = 0; } } @@ -551,7 +581,11 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) get_ksymbol_ftrace_mod(iter)) return 1; - return get_ksymbol_bpf(iter); + if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) && + get_ksymbol_bpf(iter)) + return 1; + + return get_ksymbol_kprobe(iter); } /* Returns false if pos at or past end of file. */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 4a904cc56d68..0924397d9d59 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -35,6 +35,7 @@ #include <linux/ftrace.h> #include <linux/cpu.h> #include <linux/jump_label.h> +#include <linux/perf_event.h> #include <asm/sections.h> #include <asm/cacheflush.h> @@ -123,6 +124,7 @@ struct kprobe_insn_cache kprobe_insn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), .alloc = alloc_insn_page, .free = free_insn_page, + .sym = KPROBE_INSN_PAGE_SYM, .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, @@ -188,6 +190,10 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->cache = c; list_add_rcu(&kip->list, &c->pages); slot = kip->insns; + + /* Record the perf ksymbol register event after adding the page */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns, + PAGE_SIZE, false, c->sym); out: mutex_unlock(&c->mutex); return slot; @@ -206,6 +212,13 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx) * next time somebody inserts a probe. */ if (!list_is_singular(&kip->list)) { + /* + * Record perf ksymbol unregister event before removing + * the page. + */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + (unsigned long)kip->insns, PAGE_SIZE, true, + kip->cache->sym); list_del_rcu(&kip->list); synchronize_rcu(); kip->cache->free(kip->insns); @@ -295,12 +308,34 @@ bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr) return ret; } +int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, + unsigned long *value, char *type, char *sym) +{ + struct kprobe_insn_page *kip; + int ret = -ERANGE; + + rcu_read_lock(); + list_for_each_entry_rcu(kip, &c->pages, list) { + if ((*symnum)--) + continue; + strlcpy(sym, c->sym, KSYM_NAME_LEN); + *type = 't'; + *value = (unsigned long)kip->insns; + ret = 0; + break; + } + rcu_read_unlock(); + + return ret; +} + #ifdef CONFIG_OPTPROBES /* For optimized_kprobe buffer */ struct kprobe_insn_cache kprobe_optinsn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), .alloc = alloc_insn_page, .free = free_insn_page, + .sym = KPROBE_OPTINSN_PAGE_SYM, .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), /* .insn_size is initialized later */ .nr_garbage = 0, @@ -2232,6 +2267,28 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry) kprobe_remove_area_blacklist(entry, entry + 1); } +int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, + char *type, char *sym) +{ + return -ERANGE; +} + +int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym) +{ +#ifdef __ARCH_WANT_KPROBES_INSN_SLOT + if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym)) + return 0; +#ifdef CONFIG_OPTPROBES + if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym)) + return 0; +#endif +#endif + if (!arch_kprobe_get_kallsym(&symnum, value, type, sym)) + return 0; + return -ERANGE; +} + int __init __weak arch_populate_kprobe_blacklist(void) { return 0; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1903b80db6eb..72064541bef2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2764,6 +2764,50 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) { } +/* List of trace_ops that have allocated trampolines */ +static LIST_HEAD(ftrace_ops_trampoline_list); + +static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops) +{ + lockdep_assert_held(&ftrace_lock); + list_add_rcu(&ops->list, &ftrace_ops_trampoline_list); +} + +static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) +{ + lockdep_assert_held(&ftrace_lock); + list_del_rcu(&ops->list); +} + +/* + * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols + * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is + * not a module. + */ +#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace" +#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline" + +static void ftrace_trampoline_free(struct ftrace_ops *ops) +{ + if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && + ops->trampoline) { + /* + * Record the text poke event before the ksymbol unregister + * event. + */ + perf_event_text_poke((void *)ops->trampoline, + (void *)ops->trampoline, + ops->trampoline_size, NULL, 0); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, + true, FTRACE_TRAMPOLINE_SYM); + /* Remove from kallsyms after the perf events */ + ftrace_remove_trampoline_from_kallsyms(ops); + } + + arch_ftrace_trampoline_free(ops); +} + static void ftrace_startup_enable(int command) { if (saved_ftrace_func != ftrace_trace_function) { @@ -2934,7 +2978,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) synchronize_rcu_tasks(); free_ops: - arch_ftrace_trampoline_free(ops); + ftrace_trampoline_free(ops); } return 0; @@ -6178,6 +6222,27 @@ struct ftrace_mod_map { unsigned int num_funcs; }; +static int ftrace_get_trampoline_kallsym(unsigned int symnum, + unsigned long *value, char *type, + char *name, char *module_name, + int *exported) +{ + struct ftrace_ops *op; + + list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) { + if (!op->trampoline || symnum--) + continue; + *value = op->trampoline; + *type = 't'; + strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); + strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); + *exported = 0; + return 0; + } + + return -ERANGE; +} + #ifdef CONFIG_MODULES #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) @@ -6514,6 +6579,7 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, { struct ftrace_mod_map *mod_map; struct ftrace_mod_func *mod_func; + int ret; preempt_disable(); list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { @@ -6540,8 +6606,10 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, WARN_ON(1); break; } + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, + module_name, exported); preempt_enable(); - return -ERANGE; + return ret; } #else @@ -6553,6 +6621,18 @@ allocate_ftrace_mod_map(struct module *mod, { return NULL; } +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, char *module_name, + int *exported) +{ + int ret; + + preempt_disable(); + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, + module_name, exported); + preempt_enable(); + return ret; +} #endif /* CONFIG_MODULES */ struct ftrace_init_func { @@ -6733,7 +6813,24 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) static void ftrace_update_trampoline(struct ftrace_ops *ops) { + unsigned long trampoline = ops->trampoline; + arch_ftrace_update_trampoline(ops); + if (ops->trampoline && ops->trampoline != trampoline && + (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) { + /* Add to kallsyms before the perf events */ + ftrace_add_trampoline_to_kallsyms(ops); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, false, + FTRACE_TRAMPOLINE_SYM); + /* + * Record the perf text poke event after the ksymbol register + * event. + */ + perf_event_text_poke((void *)ops->trampoline, NULL, 0, + (void *)ops->trampoline, + ops->trampoline_size); + } } void ftrace_init_trace_array(struct trace_array *tr) |