diff options
Diffstat (limited to 'drivers/iommu/intel')
-rw-r--r-- | drivers/iommu/intel/Kconfig | 11 | ||||
-rw-r--r-- | drivers/iommu/intel/Makefile | 1 | ||||
-rw-r--r-- | drivers/iommu/intel/dmar.c | 33 | ||||
-rw-r--r-- | drivers/iommu/intel/iommu.c | 122 | ||||
-rw-r--r-- | drivers/iommu/intel/iommu.h | 115 | ||||
-rw-r--r-- | drivers/iommu/intel/pasid.c | 20 | ||||
-rw-r--r-- | drivers/iommu/intel/perfmon.c | 877 | ||||
-rw-r--r-- | drivers/iommu/intel/perfmon.h | 64 | ||||
-rw-r--r-- | drivers/iommu/intel/svm.c | 90 |
9 files changed, 1244 insertions, 89 deletions
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index b7dff5092fd2..12e1e90fdae1 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -96,4 +96,15 @@ config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON passing intel_iommu=sm_on to the kernel. If not sure, please use the default value. +config INTEL_IOMMU_PERF_EVENTS + def_bool y + bool "Intel IOMMU performance events" + depends on INTEL_IOMMU && PERF_EVENTS + help + Selecting this option will enable the performance monitoring + infrastructure in the Intel IOMMU. It collects information about + key events occurring during operation of the remapping hardware, + to aid performance tuning and debug. These are available on modern + processors which support Intel VT-d 4.0 and later. + endif # INTEL_IOMMU diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile index fa0dae16441c..7af3b8a4f2a0 100644 --- a/drivers/iommu/intel/Makefile +++ b/drivers/iommu/intel/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_DMAR_PERF) += perf.o obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o +obj-$(CONFIG_INTEL_IOMMU_PERF_EVENTS) += perfmon.o diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index b00a0ceb2d13..6acfe879589c 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -34,6 +34,7 @@ #include "../irq_remapping.h" #include "perf.h" #include "trace.h" +#include "perfmon.h" typedef int (*dmar_res_handler_t)(struct acpi_dmar_header *, void *); struct dmar_res_callback { @@ -427,6 +428,8 @@ static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg) memcpy(dmaru->hdr, header, header->length); dmaru->reg_base_addr = drhd->address; dmaru->segment = drhd->segment; + /* The size of the register set is 2 ^ N 4 KB pages. */ + dmaru->reg_size = 1UL << (drhd->size + 12); dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */ dmaru->devices = dmar_alloc_dev_scope((void *)(drhd + 1), ((void *)drhd) + drhd->header.length, @@ -956,17 +959,18 @@ static void unmap_iommu(struct intel_iommu *iommu) /** * map_iommu: map the iommu's registers * @iommu: the iommu to map - * @phys_addr: the physical address of the base resgister + * @drhd: DMA remapping hardware definition structure * * Memory map the iommu's registers. Start w/ a single page, and * possibly expand if that turns out to be insufficent. */ -static int map_iommu(struct intel_iommu *iommu, u64 phys_addr) +static int map_iommu(struct intel_iommu *iommu, struct dmar_drhd_unit *drhd) { + u64 phys_addr = drhd->reg_base_addr; int map_size, err=0; iommu->reg_phys = phys_addr; - iommu->reg_size = VTD_PAGE_SIZE; + iommu->reg_size = drhd->reg_size; if (!request_mem_region(iommu->reg_phys, iommu->reg_size, iommu->name)) { pr_err("Can't reserve memory\n"); @@ -1013,6 +1017,16 @@ static int map_iommu(struct intel_iommu *iommu, u64 phys_addr) goto release; } } + + if (cap_ecmds(iommu->cap)) { + int i; + + for (i = 0; i < DMA_MAX_NUM_ECMDCAP; i++) { + iommu->ecmdcap[i] = dmar_readq(iommu->reg + DMAR_ECCAP_REG + + i * DMA_ECMD_REG_STEP); + } + } + err = 0; goto out; @@ -1050,7 +1064,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) } sprintf(iommu->name, "dmar%d", iommu->seq_id); - err = map_iommu(iommu, drhd->reg_base_addr); + err = map_iommu(iommu, drhd); if (err) { pr_err("Failed to map %s\n", iommu->name); goto error_free_seq_id; @@ -1103,6 +1117,9 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) if (sts & DMA_GSTS_QIES) iommu->gcmd |= DMA_GCMD_QIE; + if (alloc_iommu_pmu(iommu)) + pr_debug("Cannot alloc PMU for iommu (seq_id = %d)\n", iommu->seq_id); + raw_spin_lock_init(&iommu->register_lock); /* @@ -1127,6 +1144,8 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) err = iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); if (err) goto err_sysfs; + + iommu_pmu_register(iommu); } drhd->iommu = iommu; @@ -1137,6 +1156,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) err_sysfs: iommu_device_sysfs_remove(&iommu->iommu); err_unmap: + free_iommu_pmu(iommu); unmap_iommu(iommu); error_free_seq_id: ida_free(&dmar_seq_ids, iommu->seq_id); @@ -1148,10 +1168,13 @@ error: static void free_iommu(struct intel_iommu *iommu) { if (intel_iommu_enabled && !iommu->drhd->ignored) { + iommu_pmu_unregister(iommu); iommu_device_unregister(&iommu->iommu); iommu_device_sysfs_remove(&iommu->iommu); } + free_iommu_pmu(iommu); + if (iommu->irq) { if (iommu->pr_irq) { free_irq(iommu->pr_irq, iommu); @@ -1859,6 +1882,8 @@ static inline int dmar_msi_reg(struct intel_iommu *iommu, int irq) return DMAR_FECTL_REG; else if (iommu->pr_irq == irq) return DMAR_PECTL_REG; + else if (iommu->perf_irq == irq) + return DMAR_PERFINTRCTL_REG; else BUG(); } diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 59df7e42fd53..86ee06ac058b 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -16,7 +16,6 @@ #include <linux/crash_dump.h> #include <linux/dma-direct.h> #include <linux/dmi.h> -#include <linux/intel-svm.h> #include <linux/memory.h> #include <linux/pci.h> #include <linux/pci-ats.h> @@ -30,6 +29,7 @@ #include "../iommu-sva.h" #include "pasid.h" #include "cap_audit.h" +#include "perfmon.h" #define ROOT_SIZE VTD_PAGE_SIZE #define CONTEXT_SIZE VTD_PAGE_SIZE @@ -362,12 +362,12 @@ static int __init intel_iommu_setup(char *str) } __setup("intel_iommu=", intel_iommu_setup); -void *alloc_pgtable_page(int node) +void *alloc_pgtable_page(int node, gfp_t gfp) { struct page *page; void *vaddr = NULL; - page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0); + page = alloc_pages_node(node, gfp | __GFP_ZERO, 0); if (page) vaddr = page_address(page); return vaddr; @@ -612,7 +612,7 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, if (!alloc) return NULL; - context = alloc_pgtable_page(iommu->node); + context = alloc_pgtable_page(iommu->node, GFP_ATOMIC); if (!context) return NULL; @@ -908,7 +908,8 @@ pgtable_walk: #endif static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, - unsigned long pfn, int *target_level) + unsigned long pfn, int *target_level, + gfp_t gfp) { struct dma_pte *parent, *pte; int level = agaw_to_level(domain->agaw); @@ -935,7 +936,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, if (!dma_pte_present(pte)) { uint64_t pteval; - tmp_page = alloc_pgtable_page(domain->nid); + tmp_page = alloc_pgtable_page(domain->nid, gfp); if (!tmp_page) return NULL; @@ -1186,7 +1187,7 @@ static int iommu_alloc_root_entry(struct intel_iommu *iommu) { struct root_entry *root; - root = (struct root_entry *)alloc_pgtable_page(iommu->node); + root = (struct root_entry *)alloc_pgtable_page(iommu->node, GFP_ATOMIC); if (!root) { pr_err("Allocating root entry for %s failed\n", iommu->name); @@ -2150,7 +2151,8 @@ static void switch_to_super_page(struct dmar_domain *domain, while (start_pfn <= end_pfn) { if (!pte) - pte = pfn_to_dma_pte(domain, start_pfn, &level); + pte = pfn_to_dma_pte(domain, start_pfn, &level, + GFP_ATOMIC); if (dma_pte_present(pte)) { dma_pte_free_pagetable(domain, start_pfn, @@ -2172,7 +2174,8 @@ static void switch_to_super_page(struct dmar_domain *domain, static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, - unsigned long phys_pfn, unsigned long nr_pages, int prot) + unsigned long phys_pfn, unsigned long nr_pages, int prot, + gfp_t gfp) { struct dma_pte *first_pte = NULL, *pte = NULL; unsigned int largepage_lvl = 0; @@ -2202,7 +2205,8 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, nr_pages); - pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); + pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, + gfp); if (!pte) return -ENOMEM; first_pte = pte; @@ -2368,7 +2372,7 @@ static int iommu_domain_identity_map(struct dmar_domain *domain, return __domain_mapping(domain, first_vpfn, first_vpfn, last_vpfn - first_vpfn + 1, - DMA_PTE_READ|DMA_PTE_WRITE); + DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL); } static int md_domain_init(struct dmar_domain *domain, int guest_width); @@ -2676,7 +2680,7 @@ static int copy_context_table(struct intel_iommu *iommu, if (!old_ce) goto out; - new_ce = alloc_pgtable_page(iommu->node); + new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL); if (!new_ce) goto out_unmap; @@ -4005,7 +4009,8 @@ int __init intel_iommu_init(void) * is likely to be much lower than the overhead of synchronizing * the virtual and physical IOMMU page-tables. */ - if (cap_caching_mode(iommu->cap)) { + if (cap_caching_mode(iommu->cap) && + !first_level_by_default(IOMMU_DOMAIN_DMA)) { pr_info_once("IOMMU batching disallowed due to virtualization\n"); iommu_set_dma_strict(); } @@ -4013,6 +4018,8 @@ int __init intel_iommu_init(void) intel_iommu_groups, "%s", iommu->name); iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); + + iommu_pmu_register(iommu); } up_read(&dmar_global_lock); @@ -4136,7 +4143,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width) domain->max_addr = 0; /* always allocate the top pgd */ - domain->pgd = alloc_pgtable_page(domain->nid); + domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC); if (!domain->pgd) return -ENOMEM; domain_flush_cache(domain, domain->pgd, PAGE_SIZE); @@ -4298,7 +4305,7 @@ static int intel_iommu_map(struct iommu_domain *domain, the low bits of hpa would take us onto the next page */ size = aligned_nrpages(hpa, size); return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, - hpa >> VTD_PAGE_SHIFT, size, prot); + hpa >> VTD_PAGE_SHIFT, size, prot, gfp); } static int intel_iommu_map_pages(struct iommu_domain *domain, @@ -4333,7 +4340,8 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain, /* Cope with horrid API which requires us to unmap more than the size argument if it happens to be a large-page mapping. */ - BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level)); + BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, + GFP_ATOMIC)); if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) size = VTD_PAGE_SIZE << level_to_offset_bits(level); @@ -4346,7 +4354,12 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain, if (dmar_domain->max_addr == iova + size) dmar_domain->max_addr = iova; - iommu_iotlb_gather_add_page(domain, gather, iova, size); + /* + * We do not use page-selective IOTLB invalidation in flush queue, + * so there is no need to track page and sync iotlb. + */ + if (!iommu_iotlb_gather_queued(gather)) + iommu_iotlb_gather_add_page(domain, gather, iova, size); return size; } @@ -4392,7 +4405,8 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, int level = 0; u64 phys = 0; - pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level); + pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, + GFP_ATOMIC); if (pte && dma_pte_present(pte)) phys = dma_pte_addr(pte) + (iova & (BIT_MASK(level_to_offset_bits(level) + @@ -4642,8 +4656,12 @@ static int intel_iommu_enable_sva(struct device *dev) return -EINVAL; ret = iopf_queue_add_device(iommu->iopf_queue, dev); - if (!ret) - ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); + if (ret) + return ret; + + ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); + if (ret) + iopf_queue_remove_device(iommu->iopf_queue, dev); return ret; } @@ -4655,8 +4673,12 @@ static int intel_iommu_disable_sva(struct device *dev) int ret; ret = iommu_unregister_device_fault_handler(dev); - if (!ret) - ret = iopf_queue_remove_device(iommu->iopf_queue, dev); + if (ret) + return ret; + + ret = iopf_queue_remove_device(iommu->iopf_queue, dev); + if (ret) + iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); return ret; } @@ -5023,3 +5045,59 @@ void quirk_extra_dev_tlb_flush(struct device_domain_info *info, pasid, qdep, address, mask); } } + +#define ecmd_get_status_code(res) (((res) & 0xff) >> 1) + +/* + * Function to submit a command to the enhanced command interface. The + * valid enhanced command descriptions are defined in Table 47 of the + * VT-d spec. The VT-d hardware implementation may support some but not + * all commands, which can be determined by checking the Enhanced + * Command Capability Register. + * + * Return values: + * - 0: Command successful without any error; + * - Negative: software error value; + * - Nonzero positive: failure status code defined in Table 48. + */ +int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) +{ + unsigned long flags; + u64 res; + int ret; + + if (!cap_ecmds(iommu->cap)) + return -ENODEV; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + + res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); + if (res & DMA_ECMD_ECRSP_IP) { + ret = -EBUSY; + goto err; + } + + /* + * Unconditionally write the operand B, because + * - There is no side effect if an ecmd doesn't require an + * operand B, but we set the register to some value. + * - It's not invoked in any critical path. The extra MMIO + * write doesn't bring any performance concerns. + */ + dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); + dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); + + IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, + !(res & DMA_ECMD_ECRSP_IP), res); + + if (res & DMA_ECMD_ECRSP_IP) { + ret = -ETIMEDOUT; + goto err; + } + + ret = ecmd_get_status_code(res); +err: + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + + return ret; +} diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 06e61e474856..d6df3b865812 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -22,6 +22,7 @@ #include <linux/ioasid.h> #include <linux/bitfield.h> #include <linux/xarray.h> +#include <linux/perf_event.h> #include <asm/cacheflush.h> #include <asm/iommu.h> @@ -125,6 +126,17 @@ #define DMAR_MTRR_PHYSMASK8_REG 0x208 #define DMAR_MTRR_PHYSBASE9_REG 0x210 #define DMAR_MTRR_PHYSMASK9_REG 0x218 +#define DMAR_PERFCAP_REG 0x300 +#define DMAR_PERFCFGOFF_REG 0x310 +#define DMAR_PERFOVFOFF_REG 0x318 +#define DMAR_PERFCNTROFF_REG 0x31c +#define DMAR_PERFINTRSTS_REG 0x324 +#define DMAR_PERFINTRCTL_REG 0x328 +#define DMAR_PERFEVNTCAP_REG 0x380 +#define DMAR_ECMD_REG 0x400 +#define DMAR_ECEO_REG 0x408 +#define DMAR_ECRSP_REG 0x410 +#define DMAR_ECCAP_REG 0x430 #define DMAR_VCCAP_REG 0xe30 /* Virtual command capability register */ #define DMAR_VCMD_REG 0xe00 /* Virtual command register */ #define DMAR_VCRSP_REG 0xe10 /* Virtual command response register */ @@ -148,6 +160,7 @@ */ #define cap_esrtps(c) (((c) >> 63) & 1) #define cap_esirtps(c) (((c) >> 62) & 1) +#define cap_ecmds(c) (((c) >> 61) & 1) #define cap_fl5lp_support(c) (((c) >> 60) & 1) #define cap_pi_support(c) (((c) >> 59) & 1) #define cap_fl1gp_support(c) (((c) >> 56) & 1) @@ -179,7 +192,8 @@ * Extended Capability Register */ -#define ecap_rps(e) (((e) >> 49) & 0x1) +#define ecap_pms(e) (((e) >> 51) & 0x1) +#define ecap_rps(e) (((e) >> 49) & 0x1) #define ecap_smpwc(e) (((e) >> 48) & 0x1) #define ecap_flts(e) (((e) >> 47) & 0x1) #define ecap_slts(e) (((e) >> 46) & 0x1) @@ -210,6 +224,22 @@ #define ecap_max_handle_mask(e) (((e) >> 20) & 0xf) #define ecap_sc_support(e) (((e) >> 7) & 0x1) /* Snooping Control */ +/* + * Decoding Perf Capability Register + */ +#define pcap_num_cntr(p) ((p) & 0xffff) +#define pcap_cntr_width(p) (((p) >> 16) & 0x7f) +#define pcap_num_event_group(p) (((p) >> 24) & 0x1f) +#define pcap_filters_mask(p) (((p) >> 32) & 0x1f) +#define pcap_interrupt(p) (((p) >> 50) & 0x1) +/* The counter stride is calculated as 2 ^ (x+10) bytes */ +#define pcap_cntr_stride(p) (1ULL << ((((p) >> 52) & 0x7) + 10)) + +/* + * Decoding Perf Event Capability Register + */ +#define pecap_es(p) ((p) & 0xfffffff) + /* Virtual command interface capability */ #define vccap_pasid(v) (((v) & DMA_VCS_PAS)) /* PASID allocation */ @@ -281,6 +311,26 @@ #define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16) #define DMA_CCMD_DID(d) ((u64)((d) & 0xffff)) +/* ECMD_REG */ +#define DMA_MAX_NUM_ECMD 256 +#define DMA_MAX_NUM_ECMDCAP (DMA_MAX_NUM_ECMD / 64) +#define DMA_ECMD_REG_STEP 8 +#define DMA_ECMD_ENABLE 0xf0 +#define DMA_ECMD_DISABLE 0xf1 +#define DMA_ECMD_FREEZE 0xf4 +#define DMA_ECMD_UNFREEZE 0xf5 +#define DMA_ECMD_OA_SHIFT 16 +#define DMA_ECMD_ECRSP_IP 0x1 +#define DMA_ECMD_ECCAP3 3 +#define DMA_ECMD_ECCAP3_ECNTS BIT_ULL(48) +#define DMA_ECMD_ECCAP3_DCNTS BIT_ULL(49) +#define DMA_ECMD_ECCAP3_FCNTS BIT_ULL(52) +#define DMA_ECMD_ECCAP3_UFCNTS BIT_ULL(53) +#define DMA_ECMD_ECCAP3_ESSENTIAL (DMA_ECMD_ECCAP3_ECNTS | \ + DMA_ECMD_ECCAP3_DCNTS | \ + DMA_ECMD_ECCAP3_FCNTS | \ + DMA_ECMD_ECCAP3_UFCNTS) + /* FECTL_REG */ #define DMA_FECTL_IM (((u32)1) << 31) @@ -309,6 +359,9 @@ #define DMA_VCS_PAS ((u64)1) +/* PERFINTRSTS_REG */ +#define DMA_PERFINTRSTS_PIS ((u32)1) + #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \ do { \ cycles_t start_time = get_cycles(); \ @@ -438,6 +491,11 @@ struct q_inval { int free_cnt; }; +/* Page Request Queue depth */ +#define PRQ_ORDER 4 +#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) +#define PRQ_DEPTH ((0x1000 << PRQ_ORDER) >> 5) + struct dmar_pci_notify_info; #ifdef CONFIG_IRQ_REMAP @@ -554,6 +612,40 @@ struct dmar_domain { iommu core */ }; +/* + * In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters. + * But in practice, there are only 14 counters for the existing + * platform. Setting the max number of counters to 64 should be good + * enough for a long time. Also, supporting more than 64 counters + * requires more extras, e.g., extra freeze and overflow registers, + * which is not necessary for now. + */ +#define IOMMU_PMU_IDX_MAX 64 + +struct iommu_pmu { + struct intel_iommu *iommu; + u32 num_cntr; /* Number of counters */ + u32 num_eg; /* Number of event group */ + u32 cntr_width; /* Counter width */ + u32 cntr_stride; /* Counter Stride */ + u32 filter; /* Bitmask of filter support */ + void __iomem *base; /* the PerfMon base address */ + void __iomem *cfg_reg; /* counter configuration base address */ + void __iomem *cntr_reg; /* counter 0 address*/ + void __iomem *overflow; /* overflow status register */ + + u64 *evcap; /* Indicates all supported events */ + u32 **cntr_evcap; /* Supported events of each counter. */ + + struct pmu pmu; + DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX); + struct perf_event *event_list[IOMMU_PMU_IDX_MAX]; + unsigned char irq_name[16]; +}; + +#define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED) +#define IOMMU_IRQ_ID_OFFSET_PERF (2 * DMAR_UNITS_SUPPORTED) + struct intel_iommu { void __iomem *reg; /* Pointer to hardware regs, virtual addr */ u64 reg_phys; /* physical address of hw register set */ @@ -561,12 +653,13 @@ struct intel_iommu { u64 cap; u64 ecap; u64 vccap; + u64 ecmdcap[DMA_MAX_NUM_ECMDCAP]; u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */ raw_spinlock_t register_lock; /* protect register handling */ int seq_id; /* sequence id of the iommu */ int agaw; /* agaw of this iommu */ int msagaw; /* max sagaw of this iommu */ - unsigned int irq, pr_irq; + unsigned int irq, pr_irq, perf_irq; u16 segment; /* PCI segment# */ unsigned char name[13]; /* Device Name */ @@ -600,6 +693,8 @@ struct intel_iommu { struct dmar_drhd_unit *drhd; void *perf_statistic; + + struct iommu_pmu *pmu; }; /* PCI domain-device relationship */ @@ -737,7 +832,7 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, extern int dmar_ir_support(void); -void *alloc_pgtable_page(int node); +void *alloc_pgtable_page(int node, gfp_t gfp); void free_pgtable_page(void *vaddr); void iommu_flush_write_buffer(struct intel_iommu *iommu); struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn); @@ -756,19 +851,13 @@ struct intel_svm_dev { struct rcu_head rcu; struct device *dev; struct intel_iommu *iommu; - struct iommu_sva sva; - u32 pasid; - int users; u16 did; - u16 dev_iotlb:1; u16 sid, qdep; }; struct intel_svm { struct mmu_notifier notifier; struct mm_struct *mm; - - unsigned int flags; u32 pasid; struct list_head devs; }; @@ -800,6 +889,14 @@ extern const struct iommu_ops intel_iommu_ops; extern int intel_iommu_sm; extern int iommu_calculate_agaw(struct intel_iommu *iommu); extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu); +int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob); + +static inline bool ecmd_has_pmu_essential(struct intel_iommu *iommu) +{ + return (iommu->ecmdcap[DMA_ECMD_ECCAP3] & DMA_ECMD_ECCAP3_ESSENTIAL) == + DMA_ECMD_ECCAP3_ESSENTIAL; +} + extern int dmar_disabled; extern int intel_iommu_enabled; #else diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index fb3c7020028d..633e0a4a01e7 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -128,6 +128,9 @@ int intel_pasid_alloc_table(struct device *dev) pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3); info->pasid_table = pasid_table; + if (!ecap_coherent(info->iommu->ecap)) + clflush_cache_range(pasid_table->table, size); + return 0; } @@ -200,7 +203,7 @@ static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid) retry: entries = get_pasid_table_from_pde(&dir[dir_index]); if (!entries) { - entries = alloc_pgtable_page(info->iommu->node); + entries = alloc_pgtable_page(info->iommu->node, GFP_ATOMIC); if (!entries) return NULL; @@ -215,6 +218,10 @@ retry: free_pgtable_page(entries); goto retry; } + if (!ecap_coherent(info->iommu->ecap)) { + clflush_cache_range(entries, VTD_PAGE_SIZE); + clflush_cache_range(&dir[dir_index].val, sizeof(*dir)); + } } return &entries[index]; @@ -365,6 +372,16 @@ static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value) } /* + * Setup No Execute Enable bit (Bit 133) of a scalable mode PASID + * entry. It is required when XD bit of the first level page table + * entry is about to be set. + */ +static inline void pasid_set_nxe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 5, 1 << 5); +} + +/* * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode * PASID entry. */ @@ -557,6 +574,7 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, pasid_set_domain_id(pte, did); pasid_set_address_width(pte, iommu->agaw); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + pasid_set_nxe(pte); /* Setup Present and PASID Granular Transfer Type: */ pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c new file mode 100644 index 000000000000..e17d9743a0d8 --- /dev/null +++ b/drivers/iommu/intel/perfmon.c @@ -0,0 +1,877 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Support Intel IOMMU PerfMon + * Copyright(c) 2023 Intel Corporation. + */ +#define pr_fmt(fmt) "DMAR: " fmt +#define dev_fmt(fmt) pr_fmt(fmt) + +#include <linux/dmar.h> +#include "iommu.h" +#include "perfmon.h" + +PMU_FORMAT_ATTR(event, "config:0-27"); /* ES: Events Select */ +PMU_FORMAT_ATTR(event_group, "config:28-31"); /* EGI: Event Group Index */ + +static struct attribute *iommu_pmu_format_attrs[] = { + &format_attr_event_group.attr, + &format_attr_event.attr, + NULL +}; + +static struct attribute_group iommu_pmu_format_attr_group = { + .name = "format", + .attrs = iommu_pmu_format_attrs, +}; + +/* The available events are added in attr_update later */ +static struct attribute *attrs_empty[] = { + NULL +}; + +static struct attribute_group iommu_pmu_events_attr_group = { + .name = "events", + .attrs = attrs_empty, +}; + +static cpumask_t iommu_pmu_cpu_mask; + +static ssize_t +cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask); +} +static DEVICE_ATTR_RO(cpumask); + +static struct attribute *iommu_pmu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL +}; + +static struct attribute_group iommu_pmu_cpumask_attr_group = { + .attrs = iommu_pmu_cpumask_attrs, +}; + +static const struct attribute_group *iommu_pmu_attr_groups[] = { + &iommu_pmu_format_attr_group, + &iommu_pmu_events_attr_group, + &iommu_pmu_cpumask_attr_group, + NULL +}; + +static inline struct iommu_pmu *dev_to_iommu_pmu(struct device *dev) +{ + /* + * The perf_event creates its own dev for each PMU. + * See pmu_dev_alloc() + */ + return container_of(dev_get_drvdata(dev), struct iommu_pmu, pmu); +} + +#define IOMMU_PMU_ATTR(_name, _format, _filter) \ + PMU_FORMAT_ATTR(_name, _format); \ + \ +static struct attribute *_name##_attr[] = { \ + &format_attr_##_name.attr, \ + NULL \ +}; \ + \ +static umode_t \ +_name##_is_visible(struct kobject *kobj, struct attribute *attr, int i) \ +{ \ + struct device *dev = kobj_to_dev(kobj); \ + struct iommu_pmu *iommu_pmu = dev_to_iommu_pmu(dev); \ + \ + if (!iommu_pmu) \ + return 0; \ + return (iommu_pmu->filter & _filter) ? attr->mode : 0; \ +} \ + \ +static struct attribute_group _name = { \ + .name = "format", \ + .attrs = _name##_attr, \ + .is_visible = _name##_is_visible, \ +}; + +IOMMU_PMU_ATTR(filter_requester_id_en, "config1:0", IOMMU_PMU_FILTER_REQUESTER_ID); +IOMMU_PMU_ATTR(filter_domain_en, "config1:1", IOMMU_PMU_FILTER_DOMAIN); +IOMMU_PMU_ATTR(filter_pasid_en, "config1:2", IOMMU_PMU_FILTER_PASID); +IOMMU_PMU_ATTR(filter_ats_en, "config1:3", IOMMU_PMU_FILTER_ATS); +IOMMU_PMU_ATTR(filter_page_table_en, "config1:4", IOMMU_PMU_FILTER_PAGE_TABLE); +IOMMU_PMU_ATTR(filter_requester_id, "config1:16-31", IOMMU_PMU_FILTER_REQUESTER_ID); +IOMMU_PMU_ATTR(filter_domain, "config1:32-47", IOMMU_PMU_FILTER_DOMAIN); +IOMMU_PMU_ATTR(filter_pasid, "config2:0-21", IOMMU_PMU_FILTER_PASID); +IOMMU_PMU_ATTR(filter_ats, "config2:24-28", IOMMU_PMU_FILTER_ATS); +IOMMU_PMU_ATTR(filter_page_table, "config2:32-36", IOMMU_PMU_FILTER_PAGE_TABLE); + +#define iommu_pmu_en_requester_id(e) ((e) & 0x1) +#define iommu_pmu_en_domain(e) (((e) >> 1) & 0x1) +#define iommu_pmu_en_pasid(e) (((e) >> 2) & 0x1) +#define iommu_pmu_en_ats(e) (((e) >> 3) & 0x1) +#define iommu_pmu_en_page_table(e) (((e) >> 4) & 0x1) +#define iommu_pmu_get_requester_id(filter) (((filter) >> 16) & 0xffff) +#define iommu_pmu_get_domain(filter) (((filter) >> 32) & 0xffff) +#define iommu_pmu_get_pasid(filter) ((filter) & 0x3fffff) +#define iommu_pmu_get_ats(filter) (((filter) >> 24) & 0x1f) +#define iommu_pmu_get_page_table(filter) (((filter) >> 32) & 0x1f) + +#define iommu_pmu_set_filter(_name, _config, _filter, _idx, _econfig) \ +{ \ + if ((iommu_pmu->filter & _filter) && iommu_pmu_en_##_name(_econfig)) { \ + dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET + \ + IOMMU_PMU_CFG_SIZE + \ + (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET, \ + iommu_pmu_get_##_name(_config) | IOMMU_PMU_FILTER_EN);\ + } \ +} + +#define iommu_pmu_clear_filter(_filter, _idx) \ +{ \ + if (iommu_pmu->filter & _filter) { \ + dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET + \ + IOMMU_PMU_CFG_SIZE + \ + (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET, \ + 0); \ + } \ +} + +/* + * Define the event attr related functions + * Input: _name: event attr name + * _string: string of the event in sysfs + * _g_idx: event group encoding + * _event: event encoding + */ +#define IOMMU_PMU_EVENT_ATTR(_name, _string, _g_idx, _event) \ + PMU_EVENT_ATTR_STRING(_name, event_attr_##_name, _string) \ + \ +static struct attribute *_name##_attr[] = { \ + &event_attr_##_name.attr.attr, \ + NULL \ +}; \ + \ +static umode_t \ +_name##_is_visible(struct kobject *kobj, struct attribute *attr, int i) \ +{ \ + struct device *dev = kobj_to_dev(kobj); \ + struct iommu_pmu *iommu_pmu = dev_to_iommu_pmu(dev); \ + \ + if (!iommu_pmu) \ + return 0; \ + return (iommu_pmu->evcap[_g_idx] & _event) ? attr->mode : 0; \ +} \ + \ +static struct attribute_group _name = { \ + .name = "events", \ + .attrs = _name##_attr, \ + .is_visible = _name##_is_visible, \ +}; + +IOMMU_PMU_EVENT_ATTR(iommu_clocks, "event_group=0x0,event=0x001", 0x0, 0x001) +IOMMU_PMU_EVENT_ATTR(iommu_requests, "event_group=0x0,event=0x002", 0x0, 0x002) +IOMMU_PMU_EVENT_ATTR(pw_occupancy, "event_group=0x0,event=0x004", 0x0, 0x004) +IOMMU_PMU_EVENT_ATTR(ats_blocked, "event_group=0x0,event=0x008", 0x0, 0x008) +IOMMU_PMU_EVENT_ATTR(iommu_mrds, "event_group=0x1,event=0x001", 0x1, 0x001) +IOMMU_PMU_EVENT_ATTR(iommu_mem_blocked, "event_group=0x1,event=0x020", 0x1, 0x020) +IOMMU_PMU_EVENT_ATTR(pg_req_posted, "event_group=0x1,event=0x040", 0x1, 0x040) +IOMMU_PMU_EVENT_ATTR(ctxt_cache_lookup, "event_group=0x2,event=0x001", 0x2, 0x001) +IOMMU_PMU_EVENT_ATTR(ctxt_cache_hit, "event_group=0x2,event=0x002", 0x2, 0x002) +IOMMU_PMU_EVENT_ATTR(pasid_cache_lookup, "event_group=0x2,event=0x004", 0x2, 0x004) +IOMMU_PMU_EVENT_ATTR(pasid_cache_hit, "event_group=0x2,event=0x008", 0x2, 0x008) +IOMMU_PMU_EVENT_ATTR(ss_nonleaf_lookup, "event_group=0x2,event=0x010", 0x2, 0x010) +IOMMU_PMU_EVENT_ATTR(ss_nonleaf_hit, "event_group=0x2,event=0x020", 0x2, 0x020) +IOMMU_PMU_EVENT_ATTR(fs_nonleaf_lookup, "event_group=0x2,event=0x040", 0x2, 0x040) +IOMMU_PMU_EVENT_ATTR(fs_nonleaf_hit, "event_group=0x2,event=0x080", 0x2, 0x080) +IOMMU_PMU_EVENT_ATTR(hpt_nonleaf_lookup, "event_group=0x2,event=0x100", 0x2, 0x100) +IOMMU_PMU_EVENT_ATTR(hpt_nonleaf_hit, "event_group=0x2,event=0x200", 0x2, 0x200) +IOMMU_PMU_EVENT_ATTR(iotlb_lookup, "event_group=0x3,event=0x001", 0x3, 0x001) +IOMMU_PMU_EVENT_ATTR(iotlb_hit, "event_group=0x3,event=0x002", 0x3, 0x002) +IOMMU_PMU_EVENT_ATTR(hpt_leaf_lookup, "event_group=0x3,event=0x004", 0x3, 0x004) +IOMMU_PMU_EVENT_ATTR(hpt_leaf_hit, "event_group=0x3,event=0x008", 0x3, 0x008) +IOMMU_PMU_EVENT_ATTR(int_cache_lookup, "event_group=0x4,event=0x001", 0x4, 0x001) +IOMMU_PMU_EVENT_ATTR(int_cache_hit_nonposted, "event_group=0x4,event=0x002", 0x4, 0x002) +IOMMU_PMU_EVENT_ATTR(int_cache_hit_posted, "event_group=0x4,event=0x004", 0x4, 0x004) + +static const struct attribute_group *iommu_pmu_attr_update[] = { + &filter_requester_id_en, + &filter_domain_en, + &filter_pasid_en, + &filter_ats_en, + &filter_page_table_en, + &filter_requester_id, + &filter_domain, + &filter_pasid, + &filter_ats, + &filter_page_table, + &iommu_clocks, + &iommu_requests, + &pw_occupancy, + &ats_blocked, + &iommu_mrds, + &iommu_mem_blocked, + &pg_req_posted, + &ctxt_cache_lookup, + &ctxt_cache_hit, + &pasid_cache_lookup, + &pasid_cache_hit, + &ss_nonleaf_lookup, + &ss_nonleaf_hit, + &fs_nonleaf_lookup, + &fs_nonleaf_hit, + &hpt_nonleaf_lookup, + &hpt_nonleaf_hit, + &iotlb_lookup, + &iotlb_hit, + &hpt_leaf_lookup, + &hpt_leaf_hit, + &int_cache_lookup, + &int_cache_hit_nonposted, + &int_cache_hit_posted, + NULL +}; + +static inline void __iomem * +iommu_event_base(struct iommu_pmu *iommu_pmu, int idx) +{ + return iommu_pmu->cntr_reg + idx * iommu_pmu->cntr_stride; +} + +static inline void __iomem * +iommu_config_base(struct iommu_pmu *iommu_pmu, int idx) +{ + return iommu_pmu->cfg_reg + idx * IOMMU_PMU_CFG_OFFSET; +} + +static inline struct iommu_pmu *iommu_event_to_pmu(struct perf_event *event) +{ + return container_of(event->pmu, struct iommu_pmu, pmu); +} + +static inline u64 iommu_event_config(struct perf_event *event) +{ + u64 config = event->attr.config; + + return (iommu_event_select(config) << IOMMU_EVENT_CFG_ES_SHIFT) | + (iommu_event_group(config) << IOMMU_EVENT_CFG_EGI_SHIFT) | + IOMMU_EVENT_CFG_INT; +} + +static inline bool is_iommu_pmu_event(struct iommu_pmu *iommu_pmu, + struct perf_event *event) +{ + return event->pmu == &iommu_pmu->pmu; +} + +static int iommu_pmu_validate_event(struct perf_event *event) +{ + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event); + u32 event_group = iommu_event_group(event->attr.config); + + if (event_group >= iommu_pmu->num_eg) + return -EINVAL; + + return 0; +} + +static int iommu_pmu_validate_group(struct perf_event *event) +{ + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event); + struct perf_event *sibling; + int nr = 0; + + /* + * All events in a group must be scheduled simultaneously. + * Check whether there is enough counters for all the events. + */ + for_each_sibling_event(sibling, event->group_leader) { + if (!is_iommu_pmu_event(iommu_pmu, sibling) || + sibling->state <= PERF_EVENT_STATE_OFF) + continue; + + if (++nr > iommu_pmu->num_cntr) + return -EINVAL; + } + + return 0; +} + +static int iommu_pmu_event_init(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* sampling not supported */ + if (event->attr.sample_period) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + if (iommu_pmu_validate_event(event)) + return -EINVAL; + + hwc->config = iommu_event_config(event); + + return iommu_pmu_validate_group(event); +} + +static void iommu_pmu_event_update(struct perf_event *event) +{ + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event); + struct hw_perf_event *hwc = &event->hw; + u64 prev_count, new_count, delta; + int shift = 64 - iommu_pmu->cntr_width; + +again: + prev_count = local64_read(&hwc->prev_count); + new_count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx)); + if (local64_xchg(&hwc->prev_count, new_count) != prev_count) + goto again; + + /* + * The counter width is enumerated. Always shift the counter + * before using it. + */ + delta = (new_count << shift) - (prev_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); +} + +static void iommu_pmu_start(struct perf_event *event, int flags) +{ + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event); + struct intel_iommu *iommu = iommu_pmu->iommu; + struct hw_perf_event *hwc = &event->hw; + u64 count; + + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(hwc->idx < 0 || hwc->idx >= IOMMU_PMU_IDX_MAX)) + return; + + if (flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + + hwc->state = 0; + + /* Always reprogram the period */ + count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx)); + local64_set((&hwc->prev_count), count); + + /* + * The error of ecmd will be ignored. + * - The existing perf_event subsystem doesn't handle the error. + * Only IOMMU PMU returns runtime HW error. We don't want to + * change the existing generic interfaces for the specific case. + * - It's a corner case caused by HW, which is very unlikely to + * happen. There is nothing SW can do. + * - The worst case is that the user will get <not count> with + * perf command, which can give the user some hints. + */ + ecmd_submit_sync(iommu, DMA_ECMD_ENABLE, hwc->idx, 0); + + perf_event_update_userpage(event); +} + +static void iommu_pmu_stop(struct perf_event *event, int flags) +{ + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event); + struct intel_iommu *iommu = iommu_pmu->iommu; + struct hw_perf_event *hwc = &event->hw; + + if (!(hwc->state & PERF_HES_STOPPED)) { + ecmd_submit_sync(iommu, DMA_ECMD_DISABLE, hwc->idx, 0); + + iommu_pmu_event_update(event); + + hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; + } +} + +static inline int +iommu_pmu_validate_per_cntr_event(struct iommu_pmu *iommu_pmu, + int idx, struct perf_event *event) +{ + u32 event_group = iommu_event_group(event->attr.config); + u32 select = iommu_event_select(event->attr.config); + + if (!(iommu_pmu->cntr_evcap[idx][event_group] & select)) + return -EINVAL; + + return 0; +} + +static int iommu_pmu_assign_event(struct iommu_pmu *iommu_pmu, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int idx; + + /* + * The counters which support limited events are usually at the end. + * Schedule them first to accommodate more events. + */ + for (idx = iommu_pmu->num_cntr - 1; idx >= 0; idx--) { + if (test_and_set_bit(idx, iommu_pmu->used_mask)) + continue; + /* Check per-counter event capabilities */ + if (!iommu_pmu_validate_per_cntr_event(iommu_pmu, idx, event)) + break; + clear_bit(idx, iommu_pmu->used_mask); + } + if (idx < 0) + return -EINVAL; + + iommu_pmu->event_list[idx] = event; + hwc->idx = idx; + + /* config events */ + dmar_writeq(iommu_config_base(iommu_pmu, idx), hwc->config); + + iommu_pmu_set_filter(requester_id, event->attr.config1, + IOMMU_PMU_FILTER_REQUESTER_ID, idx, + event->attr.config1); + iommu_pmu_set_filter(domain, event->attr.config1, + IOMMU_PMU_FILTER_DOMAIN, idx, + event->attr.config1); + iommu_pmu_set_filter(pasid, event->attr.config1, + IOMMU_PMU_FILTER_PASID, idx, + event->attr.config1); + iommu_pmu_set_filter(ats, event->attr.config2, + IOMMU_PMU_FILTER_ATS, idx, + event->attr.config1); + iommu_pmu_set_filter(page_table, event->attr.config2, + IOMMU_PMU_FILTER_PAGE_TABLE, idx, + event->attr.config1); + + return 0; +} + +static int iommu_pmu_add(struct perf_event *event, int flags) +{ + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event); + struct hw_perf_event *hwc = &event->hw; + int ret; + + ret = iommu_pmu_assign_event(iommu_pmu, event); + if (ret < 0) + return ret; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (flags & PERF_EF_START) + iommu_pmu_start(event, 0); + + return 0; +} + +static void iommu_pmu_del(struct perf_event *event, int flags) +{ + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event); + int idx = event->hw.idx; + + iommu_pmu_stop(event, PERF_EF_UPDATE); + + iommu_pmu_clear_filter(IOMMU_PMU_FILTER_REQUESTER_ID, idx); + iommu_pmu_clear_filter(IOMMU_PMU_FILTER_DOMAIN, idx); + iommu_pmu_clear_filter(IOMMU_PMU_FILTER_PASID, idx); + iommu_pmu_clear_filter(IOMMU_PMU_FILTER_ATS, idx); + iommu_pmu_clear_filter(IOMMU_PMU_FILTER_PAGE_TABLE, idx); + + iommu_pmu->event_list[idx] = NULL; + event->hw.idx = -1; + clear_bit(idx, iommu_pmu->used_mask); + + perf_event_update_userpage(event); +} + +static void iommu_pmu_enable(struct pmu *pmu) +{ + struct iommu_pmu *iommu_pmu = container_of(pmu, struct iommu_pmu, pmu); + struct intel_iommu *iommu = iommu_pmu->iommu; + + ecmd_submit_sync(iommu, DMA_ECMD_UNFREEZE, 0, 0); +} + +static void iommu_pmu_disable(struct pmu *pmu) +{ + struct iommu_pmu *iommu_pmu = container_of(pmu, struct iommu_pmu, pmu); + struct intel_iommu *iommu = iommu_pmu->iommu; + + ecmd_submit_sync(iommu, DMA_ECMD_FREEZE, 0, 0); +} + +static void iommu_pmu_counter_overflow(struct iommu_pmu *iommu_pmu) +{ + struct perf_event *event; + u64 status; + int i; + + /* + * Two counters may be overflowed very close. Always check + * whether there are more to handle. + */ + while ((status = dmar_readq(iommu_pmu->overflow))) { + for_each_set_bit(i, (unsigned long *)&status, iommu_pmu->num_cntr) { + /* + * Find the assigned event of the counter. + * Accumulate the value into the event->count. + */ + event = iommu_pmu->event_list[i]; + if (!event) { + pr_warn_once("Cannot find the assigned event for counter %d\n", i); + continue; + } + iommu_pmu_event_update(event); + } + + dmar_writeq(iommu_pmu->overflow, status); + } +} + +static irqreturn_t iommu_pmu_irq_handler(int irq, void *dev_id) +{ + struct intel_iommu *iommu = dev_id; + + if (!dmar_readl(iommu->reg + DMAR_PERFINTRSTS_REG)) + return IRQ_NONE; + + iommu_pmu_counter_overflow(iommu->pmu); + + /* Clear the status bit */ + dmar_writel(iommu->reg + DMAR_PERFINTRSTS_REG, DMA_PERFINTRSTS_PIS); + + return IRQ_HANDLED; +} + +static int __iommu_pmu_register(struct intel_iommu *iommu) +{ + struct iommu_pmu *iommu_pmu = iommu->pmu; + + iommu_pmu->pmu.name = iommu->name; + iommu_pmu->pmu.task_ctx_nr = perf_invalid_context; + iommu_pmu->pmu.event_init = iommu_pmu_event_init; + iommu_pmu->pmu.pmu_enable = iommu_pmu_enable; + iommu_pmu->pmu.pmu_disable = iommu_pmu_disable; + iommu_pmu->pmu.add = iommu_pmu_add; + iommu_pmu->pmu.del = iommu_pmu_del; + iommu_pmu->pmu.start = iommu_pmu_start; + iommu_pmu->pmu.stop = iommu_pmu_stop; + iommu_pmu->pmu.read = iommu_pmu_event_update; + iommu_pmu->pmu.attr_groups = iommu_pmu_attr_groups; + iommu_pmu->pmu.attr_update = iommu_pmu_attr_update; + iommu_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; + iommu_pmu->pmu.module = THIS_MODULE; + + return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1); +} + +static inline void __iomem * +get_perf_reg_address(struct intel_iommu *iommu, u32 offset) +{ + u32 off = dmar_readl(iommu->reg + offset); + + return iommu->reg + off; +} + +int alloc_iommu_pmu(struct intel_iommu *iommu) +{ + struct iommu_pmu *iommu_pmu; + int i, j, ret; + u64 perfcap; + u32 cap; + + if (!ecap_pms(iommu->ecap)) + return 0; + + /* The IOMMU PMU requires the ECMD support as well */ + if (!cap_ecmds(iommu->cap)) + return -ENODEV; + + perfcap = dmar_readq(iommu->reg + DMAR_PERFCAP_REG); + /* The performance monitoring is not supported. */ + if (!perfcap) + return -ENODEV; + + /* Sanity check for the number of the counters and event groups */ + if (!pcap_num_cntr(perfcap) || !pcap_num_event_group(perfcap)) + return -ENODEV; + + /* The interrupt on overflow is required */ + if (!pcap_interrupt(perfcap)) + return -ENODEV; + + /* Check required Enhanced Command Capability */ + if (!ecmd_has_pmu_essential(iommu)) + return -ENODEV; + + iommu_pmu = kzalloc(sizeof(*iommu_pmu), GFP_KERNEL); + if (!iommu_pmu) + return -ENOMEM; + + iommu_pmu->num_cntr = pcap_num_cntr(perfcap); + if (iommu_pmu->num_cntr > IOMMU_PMU_IDX_MAX) { + pr_warn_once("The number of IOMMU counters %d > max(%d), clipping!", + iommu_pmu->num_cntr, IOMMU_PMU_IDX_MAX); + iommu_pmu->num_cntr = IOMMU_PMU_IDX_MAX; + } + + iommu_pmu->cntr_width = pcap_cntr_width(perfcap); + iommu_pmu->filter = pcap_filters_mask(perfcap); + iommu_pmu->cntr_stride = pcap_cntr_stride(perfcap); + iommu_pmu->num_eg = pcap_num_event_group(perfcap); + + iommu_pmu->evcap = kcalloc(iommu_pmu->num_eg, sizeof(u64), GFP_KERNEL); + if (!iommu_pmu->evcap) { + ret = -ENOMEM; + goto free_pmu; + } + + /* Parse event group capabilities */ + for (i = 0; i < iommu_pmu->num_eg; i++) { + u64 pcap; + + pcap = dmar_readq(iommu->reg + DMAR_PERFEVNTCAP_REG + + i * IOMMU_PMU_CAP_REGS_STEP); + iommu_pmu->evcap[i] = pecap_es(pcap); + } + + iommu_pmu->cntr_evcap = kcalloc(iommu_pmu->num_cntr, sizeof(u32 *), GFP_KERNEL); + if (!iommu_pmu->cntr_evcap) { + ret = -ENOMEM; + goto free_pmu_evcap; + } + for (i = 0; i < iommu_pmu->num_cntr; i++) { + iommu_pmu->cntr_evcap[i] = kcalloc(iommu_pmu->num_eg, sizeof(u32), GFP_KERNEL); + if (!iommu_pmu->cntr_evcap[i]) { + ret = -ENOMEM; + goto free_pmu_cntr_evcap; + } + /* + * Set to the global capabilities, will adjust according + * to per-counter capabilities later. + */ + for (j = 0; j < iommu_pmu->num_eg; j++) + iommu_pmu->cntr_evcap[i][j] = (u32)iommu_pmu->evcap[j]; + } + + iommu_pmu->cfg_reg = get_perf_reg_address(iommu, DMAR_PERFCFGOFF_REG); + iommu_pmu->cntr_reg = get_perf_reg_address(iommu, DMAR_PERFCNTROFF_REG); + iommu_pmu->overflow = get_perf_reg_address(iommu, DMAR_PERFOVFOFF_REG); + + /* + * Check per-counter capabilities. All counters should have the + * same capabilities on Interrupt on Overflow Support and Counter + * Width. + */ + for (i = 0; i < iommu_pmu->num_cntr; i++) { + cap = dmar_readl(iommu_pmu->cfg_reg + + i * IOMMU_PMU_CFG_OFFSET + + IOMMU_PMU_CFG_CNTRCAP_OFFSET); + if (!iommu_cntrcap_pcc(cap)) + continue; + + /* + * It's possible that some counters have a different + * capability because of e.g., HW bug. Check the corner + * case here and simply drop those counters. + */ + if ((iommu_cntrcap_cw(cap) != iommu_pmu->cntr_width) || + !iommu_cntrcap_ios(cap)) { + iommu_pmu->num_cntr = i; + pr_warn("PMU counter capability inconsistent, counter number reduced to %d\n", + iommu_pmu->num_cntr); + } + + /* Clear the pre-defined events group */ + for (j = 0; j < iommu_pmu->num_eg; j++) + iommu_pmu->cntr_evcap[i][j] = 0; + + /* Override with per-counter event capabilities */ + for (j = 0; j < iommu_cntrcap_egcnt(cap); j++) { + cap = dmar_readl(iommu_pmu->cfg_reg + i * IOMMU_PMU_CFG_OFFSET + + IOMMU_PMU_CFG_CNTREVCAP_OFFSET + + (j * IOMMU_PMU_OFF_REGS_STEP)); + iommu_pmu->cntr_evcap[i][iommu_event_group(cap)] = iommu_event_select(cap); + /* + * Some events may only be supported by a specific counter. + * Track them in the evcap as well. + */ + iommu_pmu->evcap[iommu_event_group(cap)] |= iommu_event_select(cap); + } + } + + iommu_pmu->iommu = iommu; + iommu->pmu = iommu_pmu; + + return 0; + +free_pmu_cntr_evcap: + for (i = 0; i < iommu_pmu->num_cntr; i++) + kfree(iommu_pmu->cntr_evcap[i]); + kfree(iommu_pmu->cntr_evcap); +free_pmu_evcap: + kfree(iommu_pmu->evcap); +free_pmu: + kfree(iommu_pmu); + + return ret; +} + +void free_iommu_pmu(struct intel_iommu *iommu) +{ + struct iommu_pmu *iommu_pmu = iommu->pmu; + + if (!iommu_pmu) + return; + + if (iommu_pmu->evcap) { + int i; + + for (i = 0; i < iommu_pmu->num_cntr; i++) + kfree(iommu_pmu->cntr_evcap[i]); + kfree(iommu_pmu->cntr_evcap); + } + kfree(iommu_pmu->evcap); + kfree(iommu_pmu); + iommu->pmu = NULL; +} + +static int iommu_pmu_set_interrupt(struct intel_iommu *iommu) +{ + struct iommu_pmu *iommu_pmu = iommu->pmu; + int irq, ret; + + irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PERF + iommu->seq_id, iommu->node, iommu); + if (irq <= 0) + return -EINVAL; + + snprintf(iommu_pmu->irq_name, sizeof(iommu_pmu->irq_name), "dmar%d-perf", iommu->seq_id); + + iommu->perf_irq = irq; + ret = request_threaded_irq(irq, NULL, iommu_pmu_irq_handler, + IRQF_ONESHOT, iommu_pmu->irq_name, iommu); + if (ret) { + dmar_free_hwirq(irq); + iommu->perf_irq = 0; + return ret; + } + return 0; +} + +static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu) +{ + if (!iommu->perf_irq) + return; + + free_irq(iommu->perf_irq, iommu); + dmar_free_hwirq(iommu->perf_irq); + iommu->perf_irq = 0; +} + +static int iommu_pmu_cpu_online(unsigned int cpu) +{ + if (cpumask_empty(&iommu_pmu_cpu_mask)) + cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask); + + return 0; +} + +static int iommu_pmu_cpu_offline(unsigned int cpu) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + int target; + + if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask)) + return 0; + + target = cpumask_any_but(cpu_online_mask, cpu); + + if (target < nr_cpu_ids) + cpumask_set_cpu(target, &iommu_pmu_cpu_mask); + else + target = -1; + + rcu_read_lock(); + + for_each_iommu(iommu, drhd) { + if (!iommu->pmu) + continue; + perf_pmu_migrate_context(&iommu->pmu->pmu, cpu, target); + } + rcu_read_unlock(); + + return 0; +} + +static int nr_iommu_pmu; + +static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu) +{ + int ret; + + if (nr_iommu_pmu++) + return 0; + + ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_IOMMU_PERF_ONLINE, + "driver/iommu/intel/perfmon:online", + iommu_pmu_cpu_online, + iommu_pmu_cpu_offline); + if (ret) + nr_iommu_pmu = 0; + + return ret; +} + +static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu) +{ + if (--nr_iommu_pmu) + return; + + cpuhp_remove_state(CPUHP_AP_PERF_X86_IOMMU_PERF_ONLINE); +} + +void iommu_pmu_register(struct intel_iommu *iommu) +{ + struct iommu_pmu *iommu_pmu = iommu->pmu; + + if (!iommu_pmu) + return; + + if (__iommu_pmu_register(iommu)) + goto err; + + if (iommu_pmu_cpuhp_setup(iommu_pmu)) + goto unregister; + + /* Set interrupt for overflow */ + if (iommu_pmu_set_interrupt(iommu)) + goto cpuhp_free; + + return; + +cpuhp_free: + iommu_pmu_cpuhp_free(iommu_pmu); +unregister: + perf_pmu_unregister(&iommu_pmu->pmu); +err: + pr_err("Failed to register PMU for iommu (seq_id = %d)\n", iommu->seq_id); + free_iommu_pmu(iommu); +} + +void iommu_pmu_unregister(struct intel_iommu *iommu) +{ + struct iommu_pmu *iommu_pmu = iommu->pmu; + + if (!iommu_pmu) + return; + + iommu_pmu_unset_interrupt(iommu); + iommu_pmu_cpuhp_free(iommu_pmu); + perf_pmu_unregister(&iommu_pmu->pmu); +} diff --git a/drivers/iommu/intel/perfmon.h b/drivers/iommu/intel/perfmon.h new file mode 100644 index 000000000000..58606af9a2b9 --- /dev/null +++ b/drivers/iommu/intel/perfmon.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * PERFCFGOFF_REG, PERFFRZOFF_REG + * PERFOVFOFF_REG, PERFCNTROFF_REG + */ +#define IOMMU_PMU_NUM_OFF_REGS 4 +#define IOMMU_PMU_OFF_REGS_STEP 4 + +#define IOMMU_PMU_FILTER_REQUESTER_ID 0x01 +#define IOMMU_PMU_FILTER_DOMAIN 0x02 +#define IOMMU_PMU_FILTER_PASID 0x04 +#define IOMMU_PMU_FILTER_ATS 0x08 +#define IOMMU_PMU_FILTER_PAGE_TABLE 0x10 + +#define IOMMU_PMU_FILTER_EN BIT(31) + +#define IOMMU_PMU_CFG_OFFSET 0x100 +#define IOMMU_PMU_CFG_CNTRCAP_OFFSET 0x80 +#define IOMMU_PMU_CFG_CNTREVCAP_OFFSET 0x84 +#define IOMMU_PMU_CFG_SIZE 0x8 +#define IOMMU_PMU_CFG_FILTERS_OFFSET 0x4 + +#define IOMMU_PMU_CAP_REGS_STEP 8 + +#define iommu_cntrcap_pcc(p) ((p) & 0x1) +#define iommu_cntrcap_cw(p) (((p) >> 8) & 0xff) +#define iommu_cntrcap_ios(p) (((p) >> 16) & 0x1) +#define iommu_cntrcap_egcnt(p) (((p) >> 28) & 0xf) + +#define IOMMU_EVENT_CFG_EGI_SHIFT 8 +#define IOMMU_EVENT_CFG_ES_SHIFT 32 +#define IOMMU_EVENT_CFG_INT BIT_ULL(1) + +#define iommu_event_select(p) ((p) & 0xfffffff) +#define iommu_event_group(p) (((p) >> 28) & 0xf) + +#ifdef CONFIG_INTEL_IOMMU_PERF_EVENTS +int alloc_iommu_pmu(struct intel_iommu *iommu); +void free_iommu_pmu(struct intel_iommu *iommu); +void iommu_pmu_register(struct intel_iommu *iommu); +void iommu_pmu_unregister(struct intel_iommu *iommu); +#else +static inline int +alloc_iommu_pmu(struct intel_iommu *iommu) +{ + return 0; +} + +static inline void +free_iommu_pmu(struct intel_iommu *iommu) +{ +} + +static inline void +iommu_pmu_register(struct intel_iommu *iommu) +{ +} + +static inline void +iommu_pmu_unregister(struct intel_iommu *iommu) +{ +} +#endif /* CONFIG_INTEL_IOMMU_PERF_EVENTS */ diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index c76b66263467..7367f56c3bad 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -9,7 +9,6 @@ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/slab.h> -#include <linux/intel-svm.h> #include <linux/rculist.h> #include <linux/pci.h> #include <linux/pci-ats.h> @@ -79,7 +78,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu) } iommu->prq = page_address(pages); - irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu); + irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu); if (irq <= 0) { pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", iommu->name); @@ -299,9 +298,8 @@ out: return 0; } -static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, - struct device *dev, - struct mm_struct *mm) +static int intel_svm_bind_mm(struct intel_iommu *iommu, struct device *dev, + struct mm_struct *mm) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_svm_dev *sdev; @@ -313,7 +311,7 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, if (!svm) { svm = kzalloc(sizeof(*svm), GFP_KERNEL); if (!svm) - return ERR_PTR(-ENOMEM); + return -ENOMEM; svm->pasid = mm->pasid; svm->mm = mm; @@ -323,24 +321,17 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, ret = mmu_notifier_register(&svm->notifier, mm); if (ret) { kfree(svm); - return ERR_PTR(ret); + return ret; } ret = pasid_private_add(svm->pasid, svm); if (ret) { mmu_notifier_unregister(&svm->notifier, mm); kfree(svm); - return ERR_PTR(ret); + return ret; } } - /* Find the matching device in svm list */ - sdev = svm_lookup_device_by_dev(svm, dev); - if (sdev) { - sdev->users++; - goto success; - } - sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); if (!sdev) { ret = -ENOMEM; @@ -351,12 +342,8 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, sdev->iommu = iommu; sdev->did = FLPT_DEFAULT_DID; sdev->sid = PCI_DEVID(info->bus, info->devfn); - sdev->users = 1; - sdev->pasid = svm->pasid; - sdev->sva.dev = dev; init_rcu_head(&sdev->rcu); if (info->ats_enabled) { - sdev->dev_iotlb = 1; sdev->qdep = info->ats_qdep; if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) sdev->qdep = 0; @@ -370,8 +357,8 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, goto free_sdev; list_add_rcu(&sdev->list, &svm->devs); -success: - return &sdev->sva; + + return 0; free_sdev: kfree(sdev); @@ -382,7 +369,7 @@ free_svm: kfree(svm); } - return ERR_PTR(ret); + return ret; } /* Caller must hold pasid_mutex */ @@ -404,32 +391,32 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid) mm = svm->mm; if (sdev) { - sdev->users--; - if (!sdev->users) { - list_del_rcu(&sdev->list); - /* Flush the PASID cache and IOTLB for this device. - * Note that we do depend on the hardware *not* using - * the PASID any more. Just as we depend on other - * devices never using PASIDs that they have no right - * to use. We have a *shared* PASID table, because it's - * large and has to be physically contiguous. So it's - * hard to be as defensive as we might like. */ - intel_pasid_tear_down_entry(iommu, dev, - svm->pasid, false); - intel_svm_drain_prq(dev, svm->pasid); - kfree_rcu(sdev, rcu); - - if (list_empty(&svm->devs)) { - if (svm->notifier.ops) - mmu_notifier_unregister(&svm->notifier, mm); - pasid_private_remove(svm->pasid); - /* We mandate that no page faults may be outstanding - * for the PASID when intel_svm_unbind_mm() is called. - * If that is not obeyed, subtle errors will happen. - * Let's make them less subtle... */ - memset(svm, 0x6b, sizeof(*svm)); - kfree(svm); - } + list_del_rcu(&sdev->list); + /* + * Flush the PASID cache and IOTLB for this device. + * Note that we do depend on the hardware *not* using + * the PASID any more. Just as we depend on other + * devices never using PASIDs that they have no right + * to use. We have a *shared* PASID table, because it's + * large and has to be physically contiguous. So it's + * hard to be as defensive as we might like. + */ + intel_pasid_tear_down_entry(iommu, dev, svm->pasid, false); + intel_svm_drain_prq(dev, svm->pasid); + kfree_rcu(sdev, rcu); + + if (list_empty(&svm->devs)) { + if (svm->notifier.ops) + mmu_notifier_unregister(&svm->notifier, mm); + pasid_private_remove(svm->pasid); + /* + * We mandate that no page faults may be outstanding + * for the PASID when intel_svm_unbind_mm() is called. + * If that is not obeyed, subtle errors will happen. + * Let's make them less subtle... + */ + memset(svm, 0x6b, sizeof(*svm)); + kfree(svm); } } out: @@ -854,13 +841,10 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct mm_struct *mm = domain->mm; - struct iommu_sva *sva; - int ret = 0; + int ret; mutex_lock(&pasid_mutex); - sva = intel_svm_bind_mm(iommu, dev, mm); - if (IS_ERR(sva)) - ret = PTR_ERR(sva); + ret = intel_svm_bind_mm(iommu, dev, mm); mutex_unlock(&pasid_mutex); return ret; |