diff options
Diffstat (limited to 'kernel')
61 files changed, 2016 insertions, 1344 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index 78701ea37c97..c6b299a6b786 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,4 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -kheaders.md5 -timeconst.h -hz.bc +/config_data +/kheaders.md5 diff --git a/kernel/Makefile b/kernel/Makefile index e8a6715f38dc..4df609be42d0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -142,10 +142,15 @@ obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o $(obj)/configs.o: $(obj)/config_data.gz -targets += config_data.gz -$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE +targets += config_data config_data.gz +$(obj)/config_data.gz: $(obj)/config_data FORCE $(call if_changed,gzip) +filechk_cat = cat $< + +$(obj)/config_data: $(KCONFIG_CONFIG) FORCE + $(call filechk,cat) + $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz diff --git a/kernel/async.c b/kernel/async.c index 33258e6e20f8..b8d7a663497f 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -78,6 +78,12 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done); static atomic_t entry_count; +static long long microseconds_since(ktime_t start) +{ + ktime_t now = ktime_get(); + return ktime_to_ns(ktime_sub(now, start)) >> 10; +} + static async_cookie_t lowest_in_progress(struct async_domain *domain) { struct async_entry *first = NULL; @@ -111,24 +117,18 @@ static void async_run_entry_fn(struct work_struct *work) struct async_entry *entry = container_of(work, struct async_entry, work); unsigned long flags; - ktime_t calltime, delta, rettime; + ktime_t calltime; /* 1) run (and print duration) */ - if (initcall_debug && system_state < SYSTEM_RUNNING) { - pr_debug("calling %lli_%pS @ %i\n", - (long long)entry->cookie, - entry->func, task_pid_nr(current)); - calltime = ktime_get(); - } + pr_debug("calling %lli_%pS @ %i\n", (long long)entry->cookie, + entry->func, task_pid_nr(current)); + calltime = ktime_get(); + entry->func(entry->data, entry->cookie); - if (initcall_debug && system_state < SYSTEM_RUNNING) { - rettime = ktime_get(); - delta = ktime_sub(rettime, calltime); - pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n", - (long long)entry->cookie, - entry->func, - (long long)ktime_to_ns(delta) >> 10); - } + + pr_debug("initcall %lli_%pS returned after %lld usecs\n", + (long long)entry->cookie, entry->func, + microseconds_since(calltime)); /* 2) remove self from the pending queues */ spin_lock_irqsave(&async_lock, flags); @@ -246,24 +246,6 @@ void async_synchronize_full(void) EXPORT_SYMBOL_GPL(async_synchronize_full); /** - * async_unregister_domain - ensure no more anonymous waiters on this domain - * @domain: idle domain to flush out of any async_synchronize_full instances - * - * async_synchronize_{cookie|full}_domain() are not flushed since callers - * of these routines should know the lifetime of @domain - * - * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing - */ -void async_unregister_domain(struct async_domain *domain) -{ - spin_lock_irq(&async_lock); - WARN_ON(!domain->registered || !list_empty(&domain->pending)); - domain->registered = 0; - spin_unlock_irq(&async_lock); -} -EXPORT_SYMBOL_GPL(async_unregister_domain); - -/** * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain * @domain: the domain to synchronize * @@ -287,23 +269,15 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); */ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain) { - ktime_t starttime, delta, endtime; + ktime_t starttime; - if (initcall_debug && system_state < SYSTEM_RUNNING) { - pr_debug("async_waiting @ %i\n", task_pid_nr(current)); - starttime = ktime_get(); - } + pr_debug("async_waiting @ %i\n", task_pid_nr(current)); + starttime = ktime_get(); wait_event(async_done, lowest_in_progress(domain) >= cookie); - if (initcall_debug && system_state < SYSTEM_RUNNING) { - endtime = ktime_get(); - delta = ktime_sub(endtime, starttime); - - pr_debug("async_continuing @ %i after %lli usec\n", - task_pid_nr(current), - (long long)ktime_to_ns(delta) >> 10); - } + pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current), + microseconds_since(starttime)); } EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8fd552c16763..757476c91c98 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6496,6 +6496,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, { struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux; struct bpf_verifier_state *vstate = env->cur_state; + bool off_is_imm = tnum_is_const(off_reg->var_off); bool off_is_neg = off_reg->smin_value < 0; bool ptr_is_dst_reg = ptr_reg == dst_reg; u8 opcode = BPF_OP(insn->code); @@ -6526,6 +6527,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, alu_limit = abs(tmp_aux->alu_limit - alu_limit); } else { alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; + alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0; alu_state |= ptr_is_dst_reg ? BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; } @@ -12371,7 +12373,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; struct bpf_insn *patch = &insn_buf[0]; - bool issrc, isneg; + bool issrc, isneg, isimm; u32 off_reg; aux = &env->insn_aux_data[i + delta]; @@ -12382,28 +12384,29 @@ static int do_misc_fixups(struct bpf_verifier_env *env) isneg = aux->alu_state & BPF_ALU_NEG_VALUE; issrc = (aux->alu_state & BPF_ALU_SANITIZE) == BPF_ALU_SANITIZE_SRC; + isimm = aux->alu_state & BPF_ALU_IMMEDIATE; off_reg = issrc ? insn->src_reg : insn->dst_reg; - if (isneg) - *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); - *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); - *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); - *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); - *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); - *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); - if (issrc) { - *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, - off_reg); - insn->src_reg = BPF_REG_AX; + if (isimm) { + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); } else { - *patch++ = BPF_ALU64_REG(BPF_AND, off_reg, - BPF_REG_AX); + if (isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); + *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); + *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); + *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg); } + if (!issrc) + *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg); + insn->src_reg = BPF_REG_AX; if (isneg) insn->code = insn->code == code_add ? code_sub : code_add; *patch++ = *insn; - if (issrc && isneg) + if (issrc && isneg && !isimm) *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); cnt = patch - insn_buf; diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index d3fd428f4b92..eb701b2ac72f 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -1,5 +1,4 @@ # KEEP ALPHABETICALLY SORTED -# CONFIG_DEVKMEM is not set # CONFIG_DEVMEM is not set # CONFIG_FHANDLE is not set # CONFIG_INET_LRO is not set diff --git a/kernel/cred.c b/kernel/cred.c index 421b1149c651..e1d274cd741b 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -33,7 +33,7 @@ do { \ static struct kmem_cache *cred_jar; /* init to 2 - one for init_task, one to ensure it is never freed */ -struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; +static struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; /* * The initial credentials for the initial task diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 002268262c9a..f737e3347059 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -344,8 +344,8 @@ void dma_direct_sync_sg_for_device(struct device *dev, phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg)); if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, sg->length, - dir, SYNC_FOR_DEVICE); + swiotlb_sync_single_for_device(dev, paddr, sg->length, + dir); if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_device(paddr, sg->length, @@ -370,8 +370,8 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, arch_sync_dma_for_cpu(paddr, sg->length, dir); if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, sg->length, dir, - SYNC_FOR_CPU); + swiotlb_sync_single_for_cpu(dev, paddr, sg->length, + dir); if (dir == DMA_FROM_DEVICE) arch_dma_mark_clean(paddr, sg->length); diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index b98615578737..50afc05b6f1d 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -57,7 +57,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev, phys_addr_t paddr = dma_to_phys(dev, addr); if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); + swiotlb_sync_single_for_device(dev, paddr, size, dir); if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_device(paddr, size, dir); @@ -74,7 +74,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, } if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); + swiotlb_sync_single_for_cpu(dev, paddr, size, dir); if (dir == DMA_FROM_DEVICE) arch_dma_mark_clean(paddr, size); @@ -114,6 +114,6 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, dma_direct_sync_single_for_cpu(dev, addr, size, dir); if (unlikely(is_swiotlb_buffer(phys))) - swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); + swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); } #endif /* _KERNEL_DMA_DIRECT_H */ diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index e0e64f8b0739..9b9af1bd6be3 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020 Hisilicon Limited. + * Copyright (C) 2020 HiSilicon Limited. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -38,7 +38,8 @@ struct map_benchmark { __u32 dma_bits; /* DMA addressing capability */ __u32 dma_dir; /* DMA data direction */ __u32 dma_trans_ns; /* time for DMA transmission in ns */ - __u8 expansion[80]; /* For future use */ + __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ + __u8 expansion[76]; /* For future use */ }; struct map_benchmark_data { @@ -58,9 +59,11 @@ static int map_benchmark_thread(void *data) void *buf; dma_addr_t dma_addr; struct map_benchmark_data *map = data; + int npages = map->bparam.granule; + u64 size = npages * PAGE_SIZE; int ret = 0; - buf = (void *)__get_free_page(GFP_KERNEL); + buf = alloc_pages_exact(size, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -76,10 +79,10 @@ static int map_benchmark_thread(void *data) * 66 means evertything goes well! 66 is lucky. */ if (map->dir != DMA_FROM_DEVICE) - memset(buf, 0x66, PAGE_SIZE); + memset(buf, 0x66, size); map_stime = ktime_get(); - dma_addr = dma_map_single(map->dev, buf, PAGE_SIZE, map->dir); + dma_addr = dma_map_single(map->dev, buf, size, map->dir); if (unlikely(dma_mapping_error(map->dev, dma_addr))) { pr_err("dma_map_single failed on %s\n", dev_name(map->dev)); @@ -93,7 +96,7 @@ static int map_benchmark_thread(void *data) ndelay(map->bparam.dma_trans_ns); unmap_stime = ktime_get(); - dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir); + dma_unmap_single(map->dev, dma_addr, size, map->dir); unmap_etime = ktime_get(); unmap_delta = ktime_sub(unmap_etime, unmap_stime); @@ -112,7 +115,7 @@ static int map_benchmark_thread(void *data) } out: - free_page((unsigned long)buf); + free_pages_exact(buf, size); return ret; } @@ -203,7 +206,6 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, struct map_benchmark_data *map = file->private_data; void __user *argp = (void __user *)arg; u64 old_dma_mask; - int ret; if (copy_from_user(&map->bparam, argp, sizeof(map->bparam))) @@ -234,6 +236,11 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, return -EINVAL; } + if (map->bparam.granule < 1 || map->bparam.granule > 1024) { + pr_err("invalid granule size\n"); + return -EINVAL; + } + switch (map->bparam.dma_dir) { case DMA_MAP_BIDIRECTIONAL: map->dir = DMA_BIDIRECTIONAL; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b6a633679933..2b06a809d0b9 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -477,11 +477,10 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, } EXPORT_SYMBOL(dma_free_attrs); -struct page *dma_alloc_pages(struct device *dev, size_t size, +static struct page *__dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) { const struct dma_map_ops *ops = get_dma_ops(dev); - struct page *page; if (WARN_ON_ONCE(!dev->coherent_dma_mask)) return NULL; @@ -490,33 +489,162 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, size = PAGE_ALIGN(size); if (dma_alloc_direct(dev, ops)) - page = dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); - else if (ops->alloc_pages) - page = ops->alloc_pages(dev, size, dma_handle, dir, gfp); - else + return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); + if (!ops->alloc_pages) return NULL; + return ops->alloc_pages(dev, size, dma_handle, dir, gfp); +} - debug_dma_map_page(dev, page, 0, size, dir, *dma_handle); +struct page *dma_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) +{ + struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp); + if (page) + debug_dma_map_page(dev, page, 0, size, dir, *dma_handle); return page; } EXPORT_SYMBOL_GPL(dma_alloc_pages); -void dma_free_pages(struct device *dev, size_t size, struct page *page, +static void __dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); size = PAGE_ALIGN(size); - debug_dma_unmap_page(dev, dma_handle, size, dir); - if (dma_alloc_direct(dev, ops)) dma_direct_free_pages(dev, size, page, dma_handle, dir); else if (ops->free_pages) ops->free_pages(dev, size, page, dma_handle, dir); } + +void dma_free_pages(struct device *dev, size_t size, struct page *page, + dma_addr_t dma_handle, enum dma_data_direction dir) +{ + debug_dma_unmap_page(dev, dma_handle, size, dir); + __dma_free_pages(dev, size, page, dma_handle, dir); +} EXPORT_SYMBOL_GPL(dma_free_pages); +int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, + size_t size, struct page *page) +{ + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + if (vma->vm_pgoff >= count || vma_pages(vma) > count - vma->vm_pgoff) + return -ENXIO; + return remap_pfn_range(vma, vma->vm_start, + page_to_pfn(page) + vma->vm_pgoff, + vma_pages(vma) << PAGE_SHIFT, vma->vm_page_prot); +} +EXPORT_SYMBOL_GPL(dma_mmap_pages); + +static struct sg_table *alloc_single_sgt(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp) +{ + struct sg_table *sgt; + struct page *page; + + sgt = kmalloc(sizeof(*sgt), gfp); + if (!sgt) + return NULL; + if (sg_alloc_table(sgt, 1, gfp)) + goto out_free_sgt; + page = __dma_alloc_pages(dev, size, &sgt->sgl->dma_address, dir, gfp); + if (!page) + goto out_free_table; + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); + sg_dma_len(sgt->sgl) = sgt->sgl->length; + return sgt; +out_free_table: + sg_free_table(sgt); +out_free_sgt: + kfree(sgt); + return NULL; +} + +struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + struct sg_table *sgt; + + if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES)) + return NULL; + + if (ops && ops->alloc_noncontiguous) + sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs); + else + sgt = alloc_single_sgt(dev, size, dir, gfp); + + if (sgt) { + sgt->nents = 1; + debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir); + } + return sgt; +} +EXPORT_SYMBOL_GPL(dma_alloc_noncontiguous); + +static void free_single_sgt(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir) +{ + __dma_free_pages(dev, size, sg_page(sgt->sgl), sgt->sgl->dma_address, + dir); + sg_free_table(sgt); + kfree(sgt); +} + +void dma_free_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir); + if (ops && ops->free_noncontiguous) + ops->free_noncontiguous(dev, size, sgt, dir); + else + free_single_sgt(dev, size, sgt, dir); +} +EXPORT_SYMBOL_GPL(dma_free_noncontiguous); + +void *dma_vmap_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + if (ops && ops->alloc_noncontiguous) + return vmap(sgt_handle(sgt)->pages, count, VM_MAP, PAGE_KERNEL); + return page_address(sg_page(sgt->sgl)); +} +EXPORT_SYMBOL_GPL(dma_vmap_noncontiguous); + +void dma_vunmap_noncontiguous(struct device *dev, void *vaddr) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (ops && ops->alloc_noncontiguous) + vunmap(vaddr); +} +EXPORT_SYMBOL_GPL(dma_vunmap_noncontiguous); + +int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, + size_t size, struct sg_table *sgt) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (ops && ops->alloc_noncontiguous) { + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + if (vma->vm_pgoff >= count || + vma_pages(vma) > count - vma->vm_pgoff) + return -ENXIO; + return vm_map_pages(vma, sgt_handle(sgt)->pages, count); + } + return dma_mmap_pages(dev, vma, size, sg_page(sgt->sgl)); +} +EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous); + int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index c10e855a03bc..8ca7d505d61c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -59,32 +59,11 @@ */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) -enum swiotlb_force swiotlb_force; - -/* - * Used to do a quick range check in swiotlb_tbl_unmap_single and - * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this - * API. - */ -phys_addr_t io_tlb_start, io_tlb_end; - -/* - * The number of IO TLB blocks (in groups of 64) between io_tlb_start and - * io_tlb_end. This is command line adjustable via setup_io_tlb_npages. - */ -static unsigned long io_tlb_nslabs; +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) -/* - * The number of used IO TLB block - */ -static unsigned long io_tlb_used; +enum swiotlb_force swiotlb_force; -/* - * This is a free list describing the number of free entries available from - * each index - */ -static unsigned int *io_tlb_list; -static unsigned int io_tlb_index; +struct io_tlb_mem *io_tlb_default_mem; /* * Max segment that we can provide which (if pages are contingous) will @@ -92,57 +71,30 @@ static unsigned int io_tlb_index; */ static unsigned int max_segment; -/* - * We need to save away the original address corresponding to a mapped entry - * for the sync operations. - */ -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) -static phys_addr_t *io_tlb_orig_addr; - -/* - * The mapped buffer's size should be validated during a sync operation. - */ -static size_t *io_tlb_orig_size; - -/* - * Protect the above data structures in the map and unmap calls - */ -static DEFINE_SPINLOCK(io_tlb_lock); - -static int late_alloc; +static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT; static int __init setup_io_tlb_npages(char *str) { if (isdigit(*str)) { - io_tlb_nslabs = simple_strtoul(str, &str, 0); /* avoid tail segment of size < IO_TLB_SEGSIZE */ - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + default_nslabs = + ALIGN(simple_strtoul(str, &str, 0), IO_TLB_SEGSIZE); } if (*str == ',') ++str; - if (!strcmp(str, "force")) { + if (!strcmp(str, "force")) swiotlb_force = SWIOTLB_FORCE; - } else if (!strcmp(str, "noforce")) { + else if (!strcmp(str, "noforce")) swiotlb_force = SWIOTLB_NO_FORCE; - io_tlb_nslabs = 1; - } return 0; } early_param("swiotlb", setup_io_tlb_npages); -static bool no_iotlb_memory; - -unsigned long swiotlb_nr_tbl(void) -{ - return unlikely(no_iotlb_memory) ? 0 : io_tlb_nslabs; -} -EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); - unsigned int swiotlb_max_segment(void) { - return unlikely(no_iotlb_memory) ? 0 : max_segment; + return io_tlb_default_mem ? max_segment : 0; } EXPORT_SYMBOL_GPL(swiotlb_max_segment); @@ -156,42 +108,34 @@ void swiotlb_set_max_segment(unsigned int val) unsigned long swiotlb_size_or_default(void) { - unsigned long size; - - size = io_tlb_nslabs << IO_TLB_SHIFT; - - return size ? size : (IO_TLB_DEFAULT_SIZE); + return default_nslabs << IO_TLB_SHIFT; } -void __init swiotlb_adjust_size(unsigned long new_size) +void __init swiotlb_adjust_size(unsigned long size) { - unsigned long size; - /* * If swiotlb parameter has not been specified, give a chance to * architectures such as those supporting memory encryption to * adjust/expand SWIOTLB size for their use. */ - if (!io_tlb_nslabs) { - size = ALIGN(new_size, IO_TLB_SIZE); - io_tlb_nslabs = size >> IO_TLB_SHIFT; - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - - pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20); - } + if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT) + return; + size = ALIGN(size, IO_TLB_SIZE); + default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); + pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20); } void swiotlb_print_info(void) { - unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; + struct io_tlb_mem *mem = io_tlb_default_mem; - if (no_iotlb_memory) { + if (!mem) { pr_warn("No low mem\n"); return; } - pr_info("mapped [mem %pa-%pa] (%luMB)\n", &io_tlb_start, &io_tlb_end, - bytes >> 20); + pr_info("mapped [mem %pa-%pa] (%luMB)\n", &mem->start, &mem->end, + (mem->nslabs << IO_TLB_SHIFT) >> 20); } static inline unsigned long io_tlb_offset(unsigned long val) @@ -212,64 +156,51 @@ static inline unsigned long nr_slots(u64 val) */ void __init swiotlb_update_mem_attributes(void) { + struct io_tlb_mem *mem = io_tlb_default_mem; void *vaddr; unsigned long bytes; - if (no_iotlb_memory || late_alloc) + if (!mem || mem->late_alloc) return; - - vaddr = phys_to_virt(io_tlb_start); - bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT); + vaddr = phys_to_virt(mem->start); + bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT); set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); memset(vaddr, 0, bytes); } int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { - unsigned long i, bytes; + unsigned long bytes = nslabs << IO_TLB_SHIFT, i; + struct io_tlb_mem *mem; size_t alloc_size; - bytes = nslabs << IO_TLB_SHIFT; - - io_tlb_nslabs = nslabs; - io_tlb_start = __pa(tlb); - io_tlb_end = io_tlb_start + bytes; - - /* - * Allocate and initialize the free list array. This array is used - * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE - * between io_tlb_start and io_tlb_end. - */ - alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(int)); - io_tlb_list = memblock_alloc(alloc_size, PAGE_SIZE); - if (!io_tlb_list) - panic("%s: Failed to allocate %zu bytes align=0x%lx\n", - __func__, alloc_size, PAGE_SIZE); + if (swiotlb_force == SWIOTLB_NO_FORCE) + return 0; - alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)); - io_tlb_orig_addr = memblock_alloc(alloc_size, PAGE_SIZE); - if (!io_tlb_orig_addr) - panic("%s: Failed to allocate %zu bytes align=0x%lx\n", - __func__, alloc_size, PAGE_SIZE); + /* protect against double initialization */ + if (WARN_ON_ONCE(io_tlb_default_mem)) + return -ENOMEM; - alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)); - io_tlb_orig_size = memblock_alloc(alloc_size, PAGE_SIZE); - if (!io_tlb_orig_size) + alloc_size = PAGE_ALIGN(struct_size(mem, slots, nslabs)); + mem = memblock_alloc(alloc_size, PAGE_SIZE); + if (!mem) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); - - for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - io_tlb_orig_size[i] = 0; + mem->nslabs = nslabs; + mem->start = __pa(tlb); + mem->end = mem->start + bytes; + mem->index = 0; + spin_lock_init(&mem->lock); + for (i = 0; i < mem->nslabs; i++) { + mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); + mem->slots[i].orig_addr = INVALID_PHYS_ADDR; + mem->slots[i].alloc_size = 0; } - io_tlb_index = 0; - no_iotlb_memory = false; + io_tlb_default_mem = mem; if (verbose) swiotlb_print_info(); - - swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); + swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); return 0; } @@ -280,29 +211,24 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) void __init swiotlb_init(int verbose) { - size_t default_size = IO_TLB_DEFAULT_SIZE; - unsigned char *vstart; - unsigned long bytes; - - if (!io_tlb_nslabs) { - io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - } - - bytes = io_tlb_nslabs << IO_TLB_SHIFT; + size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT); + void *tlb; - /* Get IO TLB memory from the low pages */ - vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE); - if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) + if (swiotlb_force == SWIOTLB_NO_FORCE) return; - if (io_tlb_start) { - memblock_free_early(io_tlb_start, - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); - io_tlb_start = 0; - } + /* Get IO TLB memory from the low pages */ + tlb = memblock_alloc_low(bytes, PAGE_SIZE); + if (!tlb) + goto fail; + if (swiotlb_init_with_tbl(tlb, default_nslabs, verbose)) + goto fail_free_mem; + return; + +fail_free_mem: + memblock_free_early(__pa(tlb), bytes); +fail: pr_warn("Cannot allocate buffer"); - no_iotlb_memory = true; } /* @@ -313,22 +239,22 @@ swiotlb_init(int verbose) int swiotlb_late_init_with_default_size(size_t default_size) { - unsigned long bytes, req_nslabs = io_tlb_nslabs; + unsigned long nslabs = + ALIGN(default_size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); + unsigned long bytes; unsigned char *vstart = NULL; unsigned int order; int rc = 0; - if (!io_tlb_nslabs) { - io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - } + if (swiotlb_force == SWIOTLB_NO_FORCE) + return 0; /* * Get IO TLB memory from the low pages */ - order = get_order(io_tlb_nslabs << IO_TLB_SHIFT); - io_tlb_nslabs = SLABS_PER_PAGE << order; - bytes = io_tlb_nslabs << IO_TLB_SHIFT; + order = get_order(nslabs << IO_TLB_SHIFT); + nslabs = SLABS_PER_PAGE << order; + bytes = nslabs << IO_TLB_SHIFT; while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, @@ -338,134 +264,99 @@ swiotlb_late_init_with_default_size(size_t default_size) order--; } - if (!vstart) { - io_tlb_nslabs = req_nslabs; + if (!vstart) return -ENOMEM; - } + if (order != get_order(bytes)) { pr_warn("only able to allocate %ld MB\n", (PAGE_SIZE << order) >> 20); - io_tlb_nslabs = SLABS_PER_PAGE << order; + nslabs = SLABS_PER_PAGE << order; } - rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs); + rc = swiotlb_late_init_with_tbl(vstart, nslabs); if (rc) free_pages((unsigned long)vstart, order); return rc; } -static void swiotlb_cleanup(void) -{ - io_tlb_end = 0; - io_tlb_start = 0; - io_tlb_nslabs = 0; - max_segment = 0; -} - int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { - unsigned long i, bytes; + unsigned long bytes = nslabs << IO_TLB_SHIFT, i; + struct io_tlb_mem *mem; - bytes = nslabs << IO_TLB_SHIFT; + if (swiotlb_force == SWIOTLB_NO_FORCE) + return 0; - io_tlb_nslabs = nslabs; - io_tlb_start = virt_to_phys(tlb); - io_tlb_end = io_tlb_start + bytes; + /* protect against double initialization */ + if (WARN_ON_ONCE(io_tlb_default_mem)) + return -ENOMEM; - set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); - memset(tlb, 0, bytes); + mem = (void *)__get_free_pages(GFP_KERNEL, + get_order(struct_size(mem, slots, nslabs))); + if (!mem) + return -ENOMEM; - /* - * Allocate and initialize the free list array. This array is used - * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE - * between io_tlb_start and io_tlb_end. - */ - io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, - get_order(io_tlb_nslabs * sizeof(int))); - if (!io_tlb_list) - goto cleanup3; - - io_tlb_orig_addr = (phys_addr_t *) - __get_free_pages(GFP_KERNEL, - get_order(io_tlb_nslabs * - sizeof(phys_addr_t))); - if (!io_tlb_orig_addr) - goto cleanup4; - - io_tlb_orig_size = (size_t *) - __get_free_pages(GFP_KERNEL, - get_order(io_tlb_nslabs * - sizeof(size_t))); - if (!io_tlb_orig_size) - goto cleanup5; - - - for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - io_tlb_orig_size[i] = 0; + mem->nslabs = nslabs; + mem->start = virt_to_phys(tlb); + mem->end = mem->start + bytes; + mem->index = 0; + mem->late_alloc = 1; + spin_lock_init(&mem->lock); + for (i = 0; i < mem->nslabs; i++) { + mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); + mem->slots[i].orig_addr = INVALID_PHYS_ADDR; + mem->slots[i].alloc_size = 0; } - io_tlb_index = 0; - no_iotlb_memory = false; - - swiotlb_print_info(); - late_alloc = 1; - - swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); + set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); + memset(tlb, 0, bytes); + io_tlb_default_mem = mem; + swiotlb_print_info(); + swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); return 0; - -cleanup5: - free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs * - sizeof(phys_addr_t))); - -cleanup4: - free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * - sizeof(int))); - io_tlb_list = NULL; -cleanup3: - swiotlb_cleanup(); - return -ENOMEM; } void __init swiotlb_exit(void) { - if (!io_tlb_orig_addr) + struct io_tlb_mem *mem = io_tlb_default_mem; + size_t size; + + if (!mem) return; - if (late_alloc) { - free_pages((unsigned long)io_tlb_orig_size, - get_order(io_tlb_nslabs * sizeof(size_t))); - free_pages((unsigned long)io_tlb_orig_addr, - get_order(io_tlb_nslabs * sizeof(phys_addr_t))); - free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * - sizeof(int))); - free_pages((unsigned long)phys_to_virt(io_tlb_start), - get_order(io_tlb_nslabs << IO_TLB_SHIFT)); - } else { - memblock_free_late(__pa(io_tlb_orig_addr), - PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); - memblock_free_late(__pa(io_tlb_orig_size), - PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t))); - memblock_free_late(__pa(io_tlb_list), - PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); - memblock_free_late(io_tlb_start, - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); - } - swiotlb_cleanup(); + size = struct_size(mem, slots, mem->nslabs); + if (mem->late_alloc) + free_pages((unsigned long)mem, get_order(size)); + else + memblock_free_late(__pa(mem), PAGE_ALIGN(size)); + io_tlb_default_mem = NULL; } /* * Bounce: copy the swiotlb buffer from or back to the original dma location */ -static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir) +static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size, + enum dma_data_direction dir) { + struct io_tlb_mem *mem = io_tlb_default_mem; + int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT; + phys_addr_t orig_addr = mem->slots[index].orig_addr; + size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = phys_to_virt(tlb_addr); + if (orig_addr == INVALID_PHYS_ADDR) + return; + + if (size > alloc_size) { + dev_WARN_ONCE(dev, 1, + "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu.\n", + alloc_size, size); + size = alloc_size; + } + if (PageHighMem(pfn_to_page(pfn))) { /* The buffer does not have a mapping. Map it in and copy */ unsigned int offset = orig_addr & ~PAGE_MASK; @@ -517,9 +408,9 @@ static inline unsigned long get_max_slots(unsigned long boundary_mask) return nr_slots(boundary_mask + 1); } -static unsigned int wrap_index(unsigned int index) +static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) { - if (index >= io_tlb_nslabs) + if (index >= mem->nslabs) return 0; return index; } @@ -531,9 +422,10 @@ static unsigned int wrap_index(unsigned int index) static int find_slots(struct device *dev, phys_addr_t orig_addr, size_t alloc_size) { + struct io_tlb_mem *mem = io_tlb_default_mem; unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = - phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask; + phys_to_dma_unencrypted(dev, mem->start) & boundary_mask; unsigned long max_slots = get_max_slots(boundary_mask); unsigned int iotlb_align_mask = dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1); @@ -552,15 +444,15 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr, if (alloc_size >= PAGE_SIZE) stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT)); - spin_lock_irqsave(&io_tlb_lock, flags); - if (unlikely(nslots > io_tlb_nslabs - io_tlb_used)) + spin_lock_irqsave(&mem->lock, flags); + if (unlikely(nslots > mem->nslabs - mem->used)) goto not_found; - index = wrap = wrap_index(ALIGN(io_tlb_index, stride)); + index = wrap = wrap_index(mem, ALIGN(mem->index, stride)); do { if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) != (orig_addr & iotlb_align_mask)) { - index = wrap_index(index + 1); + index = wrap_index(mem, index + 1); continue; } @@ -572,34 +464,34 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr, if (!iommu_is_span_boundary(index, nslots, nr_slots(tbl_dma_addr), max_slots)) { - if (io_tlb_list[index] >= nslots) + if (mem->slots[index].list >= nslots) goto found; } - index = wrap_index(index + stride); + index = wrap_index(mem, index + stride); } while (index != wrap); not_found: - spin_unlock_irqrestore(&io_tlb_lock, flags); + spin_unlock_irqrestore(&mem->lock, flags); return -1; found: for (i = index; i < index + nslots; i++) - io_tlb_list[i] = 0; + mem->slots[i].list = 0; for (i = index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && - io_tlb_list[i]; i--) - io_tlb_list[i] = ++count; + mem->slots[i].list; i--) + mem->slots[i].list = ++count; /* * Update the indices to avoid searching in the next round. */ - if (index + nslots < io_tlb_nslabs) - io_tlb_index = index + nslots; + if (index + nslots < mem->nslabs) + mem->index = index + nslots; else - io_tlb_index = 0; - io_tlb_used += nslots; + mem->index = 0; + mem->used += nslots; - spin_unlock_irqrestore(&io_tlb_lock, flags); + spin_unlock_irqrestore(&mem->lock, flags); return index; } @@ -607,11 +499,13 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, enum dma_data_direction dir, unsigned long attrs) { + struct io_tlb_mem *mem = io_tlb_default_mem; unsigned int offset = swiotlb_align_offset(dev, orig_addr); - unsigned int index, i; + unsigned int i; + int index; phys_addr_t tlb_addr; - if (no_iotlb_memory) + if (!mem) panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); if (mem_encrypt_active()) @@ -628,7 +522,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", - alloc_size, io_tlb_nslabs, io_tlb_used); + alloc_size, mem->nslabs, mem->used); return (phys_addr_t)DMA_MAPPING_ERROR; } @@ -638,49 +532,37 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, * needed. */ for (i = 0; i < nr_slots(alloc_size + offset); i++) { - io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i); - io_tlb_orig_size[index+i] = alloc_size - (i << IO_TLB_SHIFT); + mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); + mem->slots[index + i].alloc_size = + alloc_size - (i << IO_TLB_SHIFT); } - tlb_addr = slot_addr(io_tlb_start, index) + offset; + tlb_addr = slot_addr(mem->start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE); + swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } -static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_size, size_t *size) -{ - if (*size > orig_size) { - /* Warn and truncate mapping_size */ - dev_WARN_ONCE(hwdev, 1, - "Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n", - orig_size, *size); - *size = orig_size; - } -} - /* * tlb_addr is the physical address of the bounce buffer to unmap. */ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t mapping_size, size_t alloc_size, - enum dma_data_direction dir, unsigned long attrs) + size_t mapping_size, enum dma_data_direction dir, + unsigned long attrs) { + struct io_tlb_mem *mem = io_tlb_default_mem; unsigned long flags; unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr); - int i, count, nslots = nr_slots(alloc_size + offset); - int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT; - phys_addr_t orig_addr = io_tlb_orig_addr[index]; - - validate_sync_size_and_truncate(hwdev, io_tlb_orig_size[index], &mapping_size); + int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; + int nslots = nr_slots(mem->slots[index].alloc_size + offset); + int count, i; /* * First, sync the memory before unmapping the entry */ - if (orig_addr != INVALID_PHYS_ADDR && - !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) - swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE); + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE); /* * Return the buffer to the free list by setting the corresponding @@ -688,9 +570,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * While returning the entries to the free list, we merge the entries * with slots below and above the pool being returned. */ - spin_lock_irqsave(&io_tlb_lock, flags); + spin_lock_irqsave(&mem->lock, flags); if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE)) - count = io_tlb_list[index + nslots]; + count = mem->slots[index + nslots].list; else count = 0; @@ -699,9 +581,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * superceeding slots */ for (i = index + nslots - 1; i >= index; i--) { - io_tlb_list[i] = ++count; - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - io_tlb_orig_size[i] = 0; + mem->slots[i].list = ++count; + mem->slots[i].orig_addr = INVALID_PHYS_ADDR; + mem->slots[i].alloc_size = 0; } /* @@ -709,44 +591,29 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * available (non zero) */ for (i = index - 1; - io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && io_tlb_list[i]; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list; i--) - io_tlb_list[i] = ++count; - io_tlb_used -= nslots; - spin_unlock_irqrestore(&io_tlb_lock, flags); + mem->slots[i].list = ++count; + mem->used -= nslots; + spin_unlock_irqrestore(&mem->lock, flags); } -void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir, - enum dma_sync_target target) +void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir) { - int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; - size_t orig_size = io_tlb_orig_size[index]; - phys_addr_t orig_addr = io_tlb_orig_addr[index]; - - if (orig_addr == INVALID_PHYS_ADDR) - return; + if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) + swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); + else + BUG_ON(dir != DMA_FROM_DEVICE); +} - validate_sync_size_and_truncate(hwdev, orig_size, &size); - - switch (target) { - case SYNC_FOR_CPU: - if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, - size, DMA_FROM_DEVICE); - else - BUG_ON(dir != DMA_TO_DEVICE); - break; - case SYNC_FOR_DEVICE: - if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, - size, DMA_TO_DEVICE); - else - BUG_ON(dir != DMA_FROM_DEVICE); - break; - default: - BUG(); - } +void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir) +{ + if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) + swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE); + else + BUG_ON(dir != DMA_TO_DEVICE); } /* @@ -770,7 +637,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, /* Ensure that the address returned is DMA'ble */ dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr); if (unlikely(!dma_capable(dev, dma_addr, size, true))) { - swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir, + swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); dev_WARN_ONCE(dev, 1, "swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", @@ -790,22 +657,21 @@ size_t swiotlb_max_mapping_size(struct device *dev) bool is_swiotlb_active(void) { - /* - * When SWIOTLB is initialized, even if io_tlb_start points to physical - * address zero, io_tlb_end surely doesn't. - */ - return io_tlb_end != 0; + return io_tlb_default_mem != NULL; } +EXPORT_SYMBOL_GPL(is_swiotlb_active); #ifdef CONFIG_DEBUG_FS static int __init swiotlb_create_debugfs(void) { - struct dentry *root; + struct io_tlb_mem *mem = io_tlb_default_mem; - root = debugfs_create_dir("swiotlb", NULL); - debugfs_create_ulong("io_tlb_nslabs", 0400, root, &io_tlb_nslabs); - debugfs_create_ulong("io_tlb_used", 0400, root, &io_tlb_used); + if (!mem) + return 0; + mem->debugfs = debugfs_create_dir("swiotlb", NULL); + debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); + debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used); return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index 0596526ed9ea..fd1c04193e18 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1440,9 +1440,48 @@ void __wake_up_parent(struct task_struct *p, struct task_struct *parent) TASK_INTERRUPTIBLE, p); } +static bool is_effectively_child(struct wait_opts *wo, bool ptrace, + struct task_struct *target) +{ + struct task_struct *parent = + !ptrace ? target->real_parent : target->parent; + + return current == parent || (!(wo->wo_flags & __WNOTHREAD) && + same_thread_group(current, parent)); +} + +/* + * Optimization for waiting on PIDTYPE_PID. No need to iterate through child + * and tracee lists to find the target task. + */ +static int do_wait_pid(struct wait_opts *wo) +{ + bool ptrace; + struct task_struct *target; + int retval; + + ptrace = false; + target = pid_task(wo->wo_pid, PIDTYPE_TGID); + if (target && is_effectively_child(wo, ptrace, target)) { + retval = wait_consider_task(wo, ptrace, target); + if (retval) + return retval; + } + + ptrace = true; + target = pid_task(wo->wo_pid, PIDTYPE_PID); + if (target && target->ptrace && + is_effectively_child(wo, ptrace, target)) { + retval = wait_consider_task(wo, ptrace, target); + if (retval) + return retval; + } + + return 0; +} + static long do_wait(struct wait_opts *wo) { - struct task_struct *tsk; int retval; trace_sched_process_wait(wo->wo_pid); @@ -1464,19 +1503,27 @@ repeat: set_current_state(TASK_INTERRUPTIBLE); read_lock(&tasklist_lock); - tsk = current; - do { - retval = do_wait_thread(wo, tsk); - if (retval) - goto end; - retval = ptrace_do_wait(wo, tsk); + if (wo->wo_type == PIDTYPE_PID) { + retval = do_wait_pid(wo); if (retval) goto end; + } else { + struct task_struct *tsk = current; + + do { + retval = do_wait_thread(wo, tsk); + if (retval) + goto end; - if (wo->wo_flags & __WNOTHREAD) - break; - } while_each_thread(current, tsk); + retval = ptrace_do_wait(wo, tsk); + if (retval) + goto end; + + if (wo->wo_flags & __WNOTHREAD) + break; + } while_each_thread(current, tsk); + } read_unlock(&tasklist_lock); notask: diff --git a/kernel/fork.c b/kernel/fork.c index 771e0ea90499..dc06afd725cb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1145,7 +1145,7 @@ void mmput_async(struct mm_struct *mm) * invocations: in mmput() nobody alive left, in execve task is single * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the * mm->exe_file, but does so without using set_mm_exe_file() in order - * to do avoid the need for any locks. + * to avoid the need for any locks. */ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { @@ -1396,7 +1396,6 @@ fail_nomem: static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; - int retval; tsk->min_flt = tsk->maj_flt = 0; tsk->nvcsw = tsk->nivcsw = 0; @@ -1423,21 +1422,15 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) if (clone_flags & CLONE_VM) { mmget(oldmm); mm = oldmm; - goto good_mm; + } else { + mm = dup_mm(tsk, current->mm); + if (!mm) + return -ENOMEM; } - retval = -ENOMEM; - mm = dup_mm(tsk, current->mm); - if (!mm) - goto fail_nomem; - -good_mm: tsk->mm = mm; tsk->active_mm = mm; return 0; - -fail_nomem: - return retval; } static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) @@ -1743,7 +1736,7 @@ static int pidfd_release(struct inode *inode, struct file *file) * /proc/<pid>/status where Pid and NSpid are always shown relative to * the pid namespace of the procfs instance. The difference becomes * obvious when sending around a pidfd between pid namespaces from a - * different branch of the tree, i.e. where no ancestoral relation is + * different branch of the tree, i.e. where no ancestral relation is * present between the pid namespaces: * - create two new pid namespaces ns1 and ns2 in the initial pid * namespace (also take care to create new mount namespaces in the @@ -2735,8 +2728,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) return false; /* - * - make the CLONE_DETACHED bit reuseable for clone3 - * - make the CSIGNAL bits reuseable for clone3 + * - make the CLONE_DETACHED bit reusable for clone3 + * - make the CSIGNAL bits reusable for clone3 */ if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) return false; diff --git a/kernel/futex.c b/kernel/futex.c index c98b825da9cf..4938a00bc785 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3710,8 +3710,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, if (op & FUTEX_CLOCK_REALTIME) { flags |= FLAGS_CLOCKRT; - if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \ - cmd != FUTEX_WAIT_REQUEUE_PI) + if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) return -ENOSYS; } @@ -3758,42 +3757,52 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, return -ENOSYS; } +static __always_inline bool futex_cmd_has_timeout(u32 cmd) +{ + switch (cmd) { + case FUTEX_WAIT: + case FUTEX_LOCK_PI: + case FUTEX_WAIT_BITSET: + case FUTEX_WAIT_REQUEUE_PI: + return true; + } + return false; +} + +static __always_inline int +futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) +{ + if (!timespec64_valid(ts)) + return -EINVAL; + + *t = timespec64_to_ktime(*ts); + if (cmd == FUTEX_WAIT) + *t = ktime_add_safe(ktime_get(), *t); + else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) + *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); + return 0; +} SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, const struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, u32, val3) { - struct timespec64 ts; + int ret, cmd = op & FUTEX_CMD_MASK; ktime_t t, *tp = NULL; - u32 val2 = 0; - int cmd = op & FUTEX_CMD_MASK; + struct timespec64 ts; - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || - cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (utime && futex_cmd_has_timeout(cmd)) { if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) return -EFAULT; if (get_timespec64(&ts, utime)) return -EFAULT; - if (!timespec64_valid(&ts)) - return -EINVAL; - - t = timespec64_to_ktime(ts); - if (cmd == FUTEX_WAIT) - t = ktime_add_safe(ktime_get(), t); - else if (!(op & FUTEX_CLOCK_REALTIME)) - t = timens_ktime_to_host(CLOCK_MONOTONIC, t); + ret = futex_init_timeout(cmd, op, &ts, &t); + if (ret) + return ret; tp = &t; } - /* - * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. - * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. - */ - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (u32) (unsigned long) utime; - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #ifdef CONFIG_COMPAT @@ -3959,31 +3968,20 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { - struct timespec64 ts; + int ret, cmd = op & FUTEX_CMD_MASK; ktime_t t, *tp = NULL; - int val2 = 0; - int cmd = op & FUTEX_CMD_MASK; + struct timespec64 ts; - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || - cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (utime && futex_cmd_has_timeout(cmd)) { if (get_old_timespec32(&ts, utime)) return -EFAULT; - if (!timespec64_valid(&ts)) - return -EINVAL; - - t = timespec64_to_ktime(ts); - if (cmd == FUTEX_WAIT) - t = ktime_add_safe(ktime_get(), t); - else if (!(op & FUTEX_CLOCK_REALTIME)) - t = timens_ktime_to_host(CLOCK_MONOTONIC, t); + ret = futex_init_timeout(cmd, op, &ts, &t); + if (ret) + return ret; tp = &t; } - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (int) (unsigned long) utime; - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #endif /* CONFIG_COMPAT_32BIT_TIME */ diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index f62de2dea8a3..58f87a3092f3 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,6 +4,7 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS + depends on !CC_IS_CLANG || CLANG_VERSION >= 110000 select CONSTRUCTORS default n help diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 0ffe9f194080..073a3738c5e6 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -49,6 +49,55 @@ void gcov_enable_events(void) mutex_unlock(&gcov_lock); } +/** + * store_gcov_u32 - store 32 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't + * store anything. + */ +size_t store_gcov_u32(void *buffer, size_t off, u32 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + *data = v; + } + + return sizeof(*data); +} + +/** + * store_gcov_u64 - store 64 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. 64 bit numbers are stored as two 32 bit numbers, the low part + * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store + * anything. + */ +size_t store_gcov_u64(void *buffer, size_t off, u64 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + + data[0] = (v & 0xffffffffUL); + data[1] = (v >> 32); + } + + return sizeof(*data) * 2; +} + #ifdef CONFIG_MODULES /* Update list and generate events when modules are unloaded. */ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c index c466c7fbdece..cbb0bed958ab 100644 --- a/kernel/gcov/clang.c +++ b/kernel/gcov/clang.c @@ -48,9 +48,8 @@ #include <linux/list.h> #include <linux/printk.h> #include <linux/ratelimit.h> -#include <linux/seq_file.h> #include <linux/slab.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include "gcov.h" typedef void (*llvm_gcov_callback)(void); @@ -70,16 +69,10 @@ struct gcov_fn_info { u32 ident; u32 checksum; -#if CONFIG_CLANG_VERSION < 110000 - u8 use_extra_checksum; -#endif u32 cfg_checksum; u32 num_counters; u64 *counters; -#if CONFIG_CLANG_VERSION < 110000 - const char *function_name; -#endif }; static struct gcov_info *current_info; @@ -109,16 +102,6 @@ void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush) } EXPORT_SYMBOL(llvm_gcov_init); -#if CONFIG_CLANG_VERSION < 110000 -void llvm_gcda_start_file(const char *orig_filename, const char version[4], - u32 checksum) -{ - current_info->filename = orig_filename; - memcpy(¤t_info->version, version, sizeof(current_info->version)); - current_info->checksum = checksum; -} -EXPORT_SYMBOL(llvm_gcda_start_file); -#else void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum) { current_info->filename = orig_filename; @@ -126,28 +109,7 @@ void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum) current_info->checksum = checksum; } EXPORT_SYMBOL(llvm_gcda_start_file); -#endif - -#if CONFIG_CLANG_VERSION < 110000 -void llvm_gcda_emit_function(u32 ident, const char *function_name, - u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum) -{ - struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); - - if (!info) - return; - - INIT_LIST_HEAD(&info->head); - info->ident = ident; - info->checksum = func_checksum; - info->use_extra_checksum = use_extra_checksum; - info->cfg_checksum = cfg_checksum; - if (function_name) - info->function_name = kstrdup(function_name, GFP_KERNEL); - list_add_tail(&info->head, ¤t_info->functions); -} -#else void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum) { struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); @@ -161,7 +123,6 @@ void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum) info->cfg_checksum = cfg_checksum; list_add_tail(&info->head, ¤t_info->functions); } -#endif EXPORT_SYMBOL(llvm_gcda_emit_function); void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters) @@ -292,16 +253,8 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) !list_is_last(&fn_ptr2->head, &info2->functions)) { if (fn_ptr1->checksum != fn_ptr2->checksum) return false; -#if CONFIG_CLANG_VERSION < 110000 - if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum) - return false; - if (fn_ptr1->use_extra_checksum && - fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) - return false; -#else if (fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) return false; -#endif fn_ptr1 = list_next_entry(fn_ptr1, head); fn_ptr2 = list_next_entry(fn_ptr2, head); } @@ -330,35 +283,6 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) } } -#if CONFIG_CLANG_VERSION < 110000 -static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) -{ - size_t cv_size; /* counter values size */ - struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn), - GFP_KERNEL); - if (!fn_dup) - return NULL; - INIT_LIST_HEAD(&fn_dup->head); - - fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL); - if (!fn_dup->function_name) - goto err_name; - - cv_size = fn->num_counters * sizeof(fn->counters[0]); - fn_dup->counters = vmalloc(cv_size); - if (!fn_dup->counters) - goto err_counters; - memcpy(fn_dup->counters, fn->counters, cv_size); - - return fn_dup; - -err_counters: - kfree(fn_dup->function_name); -err_name: - kfree(fn_dup); - return NULL; -} -#else static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) { size_t cv_size; /* counter values size */ @@ -369,7 +293,7 @@ static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) INIT_LIST_HEAD(&fn_dup->head); cv_size = fn->num_counters * sizeof(fn->counters[0]); - fn_dup->counters = vmalloc(cv_size); + fn_dup->counters = kvmalloc(cv_size, GFP_KERNEL); if (!fn_dup->counters) { kfree(fn_dup); return NULL; @@ -379,7 +303,6 @@ static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) return fn_dup; } -#endif /** * gcov_info_dup - duplicate profiling data set @@ -420,99 +343,18 @@ err: * gcov_info_free - release memory for profiling data set duplicate * @info: profiling data set duplicate to free */ -#if CONFIG_CLANG_VERSION < 110000 -void gcov_info_free(struct gcov_info *info) -{ - struct gcov_fn_info *fn, *tmp; - - list_for_each_entry_safe(fn, tmp, &info->functions, head) { - kfree(fn->function_name); - vfree(fn->counters); - list_del(&fn->head); - kfree(fn); - } - kfree(info->filename); - kfree(info); -} -#else void gcov_info_free(struct gcov_info *info) { struct gcov_fn_info *fn, *tmp; list_for_each_entry_safe(fn, tmp, &info->functions, head) { - vfree(fn->counters); + kvfree(fn->counters); list_del(&fn->head); kfree(fn); } kfree(info->filename); kfree(info); } -#endif - -#define ITER_STRIDE PAGE_SIZE - -/** - * struct gcov_iterator - specifies current file position in logical records - * @info: associated profiling data - * @buffer: buffer containing file data - * @size: size of buffer - * @pos: current position in file - */ -struct gcov_iterator { - struct gcov_info *info; - void *buffer; - size_t size; - loff_t pos; -}; - -/** - * store_gcov_u32 - store 32 bit number in gcov format to buffer - * @buffer: target buffer or NULL - * @off: offset into the buffer - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't - * store anything. - */ -static size_t store_gcov_u32(void *buffer, size_t off, u32 v) -{ - u32 *data; - - if (buffer) { - data = buffer + off; - *data = v; - } - - return sizeof(*data); -} - -/** - * store_gcov_u64 - store 64 bit number in gcov format to buffer - * @buffer: target buffer or NULL - * @off: offset into the buffer - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. 64 bit numbers are stored as two 32 bit numbers, the low part - * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store - * anything. - */ -static size_t store_gcov_u64(void *buffer, size_t off, u64 v) -{ - u32 *data; - - if (buffer) { - data = buffer + off; - - data[0] = (v & 0xffffffffUL); - data[1] = (v >> 32); - } - - return sizeof(*data) * 2; -} /** * convert_to_gcda - convert profiling data set to gcda file format @@ -521,7 +363,7 @@ static size_t store_gcov_u64(void *buffer, size_t off, u64 v) * * Returns the number of bytes that were/would have been stored into the buffer. */ -static size_t convert_to_gcda(char *buffer, struct gcov_info *info) +size_t convert_to_gcda(char *buffer, struct gcov_info *info) { struct gcov_fn_info *fi_ptr; size_t pos = 0; @@ -535,21 +377,10 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info) u32 i; pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); -#if CONFIG_CLANG_VERSION < 110000 - pos += store_gcov_u32(buffer, pos, - fi_ptr->use_extra_checksum ? 3 : 2); -#else pos += store_gcov_u32(buffer, pos, 3); -#endif pos += store_gcov_u32(buffer, pos, fi_ptr->ident); pos += store_gcov_u32(buffer, pos, fi_ptr->checksum); -#if CONFIG_CLANG_VERSION < 110000 - if (fi_ptr->use_extra_checksum) - pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); -#else pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); -#endif - pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE); pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2); for (i = 0; i < fi_ptr->num_counters; i++) @@ -558,102 +389,3 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info) return pos; } - -/** - * gcov_iter_new - allocate and initialize profiling data iterator - * @info: profiling data set to be iterated - * - * Return file iterator on success, %NULL otherwise. - */ -struct gcov_iterator *gcov_iter_new(struct gcov_info *info) -{ - struct gcov_iterator *iter; - - iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); - if (!iter) - goto err_free; - - iter->info = info; - /* Dry-run to get the actual buffer size. */ - iter->size = convert_to_gcda(NULL, info); - iter->buffer = vmalloc(iter->size); - if (!iter->buffer) - goto err_free; - - convert_to_gcda(iter->buffer, info); - - return iter; - -err_free: - kfree(iter); - return NULL; -} - - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -void gcov_iter_free(struct gcov_iterator *iter) -{ - vfree(iter->buffer); - kfree(iter); -} - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) -{ - return iter->info; -} - -/** - * gcov_iter_start - reset file iterator to starting position - * @iter: file iterator - */ -void gcov_iter_start(struct gcov_iterator *iter) -{ - iter->pos = 0; -} - -/** - * gcov_iter_next - advance file iterator to next logical record - * @iter: file iterator - * - * Return zero if new position is valid, non-zero if iterator has reached end. - */ -int gcov_iter_next(struct gcov_iterator *iter) -{ - if (iter->pos < iter->size) - iter->pos += ITER_STRIDE; - - if (iter->pos >= iter->size) - return -EINVAL; - - return 0; -} - -/** - * gcov_iter_write - write data for current pos to seq_file - * @iter: file iterator - * @seq: seq_file handle - * - * Return zero on success, non-zero otherwise. - */ -int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) -{ - size_t len; - - if (iter->pos >= iter->size) - return -EINVAL; - - len = ITER_STRIDE; - if (iter->pos + len > iter->size) - len = iter->size - iter->pos; - - seq_write(seq, iter->buffer + iter->pos, len); - - return 0; -} diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 82babf5aa077..5c3086cad8f9 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -26,6 +26,7 @@ #include <linux/slab.h> #include <linux/mutex.h> #include <linux/seq_file.h> +#include <linux/mm.h> #include "gcov.h" /** @@ -85,6 +86,115 @@ static int __init gcov_persist_setup(char *str) } __setup("gcov_persist=", gcov_persist_setup); +#define ITER_STRIDE PAGE_SIZE + +/** + * struct gcov_iterator - specifies current file position in logical records + * @info: associated profiling data + * @buffer: buffer containing file data + * @size: size of buffer + * @pos: current position in file + */ +struct gcov_iterator { + struct gcov_info *info; + size_t size; + loff_t pos; + char buffer[]; +}; + +/** + * gcov_iter_new - allocate and initialize profiling data iterator + * @info: profiling data set to be iterated + * + * Return file iterator on success, %NULL otherwise. + */ +static struct gcov_iterator *gcov_iter_new(struct gcov_info *info) +{ + struct gcov_iterator *iter; + size_t size; + + /* Dry-run to get the actual buffer size. */ + size = convert_to_gcda(NULL, info); + + iter = kvmalloc(struct_size(iter, buffer, size), GFP_KERNEL); + if (!iter) + return NULL; + + iter->info = info; + iter->size = size; + convert_to_gcda(iter->buffer, info); + + return iter; +} + + +/** + * gcov_iter_free - free iterator data + * @iter: file iterator + */ +static void gcov_iter_free(struct gcov_iterator *iter) +{ + kvfree(iter); +} + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +static struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) +{ + return iter->info; +} + +/** + * gcov_iter_start - reset file iterator to starting position + * @iter: file iterator + */ +static void gcov_iter_start(struct gcov_iterator *iter) +{ + iter->pos = 0; +} + +/** + * gcov_iter_next - advance file iterator to next logical record + * @iter: file iterator + * + * Return zero if new position is valid, non-zero if iterator has reached end. + */ +static int gcov_iter_next(struct gcov_iterator *iter) +{ + if (iter->pos < iter->size) + iter->pos += ITER_STRIDE; + + if (iter->pos >= iter->size) + return -EINVAL; + + return 0; +} + +/** + * gcov_iter_write - write data for current pos to seq_file + * @iter: file iterator + * @seq: seq_file handle + * + * Return zero on success, non-zero otherwise. + */ +static int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) +{ + size_t len; + + if (iter->pos >= iter->size) + return -EINVAL; + + len = ITER_STRIDE; + if (iter->pos + len > iter->size) + len = iter->size - iter->pos; + + seq_write(seq, iter->buffer + iter->pos, len); + + return 0; +} + /* * seq_file.start() implementation for gcov data files. Note that the * gcov_iterator interface is designed to be more restrictive than seq_file diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index c53408a00d0b..460c12b7dfea 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -15,8 +15,7 @@ #include <linux/errno.h> #include <linux/slab.h> #include <linux/string.h> -#include <linux/seq_file.h> -#include <linux/vmalloc.h> +#include <linux/mm.h> #include "gcov.h" #if (__GNUC__ >= 10) @@ -310,7 +309,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info) cv_size = sizeof(gcov_type) * sci_ptr->num; - dci_ptr->values = vmalloc(cv_size); + dci_ptr->values = kvmalloc(cv_size, GFP_KERNEL); if (!dci_ptr->values) goto err_free; @@ -352,7 +351,7 @@ void gcov_info_free(struct gcov_info *info) ci_ptr = info->functions[fi_idx]->ctrs; for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++) - vfree(ci_ptr->values); + kvfree(ci_ptr->values); kfree(info->functions[fi_idx]); } @@ -363,71 +362,6 @@ free_info: kfree(info); } -#define ITER_STRIDE PAGE_SIZE - -/** - * struct gcov_iterator - specifies current file position in logical records - * @info: associated profiling data - * @buffer: buffer containing file data - * @size: size of buffer - * @pos: current position in file - */ -struct gcov_iterator { - struct gcov_info *info; - void *buffer; - size_t size; - loff_t pos; -}; - -/** - * store_gcov_u32 - store 32 bit number in gcov format to buffer - * @buffer: target buffer or NULL - * @off: offset into the buffer - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't - * store anything. - */ -static size_t store_gcov_u32(void *buffer, size_t off, u32 v) -{ - u32 *data; - - if (buffer) { - data = buffer + off; - *data = v; - } - - return sizeof(*data); -} - -/** - * store_gcov_u64 - store 64 bit number in gcov format to buffer - * @buffer: target buffer or NULL - * @off: offset into the buffer - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. 64 bit numbers are stored as two 32 bit numbers, the low part - * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store - * anything. - */ -static size_t store_gcov_u64(void *buffer, size_t off, u64 v) -{ - u32 *data; - - if (buffer) { - data = buffer + off; - - data[0] = (v & 0xffffffffUL); - data[1] = (v >> 32); - } - - return sizeof(*data) * 2; -} - /** * convert_to_gcda - convert profiling data set to gcda file format * @buffer: the buffer to store file data or %NULL if no data should be stored @@ -435,7 +369,7 @@ static size_t store_gcov_u64(void *buffer, size_t off, u64 v) * * Returns the number of bytes that were/would have been stored into the buffer. */ -static size_t convert_to_gcda(char *buffer, struct gcov_info *info) +size_t convert_to_gcda(char *buffer, struct gcov_info *info) { struct gcov_fn_info *fi_ptr; struct gcov_ctr_info *ci_ptr; @@ -481,102 +415,3 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info) return pos; } - -/** - * gcov_iter_new - allocate and initialize profiling data iterator - * @info: profiling data set to be iterated - * - * Return file iterator on success, %NULL otherwise. - */ -struct gcov_iterator *gcov_iter_new(struct gcov_info *info) -{ - struct gcov_iterator *iter; - - iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); - if (!iter) - goto err_free; - - iter->info = info; - /* Dry-run to get the actual buffer size. */ - iter->size = convert_to_gcda(NULL, info); - iter->buffer = vmalloc(iter->size); - if (!iter->buffer) - goto err_free; - - convert_to_gcda(iter->buffer, info); - - return iter; - -err_free: - kfree(iter); - return NULL; -} - - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -void gcov_iter_free(struct gcov_iterator *iter) -{ - vfree(iter->buffer); - kfree(iter); -} - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) -{ - return iter->info; -} - -/** - * gcov_iter_start - reset file iterator to starting position - * @iter: file iterator - */ -void gcov_iter_start(struct gcov_iterator *iter) -{ - iter->pos = 0; -} - -/** - * gcov_iter_next - advance file iterator to next logical record - * @iter: file iterator - * - * Return zero if new position is valid, non-zero if iterator has reached end. - */ -int gcov_iter_next(struct gcov_iterator *iter) -{ - if (iter->pos < iter->size) - iter->pos += ITER_STRIDE; - - if (iter->pos >= iter->size) - return -EINVAL; - - return 0; -} - -/** - * gcov_iter_write - write data for current pos to seq_file - * @iter: file iterator - * @seq: seq_file handle - * - * Return zero on success, non-zero otherwise. - */ -int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) -{ - size_t len; - - if (iter->pos >= iter->size) - return -EINVAL; - - len = ITER_STRIDE; - if (iter->pos + len > iter->size) - len = iter->size - iter->pos; - - seq_write(seq, iter->buffer + iter->pos, len); - - return 0; -} diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index 6ab2c1808c9d..912b8ea01d33 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -48,6 +48,7 @@ struct gcov_info *gcov_info_next(struct gcov_info *info); void gcov_info_link(struct gcov_info *info); void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); bool gcov_info_within_module(struct gcov_info *info, struct module *mod); +size_t convert_to_gcda(char *buffer, struct gcov_info *info); /* Base interface. */ enum gcov_action { @@ -58,16 +59,9 @@ enum gcov_action { void gcov_event(enum gcov_action action, struct gcov_info *info); void gcov_enable_events(void); -/* Iterator control. */ -struct seq_file; -struct gcov_iterator; - -struct gcov_iterator *gcov_iter_new(struct gcov_info *info); -void gcov_iter_free(struct gcov_iterator *iter); -void gcov_iter_start(struct gcov_iterator *iter); -int gcov_iter_next(struct gcov_iterator *iter); -int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq); -struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter); +/* writing helpers */ +size_t store_gcov_u32(void *buffer, size_t off, u32 v); +size_t store_gcov_u64(void *buffer, size_t off, u64 v); /* gcov_info control. */ void gcov_info_reset(struct gcov_info *info); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f42ef868efd3..6284443b87ec 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -295,8 +295,8 @@ void irq_domain_update_bus_token(struct irq_domain *domain, EXPORT_SYMBOL_GPL(irq_domain_update_bus_token); /** - * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs - * @of_node: pointer to interrupt controller's device tree node. + * irq_domain_create_simple() - Register an irq_domain and optionally map a range of irqs + * @fwnode: firmware node for the interrupt controller * @size: total number of irqs in mapping * @first_irq: first number of irq block assigned to the domain, * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then @@ -312,15 +312,15 @@ EXPORT_SYMBOL_GPL(irq_domain_update_bus_token); * irqs get mapped dynamically on the fly. However, if the controller requires * static virq assignments (non-DT boot) then it will set that up correctly. */ -struct irq_domain *irq_domain_add_simple(struct device_node *of_node, - unsigned int size, - unsigned int first_irq, - const struct irq_domain_ops *ops, - void *host_data) +struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, + unsigned int size, + unsigned int first_irq, + const struct irq_domain_ops *ops, + void *host_data) { struct irq_domain *domain; - domain = __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data); + domain = __irq_domain_add(fwnode, size, size, 0, ops, host_data); if (!domain) return NULL; @@ -328,7 +328,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { /* attempt to allocated irq_descs */ int rc = irq_alloc_descs(first_irq, first_irq, size, - of_node_to_nid(of_node)); + of_node_to_nid(to_of_node(fwnode))); if (rc < 0) pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", first_irq); @@ -338,7 +338,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, return domain; } -EXPORT_SYMBOL_GPL(irq_domain_add_simple); +EXPORT_SYMBOL_GPL(irq_domain_create_simple); /** * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index a0b6780740c8..f099baee3578 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -37,6 +37,7 @@ #include <linux/compiler.h> #include <linux/hugetlb.h> #include <linux/objtool.h> +#include <linux/kmsg_dump.h> #include <asm/page.h> #include <asm/sections.h> @@ -1165,7 +1166,7 @@ int kernel_kexec(void) #endif { kexec_in_progress = true; - kernel_restart_prepare(NULL); + kernel_restart_prepare("kexec reboot"); migrate_to_reboot_cpu(); /* @@ -1179,6 +1180,7 @@ int kernel_kexec(void) machine_shutdown(); } + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_kexec(kexec_image); #ifdef CONFIG_KEXEC_JUMP diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 5c3447cf7ad5..33400ff051a8 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -740,8 +740,10 @@ static int kexec_calculate_store_digests(struct kimage *image) sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); sha_regions = vzalloc(sha_region_sz); - if (!sha_regions) + if (!sha_regions) { + ret = -ENOMEM; goto out_free_desc; + } desc->tfm = tfm; diff --git a/kernel/kmod.c b/kernel/kmod.c index 3cd075ce2a1e..b717134ebe17 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -58,7 +58,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); /* modprobe_path is set via /proc/sys. */ -char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; +char modprobe_path[KMOD_PATH_LEN] = CONFIG_MODPROBE_PATH; static void free_modprobe_argv(struct subprocess_info *info) { diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index b94f3831e963..ec36b73f4733 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -66,12 +66,12 @@ void queued_write_lock_slowpath(struct qrwlock *lock) arch_spin_lock(&lock->wait_lock); /* Try to acquire the lock directly if no reader is present */ - if (!atomic_read(&lock->cnts) && - (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)) + if (!(cnts = atomic_read(&lock->cnts)) && + atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED)) goto unlock; /* Set the waiting flag to notify readers that a writer is pending */ - atomic_add(_QW_WAITING, &lock->cnts); + atomic_or(_QW_WAITING, &lock->cnts); /* When no more readers or writers, set the locked flag */ do { diff --git a/kernel/resource.c b/kernel/resource.c index 627e61b0c124..028a5ab18818 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -64,12 +64,8 @@ static DEFINE_RWLOCK(resource_lock); static struct resource *bootmem_resource_free; static DEFINE_SPINLOCK(bootmem_resource_lock); -static struct resource *next_resource(struct resource *p, bool sibling_only) +static struct resource *next_resource(struct resource *p) { - /* Caller wants to traverse through siblings only */ - if (sibling_only) - return p->sibling; - if (p->child) return p->child; while (!p->sibling && p->parent) @@ -81,7 +77,7 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; (*pos)++; - return (void *)next_resource(p, false); + return (void *)next_resource(p); } #ifdef CONFIG_PROC_FS @@ -330,14 +326,10 @@ EXPORT_SYMBOL(release_resource); * of the resource that's within [@start..@end]; if none is found, returns * -ENODEV. Returns -EINVAL for invalid parameters. * - * This function walks the whole tree and not just first level children - * unless @first_lvl is true. - * * @start: start address of the resource searched for * @end: end address of same resource * @flags: flags which the resource must have * @desc: descriptor the resource must have - * @first_lvl: walk only the first level children, if set * @res: return ptr, if resource found * * The caller must specify @start, @end, @flags, and @desc @@ -345,9 +337,8 @@ EXPORT_SYMBOL(release_resource); */ static int find_next_iomem_res(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, - bool first_lvl, struct resource *res) + struct resource *res) { - bool siblings_only = true; struct resource *p; if (!res) @@ -358,7 +349,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, read_lock(&resource_lock); - for (p = iomem_resource.child; p; p = next_resource(p, siblings_only)) { + for (p = iomem_resource.child; p; p = next_resource(p)) { /* If we passed the resource we are looking for, stop */ if (p->start > end) { p = NULL; @@ -369,13 +360,6 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, if (p->end < start) continue; - /* - * Now that we found a range that matches what we look for, - * check the flags and the descriptor. If we were not asked to - * use only the first level, start looking at children as well. - */ - siblings_only = first_lvl; - if ((p->flags & flags) != flags) continue; if ((desc != IORES_DESC_NONE) && (desc != p->desc)) @@ -402,14 +386,14 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, - bool first_lvl, void *arg, + void *arg, int (*func)(struct resource *, void *)) { struct resource res; int ret = -EINVAL; while (start < end && - !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) { + !find_next_iomem_res(start, end, flags, desc, &res)) { ret = (*func)(&res, arg); if (ret) break; @@ -431,7 +415,6 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, * @arg: function argument for the callback @func * @func: callback function that is called for each qualifying resource area * - * This walks through whole tree and not just first level children. * All the memory ranges which overlap start,end and also match flags and * desc are valid candidates. * @@ -441,7 +424,7 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)) { - return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func); + return __walk_iomem_res_desc(start, end, flags, desc, arg, func); } EXPORT_SYMBOL_GPL(walk_iomem_res_desc); @@ -457,8 +440,8 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, { unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true, - arg, func); + return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg, + func); } /* @@ -470,17 +453,14 @@ int walk_mem_res(u64 start, u64 end, void *arg, { unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; - return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true, - arg, func); + return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg, + func); } /* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. * It is to be used only for System RAM. - * - * This will find System RAM ranges that are children of top-level resources - * in addition to top-level System RAM resources. */ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) @@ -495,8 +475,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; while (start < end && - !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, - false, &res)) { + !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res)) { pfn = PFN_UP(res.start); end_pfn = PFN_DOWN(res.end + 1); if (end_pfn > pfn) @@ -523,6 +502,34 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); +static int __region_intersects(resource_size_t start, size_t size, + unsigned long flags, unsigned long desc) +{ + struct resource res; + int type = 0; int other = 0; + struct resource *p; + + res.start = start; + res.end = start + size - 1; + + for (p = iomem_resource.child; p ; p = p->sibling) { + bool is_type = (((p->flags & flags) == flags) && + ((desc == IORES_DESC_NONE) || + (desc == p->desc))); + + if (resource_overlaps(p, &res)) + is_type ? type++ : other++; + } + + if (type == 0) + return REGION_DISJOINT; + + if (other == 0) + return REGION_INTERSECTS; + + return REGION_MIXED; +} + /** * region_intersects() - determine intersection of region with known resources * @start: region start address @@ -546,31 +553,13 @@ EXPORT_SYMBOL_GPL(page_is_ram); int region_intersects(resource_size_t start, size_t size, unsigned long flags, unsigned long desc) { - struct resource res; - int type = 0; int other = 0; - struct resource *p; - - res.start = start; - res.end = start + size - 1; + int ret; read_lock(&resource_lock); - for (p = iomem_resource.child; p ; p = p->sibling) { - bool is_type = (((p->flags & flags) == flags) && - ((desc == IORES_DESC_NONE) || - (desc == p->desc))); - - if (resource_overlaps(p, &res)) - is_type ? type++ : other++; - } + ret = __region_intersects(start, size, flags, desc); read_unlock(&resource_lock); - if (type == 0) - return REGION_DISJOINT; - - if (other == 0) - return REGION_INTERSECTS; - - return REGION_MIXED; + return ret; } EXPORT_SYMBOL_GPL(region_intersects); @@ -1171,31 +1160,16 @@ struct address_space *iomem_get_mapping(void) return smp_load_acquire(&iomem_inode)->i_mapping; } -/** - * __request_region - create a new busy resource region - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * @name: reserving caller's ID string - * @flags: IO resource flags - */ -struct resource * __request_region(struct resource *parent, +static int __request_region_locked(struct resource *res, struct resource *parent, resource_size_t start, resource_size_t n, const char *name, int flags) { DECLARE_WAITQUEUE(wait, current); - struct resource *res = alloc_resource(GFP_KERNEL); - struct resource *orig_parent = parent; - - if (!res) - return NULL; res->name = name; res->start = start; res->end = start + n - 1; - write_lock(&resource_lock); - for (;;) { struct resource *conflict; @@ -1231,13 +1205,40 @@ struct resource * __request_region(struct resource *parent, continue; } /* Uhhuh, that didn't work out.. */ - free_resource(res); - res = NULL; - break; + return -EBUSY; } + + return 0; +} + +/** + * __request_region - create a new busy resource region + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * @name: reserving caller's ID string + * @flags: IO resource flags + */ +struct resource *__request_region(struct resource *parent, + resource_size_t start, resource_size_t n, + const char *name, int flags) +{ + struct resource *res = alloc_resource(GFP_KERNEL); + int ret; + + if (!res) + return NULL; + + write_lock(&resource_lock); + ret = __request_region_locked(res, parent, start, n, name, flags); write_unlock(&resource_lock); - if (res && orig_parent == &iomem_resource) + if (ret) { + free_resource(res); + return NULL; + } + + if (parent == &iomem_resource) revoke_iomem(res); return res; @@ -1779,25 +1780,56 @@ static struct resource *__request_free_mem_region(struct device *dev, { resource_size_t end, addr; struct resource *res; + struct region_devres *dr = NULL; size = ALIGN(size, 1UL << PA_SECTION_SHIFT); end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); addr = end - size + 1UL; + res = alloc_resource(GFP_KERNEL); + if (!res) + return ERR_PTR(-ENOMEM); + + if (dev) { + dr = devres_alloc(devm_region_release, + sizeof(struct region_devres), GFP_KERNEL); + if (!dr) { + free_resource(res); + return ERR_PTR(-ENOMEM); + } + } + + write_lock(&resource_lock); for (; addr > size && addr >= base->start; addr -= size) { - if (region_intersects(addr, size, 0, IORES_DESC_NONE) != + if (__region_intersects(addr, size, 0, IORES_DESC_NONE) != REGION_DISJOINT) continue; - if (dev) - res = devm_request_mem_region(dev, addr, size, name); - else - res = request_mem_region(addr, size, name); - if (!res) - return ERR_PTR(-ENOMEM); + if (!__request_region_locked(res, &iomem_resource, addr, size, + name, 0)) + break; + + if (dev) { + dr->parent = &iomem_resource; + dr->start = addr; + dr->n = size; + devres_add(dev, dr); + } + res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; + write_unlock(&resource_lock); + + /* + * A driver is claiming this region so revoke any mappings. + */ + revoke_iomem(res); return res; } + write_unlock(&resource_lock); + + free_resource(res); + if (dr) + devres_free(dr); return ERR_PTR(-ERANGE); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9143163fa678..5226cc26a095 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -938,7 +938,7 @@ DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) { - return clamp_value / UCLAMP_BUCKET_DELTA; + return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); } static inline unsigned int uclamp_none(enum uclamp_id clamp_id) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1d75af1ecfb4..20aa234ffe04 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10878,16 +10878,22 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq; + list_add_leaf_cfs_rq(cfs_rq_of(se)); + /* Start to propagate at parent */ se = se->parent; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - if (cfs_rq_throttled(cfs_rq)) - break; + if (!cfs_rq_throttled(cfs_rq)){ + update_load_avg(cfs_rq, se, UPDATE_TG); + list_add_leaf_cfs_rq(cfs_rq); + continue; + } - update_load_avg(cfs_rq, se, UPDATE_TG); + if (list_add_leaf_cfs_rq(cfs_rq)) + break; } } #else diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index db27b69fa92a..cc25a3cff41f 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -972,7 +972,7 @@ void psi_cgroup_free(struct cgroup *cgroup) */ void cgroup_move_task(struct task_struct *task, struct css_set *to) { - unsigned int task_flags = 0; + unsigned int task_flags; struct rq_flags rf; struct rq *rq; @@ -987,15 +987,31 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) rq = task_rq_lock(task, &rf); - if (task_on_rq_queued(task)) { - task_flags = TSK_RUNNING; - if (task_current(rq, task)) - task_flags |= TSK_ONCPU; - } else if (task->in_iowait) - task_flags = TSK_IOWAIT; - - if (task->in_memstall) - task_flags |= TSK_MEMSTALL; + /* + * We may race with schedule() dropping the rq lock between + * deactivating prev and switching to next. Because the psi + * updates from the deactivation are deferred to the switch + * callback to save cgroup tree updates, the task's scheduling + * state here is not coherent with its psi state: + * + * schedule() cgroup_move_task() + * rq_lock() + * deactivate_task() + * p->on_rq = 0 + * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates + * pick_next_task() + * rq_unlock() + * rq_lock() + * psi_task_change() // old cgroup + * task->cgroups = to + * psi_task_change() // new cgroup + * rq_unlock() + * rq_lock() + * psi_sched_switch() // does deferred updates in new cgroup + * + * Don't rely on the scheduling state. Use psi_flags instead. + */ + task_flags = task->psi_flags; if (task_flags) psi_task_change(task, task_flags, 0); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 1e63db4dbd9a..6ecd3f3a52b5 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -119,8 +119,11 @@ struct seccomp_kaddfd { int fd; unsigned int flags; - /* To only be set on reply */ - int ret; + union { + bool setfd; + /* To only be set on reply */ + int ret; + }; struct completion completion; struct list_head list; }; @@ -1069,7 +1072,11 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd) * that it has been handled. */ list_del_init(&addfd->list); - addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); + if (!addfd->setfd) + addfd->ret = receive_fd(addfd->file, addfd->flags); + else + addfd->ret = receive_fd_replace(addfd->fd, addfd->file, + addfd->flags); complete(&addfd->completion); } @@ -1583,8 +1590,8 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter, return -EBADF; kaddfd.flags = addfd.newfd_flags; - kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ? - addfd.newfd : -1; + kaddfd.setfd = addfd.flags & SECCOMP_ADDFD_FLAG_SETFD; + kaddfd.fd = addfd.newfd; init_completion(&kaddfd.completion); ret = mutex_lock_interruptible(&filter->notify_lock); diff --git a/kernel/smp.c b/kernel/smp.c index e21074900006..52bf159ec400 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -211,7 +211,7 @@ static u64 cfd_seq_inc(unsigned int src, unsigned int dst, unsigned int type) } while (0) /* Record current CSD work for current CPU, NULL to erase. */ -static void __csd_lock_record(call_single_data_t *csd) +static void __csd_lock_record(struct __call_single_data *csd) { if (!csd) { smp_mb(); /* NULL cur_csd after unlock. */ @@ -226,13 +226,13 @@ static void __csd_lock_record(call_single_data_t *csd) /* Or before unlock, as the case may be. */ } -static __always_inline void csd_lock_record(call_single_data_t *csd) +static __always_inline void csd_lock_record(struct __call_single_data *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) __csd_lock_record(csd); } -static int csd_lock_wait_getcpu(call_single_data_t *csd) +static int csd_lock_wait_getcpu(struct __call_single_data *csd) { unsigned int csd_type; @@ -282,7 +282,7 @@ static const char *csd_lock_get_type(unsigned int type) return (type >= ARRAY_SIZE(seq_type)) ? "?" : seq_type[type]; } -static void csd_lock_print_extended(call_single_data_t *csd, int cpu) +static void csd_lock_print_extended(struct __call_single_data *csd, int cpu) { struct cfd_seq_local *seq = &per_cpu(cfd_seq_local, cpu); unsigned int srccpu = csd->node.src; @@ -321,7 +321,7 @@ static void csd_lock_print_extended(call_single_data_t *csd, int cpu) * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ -static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) +static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id) { int cpu = -1; int cpux; @@ -387,7 +387,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static void __csd_lock_wait(call_single_data_t *csd) +static void __csd_lock_wait(struct __call_single_data *csd) { int bug_id = 0; u64 ts0, ts1; @@ -401,7 +401,7 @@ static void __csd_lock_wait(call_single_data_t *csd) smp_acquire__after_ctrl_dep(); } -static __always_inline void csd_lock_wait(call_single_data_t *csd) +static __always_inline void csd_lock_wait(struct __call_single_data *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) { __csd_lock_wait(csd); @@ -431,17 +431,17 @@ static void __smp_call_single_queue_debug(int cpu, struct llist_node *node) #else #define cfd_seq_store(var, src, dst, type) -static void csd_lock_record(call_single_data_t *csd) +static void csd_lock_record(struct __call_single_data *csd) { } -static __always_inline void csd_lock_wait(call_single_data_t *csd) +static __always_inline void csd_lock_wait(struct __call_single_data *csd) { smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #endif -static __always_inline void csd_lock(call_single_data_t *csd) +static __always_inline void csd_lock(struct __call_single_data *csd) { csd_lock_wait(csd); csd->node.u_flags |= CSD_FLAG_LOCK; @@ -454,7 +454,7 @@ static __always_inline void csd_lock(call_single_data_t *csd) smp_wmb(); } -static __always_inline void csd_unlock(call_single_data_t *csd) +static __always_inline void csd_unlock(struct __call_single_data *csd) { WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK)); @@ -501,7 +501,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static int generic_exec_single(int cpu, call_single_data_t *csd) +static int generic_exec_single(int cpu, struct __call_single_data *csd) { if (cpu == smp_processor_id()) { smp_call_func_t func = csd->func; @@ -784,7 +784,7 @@ EXPORT_SYMBOL(smp_call_function_single); * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. */ -int smp_call_function_single_async(int cpu, call_single_data_t *csd) +int smp_call_function_single_async(int cpu, struct __call_single_data *csd) { int err = 0; diff --git a/kernel/sys.c b/kernel/sys.c index 3d62c9599dc0..3a583a29815f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1590,7 +1590,7 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, /* * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not - * infite. In case of RLIM_INFINITY the posix CPU timer code + * infinite. In case of RLIM_INFINITY the posix CPU timer code * ignores the rlimit. */ if (!retval && new_rlim && resource == RLIMIT_CPU && @@ -2029,7 +2029,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data } /* - * arg_lock protects concurent updates but we still need mmap_lock for + * arg_lock protects concurrent updates but we still need mmap_lock for * read to exclude races with sys_brk. */ mmap_read_lock(mm); @@ -2041,7 +2041,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * output in procfs mostly, except * * - @start_brk/@brk which are used in do_brk_flags but kernel lookups - * for VMAs when updating these memvers so anything wrong written + * for VMAs when updating these members so anything wrong written * here cause kernel to swear at userspace program but won't lead * to any problem in kernel itself */ @@ -2143,7 +2143,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EINVAL; /* - * arg_lock protects concurent updates of arg boundaries, we need + * arg_lock protects concurrent updates of arg boundaries, we need * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr * validation. */ @@ -2210,7 +2210,7 @@ static int prctl_set_mm(int opt, unsigned long addr, * If command line arguments and environment * are placed somewhere else on stack, we can * set them up here, ARG_START/END to setup - * command line argumets and ENV_START/END + * command line arguments and ENV_START/END * for environment. */ case PR_SET_MM_START_STACK: @@ -2258,8 +2258,8 @@ static int prctl_get_tid_address(struct task_struct *me, int __user * __user *ti static int propagate_has_child_subreaper(struct task_struct *p, void *data) { /* - * If task has has_child_subreaper - all its decendants - * already have these flag too and new decendants will + * If task has has_child_subreaper - all its descendants + * already have these flag too and new descendants will * inherit it on fork, skip them. * * If we've found child_reaper - skip descendants in diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f91d327273c1..14edf84cc571 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2830,7 +2830,7 @@ static struct ctl_table vm_table[] = { #ifdef CONFIG_COMPACTION { .procname = "compact_memory", - .data = &sysctl_compact_memory, + .data = NULL, .maxlen = sizeof(int), .mode = 0200, .proc_handler = sysctl_compaction_handler, diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 29a6ebeebc9e..b8a0d1d564fb 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -42,7 +42,7 @@ bool ftrace_graph_is_dead(void) } /** - * ftrace_graph_stop - set to permanently disable function graph tracincg + * ftrace_graph_stop - set to permanently disable function graph tracing * * In case of an error int function graph tracing, this is called * to try to keep function graph tracing from causing any more harm. @@ -117,7 +117,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, /* * Skip graph tracing if the return location is served by direct trampoline, - * since call sequence and return addresses is unpredicatable anymore. + * since call sequence and return addresses are unpredictable anyway. * Ex: BPF trampoline may call original function and may skip frame * depending on type of BPF programs attached. */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3ba52d4e1314..2e8a3fde7104 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1045,7 +1045,7 @@ struct ftrace_ops global_ops = { }; /* - * Used by the stack undwinder to know about dynamic ftrace trampolines. + * Used by the stack unwinder to know about dynamic ftrace trampolines. */ struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) { @@ -1090,7 +1090,7 @@ struct ftrace_page { struct ftrace_page *next; struct dyn_ftrace *records; int index; - int size; + int order; }; #define ENTRY_SIZE sizeof(struct dyn_ftrace) @@ -3000,7 +3000,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) * When the kernel is preemptive, tasks can be preempted * while on a ftrace trampoline. Just scheduling a task on * a CPU is not good enough to flush them. Calling - * synchornize_rcu_tasks() will wait for those tasks to + * synchronize_rcu_tasks() will wait for those tasks to * execute and either schedule voluntarily or enter user space. */ if (IS_ENABLED(CONFIG_PREEMPTION)) @@ -3156,15 +3156,9 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) if (WARN_ON(!count)) return -EINVAL; + /* We want to fill as much as possible, with no empty pages */ pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE); - order = get_count_order(pages); - - /* - * We want to fill as much as possible. No more than a page - * may be empty. - */ - if (!is_power_of_2(pages)) - order--; + order = fls(pages) - 1; again: pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); @@ -3181,7 +3175,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) ftrace_number_of_groups++; cnt = (PAGE_SIZE << order) / ENTRY_SIZE; - pg->size = cnt; + pg->order = order; if (cnt > count) cnt = count; @@ -3194,7 +3188,6 @@ ftrace_allocate_pages(unsigned long num_to_init) { struct ftrace_page *start_pg; struct ftrace_page *pg; - int order; int cnt; if (!num_to_init) @@ -3230,13 +3223,13 @@ ftrace_allocate_pages(unsigned long num_to_init) free_pages: pg = start_pg; while (pg) { - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - if (order >= 0) - free_pages((unsigned long)pg->records, order); + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } start_pg = pg->next; kfree(pg); pg = start_pg; - ftrace_number_of_pages -= 1 << order; ftrace_number_of_groups--; } pr_info("ftrace: FAILED to allocate memory for functions\n"); @@ -5407,7 +5400,7 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct); * @reset - non zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled - * If @ip is NULL, it failes to update filter. + * If @ip is NULL, it fails to update filter. */ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset) @@ -5631,7 +5624,10 @@ int ftrace_regex_release(struct inode *inode, struct file *file) parser = &iter->parser; if (trace_parser_loaded(parser)) { - ftrace_match_records(iter->hash, parser->buffer, parser->idx); + int enable = !(iter->flags & FTRACE_ITER_NOTRACE); + + ftrace_process_regex(iter, parser->buffer, + parser->idx, enable); } trace_parser_put(parser); @@ -6221,6 +6217,7 @@ static int ftrace_process_locs(struct module *mod, p = start; pg = start_pg; while (p < end) { + unsigned long end_offset; addr = ftrace_call_adjust(*p++); /* * Some architecture linkers will pad between @@ -6231,7 +6228,8 @@ static int ftrace_process_locs(struct module *mod, if (!addr) continue; - if (pg->index == pg->size) { + end_offset = (pg->index+1) * sizeof(pg->records[0]); + if (end_offset > PAGE_SIZE << pg->order) { /* We should have allocated enough */ if (WARN_ON(!pg->next)) break; @@ -6359,7 +6357,7 @@ clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash) } } -/* Clear any records from hashs */ +/* Clear any records from hashes */ static void clear_mod_from_hashes(struct ftrace_page *pg) { struct trace_array *tr; @@ -6400,7 +6398,6 @@ void ftrace_release_mod(struct module *mod) struct ftrace_page **last_pg; struct ftrace_page *tmp_page = NULL; struct ftrace_page *pg; - int order; mutex_lock(&ftrace_lock); @@ -6451,12 +6448,12 @@ void ftrace_release_mod(struct module *mod) /* Needs to be called outside of ftrace_lock */ clear_mod_from_hashes(pg); - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - if (order >= 0) - free_pages((unsigned long)pg->records, order); + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } tmp_page = pg->next; kfree(pg); - ftrace_number_of_pages -= 1 << order; ftrace_number_of_groups--; } } @@ -6774,7 +6771,6 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) struct ftrace_mod_map *mod_map = NULL; struct ftrace_init_func *func, *func_next; struct list_head clear_hash; - int order; INIT_LIST_HEAD(&clear_hash); @@ -6812,10 +6808,10 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) ftrace_update_tot_cnt--; if (!pg->index) { *last_pg = pg->next; - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - if (order >= 0) - free_pages((unsigned long)pg->records, order); - ftrace_number_of_pages -= 1 << order; + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } ftrace_number_of_groups--; kfree(pg); pg = container_of(last_pg, struct ftrace_page, next); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 68744c51517e..2c0ee6484990 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -287,17 +287,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) -/** - * ring_buffer_event_time_stamp - return the event's extended timestamp - * @event: the event to get the timestamp of - * - * Returns the extended timestamp associated with a data event. - * An extended time_stamp is a 64-bit timestamp represented - * internally in a special way that makes the best use of space - * contained within a ring buffer event. This function decodes - * it and maps it to a straight u64 value. - */ -u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event) +static u64 rb_event_time_stamp(struct ring_buffer_event *event) { u64 ts; @@ -487,6 +477,8 @@ struct rb_time_struct { #endif typedef struct rb_time_struct rb_time_t; +#define MAX_NEST 5 + /* * head_page == tail_page && head == tail then buffer is empty. */ @@ -524,6 +516,7 @@ struct ring_buffer_per_cpu { unsigned long read_bytes; rb_time_t write_stamp; rb_time_t before_stamp; + u64 event_stamp[MAX_NEST]; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -749,6 +742,99 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) } #endif +/* + * Enable this to make sure that the event passed to + * ring_buffer_event_time_stamp() is not committed and also + * is on the buffer that it passed in. + */ +//#define RB_VERIFY_EVENT +#ifdef RB_VERIFY_EVENT +static struct list_head *rb_list_head(struct list_head *list); +static void verify_event(struct ring_buffer_per_cpu *cpu_buffer, + void *event) +{ + struct buffer_page *page = cpu_buffer->commit_page; + struct buffer_page *tail_page = READ_ONCE(cpu_buffer->tail_page); + struct list_head *next; + long commit, write; + unsigned long addr = (unsigned long)event; + bool done = false; + int stop = 0; + + /* Make sure the event exists and is not committed yet */ + do { + if (page == tail_page || WARN_ON_ONCE(stop++ > 100)) + done = true; + commit = local_read(&page->page->commit); + write = local_read(&page->write); + if (addr >= (unsigned long)&page->page->data[commit] && + addr < (unsigned long)&page->page->data[write]) + return; + + next = rb_list_head(page->list.next); + page = list_entry(next, struct buffer_page, list); + } while (!done); + WARN_ON_ONCE(1); +} +#else +static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer, + void *event) +{ +} +#endif + + +static inline u64 rb_time_stamp(struct trace_buffer *buffer); + +/** + * ring_buffer_event_time_stamp - return the event's current time stamp + * @buffer: The buffer that the event is on + * @event: the event to get the time stamp of + * + * Note, this must be called after @event is reserved, and before it is + * committed to the ring buffer. And must be called from the same + * context where the event was reserved (normal, softirq, irq, etc). + * + * Returns the time stamp associated with the current event. + * If the event has an extended time stamp, then that is used as + * the time stamp to return. + * In the highly unlikely case that the event was nested more than + * the max nesting, then the write_stamp of the buffer is returned, + * otherwise current time is returned, but that really neither of + * the last two cases should ever happen. + */ +u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[smp_processor_id()]; + unsigned int nest; + u64 ts; + + /* If the event includes an absolute time, then just use that */ + if (event->type_len == RINGBUF_TYPE_TIME_STAMP) + return rb_event_time_stamp(event); + + nest = local_read(&cpu_buffer->committing); + verify_event(cpu_buffer, event); + if (WARN_ON_ONCE(!nest)) + goto fail; + + /* Read the current saved nesting level time stamp */ + if (likely(--nest < MAX_NEST)) + return cpu_buffer->event_stamp[nest]; + + /* Shouldn't happen, warn if it does */ + WARN_ONCE(1, "nest (%d) greater than max", nest); + + fail: + /* Can only fail on 32 bit */ + if (!rb_time_read(&cpu_buffer->write_stamp, &ts)) + /* Screw it, just read the current time */ + ts = rb_time_stamp(cpu_buffer->buffer); + + return ts; +} + /** * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from @@ -994,7 +1080,7 @@ static inline u64 rb_time_stamp(struct trace_buffer *buffer) return ts << DEBUG_SHIFT; } -u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu) +u64 ring_buffer_time_stamp(struct trace_buffer *buffer) { u64 time; @@ -2710,6 +2796,10 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, { unsigned length = info->length; u64 delta = info->delta; + unsigned int nest = local_read(&cpu_buffer->committing) - 1; + + if (!WARN_ON_ONCE(nest >= MAX_NEST)) + cpu_buffer->event_stamp[nest] = info->ts; /* * If we need to add a timestamp, then we @@ -2766,7 +2856,7 @@ static u64 rb_time_delta(struct ring_buffer_event *event) return 0; case RINGBUF_TYPE_TIME_EXTEND: - return ring_buffer_event_time_stamp(event); + return rb_event_time_stamp(event); case RINGBUF_TYPE_TIME_STAMP: return 0; @@ -3064,7 +3154,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * is called before preempt_count() is updated, since the check will * be on the NORMAL bit, the TRANSITION bit will then be set. If an * NMI then comes in, it will set the NMI bit, but when the NMI code - * does the trace_recursive_unlock() it will clear the TRANSTION bit + * does the trace_recursive_unlock() it will clear the TRANSITION bit * and leave the NMI bit set. But this is fine, because the interrupt * code that set the TRANSITION bit will then clear the NMI bit when it * calls trace_recursive_unlock(). If another NMI comes in, it will @@ -3212,13 +3302,13 @@ static void dump_buffer_page(struct buffer_data_page *bpage, switch (event->type_len) { case RINGBUF_TYPE_TIME_EXTEND: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); ts += delta; pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta); break; case RINGBUF_TYPE_TIME_STAMP: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); ts = delta; pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); break; @@ -3289,12 +3379,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, switch (event->type_len) { case RINGBUF_TYPE_TIME_EXTEND: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); ts += delta; break; case RINGBUF_TYPE_TIME_STAMP: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); ts = delta; break; @@ -3451,7 +3541,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, info->after, ts)) { /* Nothing came after this event between C and E */ info->delta = ts - info->after; - info->ts = ts; } else { /* * Interrupted between C and E: @@ -3463,6 +3552,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, */ info->delta = 0; } + info->ts = ts; info->add_timestamp &= ~RB_ADD_STAMP_FORCE; } @@ -4256,12 +4346,12 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, return; case RINGBUF_TYPE_TIME_EXTEND: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); cpu_buffer->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); cpu_buffer->read_stamp = delta; return; @@ -4286,12 +4376,12 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, return; case RINGBUF_TYPE_TIME_EXTEND: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); iter->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); iter->read_stamp = delta; return; @@ -4544,7 +4634,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, case RINGBUF_TYPE_TIME_STAMP: if (ts) { - *ts = ring_buffer_event_time_stamp(event); + *ts = rb_event_time_stamp(event); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } @@ -4635,7 +4725,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) case RINGBUF_TYPE_TIME_STAMP: if (ts) { - *ts = ring_buffer_event_time_stamp(event); + *ts = rb_event_time_stamp(event); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } @@ -5021,6 +5111,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) rb_time_set(&cpu_buffer->write_stamp, 0); rb_time_set(&cpu_buffer->before_stamp, 0); + memset(cpu_buffer->event_stamp, 0, sizeof(cpu_buffer->event_stamp)); + cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index a4b4bbf8c3bf..0b15e975d2c2 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Test module for in-kernel sythetic event creation and generation. + * Test module for in-kernel synthetic event creation and generation. * * Copyright (C) 2019 Tom Zanussi <zanussi@kernel.org> */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 915fe8790f04..560e4c8d3825 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -514,7 +514,7 @@ void trace_free_pid_list(struct trace_pid_list *pid_list) * @filtered_pids: The list of pids to check * @search_pid: The PID to find in @filtered_pids * - * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis. + * Returns true if @search_pid is found in @filtered_pids, and false otherwise. */ bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) @@ -545,7 +545,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) { /* - * If filterd_no_pids is not empty, and the task's pid is listed + * If filtered_no_pids is not empty, and the task's pid is listed * in filtered_no_pids, then return true. * Otherwise, if filtered_pids is empty, that means we can * trace all tasks. If it has content, then only trace pids @@ -612,7 +612,7 @@ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) (*pos)++; - /* pid already is +1 of the actual prevous bit */ + /* pid already is +1 of the actual previous bit */ pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); /* Return pid + 1 to allow zero to be represented */ @@ -771,7 +771,7 @@ static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu) if (!buf->buffer) return trace_clock_local(); - ts = ring_buffer_time_stamp(buf->buffer, cpu); + ts = ring_buffer_time_stamp(buf->buffer); ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts); return ts; @@ -834,7 +834,7 @@ DEFINE_MUTEX(trace_types_lock); * The content of events may become garbage if we allow other process consumes * these events concurrently: * A) the page of the consumed events may become a normal page - * (not reader page) in ring buffer, and this page will be rewrited + * (not reader page) in ring buffer, and this page will be rewritten * by events producer. * B) The page of the consumed events may become a page for splice_read, * and this page will be returned to system. @@ -1520,7 +1520,7 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) #undef C #define C(a, b) b -/* These must match the bit postions in trace_iterator_flags */ +/* These must match the bit positions in trace_iterator_flags */ static const char *trace_options[] = { TRACE_FLAGS NULL @@ -2390,14 +2390,13 @@ static void tracing_stop_tr(struct trace_array *tr) static int trace_save_cmdline(struct task_struct *tsk) { - unsigned pid, idx; + unsigned tpid, idx; /* treat recording of idle task as a success */ if (!tsk->pid) return 1; - if (unlikely(tsk->pid > PID_MAX_DEFAULT)) - return 0; + tpid = tsk->pid & (PID_MAX_DEFAULT - 1); /* * It's not the end of the world if we don't get @@ -2408,26 +2407,15 @@ static int trace_save_cmdline(struct task_struct *tsk) if (!arch_spin_trylock(&trace_cmdline_lock)) return 0; - idx = savedcmd->map_pid_to_cmdline[tsk->pid]; + idx = savedcmd->map_pid_to_cmdline[tpid]; if (idx == NO_CMDLINE_MAP) { idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; - /* - * Check whether the cmdline buffer at idx has a pid - * mapped. We are going to overwrite that entry so we - * need to clear the map_pid_to_cmdline. Otherwise we - * would read the new comm for the old pid. - */ - pid = savedcmd->map_cmdline_to_pid[idx]; - if (pid != NO_CMDLINE_MAP) - savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; - - savedcmd->map_cmdline_to_pid[idx] = tsk->pid; - savedcmd->map_pid_to_cmdline[tsk->pid] = idx; - + savedcmd->map_pid_to_cmdline[tpid] = idx; savedcmd->cmdline_idx = idx; } + savedcmd->map_cmdline_to_pid[idx] = tsk->pid; set_cmdline(idx, tsk->comm); arch_spin_unlock(&trace_cmdline_lock); @@ -2438,6 +2426,7 @@ static int trace_save_cmdline(struct task_struct *tsk) static void __trace_find_cmdline(int pid, char comm[]) { unsigned map; + int tpid; if (!pid) { strcpy(comm, "<idle>"); @@ -2449,16 +2438,16 @@ static void __trace_find_cmdline(int pid, char comm[]) return; } - if (pid > PID_MAX_DEFAULT) { - strcpy(comm, "<...>"); - return; + tpid = pid & (PID_MAX_DEFAULT - 1); + map = savedcmd->map_pid_to_cmdline[tpid]; + if (map != NO_CMDLINE_MAP) { + tpid = savedcmd->map_cmdline_to_pid[map]; + if (tpid == pid) { + strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); + return; + } } - - map = savedcmd->map_pid_to_cmdline[pid]; - if (map != NO_CMDLINE_MAP) - strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); - else - strcpy(comm, "<...>"); + strcpy(comm, "<...>"); } void trace_find_cmdline(int pid, char comm[]) @@ -2737,12 +2726,13 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, unsigned int trace_ctx) { struct ring_buffer_event *entry; + struct trace_array *tr = trace_file->tr; int val; - *current_rb = trace_file->tr->array_buffer.buffer; + *current_rb = tr->array_buffer.buffer; - if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags & - (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && + if (!tr->no_filter_buffering_ref && + (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && (entry = this_cpu_read(trace_buffered_event))) { /* Try to use the per cpu buffer first */ val = this_cpu_inc_return(trace_buffered_event_cnt); @@ -3116,6 +3106,40 @@ static void ftrace_trace_userstack(struct trace_array *tr, #endif /* CONFIG_STACKTRACE */ +static inline void +func_repeats_set_delta_ts(struct func_repeats_entry *entry, + unsigned long long delta) +{ + entry->bottom_delta_ts = delta & U32_MAX; + entry->top_delta_ts = (delta >> 32); +} + +void trace_last_func_repeats(struct trace_array *tr, + struct trace_func_repeats *last_info, + unsigned int trace_ctx) +{ + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct func_repeats_entry *entry; + struct ring_buffer_event *event; + u64 delta; + + event = __trace_buffer_lock_reserve(buffer, TRACE_FUNC_REPEATS, + sizeof(*entry), trace_ctx); + if (!event) + return; + + delta = ring_buffer_event_time_stamp(buffer, event) - + last_info->ts_last_call; + + entry = ring_buffer_event_data(event); + entry->ip = last_info->ip; + entry->parent_ip = last_info->parent_ip; + entry->count = last_info->count; + func_repeats_set_delta_ts(entry, delta); + + __buffer_unlock_commit(buffer, event); +} + /* created for use with alloc_percpu */ struct trace_buffer_struct { int nesting; @@ -3368,7 +3392,7 @@ int trace_array_vprintk(struct trace_array *tr, * buffer (use trace_printk() for that), as writing into the top level * buffer should only have events that can be individually disabled. * trace_printk() is only used for debugging a kernel, and should not - * be ever encorporated in normal use. + * be ever incorporated in normal use. * * trace_array_printk() can be used, as it will not add noise to the * top level tracing buffer. @@ -3562,6 +3586,204 @@ static char *trace_iter_expand_format(struct trace_iterator *iter) return tmp; } +/* Returns true if the string is safe to dereference from an event */ +static bool trace_safe_str(struct trace_iterator *iter, const char *str) +{ + unsigned long addr = (unsigned long)str; + struct trace_event *trace_event; + struct trace_event_call *event; + + /* OK if part of the event data */ + if ((addr >= (unsigned long)iter->ent) && + (addr < (unsigned long)iter->ent + iter->ent_size)) + return true; + + /* OK if part of the temp seq buffer */ + if ((addr >= (unsigned long)iter->tmp_seq.buffer) && + (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE)) + return true; + + /* Core rodata can not be freed */ + if (is_kernel_rodata(addr)) + return true; + + if (trace_is_tracepoint_string(str)) + return true; + + /* + * Now this could be a module event, referencing core module + * data, which is OK. + */ + if (!iter->ent) + return false; + + trace_event = ftrace_find_event(iter->ent->type); + if (!trace_event) + return false; + + event = container_of(trace_event, struct trace_event_call, event); + if (!event->mod) + return false; + + /* Would rather have rodata, but this will suffice */ + if (within_module_core(addr, event->mod)) + return true; + + return false; +} + +static const char *show_buffer(struct trace_seq *s) +{ + struct seq_buf *seq = &s->seq; + + seq_buf_terminate(seq); + + return seq->buffer; +} + +static DEFINE_STATIC_KEY_FALSE(trace_no_verify); + +static int test_can_verify_check(const char *fmt, ...) +{ + char buf[16]; + va_list ap; + int ret; + + /* + * The verifier is dependent on vsnprintf() modifies the va_list + * passed to it, where it is sent as a reference. Some architectures + * (like x86_32) passes it by value, which means that vsnprintf() + * does not modify the va_list passed to it, and the verifier + * would then need to be able to understand all the values that + * vsnprintf can use. If it is passed by value, then the verifier + * is disabled. + */ + va_start(ap, fmt); + vsnprintf(buf, 16, "%d", ap); + ret = va_arg(ap, int); + va_end(ap); + + return ret; +} + +static void test_can_verify(void) +{ + if (!test_can_verify_check("%d %d", 0, 1)) { + pr_info("trace event string verifier disabled\n"); + static_branch_inc(&trace_no_verify); + } +} + +/** + * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer + * @iter: The iterator that holds the seq buffer and the event being printed + * @fmt: The format used to print the event + * @ap: The va_list holding the data to print from @fmt. + * + * This writes the data into the @iter->seq buffer using the data from + * @fmt and @ap. If the format has a %s, then the source of the string + * is examined to make sure it is safe to print, otherwise it will + * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string + * pointer. + */ +void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, + va_list ap) +{ + const char *p = fmt; + const char *str; + int i, j; + + if (WARN_ON_ONCE(!fmt)) + return; + + if (static_branch_unlikely(&trace_no_verify)) + goto print; + + /* Don't bother checking when doing a ftrace_dump() */ + if (iter->fmt == static_fmt_buf) + goto print; + + while (*p) { + j = 0; + + /* We only care about %s and variants */ + for (i = 0; p[i]; i++) { + if (i + 1 >= iter->fmt_size) { + /* + * If we can't expand the copy buffer, + * just print it. + */ + if (!trace_iter_expand_format(iter)) + goto print; + } + + if (p[i] == '\\' && p[i+1]) { + i++; + continue; + } + if (p[i] == '%') { + /* Need to test cases like %08.*s */ + for (j = 1; p[i+j]; j++) { + if (isdigit(p[i+j]) || + p[i+j] == '*' || + p[i+j] == '.') + continue; + break; + } + if (p[i+j] == 's') + break; + } + j = 0; + } + /* If no %s found then just print normally */ + if (!p[i]) + break; + + /* Copy up to the %s, and print that */ + strncpy(iter->fmt, p, i); + iter->fmt[i] = '\0'; + trace_seq_vprintf(&iter->seq, iter->fmt, ap); + + /* The ap now points to the string data of the %s */ + str = va_arg(ap, const char *); + + /* + * If you hit this warning, it is likely that the + * trace event in question used %s on a string that + * was saved at the time of the event, but may not be + * around when the trace is read. Use __string(), + * __assign_str() and __get_str() helpers in the TRACE_EVENT() + * instead. See samples/trace_events/trace-events-sample.h + * for reference. + */ + if (WARN_ONCE(!trace_safe_str(iter, str), + "fmt: '%s' current_buffer: '%s'", + fmt, show_buffer(&iter->seq))) { + int ret; + + /* Try to safely read the string */ + ret = strncpy_from_kernel_nofault(iter->fmt, str, + iter->fmt_size); + if (ret < 0) + trace_seq_printf(&iter->seq, "(0x%px)", str); + else + trace_seq_printf(&iter->seq, "(0x%px:%s)", + str, iter->fmt); + str = "[UNSAFE-MEMORY]"; + strcpy(iter->fmt, "%s"); + } else { + strncpy(iter->fmt, p + i, j + 1); + iter->fmt[j+1] = '\0'; + } + trace_seq_printf(&iter->seq, iter->fmt, str); + + p += i + j + 1; + } + print: + if (*p) + trace_seq_vprintf(&iter->seq, p, ap); +} + const char *trace_event_format(struct trace_iterator *iter, const char *fmt) { const char *p, *new_fmt; @@ -6768,7 +6990,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { /* do not add \n before testing triggers, but add \0 */ entry->buf[cnt] = '\0'; - tt = event_triggers_call(tr->trace_marker_file, entry, event); + tt = event_triggers_call(tr->trace_marker_file, buffer, entry, event); } if (entry->buf[cnt - 1] != '\n') { @@ -6976,31 +7198,34 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) return ret; } -int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs) +u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe) +{ + if (rbe == this_cpu_read(trace_buffered_event)) + return ring_buffer_time_stamp(buffer); + + return ring_buffer_event_time_stamp(buffer, rbe); +} + +/* + * Set or disable using the per CPU trace_buffer_event when possible. + */ +int tracing_set_filter_buffering(struct trace_array *tr, bool set) { int ret = 0; mutex_lock(&trace_types_lock); - if (abs && tr->time_stamp_abs_ref++) + if (set && tr->no_filter_buffering_ref++) goto out; - if (!abs) { - if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) { + if (!set) { + if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) { ret = -EINVAL; goto out; } - if (--tr->time_stamp_abs_ref) - goto out; + --tr->no_filter_buffering_ref; } - - ring_buffer_set_time_stamp_abs(tr->array_buffer.buffer, abs); - -#ifdef CONFIG_TRACER_MAX_TRACE - if (tr->max_buffer.buffer) - ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs); -#endif out: mutex_unlock(&trace_types_lock); @@ -7336,11 +7561,11 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) * @cmd: The tracing command that caused the error * @str: The string to position the caret at within @cmd * - * Finds the position of the first occurence of @str within @cmd. The + * Finds the position of the first occurrence of @str within @cmd. The * return value can be passed to tracing_log_err() for caret placement * within @cmd. * - * Returns the index within @cmd of the first occurence of @str or 0 + * Returns the index within @cmd of the first occurrence of @str or 0 * if @str was not found. */ unsigned int err_pos(char *cmd, const char *str) @@ -7890,7 +8115,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); - t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu)); + t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); } else { @@ -7899,7 +8124,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); trace_seq_printf(s, "now ts: %llu\n", - ring_buffer_time_stamp(trace_buf->buffer, cpu)); + ring_buffer_time_stamp(trace_buf->buffer)); } cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu); @@ -8906,6 +9131,7 @@ static int __remove_instance(struct trace_array *tr) ftrace_clear_pids(tr); ftrace_destroy_function_files(tr); tracefs_remove(tr->dir); + free_percpu(tr->last_func_repeats); free_trace_buffers(tr); for (i = 0; i < tr->nr_topts; i++) { @@ -9123,7 +9349,7 @@ int tracing_init_dentry(void) * As there may still be users that expect the tracing * files to exist in debugfs/tracing, we must automount * the tracefs file system there, so older tools still - * work with the newer kerenl. + * work with the newer kernel. */ tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); @@ -9676,6 +9902,8 @@ __init static int tracer_alloc_buffers(void) register_snapshot_cmd(); + test_can_verify(); + return 0; out_free_savedcmd: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a6446c03cfbc..cd80d046c7a5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -45,6 +45,7 @@ enum trace_type { TRACE_BPUTS, TRACE_HWLAT, TRACE_RAW_DATA, + TRACE_FUNC_REPEATS, __TRACE_LAST_TYPE, }; @@ -262,6 +263,17 @@ struct cond_snapshot { }; /* + * struct trace_func_repeats - used to keep track of the consecutive + * (on the same CPU) calls of a single function. + */ +struct trace_func_repeats { + unsigned long ip; + unsigned long parent_ip; + unsigned long count; + u64 ts_last_call; +}; + +/* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. * They have on/off state as well: @@ -352,11 +364,12 @@ struct trace_array { /* function tracing enabled */ int function_enabled; #endif - int time_stamp_abs_ref; + int no_filter_buffering_ref; struct list_head hist_vars; #ifdef CONFIG_TRACER_SNAPSHOT struct cond_snapshot *cond_snapshot; #endif + struct trace_func_repeats __percpu *last_func_repeats; }; enum { @@ -372,7 +385,8 @@ extern int tracing_check_open_get_tr(struct trace_array *tr); extern struct trace_array *trace_array_find(const char *instance); extern struct trace_array *trace_array_find_get(const char *instance); -extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); +extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe); +extern int tracing_set_filter_buffering(struct trace_array *tr, bool set); extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); extern bool trace_clock_in_ns(struct trace_array *tr); @@ -441,6 +455,8 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ + IF_ASSIGN(var, ent, struct func_repeats_entry, \ + TRACE_FUNC_REPEATS); \ __ftrace_bad_type(); \ } while (0) @@ -581,7 +597,10 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, struct ring_buffer_event *event); +bool trace_is_tracepoint_string(const char *str); const char *trace_event_format(struct trace_iterator *iter, const char *fmt); +void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, + va_list ap); int trace_empty(struct trace_iterator *iter); @@ -676,6 +695,10 @@ static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, } #endif /* CONFIG_STACKTRACE */ +void trace_last_func_repeats(struct trace_array *tr, + struct trace_func_repeats *last_info, + unsigned int trace_ctx); + extern u64 ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); @@ -1329,7 +1352,7 @@ __event_trigger_test_discard(struct trace_event_file *file, unsigned long eflags = file->flags; if (eflags & EVENT_FILE_FL_TRIGGER_COND) - *tt = event_triggers_call(file, entry, event); + *tt = event_triggers_call(file, buffer, entry, event); if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && @@ -1343,7 +1366,7 @@ __event_trigger_test_discard(struct trace_event_file *file, /** * event_trigger_unlock_commit - handle triggers and finish event commit - * @file: The file pointer assoctiated to the event + * @file: The file pointer associated with the event * @buffer: The ring buffer that the event is being written to * @event: The event meta data in the ring buffer * @entry: The event itself @@ -1370,7 +1393,7 @@ event_trigger_unlock_commit(struct trace_event_file *file, /** * event_trigger_unlock_commit_regs - handle triggers and finish event commit - * @file: The file pointer assoctiated to the event + * @file: The file pointer associated with the event * @buffer: The ring buffer that the event is being written to * @event: The event meta data in the ring buffer * @entry: The event itself @@ -1626,7 +1649,7 @@ extern int register_trigger_hist_enable_disable_cmds(void); */ struct event_trigger_ops { void (*func)(struct event_trigger_data *data, - void *rec, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index aaf6793ededa..c1637f90c8a3 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -95,33 +95,49 @@ u64 notrace trace_clock_global(void) { unsigned long flags; int this_cpu; - u64 now; + u64 now, prev_time; raw_local_irq_save(flags); this_cpu = raw_smp_processor_id(); - now = sched_clock_cpu(this_cpu); + /* - * If in an NMI context then dont risk lockups and return the - * cpu_clock() time: + * The global clock "guarantees" that the events are ordered + * between CPUs. But if two events on two different CPUS call + * trace_clock_global at roughly the same time, it really does + * not matter which one gets the earlier time. Just make sure + * that the same CPU will always show a monotonic clock. + * + * Use a read memory barrier to get the latest written + * time that was recorded. */ - if (unlikely(in_nmi())) - goto out; + smp_rmb(); + prev_time = READ_ONCE(trace_clock_struct.prev_time); + now = sched_clock_cpu(this_cpu); - arch_spin_lock(&trace_clock_struct.lock); + /* Make sure that now is always greater than prev_time */ + if ((s64)(now - prev_time) < 0) + now = prev_time + 1; /* - * TODO: if this happens often then maybe we should reset - * my_scd->clock to prev_time+1, to make sure - * we start ticking with the local clock from now on? + * If in an NMI context then dont risk lockups and simply return + * the current time. */ - if ((s64)(now - trace_clock_struct.prev_time) < 0) - now = trace_clock_struct.prev_time + 1; + if (unlikely(in_nmi())) + goto out; - trace_clock_struct.prev_time = now; + /* Tracing can cause strange recursion, always use a try lock */ + if (arch_spin_trylock(&trace_clock_struct.lock)) { + /* Reread prev_time in case it was already updated */ + prev_time = READ_ONCE(trace_clock_struct.prev_time); + if ((s64)(now - prev_time) < 0) + now = prev_time + 1; - arch_spin_unlock(&trace_clock_struct.lock); + trace_clock_struct.prev_time = now; + /* The unlock acts as the wmb for the above rmb */ + arch_spin_unlock(&trace_clock_struct.lock); + } out: raw_local_irq_restore(flags); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 4547ac59da61..251c819cf0c5 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -338,3 +338,25 @@ FTRACE_ENTRY(hwlat, hwlat_entry, __entry->nmi_total_ts, __entry->nmi_count) ); + +#define FUNC_REPEATS_GET_DELTA_TS(entry) \ + (((u64)(entry)->top_delta_ts << 32) | (entry)->bottom_delta_ts) \ + +FTRACE_ENTRY(func_repeats, func_repeats_entry, + + TRACE_FUNC_REPEATS, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned long, parent_ip ) + __field( u16 , count ) + __field( u16 , top_delta_ts ) + __field( u32 , bottom_delta_ts ) + ), + + F_printk(" %ps <-%ps\t(repeats:%u delta: -%llu)", + (void *)__entry->ip, + (void *)__entry->parent_ip, + __entry->count, + FUNC_REPEATS_GET_DELTA_TS(__entry)) +); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 288ad2c274fb..03be4435d103 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -16,7 +16,7 @@ static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses - * suprises + * surprises */ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) perf_trace_t; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a3563afd412d..80e96989770e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -217,6 +217,214 @@ int trace_event_get_offsets(struct trace_event_call *call) return tail->offset + tail->size; } +/* + * Check if the referenced field is an array and return true, + * as arrays are OK to dereference. + */ +static bool test_field(const char *fmt, struct trace_event_call *call) +{ + struct trace_event_fields *field = call->class->fields_array; + const char *array_descriptor; + const char *p = fmt; + int len; + + if (!(len = str_has_prefix(fmt, "REC->"))) + return false; + fmt += len; + for (p = fmt; *p; p++) { + if (!isalnum(*p) && *p != '_') + break; + } + len = p - fmt; + + for (; field->type; field++) { + if (strncmp(field->name, fmt, len) || + field->name[len]) + continue; + array_descriptor = strchr(field->type, '['); + /* This is an array and is OK to dereference. */ + return array_descriptor != NULL; + } + return false; +} + +/* + * Examine the print fmt of the event looking for unsafe dereference + * pointers using %p* that could be recorded in the trace event and + * much later referenced after the pointer was freed. Dereferencing + * pointers are OK, if it is dereferenced into the event itself. + */ +static void test_event_printk(struct trace_event_call *call) +{ + u64 dereference_flags = 0; + bool first = true; + const char *fmt, *c, *r, *a; + int parens = 0; + char in_quote = 0; + int start_arg = 0; + int arg = 0; + int i; + + fmt = call->print_fmt; + + if (!fmt) + return; + + for (i = 0; fmt[i]; i++) { + switch (fmt[i]) { + case '\\': + i++; + if (!fmt[i]) + return; + continue; + case '"': + case '\'': + /* + * The print fmt starts with a string that + * is processed first to find %p* usage, + * then after the first string, the print fmt + * contains arguments that are used to check + * if the dereferenced %p* usage is safe. + */ + if (first) { + if (fmt[i] == '\'') + continue; + if (in_quote) { + arg = 0; + first = false; + /* + * If there was no %p* uses + * the fmt is OK. + */ + if (!dereference_flags) + return; + } + } + if (in_quote) { + if (in_quote == fmt[i]) + in_quote = 0; + } else { + in_quote = fmt[i]; + } + continue; + case '%': + if (!first || !in_quote) + continue; + i++; + if (!fmt[i]) + return; + switch (fmt[i]) { + case '%': + continue; + case 'p': + /* Find dereferencing fields */ + switch (fmt[i + 1]) { + case 'B': case 'R': case 'r': + case 'b': case 'M': case 'm': + case 'I': case 'i': case 'E': + case 'U': case 'V': case 'N': + case 'a': case 'd': case 'D': + case 'g': case 't': case 'C': + case 'O': case 'f': + if (WARN_ONCE(arg == 63, + "Too many args for event: %s", + trace_event_name(call))) + return; + dereference_flags |= 1ULL << arg; + } + break; + default: + { + bool star = false; + int j; + + /* Increment arg if %*s exists. */ + for (j = 0; fmt[i + j]; j++) { + if (isdigit(fmt[i + j]) || + fmt[i + j] == '.') + continue; + if (fmt[i + j] == '*') { + star = true; + continue; + } + if ((fmt[i + j] == 's') && star) + arg++; + break; + } + break; + } /* default */ + + } /* switch */ + arg++; + continue; + case '(': + if (in_quote) + continue; + parens++; + continue; + case ')': + if (in_quote) + continue; + parens--; + if (WARN_ONCE(parens < 0, + "Paren mismatch for event: %s\narg='%s'\n%*s", + trace_event_name(call), + fmt + start_arg, + (i - start_arg) + 5, "^")) + return; + continue; + case ',': + if (in_quote || parens) + continue; + i++; + while (isspace(fmt[i])) + i++; + start_arg = i; + if (!(dereference_flags & (1ULL << arg))) + goto next_arg; + + /* Find the REC-> in the argument */ + c = strchr(fmt + i, ','); + r = strstr(fmt + i, "REC->"); + if (r && (!c || r < c)) { + /* + * Addresses of events on the buffer, + * or an array on the buffer is + * OK to dereference. + * There's ways to fool this, but + * this is to catch common mistakes, + * not malicious code. + */ + a = strchr(fmt + i, '&'); + if ((a && (a < r)) || test_field(r, call)) + dereference_flags &= ~(1ULL << arg); + } + next_arg: + i--; + arg++; + } + } + + /* + * If you triggered the below warning, the trace event reported + * uses an unsafe dereference pointer %p*. As the data stored + * at the trace event time may no longer exist when the trace + * event is printed, dereferencing to the original source is + * unsafe. The source of the dereference must be copied into the + * event itself, and the dereference must access the copy instead. + */ + if (WARN_ON_ONCE(dereference_flags)) { + arg = 1; + while (!(dereference_flags & 1)) { + dereference_flags >>= 1; + arg++; + } + pr_warn("event %s has unsafe dereference of argument %d\n", + trace_event_name(call), arg); + pr_warn("print_fmt: %s\n", fmt); + } +} + int trace_event_raw_init(struct trace_event_call *call) { int id; @@ -225,6 +433,8 @@ int trace_event_raw_init(struct trace_event_call *call) if (!id) return -ENODEV; + test_event_printk(call); + return 0; } EXPORT_SYMBOL_GPL(trace_event_raw_init); @@ -2436,7 +2646,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) } /* - * Since calls are grouped by systems, the likelyhood that the + * Since calls are grouped by systems, the likelihood that the * next call in the iteration belongs to the same system as the * previous call is high. As an optimization, we skip searching * for a map[] that matches the call's system if the last call @@ -2496,7 +2706,7 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) } /* - * Just create a decriptor for early init. A descriptor is required + * Just create a descriptor for early init. A descriptor is required * for enabling events at boot. We want to enable events before * the filesystem is initialized. */ diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e91259f6a722..c9124038b140 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -256,7 +256,7 @@ enum { * is "&&" we don't call update_preds(). Instead continue to "c". As the * next token after "c" is not "&&" but the end of input, we first process the * "&&" by calling update_preds() for the "&&" then we process the "||" by - * callin updates_preds() with the values for processing "||". + * calling updates_preds() with the values for processing "||". * * What does that mean? What update_preds() does is to first save the "target" * of the program entry indexed by the current program entry's "target" @@ -296,7 +296,7 @@ enum { * and "FALSE" the program entry after that, we are now done with the first * pass. * - * Making the above "a || b && c" have a progam of: + * Making the above "a || b && c" have a program of: * prog[0] = { "a", 1, 2 } * prog[1] = { "b", 0, 2 } * prog[2] = { "c", 0, 3 } @@ -390,7 +390,7 @@ enum { * F: return FALSE * * As "r = a; if (!r) goto n5;" is obviously the same as - * "if (!a) goto n5;" without doing anything we can interperate the + * "if (!a) goto n5;" without doing anything we can interpret the * program as: * n1: if (!a) goto n5; * n2: if (!b) goto n5; @@ -1693,6 +1693,7 @@ static void create_filter_finish(struct filter_parse_error *pe) /** * create_filter - create a filter for a trace_event_call + * @tr: the trace array associated with these events * @call: trace_event_call to create a filter for * @filter_str: filter string * @set_str: remember @filter_str and enable detailed error in filter @@ -1741,8 +1742,8 @@ int create_event_filter(struct trace_array *tr, } /** - * create_system_filter - create a filter for an event_subsystem - * @system: event_subsystem to create a filter for + * create_system_filter - create a filter for an event subsystem + * @dir: the descriptor for the subsystem directory * @filter_str: filter string * @filterp: out param for created filter (always updated on return) * @@ -1750,7 +1751,6 @@ int create_event_filter(struct trace_array *tr, * and always remembers @filter_str. */ static int create_system_filter(struct trace_subsystem_dir *dir, - struct trace_array *tr, char *filter_str, struct event_filter **filterp) { struct filter_parse_error *pe = NULL; @@ -1758,13 +1758,13 @@ static int create_system_filter(struct trace_subsystem_dir *dir, err = create_filter_start(filter_str, true, &pe, filterp); if (!err) { - err = process_system_preds(dir, tr, pe, filter_str); + err = process_system_preds(dir, dir->tr, pe, filter_str); if (!err) { /* System filters just show a default message */ kfree((*filterp)->filter_string); (*filterp)->filter_string = NULL; } else { - append_filter_err(tr, pe, *filterp); + append_filter_err(dir->tr, pe, *filterp); } } create_filter_finish(pe); @@ -1852,7 +1852,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, goto out_unlock; } - err = create_system_filter(dir, tr, filter_string, &filter); + err = create_system_filter(dir, filter_string, &filter); if (filter) { /* * No event actually uses the system filter diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 39ebe1826fc3..c1abd63f1d6c 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -81,6 +81,7 @@ struct hist_field; typedef u64 (*hist_field_fn_t) (struct hist_field *field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event); @@ -153,6 +154,7 @@ struct hist_field { static u64 hist_field_none(struct hist_field *field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -161,6 +163,7 @@ static u64 hist_field_none(struct hist_field *field, static u64 hist_field_counter(struct hist_field *field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -169,6 +172,7 @@ static u64 hist_field_counter(struct hist_field *field, static u64 hist_field_string(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -179,6 +183,7 @@ static u64 hist_field_string(struct hist_field *hist_field, static u64 hist_field_dynstring(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -191,6 +196,7 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, static u64 hist_field_pstring(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -201,52 +207,56 @@ static u64 hist_field_pstring(struct hist_field *hist_field, static u64 hist_field_log2(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_field *operand = hist_field->operands[0]; - u64 val = operand->fn(operand, elt, rbe, event); + u64 val = operand->fn(operand, elt, buffer, rbe, event); return (u64) ilog2(roundup_pow_of_two(val)); } static u64 hist_field_plus(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, rbe, event); - u64 val2 = operand2->fn(operand2, elt, rbe, event); + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); return val1 + val2; } static u64 hist_field_minus(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, rbe, event); - u64 val2 = operand2->fn(operand2, elt, rbe, event); + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); return val1 - val2; } static u64 hist_field_unary_minus(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_field *operand = hist_field->operands[0]; - s64 sval = (s64)operand->fn(operand, elt, rbe, event); + s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event); u64 val = (u64)-sval; return val; @@ -255,6 +265,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field, #define DEFINE_HIST_FIELD_FN(type) \ static u64 hist_field_##type(struct hist_field *hist_field, \ struct tracing_map_elt *elt, \ + struct trace_buffer *buffer, \ struct ring_buffer_event *rbe, \ void *event) \ { \ @@ -380,7 +391,8 @@ struct hist_trigger_data { struct action_data; typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals); @@ -608,7 +620,8 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, } static void action_trace(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { @@ -624,13 +637,14 @@ struct hist_var_data { static u64 hist_field_timestamp(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_trigger_data *hist_data = hist_field->hist_data; struct trace_array *tr = hist_data->event_file->tr; - u64 ts = ring_buffer_event_time_stamp(rbe); + u64 ts = ring_buffer_event_time_stamp(buffer, rbe); if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr)) ts = ns2usecs(ts); @@ -640,6 +654,7 @@ static u64 hist_field_timestamp(struct hist_field *hist_field, static u64 hist_field_cpu(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -1020,6 +1035,7 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data, static u64 hist_field_var_ref(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -2561,6 +2577,7 @@ find_target_event_var(struct hist_trigger_data *hist_data, } static inline void __update_field_vars(struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *rec, struct field_var **field_vars, @@ -2576,7 +2593,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, struct hist_field *var = field_var->var; struct hist_field *val = field_var->val; - var_val = val->fn(val, elt, rbe, rec); + var_val = val->fn(val, elt, buffer, rbe, rec); var_idx = var->var.idx; if (val->flags & HIST_FIELD_FL_STRING) { @@ -2592,19 +2609,21 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, static void update_field_vars(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *rec) { - __update_field_vars(elt, rbe, rec, hist_data->field_vars, + __update_field_vars(elt, buffer, rbe, rec, hist_data->field_vars, hist_data->n_field_vars, 0); } static void save_track_data_vars(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { - __update_field_vars(elt, rbe, rec, hist_data->save_vars, + __update_field_vars(elt, buffer, rbe, rec, hist_data->save_vars, hist_data->n_save_vars, hist_data->n_field_var_str); } @@ -2780,12 +2799,14 @@ static void save_track_val(struct hist_trigger_data *hist_data, } static void save_track_data(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { if (data->track_data.save_data) - data->track_data.save_data(hist_data, elt, rec, rbe, key, data, var_ref_vals); + data->track_data.save_data(hist_data, elt, buffer, rec, rbe, + key, data, var_ref_vals); } static bool check_track_val(struct tracing_map_elt *elt, @@ -2836,7 +2857,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) } static void save_track_data_snapshot(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) @@ -2905,7 +2927,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) return false; } static void save_track_data_snapshot(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) {} @@ -2947,7 +2970,8 @@ static void track_data_print(struct seq_file *m, } static void ontrack_action(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { @@ -2955,7 +2979,8 @@ static void ontrack_action(struct hist_trigger_data *hist_data, if (check_track_val(elt, data, var_val)) { save_track_val(hist_data, elt, data, var_val); - save_track_data(hist_data, elt, rec, rbe, key, data, var_ref_vals); + save_track_data(hist_data, elt, buffer, rec, rbe, + key, data, var_ref_vals); } } @@ -4400,7 +4425,8 @@ create_hist_data(unsigned int map_bits, } static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, u64 *var_ref_vals) { @@ -4414,7 +4440,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_val_field(i, hist_data) { hist_field = hist_data->fields[i]; - hist_val = hist_field->fn(hist_field, elt, rbe, rec); + hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec); if (hist_field->flags & HIST_FIELD_FL_VAR) { var_idx = hist_field->var.idx; @@ -4442,13 +4468,13 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_key_field(i, hist_data) { hist_field = hist_data->fields[i]; if (hist_field->flags & HIST_FIELD_FL_VAR) { - hist_val = hist_field->fn(hist_field, elt, rbe, rec); + hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec); var_idx = hist_field->var.idx; tracing_map_set_var(elt, var_idx, hist_val); } } - update_field_vars(hist_data, elt, rbe, rec); + update_field_vars(hist_data, elt, buffer, rbe, rec); } static inline void add_to_key(char *compound_key, void *key, @@ -4478,7 +4504,8 @@ static inline void add_to_key(char *compound_key, void *key, static void hist_trigger_actions(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, u64 *var_ref_vals) { @@ -4487,11 +4514,12 @@ hist_trigger_actions(struct hist_trigger_data *hist_data, for (i = 0; i < hist_data->n_actions; i++) { data = hist_data->actions[i]; - data->fn(hist_data, elt, rec, rbe, key, data, var_ref_vals); + data->fn(hist_data, elt, buffer, rec, rbe, key, data, var_ref_vals); } } -static void event_hist_trigger(struct event_trigger_data *data, void *rec, +static void event_hist_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe) { struct hist_trigger_data *hist_data = data->private_data; @@ -4516,7 +4544,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, HIST_STACKTRACE_SKIP); key = entries; } else { - field_contents = key_field->fn(key_field, elt, rbe, rec); + field_contents = key_field->fn(key_field, elt, buffer, rbe, rec); if (key_field->flags & HIST_FIELD_FL_STRING) { key = (void *)(unsigned long)field_contents; use_compound_key = true; @@ -4539,10 +4567,10 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, if (!elt) return; - hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals); + hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals); if (resolve_var_refs(hist_data, key, var_ref_vals, true)) - hist_trigger_actions(hist_data, elt, rec, rbe, key, var_ref_vals); + hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -5456,7 +5484,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, goto out; } - tracing_set_time_stamp_abs(file->tr, true); + tracing_set_filter_buffering(file->tr, true); } if (named_data) @@ -5564,7 +5592,7 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, if (hist_data->enable_timestamps) { if (!hist_data->remove || unregistered) - tracing_set_time_stamp_abs(file->tr, false); + tracing_set_filter_buffering(file->tr, false); } } @@ -5611,7 +5639,7 @@ static void hist_unreg_all(struct trace_event_file *file) update_cond_flag(file); if (hist_data->enable_timestamps) - tracing_set_time_stamp_abs(file->tr, false); + tracing_set_filter_buffering(file->tr, false); if (test->ops->free) test->ops->free(test->ops, test); } @@ -5812,7 +5840,8 @@ __init int register_trigger_hist_cmd(void) } static void -hist_enable_trigger(struct event_trigger_data *data, void *rec, +hist_enable_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -5830,7 +5859,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec, } static void -hist_enable_count_trigger(struct event_trigger_data *data, void *rec, +hist_enable_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!data->count) @@ -5839,7 +5869,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec, if (data->count != -1) (data->count)--; - hist_enable_trigger(data, rec, event); + hist_enable_trigger(data, buffer, rec, event); } static struct event_trigger_ops hist_enable_trigger_ops = { diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 8d71e6c83f10..2ac75eb6aa86 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1385,7 +1385,7 @@ static int destroy_synth_event(struct synth_event *se) /** * synth_event_delete - Delete a synthetic event - * @event_name: The name of the new sythetic event + * @event_name: The name of the new synthetic event * * Delete a synthetic event that was created with synth_event_create(). * diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index f725802160c0..b8bfa8505b7b 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -53,7 +53,8 @@ void trigger_data_free(struct event_trigger_data *data) * any trigger that should be deferred, ETT_NONE if nothing to defer. */ enum event_trigger_type -event_triggers_call(struct trace_event_file *file, void *rec, +event_triggers_call(struct trace_event_file *file, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct event_trigger_data *data; @@ -67,7 +68,7 @@ event_triggers_call(struct trace_event_file *file, void *rec, if (data->paused) continue; if (!rec) { - data->ops->func(data, rec, event); + data->ops->func(data, buffer, rec, event); continue; } filter = rcu_dereference_sched(data->filter); @@ -77,7 +78,7 @@ event_triggers_call(struct trace_event_file *file, void *rec, tt |= data->cmd_ops->trigger_type; continue; } - data->ops->func(data, rec, event); + data->ops->func(data, buffer, rec, event); } return tt; } @@ -105,7 +106,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->func(data, NULL, NULL); + data->ops->func(data, NULL, NULL, NULL); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -937,7 +938,8 @@ get_named_trigger_data(struct event_trigger_data *data) } static void -traceon_trigger(struct event_trigger_data *data, void *rec, +traceon_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (tracing_is_on()) @@ -947,7 +949,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec, } static void -traceon_count_trigger(struct event_trigger_data *data, void *rec, +traceon_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (tracing_is_on()) @@ -963,7 +966,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec, } static void -traceoff_trigger(struct event_trigger_data *data, void *rec, +traceoff_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!tracing_is_on()) @@ -973,7 +977,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec, } static void -traceoff_count_trigger(struct event_trigger_data *data, void *rec, +traceoff_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!tracing_is_on()) @@ -1071,7 +1076,8 @@ static struct event_command trigger_traceoff_cmd = { #ifdef CONFIG_TRACER_SNAPSHOT static void -snapshot_trigger(struct event_trigger_data *data, void *rec, +snapshot_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct trace_event_file *file = data->private_data; @@ -1083,7 +1089,8 @@ snapshot_trigger(struct event_trigger_data *data, void *rec, } static void -snapshot_count_trigger(struct event_trigger_data *data, void *rec, +snapshot_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!data->count) @@ -1092,7 +1099,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec, if (data->count != -1) (data->count)--; - snapshot_trigger(data, rec, event); + snapshot_trigger(data, buffer, rec, event); } static int @@ -1176,14 +1183,16 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #endif static void -stacktrace_trigger(struct event_trigger_data *data, void *rec, +stacktrace_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { trace_dump_stack(STACK_SKIP); } static void -stacktrace_count_trigger(struct event_trigger_data *data, void *rec, +stacktrace_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!data->count) @@ -1192,7 +1201,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec, if (data->count != -1) (data->count)--; - stacktrace_trigger(data, rec, event); + stacktrace_trigger(data, buffer, rec, event); } static int @@ -1254,7 +1263,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void) } static void -event_enable_trigger(struct event_trigger_data *data, void *rec, +event_enable_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -1266,7 +1276,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec, } static void -event_enable_count_trigger(struct event_trigger_data *data, void *rec, +event_enable_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -1281,7 +1292,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec, if (data->count != -1) (data->count)--; - event_enable_trigger(data, rec, event); + event_enable_trigger(data, buffer, rec, event); } int event_enable_trigger_print(struct seq_file *m, diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index f93723ca66bc..1f0e63f5d1f9 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -27,13 +27,28 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); +static void +function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +static void +function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs); static struct tracer_flags func_flags; /* Our option */ enum { - TRACE_FUNC_OPT_STACK = 0x1, + + TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */ + TRACE_FUNC_OPT_STACK = 0x1, + TRACE_FUNC_OPT_NO_REPEATS = 0x2, + + /* Update this to next highest bit. */ + TRACE_FUNC_OPT_HIGHEST_BIT = 0x4 }; +#define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1) + int ftrace_allocate_ftrace_ops(struct trace_array *tr) { struct ftrace_ops *ops; @@ -86,6 +101,34 @@ void ftrace_destroy_function_files(struct trace_array *tr) ftrace_free_ftrace_ops(tr); } +static ftrace_func_t select_trace_function(u32 flags_val) +{ + switch (flags_val & TRACE_FUNC_OPT_MASK) { + case TRACE_FUNC_NO_OPTS: + return function_trace_call; + case TRACE_FUNC_OPT_STACK: + return function_stack_trace_call; + case TRACE_FUNC_OPT_NO_REPEATS: + return function_no_repeats_trace_call; + case TRACE_FUNC_OPT_STACK | TRACE_FUNC_OPT_NO_REPEATS: + return function_stack_no_repeats_trace_call; + default: + return NULL; + } +} + +static bool handle_func_repeats(struct trace_array *tr, u32 flags_val) +{ + if (!tr->last_func_repeats && + (flags_val & TRACE_FUNC_OPT_NO_REPEATS)) { + tr->last_func_repeats = alloc_percpu(struct trace_func_repeats); + if (!tr->last_func_repeats) + return false; + } + + return true; +} + static int function_trace_init(struct trace_array *tr) { ftrace_func_t func; @@ -97,12 +140,12 @@ static int function_trace_init(struct trace_array *tr) if (!tr->ops) return -ENOMEM; - /* Currently only the global instance can do stack tracing */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL && - func_flags.val & TRACE_FUNC_OPT_STACK) - func = function_stack_trace_call; - else - func = function_trace_call; + func = select_trace_function(func_flags.val); + if (!func) + return -EINVAL; + + if (!handle_func_repeats(tr, func_flags.val)) + return -ENOMEM; ftrace_init_array_ops(tr, func); @@ -205,15 +248,137 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, local_irq_restore(flags); } +static inline bool is_repeat_check(struct trace_array *tr, + struct trace_func_repeats *last_info, + unsigned long ip, unsigned long parent_ip) +{ + if (last_info->ip == ip && + last_info->parent_ip == parent_ip && + last_info->count < U16_MAX) { + last_info->ts_last_call = + ring_buffer_time_stamp(tr->array_buffer.buffer); + last_info->count++; + return true; + } + + return false; +} + +static inline void process_repeats(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + struct trace_func_repeats *last_info, + unsigned int trace_ctx) +{ + if (last_info->count) { + trace_last_func_repeats(tr, last_info, trace_ctx); + last_info->count = 0; + } + + last_info->ip = ip; + last_info->parent_ip = parent_ip; +} + +static void +function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + struct trace_func_repeats *last_info; + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned int trace_ctx; + unsigned long flags; + int bit; + int cpu; + + if (unlikely(!tr->function_enabled)) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + preempt_disable_notrace(); + + cpu = smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + if (atomic_read(&data->disabled)) + goto out; + + /* + * An interrupt may happen at any place here. But as far as I can see, + * the only damage that this can cause is to mess up the repetition + * counter without valuable data being lost. + * TODO: think about a solution that is better than just hoping to be + * lucky. + */ + last_info = per_cpu_ptr(tr->last_func_repeats, cpu); + if (is_repeat_check(tr, last_info, ip, parent_ip)) + goto out; + + local_save_flags(flags); + trace_ctx = tracing_gen_ctx_flags(flags); + process_repeats(tr, ip, parent_ip, last_info, trace_ctx); + + trace_function(tr, ip, parent_ip, trace_ctx); + +out: + ftrace_test_recursion_unlock(bit); + preempt_enable_notrace(); +} + +static void +function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + struct trace_func_repeats *last_info; + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + unsigned int trace_ctx; + + if (unlikely(!tr->function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + last_info = per_cpu_ptr(tr->last_func_repeats, cpu); + if (is_repeat_check(tr, last_info, ip, parent_ip)) + goto out; + + trace_ctx = tracing_gen_ctx_flags(flags); + process_repeats(tr, ip, parent_ip, last_info, trace_ctx); + + trace_function(tr, ip, parent_ip, trace_ctx); + __trace_stack(tr, trace_ctx, STACK_SKIP); + } + + out: + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, #endif + { TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) }, { } /* Always set a last empty entry */ }; static struct tracer_flags func_flags = { - .val = 0, /* By default: all flags disabled */ + .val = TRACE_FUNC_NO_OPTS, /* By default: all flags disabled */ .opts = func_opts }; @@ -235,30 +400,32 @@ static struct tracer function_trace; static int func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { - switch (bit) { - case TRACE_FUNC_OPT_STACK: - /* do nothing if already set */ - if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) - break; - - /* We can change this flag when not running. */ - if (tr->current_trace != &function_trace) - break; + ftrace_func_t func; + u32 new_flags; - unregister_ftrace_function(tr->ops); + /* Do nothing if already set. */ + if (!!set == !!(func_flags.val & bit)) + return 0; - if (set) { - tr->ops->func = function_stack_trace_call; - register_ftrace_function(tr->ops); - } else { - tr->ops->func = function_trace_call; - register_ftrace_function(tr->ops); - } + /* We can change this flag only when not running. */ + if (tr->current_trace != &function_trace) + return 0; - break; - default: + new_flags = (func_flags.val & ~bit) | (set ? bit : 0); + func = select_trace_function(new_flags); + if (!func) return -EINVAL; - } + + /* Check if there's anything to change. */ + if (tr->ops->func == func) + return 0; + + if (!handle_func_repeats(tr, new_flags)) + return -ENOMEM; + + unregister_ftrace_function(tr->ops); + tr->ops->func = func; + register_ftrace_function(tr->ops); return 0; } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0aa6e6faa943..0de6837722da 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -764,7 +764,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, * - we are inside irq code * - we just entered irq code * - * retunns 0 if + * returns 0 if * - funcgraph-interrupts option is set * - we are not inside irq code */ diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 34dc1a712dcb..632ef88131a9 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -83,7 +83,7 @@ struct hwlat_sample { u64 nmi_total_ts; /* Total time spent in NMIs */ struct timespec64 timestamp; /* wall time */ int nmi_count; /* # NMIs during this sample */ - int count; /* # of iteratons over threash */ + int count; /* # of iterations over thresh */ }; /* keep the global state somewhere. */ @@ -389,7 +389,7 @@ static int start_kthread(struct trace_array *tr) } /** - * stop_kthread - Inform the hardware latency samping/detector kthread to stop + * stop_kthread - Inform the hardware latency sampling/detector kthread to stop * * This kicks the running hardware latency sampling/detector kernel thread and * tells it to stop sampling now. Use this on unload and at system shutdown. diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6fe770d86dc3..ea6178cb5e33 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1748,7 +1748,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) kretprobe_perf_func(tk, ri, regs); #endif - return 0; /* We don't tweek kernel, so just return 0 */ + return 0; /* We don't tweak kernel, so just return 0 */ } NOKPROBE_SYMBOL(kretprobe_dispatcher); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 61255bad7e01..d0368a569bfa 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -317,7 +317,7 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) va_list ap; va_start(ap, fmt); - trace_seq_vprintf(&iter->seq, trace_event_format(iter, fmt), ap); + trace_check_vprintf(iter, trace_event_format(iter, fmt), ap); va_end(ap); } EXPORT_SYMBOL(trace_event_printf); @@ -587,13 +587,26 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) return !trace_seq_has_overflowed(s); } +static void trace_print_time(struct trace_seq *s, struct trace_iterator *iter, + unsigned long long ts) +{ + unsigned long secs, usec_rem; + unsigned long long t; + + if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { + t = ns2usecs(ts); + usec_rem = do_div(t, USEC_PER_SEC); + secs = (unsigned long)t; + trace_seq_printf(s, " %5lu.%06lu", secs, usec_rem); + } else + trace_seq_printf(s, " %12llu", ts); +} + int trace_print_context(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; - unsigned long long t; - unsigned long secs, usec_rem; char comm[TASK_COMM_LEN]; trace_find_cmdline(entry->pid, comm); @@ -614,13 +627,8 @@ int trace_print_context(struct trace_iterator *iter) if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); - if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { - t = ns2usecs(iter->ts); - usec_rem = do_div(t, USEC_PER_SEC); - secs = (unsigned long)t; - trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); - } else - trace_seq_printf(s, " %12llu: ", iter->ts); + trace_print_time(s, iter, iter->ts); + trace_seq_puts(s, ": "); return !trace_seq_has_overflowed(s); } @@ -837,6 +845,17 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, return trace_handle_return(&iter->seq); } +static void print_fn_trace(struct trace_seq *s, unsigned long ip, + unsigned long parent_ip, int flags) +{ + seq_print_ip_sym(s, ip, flags); + + if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) { + trace_seq_puts(s, " <-"); + seq_print_ip_sym(s, parent_ip, flags); + } +} + /* TRACE_FN */ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -846,13 +865,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - seq_print_ip_sym(s, field->ip, flags); - - if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { - trace_seq_puts(s, " <-"); - seq_print_ip_sym(s, field->parent_ip, flags); - } - + print_fn_trace(s, field->ip, field->parent_ip, flags); trace_seq_putc(s, '\n'); return trace_handle_return(s); @@ -1373,6 +1386,51 @@ static struct trace_event trace_raw_data_event = { .funcs = &trace_raw_data_funcs, }; +static enum print_line_t +trace_func_repeats_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct func_repeats_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, "%lu %lu %u %llu\n", + field->ip, + field->parent_ip, + field->count, + FUNC_REPEATS_GET_DELTA_TS(field)); + + return trace_handle_return(s); +} + +static enum print_line_t +trace_func_repeats_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct func_repeats_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + print_fn_trace(s, field->ip, field->parent_ip, flags); + trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); + trace_print_time(s, iter, + iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); + trace_seq_puts(s, ")\n"); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_func_repeats_funcs = { + .trace = trace_func_repeats_print, + .raw = trace_func_repeats_raw, +}; + +static struct trace_event trace_func_repeats_event = { + .type = TRACE_FUNC_REPEATS, + .funcs = &trace_func_repeats_funcs, +}; static struct trace_event *events[] __initdata = { &trace_fn_event, @@ -1385,6 +1443,7 @@ static struct trace_event *events[] __initdata = { &trace_print_event, &trace_hwlat_event, &trace_raw_data_event, + &trace_func_repeats_event, NULL }; diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index ff32476df072..4b320fe7df70 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -251,6 +251,17 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) } EXPORT_SYMBOL_GPL(__ftrace_vprintk); +bool trace_is_tracepoint_string(const char *str) +{ + const char **ptr = __start___tracepoint_str; + + for (ptr = __start___tracepoint_str; ptr < __stop___tracepoint_str; ptr++) { + if (str == *ptr) + return true; + } + return false; +} + static const char **find_next(void *v, loff_t *pos) { const char **fmt = v; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ec589a4612df..15413ad7cef2 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -168,7 +168,7 @@ void __trace_probe_log_err(int offset, int err_type) if (!trace_probe_log.argv) return; - /* Recalcurate the length and allocate buffer */ + /* Recalculate the length and allocate buffer */ for (i = 0; i < trace_probe_log.argc; i++) { if (i == trace_probe_log.index) pos = len; @@ -182,7 +182,7 @@ void __trace_probe_log_err(int offset, int err_type) /** * Set the error position is next to the last arg + space. * Note that len includes the terminal null and the cursor - * appaers at pos + 1. + * appears at pos + 1. */ pos = len; offset = 0; @@ -592,7 +592,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, } /* - * Since $comm and immediate string can not be dereferred, + * Since $comm and immediate string can not be dereferenced, * we can find those by strcmp. */ if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 7ce4027089ee..227d518e5ba5 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -134,7 +134,7 @@ struct fetch_type { size_t size; /* Byte size of type */ int is_signed; /* Signed flag */ print_type_func_t print; /* Print functions */ - const char *fmt; /* Fromat string */ + const char *fmt; /* Format string */ const char *fmttype; /* Name in format file */ }; diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index e5282828f4a6..f003c5d02a3a 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -167,7 +167,7 @@ array: return code->op == FETCH_OP_END ? ret : -EILSEQ; } -/* Sum up total data length for dynamic arraies (strings) */ +/* Sum up total data length for dynamic arrays (strings) */ static nokprobe_inline int __get_data_size(struct trace_probe *tp, struct pt_regs *regs) { diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 73ef12092250..adf7ef194005 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -878,7 +878,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) int ret; /* - * Now that the big kernel lock is no longer preemptable, + * Now that the big kernel lock is no longer preemptible, * and this is called with the BKL held, it will always * fail. If preemption is already disabled, simply * pass the test. When the BKL is removed, or becomes @@ -940,7 +940,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * int ret; /* - * Now that the big kernel lock is no longer preemptable, + * Now that the big kernel lock is no longer preemptible, * and this is called with the BKL held, it will always * fail. If preemption is already disabled, simply * pass the test. When the BKL is removed, or becomes diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 1d84fcc78e3e..9c90b3a7dce2 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -16,7 +16,7 @@ * The buffer size is currently PAGE_SIZE, although it may become dynamic * in the future. * - * A write to the buffer will either succed or fail. That is, unlike + * A write to the buffer will either succeed or fail. That is, unlike * sprintf() there will not be a partial write (well it may write into * the buffer but it wont update the pointers). This allows users to * try to write something into the trace_seq buffer and if it fails @@ -73,7 +73,7 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s) * @fmt: printf format string * * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace + * copy to user routines. To simplify formatting of a trace * trace_seq_printf() is used to store strings into a special * buffer (@s). Then the output may be either used by * the sequencer or pulled into another buffer. @@ -133,7 +133,7 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask); * @fmt: printf format string * * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace + * copy to user routines. To simplify formatting of a trace * trace_seq_printf is used to store strings into a special * buffer (@s). Then the output may be either used by * the sequencer or pulled into another buffer. @@ -226,7 +226,7 @@ EXPORT_SYMBOL_GPL(trace_seq_puts); * @c: simple character to record * * The tracer may use either the sequence operations or its own - * copy to user routines. This function records a simple charater + * copy to user routines. This function records a simple character * into a special buffer (@s) for later retrieval by a sequencer * or other mechanism. */ @@ -348,7 +348,7 @@ int trace_seq_path(struct trace_seq *s, const struct path *path) EXPORT_SYMBOL_GPL(trace_seq_path); /** - * trace_seq_to_user - copy the squence buffer to user space + * trace_seq_to_user - copy the sequence buffer to user space * @s: trace sequence descriptor * @ubuf: The userspace memory location to copy to * @cnt: The amount to copy @@ -363,7 +363,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path); * * On failure it returns -EBUSY if all of the content in the * sequence has been already read, which includes nothing in the - * sequenc (@s->len == @s->readpos). + * sequence (@s->len == @s->readpos). * * Returns -EFAULT if the copy to userspace fails. */ diff --git a/kernel/umh.c b/kernel/umh.c index 3f646613a9d3..36c123360ab8 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -27,6 +27,7 @@ #include <linux/ptrace.h> #include <linux/async.h> #include <linux/uaccess.h> +#include <linux/initrd.h> #include <trace/events/module.h> @@ -107,6 +108,7 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); + wait_for_initramfs(); retval = kernel_execve(sub_info->path, (const char *const *)sub_info->argv, (const char *const *)sub_info->envp); @@ -336,8 +338,8 @@ static void helper_unlock(void) * @argv: arg vector for process * @envp: environment for process * @gfp_mask: gfp mask for memory allocation - * @cleanup: a cleanup function * @init: an init function + * @cleanup: a cleanup function * @data: arbitrary context sensitive data * * Returns either %NULL on allocation failure, or a subprocess_info @@ -348,7 +350,7 @@ static void helper_unlock(void) * exec. A non-zero return code causes the process to error out, exit, * and return the failure to the calling process * - * The cleanup function is just before ethe subprocess_info is about to + * The cleanup function is just before the subprocess_info is about to * be freed. This can be used for freeing the argv and envp. The * Function must be runnable in either a process context or the * context in which call_usermodehelper_exec is called. @@ -384,7 +386,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); /** * call_usermodehelper_exec - start a usermode application - * @sub_info: information about the subprocessa + * @sub_info: information about the subprocess * @wait: wait for the application to finish and return status. * when UMH_NO_WAIT don't wait at all, but you get no useful error back * when the program couldn't be exec'ed. This makes it safe to call diff --git a/kernel/up.c b/kernel/up.c index bf20b4a9af60..a38b8b095251 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -25,7 +25,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); -int smp_call_function_single_async(int cpu, call_single_data_t *csd) +int smp_call_function_single_async(int cpu, struct __call_single_data *csd) { unsigned long flags; @@ -38,7 +38,7 @@ EXPORT_SYMBOL(smp_call_function_single_async); /* * Preemption is disabled here to make sure the cond_func is called under the - * same condtions in UP and SMP. + * same conditions in UP and SMP. */ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9a4b980d695b..8d62863721b0 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -85,7 +85,7 @@ int create_user_ns(struct cred *new) /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, - * by verifing that the root directory is at the root of the + * by verifying that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; @@ -1014,7 +1014,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, goto out; ret = -EINVAL; } - /* Be very certaint the new map actually exists */ + /* Be very certain the new map actually exists */ if (new_map.nr_extents == 0) goto out; @@ -1169,7 +1169,7 @@ static bool new_idmap_permitted(const struct file *file, /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. - * And the opener of the id file also had the approprpiate capability. + * And the opener of the id file also has the appropriate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) |