From 36f7355545725c5e9400520ae33e6ee16cf78c0e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 9 Jul 2020 11:53:06 +0200 Subject: irqdomain/treewide: Keep firmware node unconditionally allocated [ Upstream commit e3beca48a45b5e0e6e6a4e0124276b8248dcc9bb ] Quite some non OF/ACPI users of irqdomains allocate firmware nodes of type IRQCHIP_FWNODE_NAMED or IRQCHIP_FWNODE_NAMED_ID and free them right after creating the irqdomain. The only purpose of these FW nodes is to convey name information. When this was introduced the core code did not store the pointer to the node in the irqdomain. A recent change stored the firmware node pointer in irqdomain for other reasons and missed to notice that the usage sites which do the alloc_fwnode/create_domain/free_fwnode sequence are broken by this. Storing a dangling pointer is dangerous itself, but in case that the domain is destroyed later on this leads to a double free. Remove the freeing of the firmware node after creating the irqdomain from all affected call sites to cure this. Fixes: 711419e504eb ("irqdomain: Add the missing assignment of domain->fwnode for named fwnode") Reported-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Acked-by: Bjorn Helgaas Acked-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/873661qakd.fsf@nanos.tec.linutronix.de Signed-off-by: Sasha Levin --- arch/x86/kernel/apic/io_apic.c | 10 +++++----- arch/x86/kernel/apic/msi.c | 18 ++++++++++++------ arch/x86/kernel/apic/vector.c | 1 - 3 files changed, 17 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f0262cb5657a..16699101fd2f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2329,12 +2329,12 @@ static int mp_irqdomain_create(int ioapic) ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops, (void *)(long)ioapic); - /* Release fw handle if it was allocated above */ - if (!cfg->dev) - irq_domain_free_fwnode(fn); - - if (!ip->irqdomain) + if (!ip->irqdomain) { + /* Release fw handle if it was allocated above */ + if (!cfg->dev) + irq_domain_free_fwnode(fn); return -ENOMEM; + } ip->irqdomain->parent = parent; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 159bd0cb8548..a20873bbbed6 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -262,12 +262,13 @@ void __init arch_init_msi_domain(struct irq_domain *parent) msi_default_domain = pci_msi_create_irq_domain(fn, &pci_msi_domain_info, parent); - irq_domain_free_fwnode(fn); } - if (!msi_default_domain) + if (!msi_default_domain) { + irq_domain_free_fwnode(fn); pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); - else + } else { msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; + } } #ifdef CONFIG_IRQ_REMAP @@ -300,7 +301,8 @@ struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent, if (!fn) return NULL; d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent); - irq_domain_free_fwnode(fn); + if (!d) + irq_domain_free_fwnode(fn); return d; } #endif @@ -363,7 +365,8 @@ static struct irq_domain *dmar_get_irq_domain(void) if (fn) { dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info, x86_vector_domain); - irq_domain_free_fwnode(fn); + if (!dmar_domain) + irq_domain_free_fwnode(fn); } out: mutex_unlock(&dmar_lock); @@ -488,7 +491,10 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id) } d = msi_create_irq_domain(fn, domain_info, parent); - irq_domain_free_fwnode(fn); + if (!d) { + irq_domain_free_fwnode(fn); + kfree(domain_info); + } return d; } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 18c0dca08163..df4d5385e6dd 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -701,7 +701,6 @@ int __init arch_early_irq_init(void) x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops, NULL); BUG_ON(x86_vector_domain == NULL); - irq_domain_free_fwnode(fn); irq_set_default_host(x86_vector_domain); arch_init_msi_domain(x86_vector_domain); -- cgit v1.2.3 From 697bd3e4aa4ba9c3ec10a4b43192b58b12e580dc Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 21 Jul 2020 11:34:48 +0200 Subject: x86, vmlinux.lds: Page-align end of ..page_aligned sections commit de2b41be8fcccb2f5b6c480d35df590476344201 upstream. On x86-32 the idt_table with 256 entries needs only 2048 bytes. It is page-aligned, but the end of the .bss..page_aligned section is not guaranteed to be page-aligned. As a result, objects from other .bss sections may end up on the same 4k page as the idt_table, and will accidentially get mapped read-only during boot, causing unexpected page-faults when the kernel writes to them. This could be worked around by making the objects in the page aligned sections page sized, but that's wrong. Explicit sections which store only page aligned objects have an implicit guarantee that the object is alone in the page in which it is placed. That works for all objects except the last one. That's inconsistent. Enforcing page sized objects for these sections would wreckage memory sanitizers, because the object becomes artificially larger than it should be and out of bound access becomes legit. Align the end of the .bss..page_aligned and .data..page_aligned section on page-size so all objects places in these sections are guaranteed to have their own page. [ tglx: Amended changelog ] Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200721093448.10417-1-joro@8bytes.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/vmlinux.lds.S | 1 + include/asm-generic/vmlinux.lds.h | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index bac1a65a9d39..1afe211d7a7c 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -362,6 +362,7 @@ SECTIONS .bss : AT(ADDR(.bss) - LOAD_OFFSET) { __bss_start = .; *(.bss..page_aligned) + . = ALIGN(PAGE_SIZE); *(BSS_MAIN) BSS_DECRYPTED . = ALIGN(PAGE_SIZE); diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index b6d7347ccda7..d7616d08e863 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -306,7 +306,8 @@ #define PAGE_ALIGNED_DATA(page_align) \ . = ALIGN(page_align); \ - *(.data..page_aligned) + *(.data..page_aligned) \ + . = ALIGN(page_align); #define READ_MOSTLY_DATA(align) \ . = ALIGN(align); \ @@ -695,7 +696,9 @@ . = ALIGN(bss_align); \ .bss : AT(ADDR(.bss) - LOAD_OFFSET) { \ BSS_FIRST_SECTIONS \ + . = ALIGN(PAGE_SIZE); \ *(.bss..page_aligned) \ + . = ALIGN(PAGE_SIZE); \ *(.dynbss) \ *(BSS_MAIN) \ *(COMMON) \ -- cgit v1.2.3 From 32344d2993b0228e238dff163d7447dcf250f147 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 17 Jul 2020 09:04:25 -0500 Subject: x86/unwind/orc: Fix ORC for newly forked tasks [ Upstream commit 372a8eaa05998cd45b3417d0e0ffd3a70978211a ] The ORC unwinder fails to unwind newly forked tasks which haven't yet run on the CPU. It correctly reads the 'ret_from_fork' instruction pointer from the stack, but it incorrectly interprets that value as a call stack address rather than a "signal" one, so the address gets incorrectly decremented in the call to orc_find(), resulting in bad ORC data. Fix it by forcing 'ret_from_fork' frames to be signal frames. Reported-by: Wang ShaoBo Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Wang ShaoBo Link: https://lkml.kernel.org/r/f91a8778dde8aae7f71884b5df2b16d552040441.1594994374.git.jpoimboe@redhat.com Signed-off-by: Sasha Levin --- arch/x86/kernel/unwind_orc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index aa0f39dc8129..187a86e0e753 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -431,8 +431,11 @@ bool unwind_next_frame(struct unwind_state *state) /* * Find the orc_entry associated with the text address. * - * Decrement call return addresses by one so they work for sibling - * calls and calls to noreturn functions. + * For a call frame (as opposed to a signal frame), state->ip points to + * the instruction after the call. That instruction's stack layout + * could be different from the call instruction's layout, for example + * if the call was to a noreturn function. So get the ORC data for the + * call instruction itself. */ orc = orc_find(state->signal ? state->ip : state->ip - 1); if (!orc) { @@ -653,6 +656,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, state->sp = task->thread.sp; state->bp = READ_ONCE_NOCHECK(frame->bp); state->ip = READ_ONCE_NOCHECK(frame->ret_addr); + state->signal = (void *)state->ip == ret_from_fork; } if (get_stack_info((unsigned long *)state->sp, state->task, -- cgit v1.2.3 From 5f4e6b874b57a510ad38bd06b7f7ba1d209f57ad Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 17 Jul 2020 09:04:26 -0500 Subject: x86/stacktrace: Fix reliable check for empty user task stacks [ Upstream commit 039a7a30ec102ec866d382a66f87f6f7654f8140 ] If a user task's stack is empty, or if it only has user regs, ORC reports it as a reliable empty stack. But arch_stack_walk_reliable() incorrectly treats it as unreliable. That happens because the only success path for user tasks is inside the loop, which only iterates on non-empty stacks. Generally, a user task must end in a user regs frame, but an empty stack is an exception to that rule. Thanks to commit 71c95825289f ("x86/unwind/orc: Fix error handling in __unwind_start()"), unwind_start() now sets state->error appropriately. So now for both ORC and FP unwinders, unwind_done() and !unwind_error() always means the end of the stack was successfully reached. So the success path for kthreads is no longer needed -- it can also be used for empty user tasks. Reported-by: Wang ShaoBo Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Wang ShaoBo Link: https://lkml.kernel.org/r/f136a4e5f019219cbc4f4da33b30c2f44fa65b84.1594994374.git.jpoimboe@redhat.com Signed-off-by: Sasha Levin --- arch/x86/kernel/stacktrace.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 2d6898c2cb64..6d83b4b857e6 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -58,7 +58,6 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, * or a page fault), which can make frame pointers * unreliable. */ - if (IS_ENABLED(CONFIG_FRAME_POINTER)) return -EINVAL; } @@ -81,10 +80,6 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, if (unwind_error(&state)) return -EINVAL; - /* Success path for non-user tasks, i.e. kthreads and idle tasks */ - if (!(task->flags & (PF_KTHREAD | PF_IDLE))) - return -EINVAL; - return 0; } -- cgit v1.2.3 From 39568546706f3cb647c99b1f3e271c958aea3414 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 Jul 2020 10:53:28 +0200 Subject: x86/i8259: Use printk_deferred() to prevent deadlock commit bdd65589593edd79b6a12ce86b3b7a7c6dae5208 upstream. 0day reported a possible circular locking dependency: Chain exists of: &irq_desc_lock_class --> console_owner --> &port_lock_key Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&port_lock_key); lock(console_owner); lock(&port_lock_key); lock(&irq_desc_lock_class); The reason for this is a printk() in the i8259 interrupt chip driver which is invoked with the irq descriptor lock held, which reverses the lock operations vs. printk() from arbitrary contexts. Switch the printk() to printk_deferred() to avoid that. Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87365abt2v.fsf@nanos.tec.linutronix.de Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/i8259.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 519649ddf100..fe522691ac71 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -207,7 +207,7 @@ spurious_8259A_irq: * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG + printk_deferred(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); spurious_irq_mask |= irqmask; } -- cgit v1.2.3