From 339e1e54891c339b30023c9cc8a005cbf65a3c0c Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Sat, 8 Oct 2016 16:59:38 +0800 Subject: clk: core: add __init decoration for CLK_OF_DECLARE_DRIVER function The new introduced macro CLK_OF_DECLARE_DRIVER is usually used to declare clock driver init functions, which are mostly decorated with __init. Add __init decoration for CLK_OF_DECLARE_DRIVER function to avoid causing section mismatch warnings on client clock drivers. Signed-off-by: Shawn Guo Fixes: c7296c51ce5d ("clk: core: New macro CLK_OF_DECLARE_DRIVER") Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index af596381fa0f..a428aec36ace 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -785,7 +785,7 @@ extern struct of_device_id __clk_of_table; * routines, one at of_clk_init(), and one at platform device probe */ #define CLK_OF_DECLARE_DRIVER(name, compat, fn) \ - static void name##_of_clk_init_driver(struct device_node *np) \ + static void __init name##_of_clk_init_driver(struct device_node *np) \ { \ of_node_clear_flag(np, OF_POPULATED); \ fn(np); \ -- cgit v1.2.3 From d33fd776f992332be110f6ceca6dad60cb5d513f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:51:28 +1100 Subject: iomap: add IOMAP_REPORT This allows the file system to tell a FIEMAP from a read operation, and thus avoids the need to report flags that aren't actually used in the read path. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/iomap.c | 2 +- include/linux/iomap.h | 17 +++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/fs/iomap.c b/fs/iomap.c index 013d1d36fbbf..a92204012e2d 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -561,7 +561,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, } while (len > 0) { - ret = iomap_apply(inode, start, len, 0, ops, &ctx, + ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, iomap_fiemap_actor); /* inode with no (attribute) mapping will give ENOENT */ if (ret == -ENOENT) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index e63e288dee83..7892f55a1866 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -19,11 +19,15 @@ struct vm_fault; #define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */ /* - * Flags for iomap mappings: + * Flags for all iomap mappings: */ -#define IOMAP_F_MERGED 0x01 /* contains multiple blocks/extents */ -#define IOMAP_F_SHARED 0x02 /* block shared with another file */ -#define IOMAP_F_NEW 0x04 /* blocks have been newly allocated */ +#define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ + +/* + * Flags that only need to be reported for IOMAP_REPORT requests: + */ +#define IOMAP_F_MERGED 0x10 /* contains multiple blocks/extents */ +#define IOMAP_F_SHARED 0x20 /* block shared with another file */ /* * Magic value for blkno: @@ -42,8 +46,9 @@ struct iomap { /* * Flags for iomap_begin / iomap_end. No flag implies a read. */ -#define IOMAP_WRITE (1 << 0) -#define IOMAP_ZERO (1 << 1) +#define IOMAP_WRITE (1 << 0) /* writing, must allocate blocks */ +#define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ +#define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ struct iomap_ops { /* -- cgit v1.2.3 From f1caa61df2a3dc4c58316295c5dc5edba4c68d85 Mon Sep 17 00:00:00 2001 From: Sinan Kaya Date: Mon, 24 Oct 2016 00:31:31 -0400 Subject: ACPI/PCI: pci_link: penalize SCI correctly Ondrej reported that IRQs stopped working in v4.7 on several platforms. A typical scenario, from Ondrej's VT82C694X/694X, is: ACPI: Using PIC for interrupt routing ACPI: PCI Interrupt Link [LNKA] (IRQs 1 3 4 5 6 7 10 *11 12 14 15) ACPI: No IRQ available for PCI Interrupt Link [LNKA] 8139too 0000:00:0f.0: PCI INT A: no GSI We're using PIC routing, so acpi_irq_balance == 0, and LNKA is already active at IRQ 11. In that case, acpi_pci_link_allocate() only tries to use the active IRQ (IRQ 11) which also happens to be the SCI. We should penalize the SCI by PIRQ_PENALTY_PCI_USING, but irq_get_trigger_type(11) returns something other than IRQ_TYPE_LEVEL_LOW, so we penalize it by PIRQ_PENALTY_ISA_ALWAYS instead, which makes acpi_pci_link_allocate() assume the IRQ isn't available and give up. Add acpi_penalize_sci_irq() so platforms can tell us the SCI IRQ, trigger, and polarity directly and we don't have to depend on irq_get_trigger_type(). Fixes: 103544d86976 (ACPI,PCI,IRQ: reduce resource requirements) Link: http://lkml.kernel.org/r/201609251512.05657.linux@rainbow-software.org Reported-by: Ondrej Zary Acked-by: Bjorn Helgaas Signed-off-by: Sinan Kaya Tested-by: Jonathan Liu Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/boot.c | 1 + drivers/acpi/pci_link.c | 30 +++++++++++++++--------------- include/linux/acpi.h | 1 + 3 files changed, 17 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8a5abaa7d453..931ced8ca345 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -454,6 +454,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK; mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); + acpi_penalize_sci_irq(bus_irq, trigger, polarity); /* * stash over-ride to indicate we've been here diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c index 6229b022a5d4..74bf96efae95 100644 --- a/drivers/acpi/pci_link.c +++ b/drivers/acpi/pci_link.c @@ -87,6 +87,7 @@ struct acpi_pci_link { static LIST_HEAD(acpi_link_list); static DEFINE_MUTEX(acpi_link_lock); +static int sci_irq = -1, sci_penalty; /* -------------------------------------------------------------------------- PCI Link Device Management @@ -496,25 +497,13 @@ static int acpi_irq_get_penalty(int irq) { int penalty = 0; - /* - * Penalize IRQ used by ACPI SCI. If ACPI SCI pin attributes conflict - * with PCI IRQ attributes, mark ACPI SCI as ISA_ALWAYS so it won't be - * use for PCI IRQs. - */ - if (irq == acpi_gbl_FADT.sci_interrupt) { - u32 type = irq_get_trigger_type(irq) & IRQ_TYPE_SENSE_MASK; - - if (type != IRQ_TYPE_LEVEL_LOW) - penalty += PIRQ_PENALTY_ISA_ALWAYS; - else - penalty += PIRQ_PENALTY_PCI_USING; - } + if (irq == sci_irq) + penalty += sci_penalty; if (irq < ACPI_MAX_ISA_IRQS) return penalty + acpi_isa_irq_penalty[irq]; - penalty += acpi_irq_pci_sharing_penalty(irq); - return penalty; + return penalty + acpi_irq_pci_sharing_penalty(irq); } int __init acpi_irq_penalty_init(void) @@ -881,6 +870,17 @@ bool acpi_isa_irq_available(int irq) acpi_irq_get_penalty(irq) < PIRQ_PENALTY_ISA_ALWAYS); } +void acpi_penalize_sci_irq(int irq, int trigger, int polarity) +{ + sci_irq = irq; + + if (trigger == ACPI_MADT_TRIGGER_LEVEL && + polarity == ACPI_MADT_POLARITY_ACTIVE_LOW) + sci_penalty = PIRQ_PENALTY_PCI_USING; + else + sci_penalty = PIRQ_PENALTY_ISA_ALWAYS; +} + /* * Over-ride default table to reserve additional IRQs for use by ISA * e.g. acpi_irq_isa=5 diff --git a/include/linux/acpi.h b/include/linux/acpi.h index ddbeda6dbdc8..689a8b9b9c8f 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -326,6 +326,7 @@ struct pci_dev; int acpi_pci_irq_enable (struct pci_dev *dev); void acpi_penalize_isa_irq(int irq, int active); bool acpi_isa_irq_available(int irq); +void acpi_penalize_sci_irq(int irq, int trigger, int polarity); void acpi_pci_irq_disable (struct pci_dev *dev); extern int ec_read(u8 addr, u8 *val); -- cgit v1.2.3 From 0d7317598214134d73da59990b846481a9527a00 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 24 Oct 2016 10:57:25 +0100 Subject: mm: unexport __get_user_pages() This patch unexports the low-level __get_user_pages() function. Recent refactoring of the get_user_pages* functions allow flags to be passed through get_user_pages() which eliminates the need for access to this function from its one user, kvm. We can see that the two calls to get_user_pages() which replace __get_user_pages() in kvm_main.c are equivalent by examining their call stacks: get_user_page_nowait(): get_user_pages(start, 1, flags, page, NULL) __get_user_pages_locked(current, current->mm, start, 1, page, NULL, NULL, false, flags | FOLL_TOUCH) __get_user_pages(current, current->mm, start, 1, flags | FOLL_TOUCH | FOLL_GET, page, NULL, NULL) check_user_page_hwpoison(): get_user_pages(addr, 1, flags, NULL, NULL) __get_user_pages_locked(current, current->mm, addr, 1, NULL, NULL, NULL, false, flags | FOLL_TOUCH) __get_user_pages(current, current->mm, addr, 1, flags | FOLL_TOUCH, NULL, NULL, NULL) Signed-off-by: Lorenzo Stoakes Acked-by: Paolo Bonzini Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ---- mm/gup.c | 3 +-- mm/nommu.c | 2 +- virt/kvm/kvm_main.c | 10 ++++------ 4 files changed, 6 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3a191853faaa..a92c8d73aeaf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1271,10 +1271,6 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void * extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int foll_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, diff --git a/mm/gup.c b/mm/gup.c index 7aa113c2d373..ec4f82704b6f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -526,7 +526,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, +static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) @@ -631,7 +631,6 @@ next_page: } while (nr_pages); return i; } -EXPORT_SYMBOL(__get_user_pages); bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) { diff --git a/mm/nommu.c b/mm/nommu.c index db5fd1795298..8b8faaf2a9e9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -109,7 +109,7 @@ unsigned int kobjsize(const void *objp) return PAGE_SIZE << compound_order(page); } -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, +static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int foll_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 28510e72618a..2907b7b78654 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1346,21 +1346,19 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w static int get_user_page_nowait(unsigned long start, int write, struct page **page) { - int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; + int flags = FOLL_NOWAIT | FOLL_HWPOISON; if (write) flags |= FOLL_WRITE; - return __get_user_pages(current, current->mm, start, 1, flags, page, - NULL, NULL); + return get_user_pages(start, 1, flags, page, NULL); } static inline int check_user_page_hwpoison(unsigned long addr) { - int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; + int rc, flags = FOLL_HWPOISON | FOLL_WRITE; - rc = __get_user_pages(current, current->mm, addr, 1, - flags, NULL, NULL, NULL); + rc = get_user_pages(addr, 1, flags, NULL, NULL); return rc == -EHWPOISON; } -- cgit v1.2.3 From 8ef4227615e158faa4ee85a1d6466782f7e22f2f Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 24 Oct 2016 15:27:59 +1000 Subject: x86/io: add interface to reserve io memtype for a resource range. (v1.1) A recent change to the mm code in: 87744ab3832b mm: fix cache mode tracking in vm_insert_mixed() started enforcing checking the memory type against the registered list for amixed pfn insertion mappings. It happens that the drm drivers for a number of gpus relied on this being broken. Currently the driver only inserted VRAM mappings into the tracking table when they came from the kernel, and userspace mappings never landed in the table. This led to a regression where all the mapping end up as UC instead of WC now. I've considered a number of solutions but since this needs to be fixed in fixes and not next, and some of the solutions were going to introduce overhead that hadn't been there before I didn't consider them viable at this stage. These mainly concerned hooking into the TTM io reserve APIs, but these API have a bunch of fast paths I didn't want to unwind to add this to. The solution I've decided on is to add a new API like the arch_phys_wc APIs (these would have worked but wc_del didn't take a range), and use them from the drivers to add a WC compatible mapping to the table for all VRAM on those GPUs. This means we can then create userspace mapping that won't get degraded to UC. v1.1: use CONFIG_X86_PAT + add some comments in io.h Cc: Toshi Kani Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: x86@kernel.org Cc: mcgrof@suse.com Cc: Dan Williams Acked-by: Ingo Molnar Reviewed-by: Thomas Gleixner Signed-off-by: Dave Airlie --- arch/x86/include/asm/io.h | 6 ++++++ arch/x86/mm/pat.c | 14 ++++++++++++++ include/linux/io.h | 22 ++++++++++++++++++++++ 3 files changed, 42 insertions(+) (limited to 'include') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index de25aad07853..d34bd370074b 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -351,4 +351,10 @@ extern void arch_phys_wc_del(int handle); #define arch_phys_wc_add arch_phys_wc_add #endif +#ifdef CONFIG_X86_PAT +extern int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size); +extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size); +#define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc +#endif + #endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 170cc4ff057b..83e701f160a9 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -730,6 +730,20 @@ void io_free_memtype(resource_size_t start, resource_size_t end) free_memtype(start, end); } +int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) +{ + enum page_cache_mode type = _PAGE_CACHE_MODE_WC; + + return io_reserve_memtype(start, start + size, &type); +} +EXPORT_SYMBOL(arch_io_reserve_memtype_wc); + +void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) +{ + io_free_memtype(start, start + size); +} +EXPORT_SYMBOL(arch_io_free_memtype_wc); + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { diff --git a/include/linux/io.h b/include/linux/io.h index e2c8419278c1..82ef36eac8a1 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -141,4 +141,26 @@ enum { void *memremap(resource_size_t offset, size_t size, unsigned long flags); void memunmap(void *addr); +/* + * On x86 PAT systems we have memory tracking that keeps track of + * the allowed mappings on memory ranges. This tracking works for + * all the in-kernel mapping APIs (ioremap*), but where the user + * wishes to map a range from a physical device into user memory + * the tracking won't be updated. This API is to be used by + * drivers which remap physical device pages into userspace, + * and wants to make sure they are mapped WC and not UC. + */ +#ifndef arch_io_reserve_memtype_wc +static inline int arch_io_reserve_memtype_wc(resource_size_t base, + resource_size_t size) +{ + return 0; +} + +static inline void arch_io_free_memtype_wc(resource_size_t base, + resource_size_t size) +{ +} +#endif + #endif /* _LINUX_IO_H */ -- cgit v1.2.3 From 9dcb8b685fc30813b35ab4b4bf39244430753190 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 26 Oct 2016 10:15:30 -0700 Subject: mm: remove per-zone hashtable of bitlock waitqueues The per-zone waitqueues exist because of a scalability issue with the page waitqueues on some NUMA machines, but it turns out that they hurt normal loads, and now with the vmalloced stacks they also end up breaking gfs2 that uses a bit_wait on a stack object: wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE) where 'gh' can be a reference to the local variable 'mount_gh' on the stack of fill_super(). The reason the per-zone hash table breaks for this case is that there is no "zone" for virtual allocations, and trying to look up the physical page to get at it will fail (with a BUG_ON()). It turns out that I actually complained to the mm people about the per-zone hash table for another reason just a month ago: the zone lookup also hurts the regular use of "unlock_page()" a lot, because the zone lookup ends up forcing several unnecessary cache misses and generates horrible code. As part of that earlier discussion, we had a much better solution for the NUMA scalability issue - by just making the page lock have a separate contention bit, the waitqueue doesn't even have to be looked at for the normal case. Peter Zijlstra already has a patch for that, but let's see if anybody even notices. In the meantime, let's fix the actual gfs2 breakage by simplifying the bitlock waitqueues and removing the per-zone issue. Reported-by: Andreas Gruenbacher Tested-by: Bob Peterson Acked-by: Mel Gorman Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Steven Whitehouse Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 30 +------------ kernel/sched/core.c | 16 +++++++ kernel/sched/wait.c | 10 ----- mm/filemap.c | 4 +- mm/memory_hotplug.c | 28 ------------ mm/page_alloc.c | 115 +------------------------------------------------ 6 files changed, 21 insertions(+), 182 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7f2ae99e5daf..0f088f3a2fed 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -440,33 +440,7 @@ struct zone { seqlock_t span_seqlock; #endif - /* - * wait_table -- the array holding the hash table - * wait_table_hash_nr_entries -- the size of the hash table array - * wait_table_bits -- wait_table_size == (1 << wait_table_bits) - * - * The purpose of all these is to keep track of the people - * waiting for a page to become available and make them - * runnable again when possible. The trouble is that this - * consumes a lot of space, especially when so few things - * wait on pages at a given time. So instead of using - * per-page waitqueues, we use a waitqueue hash table. - * - * The bucket discipline is to sleep on the same queue when - * colliding and wake all in that wait queue when removing. - * When something wakes, it must check to be sure its page is - * truly available, a la thundering herd. The cost of a - * collision is great, but given the expected load of the - * table, they should be so rare as to be outweighed by the - * benefits from the saved space. - * - * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the - * primary users of these fields, and in mm/page_alloc.c - * free_area_init_core() performs the initialization of them. - */ - wait_queue_head_t *wait_table; - unsigned long wait_table_hash_nr_entries; - unsigned long wait_table_bits; + int initialized; /* Write-intensive fields used from the page allocator */ ZONE_PADDING(_pad1_) @@ -546,7 +520,7 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) static inline bool zone_is_initialized(struct zone *zone) { - return !!zone->wait_table; + return zone->initialized; } static inline bool zone_is_empty(struct zone *zone) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..42d4027f9e26 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7515,11 +7515,27 @@ static struct kmem_cache *task_group_cache __read_mostly; DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + unsigned long val = (unsigned long)word << shift | bit; + + return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(bit_waitqueue); + void __init sched_init(void) { int i, j; unsigned long alloc_size = 0, ptr; + for (i = 0; i < WAIT_TABLE_SIZE; i++) + init_waitqueue_head(bit_wait_table + i); + #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 4f7053579fe3..9453efe9b25a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit) } EXPORT_SYMBOL(wake_up_bit); -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - const struct zone *zone = page_zone(virt_to_page(word)); - unsigned long val = (unsigned long)word << shift | bit; - - return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; -} -EXPORT_SYMBOL(bit_waitqueue); - /* * Manipulate the atomic_t address to produce a better bit waitqueue table hash * index (we're keying off bit -1, but that would produce a horrible hash diff --git a/mm/filemap.c b/mm/filemap.c index 849f459ad078..c7fe2f16503f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -790,9 +790,7 @@ EXPORT_SYMBOL(__page_cache_alloc); */ wait_queue_head_t *page_waitqueue(struct page *page) { - const struct zone *zone = page_zone(page); - - return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; + return bit_waitqueue(page, 0); } EXPORT_SYMBOL(page_waitqueue); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 962927309b6e..b18dab401be6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -268,7 +268,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) unsigned long i, pfn, end_pfn, nr_pages; int node = pgdat->node_id; struct page *page; - struct zone *zone; nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; page = virt_to_page(pgdat); @@ -276,19 +275,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) for (i = 0; i < nr_pages; i++, page++) get_page_bootmem(node, page, NODE_INFO); - zone = &pgdat->node_zones[0]; - for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { - if (zone_is_initialized(zone)) { - nr_pages = zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t); - nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; - page = virt_to_page(zone->wait_table); - - for (i = 0; i < nr_pages; i++, page++) - get_page_bootmem(node, page, NODE_INFO); - } - } - pfn = pgdat->node_start_pfn; end_pfn = pgdat_end_pfn(pgdat); @@ -2158,20 +2144,6 @@ void try_offline_node(int nid) */ node_set_offline(nid); unregister_one_node(nid); - - /* free waittable in each zone */ - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zone *zone = pgdat->node_zones + i; - - /* - * wait_table may be allocated from boot memory, - * here only free if it's allocated by vmalloc. - */ - if (is_vmalloc_addr(zone->wait_table)) { - vfree(zone->wait_table); - zone->wait_table = NULL; - } - } } EXPORT_SYMBOL(try_offline_node); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2b3bf6767d54..de7c6e43b1c9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4976,72 +4976,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) #endif } -/* - * Helper functions to size the waitqueue hash table. - * Essentially these want to choose hash table sizes sufficiently - * large so that collisions trying to wait on pages are rare. - * But in fact, the number of active page waitqueues on typical - * systems is ridiculously low, less than 200. So this is even - * conservative, even though it seems large. - * - * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to - * waitqueues, i.e. the size of the waitq table given the number of pages. - */ -#define PAGES_PER_WAITQUEUE 256 - -#ifndef CONFIG_MEMORY_HOTPLUG -static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) -{ - unsigned long size = 1; - - pages /= PAGES_PER_WAITQUEUE; - - while (size < pages) - size <<= 1; - - /* - * Once we have dozens or even hundreds of threads sleeping - * on IO we've got bigger problems than wait queue collision. - * Limit the size of the wait table to a reasonable size. - */ - size = min(size, 4096UL); - - return max(size, 4UL); -} -#else -/* - * A zone's size might be changed by hot-add, so it is not possible to determine - * a suitable size for its wait_table. So we use the maximum size now. - * - * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: - * - * i386 (preemption config) : 4096 x 16 = 64Kbyte. - * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. - * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. - * - * The maximum entries are prepared when a zone's memory is (512K + 256) pages - * or more by the traditional way. (See above). It equals: - * - * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. - * ia64(16K page size) : = ( 8G + 4M)byte. - * powerpc (64K page size) : = (32G +16M)byte. - */ -static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) -{ - return 4096UL; -} -#endif - -/* - * This is an integer logarithm so that shifts can be used later - * to extract the more random high bits from the multiplicative - * hash function before the remainder is taken. - */ -static inline unsigned long wait_table_bits(unsigned long size) -{ - return ffz(~size); -} - /* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is @@ -5304,49 +5238,6 @@ void __init setup_per_cpu_pageset(void) alloc_percpu(struct per_cpu_nodestat); } -static noinline __ref -int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) -{ - int i; - size_t alloc_size; - - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_hash_nr_entries = - wait_table_hash_nr_entries(zone_size_pages); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_hash_nr_entries); - alloc_size = zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t); - - if (!slab_is_available()) { - zone->wait_table = (wait_queue_head_t *) - memblock_virt_alloc_node_nopanic( - alloc_size, zone->zone_pgdat->node_id); - } else { - /* - * This case means that a zone whose size was 0 gets new memory - * via memory hot-add. - * But it may be the case that a new node was hot-added. In - * this case vmalloc() will not be able to use this new node's - * memory - this wait_table must be initialized to use this new - * node itself as well. - * To use this new node's memory, further consideration will be - * necessary. - */ - zone->wait_table = vmalloc(alloc_size); - } - if (!zone->wait_table) - return -ENOMEM; - - for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) - init_waitqueue_head(zone->wait_table + i); - - return 0; -} - static __meminit void zone_pcp_init(struct zone *zone) { /* @@ -5367,10 +5258,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; - int ret; - ret = zone_wait_table_init(zone, size); - if (ret) - return ret; + pgdat->nr_zones = zone_idx(zone) + 1; zone->zone_start_pfn = zone_start_pfn; @@ -5382,6 +5270,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, zone_start_pfn, (zone_start_pfn + size)); zone_init_free_lists(zone); + zone->initialized = 1; return 0; } -- cgit v1.2.3 From c0a0aba8e478229b2f0956918542152fbad3f794 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 27 Oct 2016 17:46:38 -0700 Subject: kconfig.h: remove config_enabled() macro The use of config_enabled() is ambiguous. For config options, IS_ENABLED(), IS_REACHABLE(), etc. will make intention clearer. Sometimes config_enabled() has been used for non-config options because it is useful to check whether the given symbol is defined or not. I have been tackling on deprecating config_enabled(), and now is the time to finish this work. Some new users have appeared for v4.9-rc1, but it is trivial to replace them: - arch/x86/mm/kaslr.c replace config_enabled() with IS_ENABLED() because CONFIG_X86_ESPFIX64 and CONFIG_EFI are boolean. - include/asm-generic/export.h replace config_enabled() with __is_defined(). Then, config_enabled() can be removed now. Going forward, please use IS_ENABLED(), IS_REACHABLE(), etc. for config options, and __is_defined() for non-config symbols. Link: http://lkml.kernel.org/r/1476616078-32252-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Ingo Molnar Acked-by: Nicolas Pitre Cc: Peter Oberparleiter Cc: Arnd Bergmann Cc: Kees Cook Cc: Michal Marek Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Thomas Garnier Cc: Paul Bolle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/kaslr.c | 6 +++--- include/asm-generic/export.h | 2 +- include/linux/kconfig.h | 5 ++--- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index ddd2661c4502..887e57182716 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -104,10 +104,10 @@ void __init kernel_randomize_memory(void) * consistent with the vaddr_start/vaddr_end variables. */ BUILD_BUG_ON(vaddr_start >= vaddr_end); - BUILD_BUG_ON(config_enabled(CONFIG_X86_ESPFIX64) && + BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) && vaddr_end >= EFI_VA_START); - BUILD_BUG_ON((config_enabled(CONFIG_X86_ESPFIX64) || - config_enabled(CONFIG_EFI)) && + BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) || + IS_ENABLED(CONFIG_EFI)) && vaddr_end >= __START_KERNEL_map); BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h index 43199a049da5..63554e9f6e0c 100644 --- a/include/asm-generic/export.h +++ b/include/asm-generic/export.h @@ -70,7 +70,7 @@ KSYM(__kcrctab_\name): #include #define __EXPORT_SYMBOL(sym, val, sec) \ - __cond_export_sym(sym, val, sec, config_enabled(__KSYM_##sym)) + __cond_export_sym(sym, val, sec, __is_defined(__KSYM_##sym)) #define __cond_export_sym(sym, val, sec, conf) \ ___cond_export_sym(sym, val, sec, conf) #define ___cond_export_sym(sym, val, sec, enabled) \ diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h index 15ec117ec537..8f2e059e4d45 100644 --- a/include/linux/kconfig.h +++ b/include/linux/kconfig.h @@ -31,7 +31,6 @@ * When CONFIG_BOOGER is not defined, we generate a (... 1, 0) pair, and when * the last step cherry picks the 2nd arg, we get a zero. */ -#define config_enabled(cfg) ___is_defined(cfg) #define __is_defined(x) ___is_defined(x) #define ___is_defined(val) ____is_defined(__ARG_PLACEHOLDER_##val) #define ____is_defined(arg1_or_junk) __take_second_arg(arg1_or_junk 1, 0) @@ -41,13 +40,13 @@ * otherwise. For boolean options, this is equivalent to * IS_ENABLED(CONFIG_FOO). */ -#define IS_BUILTIN(option) config_enabled(option) +#define IS_BUILTIN(option) __is_defined(option) /* * IS_MODULE(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'm', 0 * otherwise. */ -#define IS_MODULE(option) config_enabled(option##_MODULE) +#define IS_MODULE(option) __is_defined(option##_MODULE) /* * IS_REACHABLE(CONFIG_FOO) evaluates to 1 if the currently compiled -- cgit v1.2.3 From 5aab90ce1ec449912a2ebc4d45e0c85dac29e9dd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 26 Oct 2016 11:48:24 +0200 Subject: perf/powerpc: Don't call perf_event_disable() from atomic context The trinity syscall fuzzer triggered following WARN() on powerpc: WARNING: CPU: 9 PID: 2998 at arch/powerpc/kernel/hw_breakpoint.c:278 ... NIP [c00000000093aedc] .hw_breakpoint_handler+0x28c/0x2b0 LR [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0 Call Trace: [c0000002f7933580] [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0 (unreliable) [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0 [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0 [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0 [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100 [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48 Followed by a lockdep warning: =============================== [ INFO: suspicious RCU usage. ] 4.8.0-rc5+ #7 Tainted: G W ------------------------------- ./include/linux/rcupdate.h:556 Illegal context switch in RCU read-side critical section! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 2 locks held by ls/2998: #0: (rcu_read_lock){......}, at: [] .__atomic_notifier_call_chain+0x0/0x1c0 #1: (rcu_read_lock){......}, at: [] .hw_breakpoint_handler+0x0/0x2b0 stack backtrace: CPU: 9 PID: 2998 Comm: ls Tainted: G W 4.8.0-rc5+ #7 Call Trace: [c0000002f7933150] [c00000000094b1f8] .dump_stack+0xe0/0x14c (unreliable) [c0000002f79331e0] [c00000000013c468] .lockdep_rcu_suspicious+0x138/0x180 [c0000002f7933270] [c0000000001005d8] .___might_sleep+0x278/0x2e0 [c0000002f7933300] [c000000000935584] .mutex_lock_nested+0x64/0x5a0 [c0000002f7933410] [c00000000023084c] .perf_event_ctx_lock_nested+0x16c/0x380 [c0000002f7933500] [c000000000230a80] .perf_event_disable+0x20/0x60 [c0000002f7933580] [c00000000093aeec] .hw_breakpoint_handler+0x29c/0x2b0 [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0 [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0 [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0 [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100 [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48 While it looks like the first WARN() is probably valid, the other one is triggered by disabling event via perf_event_disable() from atomic context. The event is disabled here in case we were not able to emulate the instruction that hit the breakpoint. By disabling the event we unschedule the event and make sure it's not scheduled back. But we can't call perf_event_disable() from atomic context, instead we need to use the event's pending_disable irq_work method to disable it. Reported-by: Jan Stancek Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Huang Ying Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Neuling Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161026094824.GA21397@krava Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/hw_breakpoint.c | 2 +- include/linux/perf_event.h | 1 + kernel/events/core.c | 10 ++++++++-- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 9781c69eae57..03d089b3ed72 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -275,7 +275,7 @@ int hw_breakpoint_handler(struct die_args *args) if (!stepped) { WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " "0x%lx will be disabled.", info->address); - perf_event_disable(bp); + perf_event_disable_inatomic(bp); goto out; } /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 060d0ede88df..4741ecdb9817 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1257,6 +1257,7 @@ extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); +extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); #else /* !CONFIG_PERF_EVENTS: */ static inline void * diff --git a/kernel/events/core.c b/kernel/events/core.c index a5d2e62faf7e..0e292132efac 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1960,6 +1960,12 @@ void perf_event_disable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_disable); +void perf_event_disable_inatomic(struct perf_event *event) +{ + event->pending_disable = 1; + irq_work_queue(&event->pending); +} + static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, u64 tstamp) @@ -7075,8 +7081,8 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_disable = 1; - irq_work_queue(&event->pending); + + perf_event_disable_inatomic(event); } READ_ONCE(event->overflow_handler)(event, data, regs); -- cgit v1.2.3