diff options
82 files changed, 935 insertions, 769 deletions
@@ -9,6 +9,10 @@ Linus ---------- +M: Matt Mackal +E: mpm@selenic.com +D: SLOB slab allocator + N: Matti Aarnio E: mea@nic.funet.fi D: Alpha systems hacking, IPv6 and other network related stuff diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c index c6a06b71594d..f40578026a04 100644 --- a/Documentation/accounting/getdelays.c +++ b/Documentation/accounting/getdelays.c @@ -314,6 +314,7 @@ int main(int argc, char *argv[]) break; case 'm': strncpy(cpumask, optarg, sizeof(cpumask)); + cpumask[sizeof(cpumask) - 1] = '\0'; maskset = 1; printf("cpumask %s maskset %d\n", cpumask, maskset); break; diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 884904975d0b..c1b9aa8c5a52 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3130,6 +3130,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. [KNL] Should the soft-lockup detector generate panics. Format: <integer> + softlockup_all_cpu_backtrace= + [KNL] Should the soft-lockup detector generate + backtraces on all cpus. + Format: <integer> + sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/laptops/sonypi.txt diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index f304edb8fbe7..45134dc23854 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt @@ -209,15 +209,12 @@ If memory device is found, memory hotplug code will be called. 4.2 Notify memory hot-add event by hand ------------ -On powerpc, the firmware does not notify a memory hotplug event to the kernel. -Therefore, "probe" interface is supported to notify the event to the kernel. -This interface depends on CONFIG_ARCH_MEMORY_PROBE. - -CONFIG_ARCH_MEMORY_PROBE is supported on powerpc only. On x86, this config -option is disabled by default since ACPI notifies a memory hotplug event to -the kernel, which performs its hotplug operation as the result. Please -enable this option if you need the "probe" interface for testing purposes -on x86. +On some architectures, the firmware may not notify the kernel of a memory +hotplug event. Therefore, the memory "probe" interface is supported to +explicitly notify the kernel. This interface depends on +CONFIG_ARCH_MEMORY_PROBE and can be configured on powerpc, sh, and x86 +if hotplug is supported, although for x86 this should be handled by ACPI +notification. Probe interface is located at /sys/devices/system/memory/probe diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 708bb7f1b7e0..c14374e71775 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -75,6 +75,7 @@ show up in /proc/sys/kernel: - shmall - shmmax [ sysv ipc ] - shmmni +- softlockup_all_cpu_backtrace - stop-a [ SPARC only ] - sysrq ==> Documentation/sysrq.txt - sysctl_writes_strict @@ -783,6 +784,22 @@ via the /proc/sys interface: ============================================================== +softlockup_all_cpu_backtrace: + +This value controls the soft lockup detector thread's behavior +when a soft lockup condition is detected as to whether or not +to gather further debug information. If enabled, each cpu will +be issued an NMI and instructed to capture stack trace. + +This feature is only applicable for architectures which support +NMI. + +0: do nothing. This is the default behavior. + +1: on detection capture more debug information. + +============================================================== + tainted: Non-zero if the kernel has been tainted. Numeric values, which diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index bd4b34c03738..4415aa915681 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -702,7 +702,8 @@ The batch value of each per cpu pagelist is also updated as a result. It is set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) The initial value is zero. Kernel does not use this value at boot time to set -the high water marks for each per cpu page list. +the high water marks for each per cpu page list. If the user writes '0' to this +sysctl, it will revert to this default behavior. ============================================================== diff --git a/MAINTAINERS b/MAINTAINERS index 3f2e171047b9..3cc94fff780f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8196,13 +8196,15 @@ S: Maintained F: drivers/usb/misc/sisusbvga/ SLAB ALLOCATOR -M: Christoph Lameter <cl@linux-foundation.org> +M: Christoph Lameter <cl@linux.com> M: Pekka Enberg <penberg@kernel.org> -M: Matt Mackall <mpm@selenic.com> +M: David Rientjes <rientjes@google.com> +M: Joonsoo Kim <iamjoonsoo.kim@lge.com> +M: Andrew Morton <akpm@linux-foundation.org> L: linux-mm@kvack.org S: Maintained F: include/linux/sl?b*.h -F: mm/sl?b.c +F: mm/sl?b* SLEEPABLE READ-COPY UPDATE (SRCU) M: Lai Jiangshan <laijs@cn.fujitsu.com> diff --git a/arch/arm/common/scoop.c b/arch/arm/common/scoop.c index 6ef146edd0cd..a20fa80776d3 100644 --- a/arch/arm/common/scoop.c +++ b/arch/arm/common/scoop.c @@ -182,7 +182,6 @@ static int scoop_probe(struct platform_device *pdev) struct scoop_config *inf; struct resource *mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); int ret; - int temp; if (!mem) return -EINVAL; diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index f989d7c22dc5..e4e4208a9130 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -114,8 +114,14 @@ static inline struct thread_info *current_thread_info(void) ((unsigned long)(task_thread_info(tsk)->cpu_context.pc)) #define thread_saved_sp(tsk) \ ((unsigned long)(task_thread_info(tsk)->cpu_context.sp)) + +#ifndef CONFIG_THUMB2_KERNEL #define thread_saved_fp(tsk) \ ((unsigned long)(task_thread_info(tsk)->cpu_context.fp)) +#else +#define thread_saved_fp(tsk) \ + ((unsigned long)(task_thread_info(tsk)->cpu_context.r7)) +#endif extern void crunch_task_disable(struct thread_info *); extern void crunch_task_copy(struct thread_info *, void *); diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c index 2037f7205987..1d37568c547a 100644 --- a/arch/arm/kernel/perf_event_v7.c +++ b/arch/arm/kernel/perf_event_v7.c @@ -1924,7 +1924,7 @@ static int krait_pmu_get_event_idx(struct pmu_hw_events *cpuc, struct perf_event *event) { int idx; - int bit; + int bit = -1; unsigned int prefix; unsigned int region; unsigned int code; @@ -1953,7 +1953,7 @@ static int krait_pmu_get_event_idx(struct pmu_hw_events *cpuc, } idx = armv7pmu_get_event_idx(cpuc, event); - if (idx < 0 && krait_event) + if (idx < 0 && bit >= 0) clear_bit(bit, cpuc->used_mask); return idx; diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig index 28fa2fa49e5d..4b5185748f74 100644 --- a/arch/arm/mach-imx/Kconfig +++ b/arch/arm/mach-imx/Kconfig @@ -734,9 +734,9 @@ config SOC_IMX6 select HAVE_IMX_MMDC select HAVE_IMX_SRC select MFD_SYSCON - select PL310_ERRATA_588369 if CACHE_PL310 - select PL310_ERRATA_727915 if CACHE_PL310 - select PL310_ERRATA_769419 if CACHE_PL310 + select PL310_ERRATA_588369 if CACHE_L2X0 + select PL310_ERRATA_727915 if CACHE_L2X0 + select PL310_ERRATA_769419 if CACHE_L2X0 config SOC_IMX6Q bool "i.MX6 Quad/DualLite support" @@ -771,9 +771,9 @@ config SOC_VF610 select ARM_GIC select PINCTRL_VF610 select VF_PIT_TIMER - select PL310_ERRATA_588369 if CACHE_PL310 - select PL310_ERRATA_727915 if CACHE_PL310 - select PL310_ERRATA_769419 if CACHE_PL310 + select PL310_ERRATA_588369 if CACHE_L2X0 + select PL310_ERRATA_727915 if CACHE_L2X0 + select PL310_ERRATA_769419 if CACHE_L2X0 help This enable support for Freescale Vybrid VF610 processor. diff --git a/arch/arm/mach-omap2/Kconfig b/arch/arm/mach-omap2/Kconfig index 062505345c95..1c1ed737f7ab 100644 --- a/arch/arm/mach-omap2/Kconfig +++ b/arch/arm/mach-omap2/Kconfig @@ -34,8 +34,8 @@ config ARCH_OMAP4 select HAVE_ARM_SCU if SMP select HAVE_ARM_TWD if SMP select OMAP_INTERCONNECT - select PL310_ERRATA_588369 - select PL310_ERRATA_727915 + select PL310_ERRATA_588369 if CACHE_L2X0 + select PL310_ERRATA_727915 if CACHE_L2X0 select PM_OPP if PM select PM_RUNTIME if CPU_IDLE select ARM_ERRATA_754322 diff --git a/arch/arm/mach-sti/Kconfig b/arch/arm/mach-sti/Kconfig index 7e33e9d2c42e..878e9ec97d0f 100644 --- a/arch/arm/mach-sti/Kconfig +++ b/arch/arm/mach-sti/Kconfig @@ -11,8 +11,8 @@ menuconfig ARCH_STI select ARM_ERRATA_754322 select ARM_ERRATA_764369 if SMP select ARM_ERRATA_775420 - select PL310_ERRATA_753970 if CACHE_PL310 - select PL310_ERRATA_769419 if CACHE_PL310 + select PL310_ERRATA_753970 if CACHE_L2X0 + select PL310_ERRATA_769419 if CACHE_L2X0 help Include support for STiH41x SOCs like STiH415/416 using the device tree for discovery diff --git a/arch/arm/mach-ux500/Kconfig b/arch/arm/mach-ux500/Kconfig index 5be7c4583a93..699e8601dbf0 100644 --- a/arch/arm/mach-ux500/Kconfig +++ b/arch/arm/mach-ux500/Kconfig @@ -15,7 +15,7 @@ menuconfig ARCH_U8500 select PINCTRL select PINCTRL_ABX500 select PINCTRL_NOMADIK - select PL310_ERRATA_753970 if CACHE_PL310 + select PL310_ERRATA_753970 if CACHE_L2X0 help Support for ST-Ericsson's Ux500 architecture diff --git a/arch/arm/mach-vexpress/Kconfig b/arch/arm/mach-vexpress/Kconfig index 99c1f151c403..d8b9330f896a 100644 --- a/arch/arm/mach-vexpress/Kconfig +++ b/arch/arm/mach-vexpress/Kconfig @@ -43,7 +43,7 @@ config ARCH_VEXPRESS_CORTEX_A5_A9_ERRATA bool "Enable A5 and A9 only errata work-arounds" default y select ARM_ERRATA_720789 - select PL310_ERRATA_753970 if CACHE_PL310 + select PL310_ERRATA_753970 if CACHE_L2X0 help Provides common dependencies for Versatile Express platforms based on Cortex-A5 and Cortex-A9 processors. In order to diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index eda0dd0ab97b..c348eaee7ee2 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -889,9 +889,10 @@ config CACHE_L2X0 help This option enables the L2x0 PrimeCell. +if CACHE_L2X0 + config CACHE_PL310 bool - depends on CACHE_L2X0 default y if CPU_V7 && !(CPU_V6 || CPU_V6K) help This option enables optimisations for the PL310 cache @@ -899,7 +900,6 @@ config CACHE_PL310 config PL310_ERRATA_588369 bool "PL310 errata: Clean & Invalidate maintenance operations do not invalidate clean lines" - depends on CACHE_L2X0 help The PL310 L2 cache controller implements three types of Clean & Invalidate maintenance operations: by Physical Address @@ -912,7 +912,6 @@ config PL310_ERRATA_588369 config PL310_ERRATA_727915 bool "PL310 errata: Background Clean & Invalidate by Way operation can cause data corruption" - depends on CACHE_L2X0 help PL310 implements the Clean & Invalidate by Way L2 cache maintenance operation (offset 0x7FC). This operation runs in background so that @@ -923,7 +922,6 @@ config PL310_ERRATA_727915 config PL310_ERRATA_753970 bool "PL310 errata: cache sync operation may be faulty" - depends on CACHE_PL310 help This option enables the workaround for the 753970 PL310 (r3p0) erratum. @@ -938,7 +936,6 @@ config PL310_ERRATA_753970 config PL310_ERRATA_769419 bool "PL310 errata: no automatic Store Buffer drain" - depends on CACHE_L2X0 help On revisions of the PL310 prior to r3p2, the Store Buffer does not automatically drain. This can cause normal, non-cacheable @@ -948,6 +945,8 @@ config PL310_ERRATA_769419 on systems with an outer cache, the store buffer is drained explicitly. +endif + config CACHE_TAUROS2 bool "Enable the Tauros2 L2 cache controller" depends on (ARCH_DOVE || ARCH_MMP || CPU_PJ4) diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S index 97448c3acf38..ba0d58e1a2a2 100644 --- a/arch/arm/mm/proc-arm925.S +++ b/arch/arm/mm/proc-arm925.S @@ -502,6 +502,7 @@ __\name\()_proc_info: .long \cpu_val .long \cpu_mask .long PMD_TYPE_SECT | \ + PMD_SECT_CACHEABLE | \ PMD_BIT4 | \ PMD_SECT_AP_WRITE | \ PMD_SECT_AP_READ diff --git a/arch/ia64/include/uapi/asm/fcntl.h b/arch/ia64/include/uapi/asm/fcntl.h index 1dd275dc8f65..7b485876cad4 100644 --- a/arch/ia64/include/uapi/asm/fcntl.h +++ b/arch/ia64/include/uapi/asm/fcntl.h @@ -8,6 +8,7 @@ #define force_o_largefile() \ (personality(current->personality) != PER_LINUX32) +#include <linux/personality.h> #include <asm-generic/fcntl.h> #endif /* _ASM_IA64_FCNTL_H */ diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 790352f93700..35d16bd2760b 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -303,7 +303,6 @@ config PPC_EARLY_DEBUG_OPAL_VTERMNO This correspond to which /dev/hvcN you want to use for early debug. - On OPAL v1 (takeover) this should always be 0 On OPAL v2, this will be 0 for network console and 1 or 2 for the machine built-in serial ports. diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 37991e154ef8..840a5509b3f1 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -88,4 +88,15 @@ static inline unsigned long ppc_function_entry(void *func) #endif } +static inline unsigned long ppc_global_function_entry(void *func) +{ +#if defined(CONFIG_PPC64) && defined(_CALL_ELF) && _CALL_ELF == 2 + /* PPC64 ABIv2 the global entry point is at the address */ + return (unsigned long)func; +#else + /* All other cases there is no change vs ppc_function_entry() */ + return ppc_function_entry(func); +#endif +} + #endif /* _ASM_POWERPC_CODE_PATCHING_H */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 460018889ba9..0da1dbd42e02 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -12,27 +12,7 @@ #ifndef __OPAL_H #define __OPAL_H -/****** Takeover interface ********/ - -/* PAPR H-Call used to querty the HAL existence and/or instanciate - * it from within pHyp (tech preview only). - * - * This is exclusively used in prom_init.c - */ - #ifndef __ASSEMBLY__ - -struct opal_takeover_args { - u64 k_image; /* r4 */ - u64 k_size; /* r5 */ - u64 k_entry; /* r6 */ - u64 k_entry2; /* r7 */ - u64 hal_addr; /* r8 */ - u64 rd_image; /* r9 */ - u64 rd_size; /* r10 */ - u64 rd_loc; /* r11 */ -}; - /* * SG entry * @@ -55,15 +35,6 @@ struct opal_sg_list { /* We calculate number of sg entries based on PAGE_SIZE */ #define SG_ENTRIES_PER_NODE ((PAGE_SIZE - 16) / sizeof(struct opal_sg_entry)) -extern long opal_query_takeover(u64 *hal_size, u64 *hal_align); - -extern long opal_do_takeover(struct opal_takeover_args *args); - -struct rtas_args; -extern int opal_enter_rtas(struct rtas_args *args, - unsigned long data, - unsigned long entry); - #endif /* __ASSEMBLY__ */ /****** OPAL APIs ******/ diff --git a/arch/powerpc/include/asm/swab.h b/arch/powerpc/include/asm/swab.h index b9bd1ca944d0..96f59de61855 100644 --- a/arch/powerpc/include/asm/swab.h +++ b/arch/powerpc/include/asm/swab.h @@ -9,10 +9,6 @@ #include <uapi/asm/swab.h> -#ifdef __GNUC__ -#ifndef __powerpc64__ -#endif /* __powerpc64__ */ - static __inline__ __u16 ld_le16(const volatile __u16 *addr) { __u16 val; @@ -20,19 +16,12 @@ static __inline__ __u16 ld_le16(const volatile __u16 *addr) __asm__ __volatile__ ("lhbrx %0,0,%1" : "=r" (val) : "r" (addr), "m" (*addr)); return val; } -#define __arch_swab16p ld_le16 static __inline__ void st_le16(volatile __u16 *addr, const __u16 val) { __asm__ __volatile__ ("sthbrx %1,0,%2" : "=m" (*addr) : "r" (val), "r" (addr)); } -static inline void __arch_swab16s(__u16 *addr) -{ - st_le16(addr, *addr); -} -#define __arch_swab16s __arch_swab16s - static __inline__ __u32 ld_le32(const volatile __u32 *addr) { __u32 val; @@ -40,42 +29,10 @@ static __inline__ __u32 ld_le32(const volatile __u32 *addr) __asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (val) : "r" (addr), "m" (*addr)); return val; } -#define __arch_swab32p ld_le32 static __inline__ void st_le32(volatile __u32 *addr, const __u32 val) { __asm__ __volatile__ ("stwbrx %1,0,%2" : "=m" (*addr) : "r" (val), "r" (addr)); } -static inline void __arch_swab32s(__u32 *addr) -{ - st_le32(addr, *addr); -} -#define __arch_swab32s __arch_swab32s - -static inline __attribute_const__ __u16 __arch_swab16(__u16 value) -{ - __u16 result; - - __asm__("rlwimi %0,%1,8,16,23" - : "=r" (result) - : "r" (value), "0" (value >> 8)); - return result; -} -#define __arch_swab16 __arch_swab16 - -static inline __attribute_const__ __u32 __arch_swab32(__u32 value) -{ - __u32 result; - - __asm__("rlwimi %0,%1,24,16,23\n\t" - "rlwimi %0,%1,8,8,15\n\t" - "rlwimi %0,%1,24,0,7" - : "=r" (result) - : "r" (value), "0" (value >> 24)); - return result; -} -#define __arch_swab32 __arch_swab32 - -#endif /* __GNUC__ */ #endif /* _ASM_POWERPC_SWAB_H */ diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index f202d0731b06..d178834fe508 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -10,6 +10,8 @@ * */ +#define pr_fmt(fmt) "ftrace-powerpc: " fmt + #include <linux/spinlock.h> #include <linux/hardirq.h> #include <linux/uaccess.h> @@ -105,7 +107,7 @@ __ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { unsigned int op; - unsigned long ptr; + unsigned long entry, ptr; unsigned long ip = rec->ip; void *tramp; @@ -115,7 +117,7 @@ __ftrace_make_nop(struct module *mod, /* Make sure that that this is still a 24bit jump */ if (!is_bl_op(op)) { - printk(KERN_ERR "Not expected bl: opcode is %x\n", op); + pr_err("Not expected bl: opcode is %x\n", op); return -EINVAL; } @@ -125,21 +127,21 @@ __ftrace_make_nop(struct module *mod, pr_devel("ip:%lx jumps to %p", ip, tramp); if (!is_module_trampoline(tramp)) { - printk(KERN_ERR "Not a trampoline\n"); + pr_err("Not a trampoline\n"); return -EINVAL; } if (module_trampoline_target(mod, tramp, &ptr)) { - printk(KERN_ERR "Failed to get trampoline target\n"); + pr_err("Failed to get trampoline target\n"); return -EFAULT; } pr_devel("trampoline target %lx", ptr); + entry = ppc_global_function_entry((void *)addr); /* This should match what was called */ - if (ptr != ppc_function_entry((void *)addr)) { - printk(KERN_ERR "addr %lx does not match expected %lx\n", - ptr, ppc_function_entry((void *)addr)); + if (ptr != entry) { + pr_err("addr %lx does not match expected %lx\n", ptr, entry); return -EINVAL; } @@ -179,7 +181,7 @@ __ftrace_make_nop(struct module *mod, /* Make sure that that this is still a 24bit jump */ if (!is_bl_op(op)) { - printk(KERN_ERR "Not expected bl: opcode is %x\n", op); + pr_err("Not expected bl: opcode is %x\n", op); return -EINVAL; } @@ -198,7 +200,7 @@ __ftrace_make_nop(struct module *mod, /* Find where the trampoline jumps to */ if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) { - printk(KERN_ERR "Failed to read %lx\n", tramp); + pr_err("Failed to read %lx\n", tramp); return -EFAULT; } @@ -209,7 +211,7 @@ __ftrace_make_nop(struct module *mod, ((jmp[1] & 0xffff0000) != 0x398c0000) || (jmp[2] != 0x7d8903a6) || (jmp[3] != 0x4e800420)) { - printk(KERN_ERR "Not a trampoline\n"); + pr_err("Not a trampoline\n"); return -EINVAL; } @@ -221,8 +223,7 @@ __ftrace_make_nop(struct module *mod, pr_devel(" %lx ", tramp); if (tramp != addr) { - printk(KERN_ERR - "Trampoline location %08lx does not match addr\n", + pr_err("Trampoline location %08lx does not match addr\n", tramp); return -EINVAL; } @@ -263,15 +264,13 @@ int ftrace_make_nop(struct module *mod, */ if (!rec->arch.mod) { if (!mod) { - printk(KERN_ERR "No module loaded addr=%lx\n", - addr); + pr_err("No module loaded addr=%lx\n", addr); return -EFAULT; } rec->arch.mod = mod; } else if (mod) { if (mod != rec->arch.mod) { - printk(KERN_ERR - "Record mod %p not equal to passed in mod %p\n", + pr_err("Record mod %p not equal to passed in mod %p\n", rec->arch.mod, mod); return -EINVAL; } @@ -307,26 +306,25 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) * The load offset is different depending on the ABI. For simplicity * just mask it out when doing the compare. */ - if ((op[0] != 0x48000008) || ((op[1] & 0xffff00000) != 0xe8410000)) { - printk(KERN_ERR "Unexpected call sequence: %x %x\n", - op[0], op[1]); + if ((op[0] != 0x48000008) || ((op[1] & 0xffff0000) != 0xe8410000)) { + pr_err("Unexpected call sequence: %x %x\n", op[0], op[1]); return -EINVAL; } /* If we never set up a trampoline to ftrace_caller, then bail */ if (!rec->arch.mod->arch.tramp) { - printk(KERN_ERR "No ftrace trampoline\n"); + pr_err("No ftrace trampoline\n"); return -EINVAL; } /* Ensure branch is within 24 bits */ - if (create_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) { - printk(KERN_ERR "Branch out of range"); + if (!create_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) { + pr_err("Branch out of range\n"); return -EINVAL; } if (patch_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) { - printk(KERN_ERR "REL24 out of range!\n"); + pr_err("REL24 out of range!\n"); return -EINVAL; } @@ -345,13 +343,13 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) /* It should be pointing to a nop */ if (op != PPC_INST_NOP) { - printk(KERN_ERR "Expected NOP but have %x\n", op); + pr_err("Expected NOP but have %x\n", op); return -EINVAL; } /* If we never set up a trampoline to ftrace_caller, then bail */ if (!rec->arch.mod->arch.tramp) { - printk(KERN_ERR "No ftrace trampoline\n"); + pr_err("No ftrace trampoline\n"); return -EINVAL; } @@ -359,7 +357,7 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) op = create_branch((unsigned int *)ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK); if (!op) { - printk(KERN_ERR "REL24 out of range!\n"); + pr_err("REL24 out of range!\n"); return -EINVAL; } @@ -397,7 +395,7 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) * already have a module defined. */ if (!rec->arch.mod) { - printk(KERN_ERR "No module loaded\n"); + pr_err("No module loaded\n"); return -EINVAL; } diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c index b82227e7e21b..12e48d56f771 100644 --- a/arch/powerpc/kernel/iomap.c +++ b/arch/powerpc/kernel/iomap.c @@ -23,7 +23,7 @@ unsigned int ioread16(void __iomem *addr) } unsigned int ioread16be(void __iomem *addr) { - return in_be16(addr); + return readw_be(addr); } unsigned int ioread32(void __iomem *addr) { @@ -31,7 +31,7 @@ unsigned int ioread32(void __iomem *addr) } unsigned int ioread32be(void __iomem *addr) { - return in_be32(addr); + return readl_be(addr); } EXPORT_SYMBOL(ioread8); EXPORT_SYMBOL(ioread16); @@ -49,7 +49,7 @@ void iowrite16(u16 val, void __iomem *addr) } void iowrite16be(u16 val, void __iomem *addr) { - out_be16(addr, val); + writew_be(val, addr); } void iowrite32(u32 val, void __iomem *addr) { @@ -57,7 +57,7 @@ void iowrite32(u32 val, void __iomem *addr) } void iowrite32be(u32 val, void __iomem *addr) { - out_be32(addr, val); + writel_be(val, addr); } EXPORT_SYMBOL(iowrite8); EXPORT_SYMBOL(iowrite16); @@ -75,15 +75,15 @@ EXPORT_SYMBOL(iowrite32be); */ void ioread8_rep(void __iomem *addr, void *dst, unsigned long count) { - _insb((u8 __iomem *) addr, dst, count); + readsb(addr, dst, count); } void ioread16_rep(void __iomem *addr, void *dst, unsigned long count) { - _insw_ns((u16 __iomem *) addr, dst, count); + readsw(addr, dst, count); } void ioread32_rep(void __iomem *addr, void *dst, unsigned long count) { - _insl_ns((u32 __iomem *) addr, dst, count); + readsl(addr, dst, count); } EXPORT_SYMBOL(ioread8_rep); EXPORT_SYMBOL(ioread16_rep); @@ -91,15 +91,15 @@ EXPORT_SYMBOL(ioread32_rep); void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count) { - _outsb((u8 __iomem *) addr, src, count); + writesb(addr, src, count); } void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count) { - _outsw_ns((u16 __iomem *) addr, src, count); + writesw(addr, src, count); } void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count) { - _outsl_ns((u32 __iomem *) addr, src, count); + writesl(addr, src, count); } EXPORT_SYMBOL(iowrite8_rep); EXPORT_SYMBOL(iowrite16_rep); diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 90fab64d911d..2f72af82513c 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -32,6 +32,7 @@ #include <linux/module.h> #include <linux/kdebug.h> #include <linux/slab.h> +#include <asm/code-patching.h> #include <asm/cacheflush.h> #include <asm/sstep.h> #include <asm/uaccess.h> @@ -491,12 +492,10 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, return ret; } -#ifdef CONFIG_PPC64 unsigned long arch_deref_entry_point(void *entry) { - return ((func_descr_t *)entry)->entry; + return ppc_global_function_entry(entry); } -#endif int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) { @@ -508,8 +507,12 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) /* setup return addr to the jprobe handler routine */ regs->nip = arch_deref_entry_point(jp->entry); #ifdef CONFIG_PPC64 +#if defined(_CALL_ELF) && _CALL_ELF == 2 + regs->gpr[12] = (unsigned long)jp->entry; +#else regs->gpr[2] = (unsigned long)(((func_descr_t *)jp->entry)->toc); #endif +#endif return 1; } diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 077d2ce6c5a7..d807ee626af9 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -315,8 +315,17 @@ static void dedotify_versions(struct modversion_info *vers, struct modversion_info *end; for (end = (void *)vers + size; vers < end; vers++) - if (vers->name[0] == '.') + if (vers->name[0] == '.') { memmove(vers->name, vers->name+1, strlen(vers->name)); +#ifdef ARCH_RELOCATES_KCRCTAB + /* The TOC symbol has no CRC computed. To avoid CRC + * check failing, we must force it to the expected + * value (see CRC check in module.c). + */ + if (!strcmp(vers->name, "TOC.")) + vers->crc = -(unsigned long)reloc_start; +#endif + } } /* Undefined symbols which refer to .funcname, hack to funcname (or .TOC.) */ diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 613a860a203c..b694b0730971 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -662,13 +662,6 @@ void __init early_init_devtree(void *params) of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL); #endif - /* Pre-initialize the cmd_line with the content of boot_commmand_line, - * which will be empty except when the content of the variable has - * been overriden by a bootloading mechanism. This happens typically - * with HAL takeover - */ - strlcpy(cmd_line, boot_command_line, COMMAND_LINE_SIZE); - /* Retrieve various informations from the /chosen node of the * device-tree, including the platform type, initrd location and * size, TCE reserve, and more ... diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 078145acf7fb..1a85d8f96739 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -1268,201 +1268,6 @@ static u64 __initdata prom_opal_base; static u64 __initdata prom_opal_entry; #endif -#ifdef __BIG_ENDIAN__ -/* XXX Don't change this structure without updating opal-takeover.S */ -static struct opal_secondary_data { - s64 ack; /* 0 */ - u64 go; /* 8 */ - struct opal_takeover_args args; /* 16 */ -} opal_secondary_data; - -static u64 __initdata prom_opal_align; -static u64 __initdata prom_opal_size; -static int __initdata prom_rtas_start_cpu; -static u64 __initdata prom_rtas_data; -static u64 __initdata prom_rtas_entry; - -extern char opal_secondary_entry; - -static void __init prom_query_opal(void) -{ - long rc; - - /* We must not query for OPAL presence on a machine that - * supports TNK takeover (970 blades), as this uses the same - * h-call with different arguments and will crash - */ - if (PHANDLE_VALID(call_prom("finddevice", 1, 1, - ADDR("/tnk-memory-map")))) { - prom_printf("TNK takeover detected, skipping OPAL check\n"); - return; - } - - prom_printf("Querying for OPAL presence... "); - - rc = opal_query_takeover(&prom_opal_size, - &prom_opal_align); - prom_debug("(rc = %ld) ", rc); - if (rc != 0) { - prom_printf("not there.\n"); - return; - } - of_platform = PLATFORM_OPAL; - prom_printf(" there !\n"); - prom_debug(" opal_size = 0x%lx\n", prom_opal_size); - prom_debug(" opal_align = 0x%lx\n", prom_opal_align); - if (prom_opal_align < 0x10000) - prom_opal_align = 0x10000; -} - -static int __init prom_rtas_call(int token, int nargs, int nret, - int *outputs, ...) -{ - struct rtas_args rtas_args; - va_list list; - int i; - - rtas_args.token = token; - rtas_args.nargs = nargs; - rtas_args.nret = nret; - rtas_args.rets = (rtas_arg_t *)&(rtas_args.args[nargs]); - va_start(list, outputs); - for (i = 0; i < nargs; ++i) - rtas_args.args[i] = va_arg(list, rtas_arg_t); - va_end(list); - - for (i = 0; i < nret; ++i) - rtas_args.rets[i] = 0; - - opal_enter_rtas(&rtas_args, prom_rtas_data, - prom_rtas_entry); - - if (nret > 1 && outputs != NULL) - for (i = 0; i < nret-1; ++i) - outputs[i] = rtas_args.rets[i+1]; - return (nret > 0)? rtas_args.rets[0]: 0; -} - -static void __init prom_opal_hold_cpus(void) -{ - int i, cnt, cpu, rc; - long j; - phandle node; - char type[64]; - u32 servers[8]; - void *entry = (unsigned long *)&opal_secondary_entry; - struct opal_secondary_data *data = &opal_secondary_data; - - prom_debug("prom_opal_hold_cpus: start...\n"); - prom_debug(" - entry = 0x%x\n", entry); - prom_debug(" - data = 0x%x\n", data); - - data->ack = -1; - data->go = 0; - - /* look for cpus */ - for (node = 0; prom_next_node(&node); ) { - type[0] = 0; - prom_getprop(node, "device_type", type, sizeof(type)); - if (strcmp(type, "cpu") != 0) - continue; - - /* Skip non-configured cpus. */ - if (prom_getprop(node, "status", type, sizeof(type)) > 0) - if (strcmp(type, "okay") != 0) - continue; - - cnt = prom_getprop(node, "ibm,ppc-interrupt-server#s", servers, - sizeof(servers)); - if (cnt == PROM_ERROR) - break; - cnt >>= 2; - for (i = 0; i < cnt; i++) { - cpu = servers[i]; - prom_debug("CPU %d ... ", cpu); - if (cpu == prom.cpu) { - prom_debug("booted !\n"); - continue; - } - prom_debug("starting ... "); - - /* Init the acknowledge var which will be reset by - * the secondary cpu when it awakens from its OF - * spinloop. - */ - data->ack = -1; - rc = prom_rtas_call(prom_rtas_start_cpu, 3, 1, - NULL, cpu, entry, data); - prom_debug("rtas rc=%d ...", rc); - - for (j = 0; j < 100000000 && data->ack == -1; j++) { - HMT_low(); - mb(); - } - HMT_medium(); - if (data->ack != -1) - prom_debug("done, PIR=0x%x\n", data->ack); - else - prom_debug("timeout !\n"); - } - } - prom_debug("prom_opal_hold_cpus: end...\n"); -} - -static void __init prom_opal_takeover(void) -{ - struct opal_secondary_data *data = &opal_secondary_data; - struct opal_takeover_args *args = &data->args; - u64 align = prom_opal_align; - u64 top_addr, opal_addr; - - args->k_image = (u64)_stext; - args->k_size = _end - _stext; - args->k_entry = 0; - args->k_entry2 = 0x60; - - top_addr = _ALIGN_UP(args->k_size, align); - - if (prom_initrd_start != 0) { - args->rd_image = prom_initrd_start; - args->rd_size = prom_initrd_end - args->rd_image; - args->rd_loc = top_addr; - top_addr = _ALIGN_UP(args->rd_loc + args->rd_size, align); - } - - /* Pickup an address for the HAL. We want to go really high - * up to avoid problem with future kexecs. On the other hand - * we don't want to be all over the TCEs on P5IOC2 machines - * which are going to be up there too. We assume the machine - * has plenty of memory, and we ask for the HAL for now to - * be just below the 1G point, or above the initrd - */ - opal_addr = _ALIGN_DOWN(0x40000000 - prom_opal_size, align); - if (opal_addr < top_addr) - opal_addr = top_addr; - args->hal_addr = opal_addr; - - /* Copy the command line to the kernel image */ - strlcpy(boot_command_line, prom_cmd_line, - COMMAND_LINE_SIZE); - - prom_debug(" k_image = 0x%lx\n", args->k_image); - prom_debug(" k_size = 0x%lx\n", args->k_size); - prom_debug(" k_entry = 0x%lx\n", args->k_entry); - prom_debug(" k_entry2 = 0x%lx\n", args->k_entry2); - prom_debug(" hal_addr = 0x%lx\n", args->hal_addr); - prom_debug(" rd_image = 0x%lx\n", args->rd_image); - prom_debug(" rd_size = 0x%lx\n", args->rd_size); - prom_debug(" rd_loc = 0x%lx\n", args->rd_loc); - prom_printf("Performing OPAL takeover,this can take a few minutes..\n"); - prom_close_stdin(); - mb(); - data->go = 1; - for (;;) - opal_do_takeover(args); -} -#endif /* __BIG_ENDIAN__ */ - /* * Allocate room for and instantiate OPAL */ @@ -1597,12 +1402,6 @@ static void __init prom_instantiate_rtas(void) &val, sizeof(val)) != PROM_ERROR) rtas_has_query_cpu_stopped = true; -#if defined(CONFIG_PPC_POWERNV) && defined(__BIG_ENDIAN__) - /* PowerVN takeover hack */ - prom_rtas_data = base; - prom_rtas_entry = entry; - prom_getprop(rtas_node, "start-cpu", &prom_rtas_start_cpu, 4); -#endif prom_debug("rtas base = 0x%x\n", base); prom_debug("rtas entry = 0x%x\n", entry); prom_debug("rtas size = 0x%x\n", (long)size); @@ -3027,16 +2826,6 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, prom_instantiate_rtas(); #ifdef CONFIG_PPC_POWERNV -#ifdef __BIG_ENDIAN__ - /* Detect HAL and try instanciating it & doing takeover */ - if (of_platform == PLATFORM_PSERIES_LPAR) { - prom_query_opal(); - if (of_platform == PLATFORM_OPAL) { - prom_opal_hold_cpus(); - prom_opal_takeover(); - } - } else -#endif /* __BIG_ENDIAN__ */ if (of_platform == PLATFORM_OPAL) prom_instantiate_opal(); #endif /* CONFIG_PPC_POWERNV */ diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index 77aa1e95e904..fe8e54b9ef7d 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -21,9 +21,7 @@ _end enter_prom memcpy memset reloc_offset __secondary_hold __secondary_hold_acknowledge __secondary_hold_spinloop __start strcmp strcpy strlcpy strlen strncmp strstr logo_linux_clut224 reloc_got2 kernstart_addr memstart_addr linux_banner _stext -opal_query_takeover opal_do_takeover opal_enter_rtas opal_secondary_entry -boot_command_line __prom_init_toc_start __prom_init_toc_end -btext_setup_display TOC." +__prom_init_toc_start __prom_init_toc_end btext_setup_display TOC." NM="$1" OBJ="$2" diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index e239df3768ac..e5b022c55ccd 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -469,9 +469,17 @@ void __init smp_setup_cpu_maps(void) } for (j = 0; j < nthreads && cpu < nr_cpu_ids; j++) { + bool avail; + DBG(" thread %d -> cpu %d (hard id %d)\n", j, cpu, be32_to_cpu(intserv[j])); - set_cpu_present(cpu, of_device_is_available(dn)); + + avail = of_device_is_available(dn); + if (!avail) + avail = !of_property_match_string(dn, + "enable-method", "spin-table"); + + set_cpu_present(cpu, avail); set_hard_smp_processor_id(cpu, be32_to_cpu(intserv[j])); set_cpu_possible(cpu, true); cpu++; diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 4e47db686b5d..1bc5a1755ed4 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -54,7 +54,6 @@ #include "signal.h" -#undef DEBUG_SIG #ifdef CONFIG_PPC64 #define sys_rt_sigreturn compat_sys_rt_sigreturn @@ -1063,10 +1062,6 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, return 1; badframe: -#ifdef DEBUG_SIG - printk("badframe in handle_rt_signal, regs=%p frame=%p newsp=%lx\n", - regs, frame, newsp); -#endif if (show_unhandled_signals) printk_ratelimited(KERN_INFO "%s[%d]: bad frame in handle_rt_signal32: " @@ -1484,10 +1479,6 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka, return 1; badframe: -#ifdef DEBUG_SIG - printk("badframe in handle_signal, regs=%p frame=%p newsp=%lx\n", - regs, frame, newsp); -#endif if (show_unhandled_signals) printk_ratelimited(KERN_INFO "%s[%d]: bad frame in handle_signal32: " diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index d501dc4dc3e6..97c1e4b683fc 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -38,7 +38,6 @@ #include "signal.h" -#define DEBUG_SIG 0 #define GP_REGS_SIZE min(sizeof(elf_gregset_t), sizeof(struct pt_regs)) #define FP_REGS_SIZE sizeof(elf_fpregset_t) @@ -700,10 +699,6 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5, return 0; badframe: -#if DEBUG_SIG - printk("badframe in sys_rt_sigreturn, regs=%p uc=%p &uc->uc_mcontext=%p\n", - regs, uc, &uc->uc_mcontext); -#endif if (show_unhandled_signals) printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32, current->comm, current->pid, "rt_sigreturn", @@ -809,10 +804,6 @@ int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info, return 1; badframe: -#if DEBUG_SIG - printk("badframe in setup_rt_frame, regs=%p frame=%p newsp=%lx\n", - regs, frame, newsp); -#endif if (show_unhandled_signals) printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32, current->comm, current->pid, "setup_rt_frame", diff --git a/arch/powerpc/platforms/cell/cbe_thermal.c b/arch/powerpc/platforms/cell/cbe_thermal.c index 94560db788bf..2c15ff094483 100644 --- a/arch/powerpc/platforms/cell/cbe_thermal.c +++ b/arch/powerpc/platforms/cell/cbe_thermal.c @@ -125,7 +125,7 @@ static ssize_t show_throttle(struct cbe_pmd_regs __iomem *pmd_regs, char *buf, i static ssize_t store_throttle(struct cbe_pmd_regs __iomem *pmd_regs, const char *buf, size_t size, int pos) { u64 reg_value; - int temp; + unsigned int temp; u64 new_value; int ret; diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index d55891f89a2c..4ad227d04c1a 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,4 +1,4 @@ -obj-y += setup.o opal-takeover.o opal-wrappers.o opal.o opal-async.o +obj-y += setup.o opal-wrappers.o opal.o opal-async.o obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o obj-y += opal-msglog.o diff --git a/arch/powerpc/platforms/powernv/opal-takeover.S b/arch/powerpc/platforms/powernv/opal-takeover.S deleted file mode 100644 index 11a3169ee583..000000000000 --- a/arch/powerpc/platforms/powernv/opal-takeover.S +++ /dev/null @@ -1,140 +0,0 @@ -/* - * PowerNV OPAL takeover assembly code, for use by prom_init.c - * - * Copyright 2011 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm/ppc_asm.h> -#include <asm/hvcall.h> -#include <asm/asm-offsets.h> -#include <asm/opal.h> - -#define H_HAL_TAKEOVER 0x5124 -#define H_HAL_TAKEOVER_QUERY_MAGIC -1 - - .text -_GLOBAL(opal_query_takeover) - mfcr r0 - stw r0,8(r1) - stdu r1,-STACKFRAMESIZE(r1) - std r3,STK_PARAM(R3)(r1) - std r4,STK_PARAM(R4)(r1) - li r3,H_HAL_TAKEOVER - li r4,H_HAL_TAKEOVER_QUERY_MAGIC - HVSC - addi r1,r1,STACKFRAMESIZE - ld r10,STK_PARAM(R3)(r1) - std r4,0(r10) - ld r10,STK_PARAM(R4)(r1) - std r5,0(r10) - lwz r0,8(r1) - mtcrf 0xff,r0 - blr - -_GLOBAL(opal_do_takeover) - mfcr r0 - stw r0,8(r1) - mflr r0 - std r0,16(r1) - bl __opal_do_takeover - ld r0,16(r1) - mtlr r0 - lwz r0,8(r1) - mtcrf 0xff,r0 - blr - -__opal_do_takeover: - ld r4,0(r3) - ld r5,0x8(r3) - ld r6,0x10(r3) - ld r7,0x18(r3) - ld r8,0x20(r3) - ld r9,0x28(r3) - ld r10,0x30(r3) - ld r11,0x38(r3) - li r3,H_HAL_TAKEOVER - HVSC - blr - - .globl opal_secondary_entry -opal_secondary_entry: - mr r31,r3 - mfmsr r11 - li r12,(MSR_SF | MSR_ISF)@highest - sldi r12,r12,48 - or r11,r11,r12 - mtmsrd r11 - isync - mfspr r4,SPRN_PIR - std r4,0(r3) -1: HMT_LOW - ld r4,8(r3) - cmpli cr0,r4,0 - beq 1b - HMT_MEDIUM -1: addi r3,r31,16 - bl __opal_do_takeover - b 1b - -_GLOBAL(opal_enter_rtas) - mflr r0 - std r0,16(r1) - stdu r1,-PROM_FRAME_SIZE(r1) /* Save SP and create stack space */ - - /* Because PROM is running in 32b mode, it clobbers the high order half - * of all registers that it saves. We therefore save those registers - * PROM might touch to the stack. (r0, r3-r13 are caller saved) - */ - SAVE_GPR(2, r1) - SAVE_GPR(13, r1) - SAVE_8GPRS(14, r1) - SAVE_10GPRS(22, r1) - mfcr r10 - mfmsr r11 - std r10,_CCR(r1) - std r11,_MSR(r1) - - /* Get the PROM entrypoint */ - mtlr r5 - - /* Switch MSR to 32 bits mode - */ - li r12,1 - rldicr r12,r12,MSR_SF_LG,(63-MSR_SF_LG) - andc r11,r11,r12 - li r12,1 - rldicr r12,r12,MSR_ISF_LG,(63-MSR_ISF_LG) - andc r11,r11,r12 - mtmsrd r11 - isync - - /* Enter RTAS here... */ - blrl - - /* Just make sure that r1 top 32 bits didn't get - * corrupt by OF - */ - rldicl r1,r1,0,32 - - /* Restore the MSR (back to 64 bits) */ - ld r0,_MSR(r1) - MTMSRD(r0) - isync - - /* Restore other registers */ - REST_GPR(2, r1) - REST_GPR(13, r1) - REST_8GPRS(14, r1) - REST_10GPRS(22, r1) - ld r4,_CCR(r1) - mtcr r4 - - addi r1,r1,PROM_FRAME_SIZE - ld r0,16(r1) - mtlr r0 - blr diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index 62c47bb76517..9e5353ff6d1b 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -476,6 +476,11 @@ void __init alloc_dart_table(void) */ dart_tablebase = (unsigned long) __va(memblock_alloc_base(1UL<<24, 1UL<<24, 0x80000000L)); + /* + * The DART space is later unmapped from the kernel linear mapping and + * accessing dart_tablebase during kmemleak scanning will fault. + */ + kmemleak_no_scan((void *)dart_tablebase); printk(KERN_INFO "DART table allocated at: %lx\n", dart_tablebase); } diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h index 375cffcf7dbd..91d219381306 100644 --- a/arch/sparc/include/asm/irq_64.h +++ b/arch/sparc/include/asm/irq_64.h @@ -89,7 +89,7 @@ static inline unsigned long get_softint(void) return retval; } -void arch_trigger_all_cpu_backtrace(void); +void arch_trigger_all_cpu_backtrace(bool); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace extern void *hardirq_stack[NR_CPUS]; diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index b2988f25e230..027e09986194 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -239,7 +239,7 @@ static void __global_reg_poll(struct global_reg_snapshot *gp) } } -void arch_trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(bool include_self) { struct thread_info *tp = current_thread_info(); struct pt_regs *regs = get_irq_regs(); @@ -251,16 +251,22 @@ void arch_trigger_all_cpu_backtrace(void) spin_lock_irqsave(&global_cpu_snapshot_lock, flags); - memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot)); - this_cpu = raw_smp_processor_id(); - __global_reg_self(tp, regs, this_cpu); + memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot)); + + if (include_self) + __global_reg_self(tp, regs, this_cpu); smp_fetch_global_regs(); for_each_online_cpu(cpu) { - struct global_reg_snapshot *gp = &global_cpu_snapshot[cpu].reg; + struct global_reg_snapshot *gp; + + if (!include_self && cpu == this_cpu) + continue; + + gp = &global_cpu_snapshot[cpu].reg; __global_reg_poll(gp); @@ -292,7 +298,7 @@ void arch_trigger_all_cpu_backtrace(void) static void sysrq_handle_globreg(int key) { - arch_trigger_all_cpu_backtrace(); + arch_trigger_all_cpu_backtrace(true); } static struct sysrq_key_op sparc_globalreg_op = { diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index cb6cfcd034cf..a80cbb88ea91 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -43,7 +43,7 @@ extern int vector_used_by_percpu_irq(unsigned int vector); extern void init_ISA_irqs(void); #ifdef CONFIG_X86_LOCAL_APIC -void arch_trigger_all_cpu_backtrace(void); +void arch_trigger_all_cpu_backtrace(bool); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace #endif diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index c3fcb5de5083..6a1e71bde323 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -33,31 +33,41 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; /* "in progress" flag of arch_trigger_all_cpu_backtrace */ static unsigned long backtrace_flag; -void arch_trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(bool include_self) { int i; + int cpu = get_cpu(); - if (test_and_set_bit(0, &backtrace_flag)) + if (test_and_set_bit(0, &backtrace_flag)) { /* * If there is already a trigger_all_cpu_backtrace() in progress * (backtrace_flag == 1), don't output double cpu dump infos. */ + put_cpu(); return; + } cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); + if (!include_self) + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); - printk(KERN_INFO "sending NMI to all CPUs:\n"); - apic->send_IPI_all(NMI_VECTOR); + if (!cpumask_empty(to_cpumask(backtrace_mask))) { + pr_info("sending NMI to %s CPUs:\n", + (include_self ? "all" : "other")); + apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR); + } /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { if (cpumask_empty(to_cpumask(backtrace_mask))) break; mdelay(1); + touch_softlockup_watchdog(); } clear_bit(0, &backtrace_flag); smp_mb__after_atomic(); + put_cpu(); } static int diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index 83969f8c5727..6467c919c509 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -176,14 +176,24 @@ static int __init cma_activate_area(struct cma *cma) base_pfn = pfn; for (j = pageblock_nr_pages; j; --j, pfn++) { WARN_ON_ONCE(!pfn_valid(pfn)); + /* + * alloc_contig_range requires the pfn range + * specified to be in the same zone. Make this + * simple by forcing the entire CMA resv range + * to be in the same zone. + */ if (page_zone(pfn_to_page(pfn)) != zone) - return -EINVAL; + goto err; } init_cma_reserved_pageblock(pfn_to_page(base_pfn)); } while (--i); mutex_init(&cma->lock); return 0; + +err: + kfree(cma->bitmap); + return -EINVAL; } static struct cma cma_areas[MAX_CMA_AREAS]; diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c index 23b4a3b28dbc..4eab93aa570b 100644 --- a/drivers/macintosh/smu.c +++ b/drivers/macintosh/smu.c @@ -1257,7 +1257,8 @@ static unsigned int smu_fpoll(struct file *file, poll_table *wait) if (pp->busy && pp->cmd.status != 1) mask |= POLLIN; spin_unlock_irqrestore(&pp->lock, flags); - } if (pp->mode == smu_file_events) { + } + if (pp->mode == smu_file_events) { /* Not yet implemented */ } return mask; diff --git a/drivers/memstick/host/rtsx_pci_ms.c b/drivers/memstick/host/rtsx_pci_ms.c index 2a635b6fdaf7..c880ba685754 100644 --- a/drivers/memstick/host/rtsx_pci_ms.c +++ b/drivers/memstick/host/rtsx_pci_ms.c @@ -601,6 +601,7 @@ static int rtsx_pci_ms_drv_remove(struct platform_device *pdev) pcr->slots[RTSX_MS_CARD].card_event = NULL; msh = host->msh; host->eject = true; + cancel_work_sync(&host->handle_req); mutex_lock(&host->host_mutex); if (host->req) { diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 971a760af4a1..8dae2f724a35 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -700,14 +700,6 @@ static void handle_rx_net(struct vhost_work *work) handle_rx(net); } -static void vhost_net_free(void *addr) -{ - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); -} - static int vhost_net_open(struct inode *inode, struct file *f) { struct vhost_net *n; @@ -723,7 +715,7 @@ static int vhost_net_open(struct inode *inode, struct file *f) } vqs = kmalloc(VHOST_NET_VQ_MAX * sizeof(*vqs), GFP_KERNEL); if (!vqs) { - vhost_net_free(n); + kvfree(n); return -ENOMEM; } @@ -840,7 +832,7 @@ static int vhost_net_release(struct inode *inode, struct file *f) * since jobs can re-queue themselves. */ vhost_net_flush(n); kfree(n->dev.vqs); - vhost_net_free(n); + kvfree(n); return 0; } diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 4f4ffa4c604e..69906cacd04f 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1503,14 +1503,6 @@ static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features) return 0; } -static void vhost_scsi_free(struct vhost_scsi *vs) -{ - if (is_vmalloc_addr(vs)) - vfree(vs); - else - kfree(vs); -} - static int vhost_scsi_open(struct inode *inode, struct file *f) { struct vhost_scsi *vs; @@ -1550,7 +1542,7 @@ static int vhost_scsi_open(struct inode *inode, struct file *f) return 0; err_vqs: - vhost_scsi_free(vs); + kvfree(vs); err_vs: return r; } @@ -1569,7 +1561,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f) /* Jobs can re-queue themselves in evt kick handler. Do extra flush. */ vhost_scsi_flush(vs); kfree(vs->dev.vqs); - vhost_scsi_free(vs); + kvfree(vs); return 0; } @@ -1021,6 +1021,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) /* everything turned out well, dispose of the aiocb. */ kiocb_free(iocb); + put_reqs_available(ctx, 1); /* * We have to order our ring_info tail store above and test @@ -1062,6 +1063,9 @@ static long aio_read_events_ring(struct kioctx *ctx, if (head == tail) goto out; + head %= ctx->nr_events; + tail %= ctx->nr_events; + while (ret < nr) { long avail; struct io_event *ev; @@ -1100,8 +1104,6 @@ static long aio_read_events_ring(struct kioctx *ctx, flush_dcache_page(ctx->ring_pages[0]); pr_debug("%li h%u t%u\n", ret, head, tail); - - put_reqs_available(ctx, ret); out: mutex_unlock(&ctx->ring_lock); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index a106b3f2b22a..fae17c640df3 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -331,6 +331,7 @@ struct dlm_lock_resource u16 state; char lvb[DLM_LVB_LEN]; unsigned int inflight_locks; + unsigned int inflight_assert_workers; unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; }; @@ -910,6 +911,9 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); + void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3087a21d32f9..82abf0cc9a12 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -581,6 +581,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, atomic_set(&res->asts_reserved, 0); res->migration_pending = 0; res->inflight_locks = 0; + res->inflight_assert_workers = 0; res->dlm = dlm; @@ -683,6 +684,43 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, wake_up(&res->wq); } +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + res->inflight_assert_workers++; + mlog(0, "%s:%.*s: inflight assert worker++: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_assert_workers); +} + +static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + spin_lock(&res->spinlock); + __dlm_lockres_grab_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); +} + +static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + BUG_ON(res->inflight_assert_workers == 0); + res->inflight_assert_workers--; + mlog(0, "%s:%.*s: inflight assert worker--: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_assert_workers); +} + +static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + spin_lock(&res->spinlock); + __dlm_lockres_drop_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); +} + /* * lookup a lock resource by name. * may already exist in the hashtable. @@ -1603,7 +1641,8 @@ send_response: mlog(ML_ERROR, "failed to dispatch assert master work\n"); response = DLM_MASTER_RESP_ERROR; dlm_lockres_put(res); - } + } else + dlm_lockres_grab_inflight_worker(dlm, res); } else { if (res) dlm_lockres_put(res); @@ -2118,6 +2157,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) dlm_lockres_release_ast(dlm, res); put: + dlm_lockres_drop_inflight_worker(dlm, res); + dlm_lockres_put(res); mlog(0, "finished with dlm_assert_master_worker\n"); @@ -3088,11 +3129,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, /* remove it so that only one mle will be found */ __dlm_unlink_mle(dlm, tmp); __dlm_mle_detach_hb_events(dlm, tmp); - ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; - mlog(0, "%s:%.*s: master=%u, newmaster=%u, " - "telling master to get ref for cleared out mle " - "during migration\n", dlm->name, namelen, name, - master, new_master); + if (tmp->type == DLM_MLE_MASTER) { + ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: master=%u, newmaster=%u, " + "telling master to get ref " + "for cleared out mle during " + "migration\n", dlm->name, + namelen, name, master, + new_master); + } } spin_unlock(&tmp->spinlock); } diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 5de019437ea5..45067faf5695 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1708,7 +1708,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, mlog_errno(-ENOMEM); /* retry!? */ BUG(); - } + } else + __dlm_lockres_grab_inflight_worker(dlm, res); } else /* put.. incase we are not the master */ dlm_lockres_put(res); spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 9db869de829d..69aac6f088ad 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -259,12 +259,15 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, * refs on it. */ unused = __dlm_lockres_unused(lockres); if (!unused || - (lockres->state & DLM_LOCK_RES_MIGRATING)) { + (lockres->state & DLM_LOCK_RES_MIGRATING) || + (lockres->inflight_assert_workers != 0)) { mlog(0, "%s: res %.*s is in use or being remastered, " - "used %d, state %d\n", dlm->name, - lockres->lockname.len, lockres->lockname.name, - !unused, lockres->state); - list_move_tail(&dlm->purge_list, &lockres->purge); + "used %d, state %d, assert master workers %u\n", + dlm->name, lockres->lockname.len, + lockres->lockname.name, + !unused, lockres->state, + lockres->inflight_assert_workers); + list_move_tail(&lockres->purge, &dlm->purge_list); spin_unlock(&lockres->spinlock); continue; } diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 5698b52cf5c9..2e3c9dbab68c 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -191,7 +191,9 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, DLM_UNLOCK_CLEAR_CONVERT_TYPE); } else if (status == DLM_RECOVERING || status == DLM_MIGRATING || - status == DLM_FORWARD) { + status == DLM_FORWARD || + status == DLM_NOLOCKMGR + ) { /* must clear the actions because this unlock * is about to be retried. cannot free or do * any list manipulation. */ @@ -200,7 +202,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, res->lockname.name, status==DLM_RECOVERING?"recovering": (status==DLM_MIGRATING?"migrating": - "forward")); + (status == DLM_FORWARD ? "forward" : + "nolockmanager"))); actions = 0; } if (flags & LKM_CANCEL) @@ -364,7 +367,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, * updated state to the recovery master. this thread * just needs to finish out the operation and call * the unlockast. */ - ret = DLM_NORMAL; + if (dlm_is_node_dead(dlm, owner)) + ret = DLM_NORMAL; + else + ret = DLM_NOLOCKMGR; } else { /* something bad. this will BUG in ocfs2 */ ret = dlm_err_to_dlm_status(tmpret); @@ -638,7 +644,9 @@ retry: if (status == DLM_RECOVERING || status == DLM_MIGRATING || - status == DLM_FORWARD) { + status == DLM_FORWARD || + status == DLM_NOLOCKMGR) { + /* We want to go away for a tiny bit to allow recovery * / migration to complete on this resource. I don't * know of any wait queue we could sleep on as this @@ -650,7 +658,7 @@ retry: msleep(50); mlog(0, "retrying unlock due to pending recovery/" - "migration/in-progress\n"); + "migration/in-progress/reconnect\n"); goto retry; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2060fc398445..8add6f1030d7 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -205,6 +205,21 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) return inode; } +static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb, + struct dentry *dentry, struct inode *inode) +{ + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + + ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); + ocfs2_lock_res_free(&dl->dl_lockres); + BUG_ON(dl->dl_count != 1); + spin_lock(&dentry_attach_lock); + dentry->d_fsdata = NULL; + spin_unlock(&dentry_attach_lock); + kfree(dl); + iput(inode); +} + static int ocfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, @@ -231,6 +246,7 @@ static int ocfs2_mknod(struct inode *dir, sigset_t oldset; int did_block_signals = 0; struct posix_acl *default_acl = NULL, *acl = NULL; + struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno, @@ -423,6 +439,8 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } + dl = dentry->d_fsdata; + status = ocfs2_add_entry(handle, dentry, inode, OCFS2_I(inode)->ip_blkno, parent_fe_bh, &lookup); @@ -469,6 +487,9 @@ leave: * ocfs2_delete_inode will mutex_lock again. */ if ((status < 0) && inode) { + if (dl) + ocfs2_cleanup_add_entry_failure(osb, dentry, inode); + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; clear_nlink(inode); iput(inode); @@ -991,6 +1012,65 @@ leave: return status; } +static int ocfs2_check_if_ancestor(struct ocfs2_super *osb, + u64 src_inode_no, u64 dest_inode_no) +{ + int ret = 0, i = 0; + u64 parent_inode_no = 0; + u64 child_inode_no = src_inode_no; + struct inode *child_inode; + +#define MAX_LOOKUP_TIMES 32 + while (1) { + child_inode = ocfs2_iget(osb, child_inode_no, 0, 0); + if (IS_ERR(child_inode)) { + ret = PTR_ERR(child_inode); + break; + } + + ret = ocfs2_inode_lock(child_inode, NULL, 0); + if (ret < 0) { + iput(child_inode); + if (ret != -ENOENT) + mlog_errno(ret); + break; + } + + ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2, + &parent_inode_no); + ocfs2_inode_unlock(child_inode, 0); + iput(child_inode); + if (ret < 0) { + ret = -ENOENT; + break; + } + + if (parent_inode_no == dest_inode_no) { + ret = 1; + break; + } + + if (parent_inode_no == osb->root_inode->i_ino) { + ret = 0; + break; + } + + child_inode_no = parent_inode_no; + + if (++i >= MAX_LOOKUP_TIMES) { + mlog(ML_NOTICE, "max lookup times reached, filesystem " + "may have nested directories, " + "src inode: %llu, dest inode: %llu.\n", + (unsigned long long)src_inode_no, + (unsigned long long)dest_inode_no); + ret = 0; + break; + } + } + + return ret; +} + /* * The only place this should be used is rename! * if they have the same id, then the 1st one is the only one locked. @@ -1002,6 +1082,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, struct inode *inode2) { int status; + int inode1_is_ancestor, inode2_is_ancestor; struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); struct buffer_head **tmpbh; @@ -1015,9 +1096,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, if (*bh2) *bh2 = NULL; - /* we always want to lock the one with the lower lockid first. */ + /* we always want to lock the one with the lower lockid first. + * and if they are nested, we lock ancestor first */ if (oi1->ip_blkno != oi2->ip_blkno) { - if (oi1->ip_blkno < oi2->ip_blkno) { + inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno, + oi1->ip_blkno); + if (inode1_is_ancestor < 0) { + status = inode1_is_ancestor; + goto bail; + } + + inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno, + oi2->ip_blkno); + if (inode2_is_ancestor < 0) { + status = inode2_is_ancestor; + goto bail; + } + + if ((inode1_is_ancestor == 1) || + (oi1->ip_blkno < oi2->ip_blkno && + inode2_is_ancestor == 0)) { /* switch id1 and id2 around */ tmpbh = bh2; bh2 = bh1; @@ -1098,6 +1196,7 @@ static int ocfs2_rename(struct inode *old_dir, struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, }; struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; struct ocfs2_dir_lookup_result target_insert = { NULL, }; + bool should_add_orphan = false; /* At some point it might be nice to break this function up a * bit. */ @@ -1134,6 +1233,21 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } rename_lock = 1; + + /* here we cannot guarantee the inodes haven't just been + * changed, so check if they are nested again */ + status = ocfs2_check_if_ancestor(osb, new_dir->i_ino, + old_inode->i_ino); + if (status < 0) { + mlog_errno(status); + goto bail; + } else if (status == 1) { + status = -EPERM; + trace_ocfs2_rename_not_permitted( + (unsigned long long)old_inode->i_ino, + (unsigned long long)new_dir->i_ino); + goto bail; + } } /* if old and new are the same, this'll just do one lock. */ @@ -1304,6 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir, mlog_errno(status); goto bail; } + should_add_orphan = true; } } else { BUG_ON(new_dentry->d_parent->d_inode != new_dir); @@ -1348,17 +1463,6 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } - if (S_ISDIR(new_inode->i_mode) || - (ocfs2_read_links_count(newfe) == 1)) { - status = ocfs2_orphan_add(osb, handle, new_inode, - newfe_bh, orphan_name, - &orphan_insert, orphan_dir); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - /* change the dirent to point to the correct inode */ status = ocfs2_update_entry(new_dir, handle, &target_lookup_res, old_inode); @@ -1373,6 +1477,15 @@ static int ocfs2_rename(struct inode *old_dir, else ocfs2_add_links_count(newfe, -1); ocfs2_journal_dirty(handle, newfe_bh); + if (should_add_orphan) { + status = ocfs2_orphan_add(osb, handle, new_inode, + newfe_bh, orphan_name, + &orphan_insert, orphan_dir); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } } else { /* if the name was not found in new_dir, add it now */ status = ocfs2_add_entry(handle, new_dentry, old_inode, @@ -1642,6 +1755,7 @@ static int ocfs2_symlink(struct inode *dir, struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; int did_block_signals = 0; + struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_symlink_begin(dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); @@ -1830,6 +1944,8 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } + dl = dentry->d_fsdata; + status = ocfs2_add_entry(handle, dentry, inode, le64_to_cpu(fe->i_blkno), parent_fe_bh, &lookup); @@ -1864,6 +1980,9 @@ bail: if (xattr_ac) ocfs2_free_alloc_context(xattr_ac); if ((status < 0) && inode) { + if (dl) + ocfs2_cleanup_add_entry_failure(osb, dentry, inode); + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; clear_nlink(inode); iput(inode); diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 1b60c62aa9d6..6cb019b7c6a8 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -2292,6 +2292,8 @@ TRACE_EVENT(ocfs2_rename, __entry->new_len, __get_str(new_name)) ); +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted); + TRACE_EVENT(ocfs2_rename_target_exists, TP_PROTO(int new_len, const char *new_name), TP_ARGS(new_len, new_name), diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 714e53b9cc66..636aab69ead5 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4288,9 +4288,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, goto out; } + error = ocfs2_rw_lock(inode, 1); + if (error) { + mlog_errno(error); + goto out; + } + error = ocfs2_inode_lock(inode, &old_bh, 1); if (error) { mlog_errno(error); + ocfs2_rw_unlock(inode, 1); goto out; } @@ -4302,6 +4309,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, up_write(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock(inode, 1); + ocfs2_rw_unlock(inode, 1); brelse(old_bh); if (error) { diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c7a89cea5c5d..ddb662b32447 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1925,15 +1925,11 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_shutdown_local_alloc(osb); + ocfs2_truncate_log_shutdown(osb); + /* This will disable recovery and flush any recovery work. */ ocfs2_recovery_exit(osb); - /* - * During dismount, when it recovers another node it will call - * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq. - */ - ocfs2_truncate_log_shutdown(osb); - ocfs2_journal_shutdown(osb); ocfs2_sync_blockdev(sb); diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 6a45fb583ff1..447775ee2c4b 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -32,15 +32,24 @@ static inline void touch_nmi_watchdog(void) #ifdef arch_trigger_all_cpu_backtrace static inline bool trigger_all_cpu_backtrace(void) { - arch_trigger_all_cpu_backtrace(); + arch_trigger_all_cpu_backtrace(true); return true; } +static inline bool trigger_allbutself_cpu_backtrace(void) +{ + arch_trigger_all_cpu_backtrace(false); + return true; +} #else static inline bool trigger_all_cpu_backtrace(void) { return false; } +static inline bool trigger_allbutself_cpu_backtrace(void) +{ + return false; +} #endif #ifdef CONFIG_LOCKUP_DETECTOR @@ -48,6 +57,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *); u64 hw_nmi_get_sample_period(int watchdog_thresh); extern int watchdog_user_enabled; extern int watchdog_thresh; +extern int sysctl_softlockup_all_cpu_backtrace; struct ctl_table; extern int proc_dowatchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 3c545b48aeab..8304959ad336 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -360,6 +360,9 @@ static inline void ClearPageCompound(struct page *page) ClearPageHead(page); } #endif + +#define PG_head_mask ((1L << PG_head)) + #else /* * Reduce page flag use as much as possible by overlapping diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 0fd06fef9fac..26b4f2e13275 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -44,6 +44,12 @@ #undef __field_ext #define __field_ext(type, item, filter_type) type item; +#undef __field_struct +#define __field_struct(type, item) type item; + +#undef __field_struct_ext +#define __field_struct_ext(type, item, filter_type) type item; + #undef __array #define __array(type, item, len) type item[len]; @@ -122,6 +128,12 @@ #undef __field_ext #define __field_ext(type, item, filter_type) +#undef __field_struct +#define __field_struct(type, item) + +#undef __field_struct_ext +#define __field_struct_ext(type, item, filter_type) + #undef __array #define __array(type, item, len) @@ -315,9 +327,21 @@ static struct trace_event_functions ftrace_event_type_funcs_##call = { \ if (ret) \ return ret; +#undef __field_struct_ext +#define __field_struct_ext(type, item, filter_type) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), \ + 0, filter_type); \ + if (ret) \ + return ret; + #undef __field #define __field(type, item) __field_ext(type, item, FILTER_OTHER) +#undef __field_struct +#define __field_struct(type, item) __field_struct_ext(type, item, FILTER_OTHER) + #undef __array #define __array(type, item, len) \ do { \ @@ -379,6 +403,12 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ #undef __field_ext #define __field_ext(type, item, filter_type) +#undef __field_struct +#define __field_struct(type, item) + +#undef __field_struct_ext +#define __field_struct_ext(type, item, filter_type) + #undef __array #define __array(type, item, len) @@ -550,6 +580,9 @@ static inline notrace int ftrace_get_offsets_##call( \ #undef __field #define __field(type, item) +#undef __field_struct +#define __field_struct(type, item) + #undef __array #define __array(type, item, len) diff --git a/include/trace/syscall.h b/include/trace/syscall.h index fed853f3d7aa..9674145e2f6a 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -4,6 +4,7 @@ #include <linux/tracepoint.h> #include <linux/unistd.h> #include <linux/ftrace_event.h> +#include <linux/thread_info.h> #include <asm/ptrace.h> @@ -32,4 +33,18 @@ struct syscall_metadata { struct ftrace_event_call *exit_event; }; +#if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_HAVE_SYSCALL_TRACEPOINTS) +static inline void syscall_tracepoint_update(struct task_struct *p) +{ + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) + set_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT); + else + clear_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT); +} +#else +static inline void syscall_tracepoint_update(struct task_struct *p) +{ +} +#endif + #endif /* _TRACE_SYSCALL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index d2799d1fc952..6a13c46cd87d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1487,7 +1487,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); + syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); + proc_fork_connector(p); cgroup_post_fork(p); if (clone_flags & CLONE_THREAD) diff --git a/kernel/kexec.c b/kernel/kexec.c index 6748688813d0..369f41a94124 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1617,6 +1617,7 @@ static int __init crash_save_vmcoreinfo_init(void) #ifdef CONFIG_MEMORY_FAILURE VMCOREINFO_NUMBER(PG_hwpoison); #endif + VMCOREINFO_NUMBER(PG_head_mask); VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); arch_crash_save_vmcoreinfo(); diff --git a/kernel/smp.c b/kernel/smp.c index 306f8180b0d5..80c33f8de14f 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -29,6 +29,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); +static void flush_smp_call_function_queue(bool warn_cpu_offline); + static int hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -51,12 +53,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: + /* Fall-through to the CPU_DEAD[_FROZEN] case. */ case CPU_DEAD: case CPU_DEAD_FROZEN: free_cpumask_var(cfd->cpumask); free_percpu(cfd->csd); break; + + case CPU_DYING: + case CPU_DYING_FROZEN: + /* + * The IPIs for the smp-call-function callbacks queued by other + * CPUs might arrive late, either due to hardware latencies or + * because this CPU disabled interrupts (inside stop-machine) + * before the IPIs were sent. So flush out any pending callbacks + * explicitly (without waiting for the IPIs to arrive), to + * ensure that the outgoing CPU doesn't go offline with work + * still pending. + */ + flush_smp_call_function_queue(false); + break; #endif }; @@ -177,23 +194,47 @@ static int generic_exec_single(int cpu, struct call_single_data *csd, return 0; } -/* - * Invoked by arch to handle an IPI for call function single. Must be - * called from the arch with interrupts disabled. +/** + * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks + * + * Invoked by arch to handle an IPI for call function single. + * Must be called with interrupts disabled. */ void generic_smp_call_function_single_interrupt(void) { + flush_smp_call_function_queue(true); +} + +/** + * flush_smp_call_function_queue - Flush pending smp-call-function callbacks + * + * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an + * offline CPU. Skip this check if set to 'false'. + * + * Flush any pending smp-call-function callbacks queued on this CPU. This is + * invoked by the generic IPI handler, as well as by a CPU about to go offline, + * to ensure that all pending IPI callbacks are run before it goes completely + * offline. + * + * Loop through the call_single_queue and run all the queued callbacks. + * Must be called with interrupts disabled. + */ +static void flush_smp_call_function_queue(bool warn_cpu_offline) +{ + struct llist_head *head; struct llist_node *entry; struct call_single_data *csd, *csd_next; static bool warned; - entry = llist_del_all(&__get_cpu_var(call_single_queue)); + WARN_ON(!irqs_disabled()); + + head = &__get_cpu_var(call_single_queue); + entry = llist_del_all(head); entry = llist_reverse_order(entry); - /* - * Shouldn't receive this interrupt on a cpu that is not yet online. - */ - if (unlikely(!cpu_online(smp_processor_id()) && !warned)) { + /* There shouldn't be any pending callbacks on an offline CPU. */ + if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) && + !warned && !llist_empty(head))) { warned = true; WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7de6555cfea0..75b22e22a72c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; static int minolduid; -static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -861,6 +860,17 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_SMP + { + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif /* CONFIG_SMP */ { .procname = "nmi_watchdog", .data = &watchdog_user_enabled, @@ -1317,7 +1327,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(percpu_pagelist_fraction), .mode = 0644, .proc_handler = percpu_pagelist_fraction_sysctl_handler, - .extra1 = &min_percpu_pagelist_fract, + .extra1 = &zero, }, #ifdef CONFIG_MMU { diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 33cbd8c203f8..3490407dc7b7 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -492,33 +492,29 @@ static int sys_tracepoint_refcount; void syscall_regfunc(void) { - unsigned long flags; - struct task_struct *g, *t; + struct task_struct *p, *t; if (!sys_tracepoint_refcount) { - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, t) { - /* Skip kernel threads. */ - if (t->mm) - set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); - } while_each_thread(g, t); - read_unlock_irqrestore(&tasklist_lock, flags); + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); + } + read_unlock(&tasklist_lock); } sys_tracepoint_refcount++; } void syscall_unregfunc(void) { - unsigned long flags; - struct task_struct *g, *t; + struct task_struct *p, *t; sys_tracepoint_refcount--; if (!sys_tracepoint_refcount) { - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, t) { + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); - } while_each_thread(g, t); - read_unlock_irqrestore(&tasklist_lock, flags); + } + read_unlock(&tasklist_lock); } } #endif diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 516203e665fc..c3319bd1b040 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -31,6 +31,12 @@ int watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; +#ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; +#else +#define sysctl_softlockup_all_cpu_backtrace 0 +#endif + static int __read_mostly watchdog_running; static u64 __read_mostly sample_period; @@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif +static unsigned long soft_lockup_nmi_warn; /* boot commands */ /* @@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str) } __setup("nosoftlockup", nosoftlockup_setup); /* */ +#ifdef CONFIG_SMP +static int __init softlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_softlockup_all_cpu_backtrace = + !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +#endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- @@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); struct pt_regs *regs = get_irq_regs(); int duration; + int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; /* kick the hardlockup detector */ watchdog_interrupt_count(); @@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; + if (softlockup_all_cpu_backtrace) { + /* Prevent multiple soft-lockup reports if one cpu is already + * engaged in dumping cpu back traces + */ + if (test_and_set_bit(0, &soft_lockup_nmi_warn)) { + /* Someone else will report us. Let's give up */ + __this_cpu_write(soft_watchdog_warn, true); + return HRTIMER_RESTART; + } + } + printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) else dump_stack(); + if (softlockup_all_cpu_backtrace) { + /* Avoid generating two back traces for current + * given that one is already made above + */ + trigger_allbutself_cpu_backtrace(); + + clear_bit(0, &soft_lockup_nmi_warn); + /* Barrier to sync with other cpus */ + smp_mb__after_atomic(); + } + if (softlockup_panic) panic("softlockup: hung tasks"); __this_cpu_write(soft_watchdog_warn, true); @@ -527,10 +566,8 @@ static void update_timers_all_cpus(void) int cpu; get_online_cpus(); - preempt_disable(); for_each_online_cpu(cpu) update_timers(cpu); - preempt_enable(); put_online_cpus(); } diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7cfcc1b8e101..7a638aa3545b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -930,7 +930,7 @@ config LOCKDEP bool depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT select STACKTRACE - select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC + select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !SCORE select KALLSYMS select KALLSYMS_ALL @@ -1408,7 +1408,7 @@ config FAULT_INJECTION_STACKTRACE_FILTER depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT depends on !X86_64 select STACKTRACE - select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC + select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !SCORE help Provide stacktrace filter for fault-injection capabilities diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index df6839e3ce08..99a03acb7d47 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -72,6 +72,8 @@ static int lz4_uncompress(const char *source, char *dest, int osize) len = *ip++; for (; len == 255; length += 255) len = *ip++; + if (unlikely(length > (size_t)(length + len))) + goto _output_error; length += len; } diff --git a/lib/lzo/lzo1x_decompress_safe.c b/lib/lzo/lzo1x_decompress_safe.c index 569985d522d5..8563081e8da3 100644 --- a/lib/lzo/lzo1x_decompress_safe.c +++ b/lib/lzo/lzo1x_decompress_safe.c @@ -19,11 +19,31 @@ #include <linux/lzo.h> #include "lzodefs.h" -#define HAVE_IP(x) ((size_t)(ip_end - ip) >= (size_t)(x)) -#define HAVE_OP(x) ((size_t)(op_end - op) >= (size_t)(x)) -#define NEED_IP(x) if (!HAVE_IP(x)) goto input_overrun -#define NEED_OP(x) if (!HAVE_OP(x)) goto output_overrun -#define TEST_LB(m_pos) if ((m_pos) < out) goto lookbehind_overrun +#define HAVE_IP(t, x) \ + (((size_t)(ip_end - ip) >= (size_t)(t + x)) && \ + (((t + x) >= t) && ((t + x) >= x))) + +#define HAVE_OP(t, x) \ + (((size_t)(op_end - op) >= (size_t)(t + x)) && \ + (((t + x) >= t) && ((t + x) >= x))) + +#define NEED_IP(t, x) \ + do { \ + if (!HAVE_IP(t, x)) \ + goto input_overrun; \ + } while (0) + +#define NEED_OP(t, x) \ + do { \ + if (!HAVE_OP(t, x)) \ + goto output_overrun; \ + } while (0) + +#define TEST_LB(m_pos) \ + do { \ + if ((m_pos) < out) \ + goto lookbehind_overrun; \ + } while (0) int lzo1x_decompress_safe(const unsigned char *in, size_t in_len, unsigned char *out, size_t *out_len) @@ -58,14 +78,14 @@ int lzo1x_decompress_safe(const unsigned char *in, size_t in_len, while (unlikely(*ip == 0)) { t += 255; ip++; - NEED_IP(1); + NEED_IP(1, 0); } t += 15 + *ip++; } t += 3; copy_literal_run: #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) - if (likely(HAVE_IP(t + 15) && HAVE_OP(t + 15))) { + if (likely(HAVE_IP(t, 15) && HAVE_OP(t, 15))) { const unsigned char *ie = ip + t; unsigned char *oe = op + t; do { @@ -81,8 +101,8 @@ copy_literal_run: } else #endif { - NEED_OP(t); - NEED_IP(t + 3); + NEED_OP(t, 0); + NEED_IP(t, 3); do { *op++ = *ip++; } while (--t > 0); @@ -95,7 +115,7 @@ copy_literal_run: m_pos -= t >> 2; m_pos -= *ip++ << 2; TEST_LB(m_pos); - NEED_OP(2); + NEED_OP(2, 0); op[0] = m_pos[0]; op[1] = m_pos[1]; op += 2; @@ -119,10 +139,10 @@ copy_literal_run: while (unlikely(*ip == 0)) { t += 255; ip++; - NEED_IP(1); + NEED_IP(1, 0); } t += 31 + *ip++; - NEED_IP(2); + NEED_IP(2, 0); } m_pos = op - 1; next = get_unaligned_le16(ip); @@ -137,10 +157,10 @@ copy_literal_run: while (unlikely(*ip == 0)) { t += 255; ip++; - NEED_IP(1); + NEED_IP(1, 0); } t += 7 + *ip++; - NEED_IP(2); + NEED_IP(2, 0); } next = get_unaligned_le16(ip); ip += 2; @@ -154,7 +174,7 @@ copy_literal_run: #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) if (op - m_pos >= 8) { unsigned char *oe = op + t; - if (likely(HAVE_OP(t + 15))) { + if (likely(HAVE_OP(t, 15))) { do { COPY8(op, m_pos); op += 8; @@ -164,7 +184,7 @@ copy_literal_run: m_pos += 8; } while (op < oe); op = oe; - if (HAVE_IP(6)) { + if (HAVE_IP(6, 0)) { state = next; COPY4(op, ip); op += next; @@ -172,7 +192,7 @@ copy_literal_run: continue; } } else { - NEED_OP(t); + NEED_OP(t, 0); do { *op++ = *m_pos++; } while (op < oe); @@ -181,7 +201,7 @@ copy_literal_run: #endif { unsigned char *oe = op + t; - NEED_OP(t); + NEED_OP(t, 0); op[0] = m_pos[0]; op[1] = m_pos[1]; op += 2; @@ -194,15 +214,15 @@ match_next: state = next; t = next; #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) - if (likely(HAVE_IP(6) && HAVE_OP(4))) { + if (likely(HAVE_IP(6, 0) && HAVE_OP(4, 0))) { COPY4(op, ip); op += t; ip += t; } else #endif { - NEED_IP(t + 3); - NEED_OP(t); + NEED_IP(t, 3); + NEED_OP(t, 0); while (t > 0) { *op++ = *ip++; t--; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e60837dc785c..33514d88fef9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -941,6 +941,37 @@ unlock: spin_unlock(ptl); } +/* + * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages + * during copy_user_huge_page()'s copy_page_rep(): in the case when + * the source page gets split and a tail freed before copy completes. + * Called under pmd_lock of checked pmd, so safe from splitting itself. + */ +static void get_user_huge_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { + struct page *endpage = page + HPAGE_PMD_NR; + + atomic_add(HPAGE_PMD_NR, &page->_count); + while (++page < endpage) + get_huge_page_tail(page); + } else { + get_page(page); + } +} + +static void put_user_huge_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { + struct page *endpage = page + HPAGE_PMD_NR; + + while (page < endpage) + put_page(page++); + } else { + put_page(page); + } +} + static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -1074,7 +1105,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ret |= VM_FAULT_WRITE; goto out_unlock; } - get_page(page); + get_user_huge_page(page); spin_unlock(ptl); alloc: if (transparent_hugepage_enabled(vma) && @@ -1095,7 +1126,7 @@ alloc: split_huge_page(page); ret |= VM_FAULT_FALLBACK; } - put_page(page); + put_user_huge_page(page); } count_vm_event(THP_FAULT_FALLBACK); goto out; @@ -1105,7 +1136,7 @@ alloc: put_page(new_page); if (page) { split_huge_page(page); - put_page(page); + put_user_huge_page(page); } else split_huge_page_pmd(vma, address, pmd); ret |= VM_FAULT_FALLBACK; @@ -1127,7 +1158,7 @@ alloc: spin_lock(ptl); if (page) - put_page(page); + put_user_huge_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(ptl); mem_cgroup_uncharge_page(new_page); @@ -2392,8 +2423,6 @@ static void collapse_huge_page(struct mm_struct *mm, pmd = mm_find_pmd(mm, address); if (!pmd) goto out; - if (pmd_trans_huge(*pmd)) - goto out; anon_vma_lock_write(vma->anon_vma); @@ -2492,8 +2521,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, pmd = mm_find_pmd(mm, address); if (!pmd) goto out; - if (pmd_trans_huge(*pmd)) - goto out; memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -2846,12 +2873,22 @@ void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, static void split_huge_page_address(struct mm_struct *mm, unsigned long address) { + pgd_t *pgd; + pud_t *pud; pmd_t *pmd; VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); - pmd = mm_find_pmd(mm, address); - if (!pmd) + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) return; /* * Caller holds the mmap_sem write mode, so a huge pmd cannot diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 226910cb7c9b..2024bbd573d2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2520,6 +2520,31 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, update_mmu_cache(vma, address, ptep); } +static int is_hugetlb_entry_migration(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_migration_entry(swp)) + return 1; + else + return 0; +} + +static int is_hugetlb_entry_hwpoisoned(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_hwpoison_entry(swp)) + return 1; + else + return 0; +} int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) @@ -2559,10 +2584,26 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - if (!huge_pte_none(huge_ptep_get(src_pte))) { + entry = huge_ptep_get(src_pte); + if (huge_pte_none(entry)) { /* skip none entry */ + ; + } else if (unlikely(is_hugetlb_entry_migration(entry) || + is_hugetlb_entry_hwpoisoned(entry))) { + swp_entry_t swp_entry = pte_to_swp_entry(entry); + + if (is_write_migration_entry(swp_entry) && cow) { + /* + * COW mappings require pages in both + * parent and child to be set to read. + */ + make_migration_entry_read(&swp_entry); + entry = swp_entry_to_pte(swp_entry); + set_huge_pte_at(src, addr, src_pte, entry); + } + set_huge_pte_at(dst, addr, dst_pte, entry); + } else { if (cow) huge_ptep_set_wrprotect(src, addr, src_pte); - entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); page_dup_rmap(ptepage); @@ -2578,32 +2619,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, return ret; } -static int is_hugetlb_entry_migration(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return 0; - swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_migration_entry(swp)) - return 1; - else - return 0; -} - -static int is_hugetlb_entry_hwpoisoned(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return 0; - swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_hwpoison_entry(swp)) - return 1; - else - return 0; -} - void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) @@ -945,7 +945,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; - BUG_ON(pmd_trans_huge(*pmd)); mmun_start = addr; mmun_end = addr + PAGE_SIZE; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 284974230459..eb58de19f815 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -656,19 +656,18 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, * @nodes and @flags,) it's isolated and queued to the pagelist which is * passed via @private.) */ -static struct vm_area_struct * +static int queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) { - int err; - struct vm_area_struct *first, *vma, *prev; - + int err = 0; + struct vm_area_struct *vma, *prev; - first = find_vma(mm, start); - if (!first) - return ERR_PTR(-EFAULT); + vma = find_vma(mm, start); + if (!vma) + return -EFAULT; prev = NULL; - for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { + for (; vma && vma->vm_start < end; vma = vma->vm_next) { unsigned long endvma = vma->vm_end; if (endvma > end) @@ -678,9 +677,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, if (!(flags & MPOL_MF_DISCONTIG_OK)) { if (!vma->vm_next && vma->vm_end < end) - return ERR_PTR(-EFAULT); + return -EFAULT; if (prev && prev->vm_end < vma->vm_start) - return ERR_PTR(-EFAULT); + return -EFAULT; } if (flags & MPOL_MF_LAZY) { @@ -694,15 +693,13 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, err = queue_pages_pgd_range(vma, start, endvma, nodes, flags, private); - if (err) { - first = ERR_PTR(err); + if (err) break; - } } next: prev = vma; } - return first; + return err; } /* @@ -1156,16 +1153,17 @@ out: /* * Allocate a new page for page migration based on vma policy. - * Start assuming that page is mapped by vma pointed to by @private. + * Start by assuming the page is mapped by the same vma as contains @start. * Search forward from there, if not. N.B., this assumes that the * list of pages handed to migrate_pages()--which is how we get here-- * is in virtual address order. */ -static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +static struct page *new_page(struct page *page, unsigned long start, int **x) { - struct vm_area_struct *vma = (struct vm_area_struct *)private; + struct vm_area_struct *vma; unsigned long uninitialized_var(address); + vma = find_vma(current->mm, start); while (vma) { address = page_address_in_vma(page, vma); if (address != -EFAULT) @@ -1195,7 +1193,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +static struct page *new_page(struct page *page, unsigned long start, int **x) { return NULL; } @@ -1205,7 +1203,6 @@ static long do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags) { - struct vm_area_struct *vma; struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; @@ -1271,11 +1268,9 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) goto mpol_out; - vma = queue_pages_range(mm, start, end, nmask, + err = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); - - err = PTR_ERR(vma); /* maybe ... */ - if (!IS_ERR(vma)) + if (!err) err = mbind_range(mm, start, end, new); if (!err) { @@ -1283,9 +1278,8 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); - nr_failed = migrate_pages(&pagelist, new_vma_page, - NULL, (unsigned long)vma, - MIGRATE_SYNC, MR_MEMPOLICY_MBIND); + nr_failed = migrate_pages(&pagelist, new_page, NULL, + start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) putback_movable_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index 63f0cd559999..9e0beaa91845 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -120,8 +120,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; - if (pmd_trans_huge(*pmd)) - goto out; ptep = pte_offset_map(pmd, addr); diff --git a/mm/nommu.c b/mm/nommu.c index b78e3a8f5ee7..4a852f6c5709 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -786,7 +786,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) for (i = 0; i < VMACACHE_SIZE; i++) { /* if the vma is cached, invalidate the entire cache */ if (curr->vmacache[i] == vma) { - vmacache_invalidate(curr->mm); + vmacache_invalidate(mm); break; } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f59fa29eda8..20d17f8266fe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -69,6 +69,7 @@ /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); +#define MIN_PERCPU_PAGELIST_FRACTION (8) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); @@ -4145,7 +4146,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif -static int __meminit zone_batchsize(struct zone *zone) +static int zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU int batch; @@ -4261,8 +4262,8 @@ static void pageset_set_high(struct per_cpu_pageset *p, pageset_update(&p->pcp, high, batch); } -static void __meminit pageset_set_high_and_batch(struct zone *zone, - struct per_cpu_pageset *pcp) +static void pageset_set_high_and_batch(struct zone *zone, + struct per_cpu_pageset *pcp) { if (percpu_pagelist_fraction) pageset_set_high(pcp, @@ -5881,23 +5882,38 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; - unsigned int cpu; + int old_percpu_pagelist_fraction; int ret; + mutex_lock(&pcp_batch_high_lock); + old_percpu_pagelist_fraction = percpu_pagelist_fraction; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (!write || (ret < 0)) - return ret; + if (!write || ret < 0) + goto out; + + /* Sanity checking to avoid pcp imbalance */ + if (percpu_pagelist_fraction && + percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { + percpu_pagelist_fraction = old_percpu_pagelist_fraction; + ret = -EINVAL; + goto out; + } + + /* No change? */ + if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) + goto out; - mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { - unsigned long high; - high = zone->managed_pages / percpu_pagelist_fraction; + unsigned int cpu; + for_each_possible_cpu(cpu) - pageset_set_high(per_cpu_ptr(zone->pageset, cpu), - high); + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); } +out: mutex_unlock(&pcp_batch_high_lock); - return 0; + return ret; } int hashdist = HASHDIST_DEFAULT; diff --git a/mm/rmap.c b/mm/rmap.c index bf05fc872ae8..b7e94ebbd09e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -569,6 +569,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) pgd_t *pgd; pud_t *pud; pmd_t *pmd = NULL; + pmd_t pmde; pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -579,7 +580,13 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) goto out; pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) + /* + * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at() + * without holding anon_vma lock for write. So when looking for a + * genuine pmde (in which to find pte), test present and !THP together. + */ + pmde = ACCESS_ONCE(*pmd); + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) pmd = NULL; out: return pmd; @@ -615,9 +622,6 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, if (!pmd) return NULL; - if (pmd_trans_huge(*pmd)) - return NULL; - pte = pte_offset_map(pmd, address); /* Make a quick check before getting the lock */ if (!sync && !pte_present(*pte)) { diff --git a/mm/shmem.c b/mm/shmem.c index f484c276e994..8f419cff9e34 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -80,11 +80,12 @@ static struct vfsmount *shm_mnt; #define SHORT_SYMLINK_LEN 128 /* - * shmem_fallocate and shmem_writepage communicate via inode->i_private - * (with i_mutex making sure that it has only one user at a time): - * we would prefer not to enlarge the shmem inode just for that. + * shmem_fallocate communicates with shmem_fault or shmem_writepage via + * inode->i_private (with i_mutex making sure that it has only one user at + * a time): we would prefer not to enlarge the shmem inode just for that. */ struct shmem_falloc { + int mode; /* FALLOC_FL mode currently operating */ pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ @@ -759,6 +760,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && + !shmem_falloc->mode && index >= shmem_falloc->start && index < shmem_falloc->next) shmem_falloc->nr_unswapped++; @@ -1233,6 +1235,44 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int error; int ret = VM_FAULT_LOCKED; + /* + * Trinity finds that probing a hole which tmpfs is punching can + * prevent the hole-punch from ever completing: which in turn + * locks writers out with its hold on i_mutex. So refrain from + * faulting pages into the hole while it's being punched, and + * wait on i_mutex to be released if vmf->flags permits. + */ + if (unlikely(inode->i_private)) { + struct shmem_falloc *shmem_falloc; + + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (!shmem_falloc || + shmem_falloc->mode != FALLOC_FL_PUNCH_HOLE || + vmf->pgoff < shmem_falloc->start || + vmf->pgoff >= shmem_falloc->next) + shmem_falloc = NULL; + spin_unlock(&inode->i_lock); + /* + * i_lock has protected us from taking shmem_falloc seriously + * once return from shmem_fallocate() went back up that stack. + * i_lock does not serialize with i_mutex at all, but it does + * not matter if sometimes we wait unnecessarily, or sometimes + * miss out on waiting: we just need to make those cases rare. + */ + if (shmem_falloc) { + if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && + !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { + up_read(&vma->vm_mm->mmap_sem); + mutex_lock(&inode->i_mutex); + mutex_unlock(&inode->i_mutex); + return VM_FAULT_RETRY; + } + /* cond_resched? Leave that to GUP or return to user */ + return VM_FAULT_NOPAGE; + } + } + error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); @@ -1724,20 +1764,31 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, pgoff_t start, index, end; int error; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + mutex_lock(&inode->i_mutex); + shmem_falloc.mode = mode & ~FALLOC_FL_KEEP_SIZE; + if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; loff_t unmap_start = round_up(offset, PAGE_SIZE); loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; + shmem_falloc.start = unmap_start >> PAGE_SHIFT; + shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; + spin_lock(&inode->i_lock); + inode->i_private = &shmem_falloc; + spin_unlock(&inode->i_lock); + if ((u64)unmap_end > (u64)unmap_start) unmap_mapping_range(mapping, unmap_start, 1 + unmap_end - unmap_start, 0); shmem_truncate_range(inode, offset, offset + len - 1); /* No need to unmap again: hole-punching leaves COWed pages */ error = 0; - goto out; + goto undone; } /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ diff --git a/mm/slab.c b/mm/slab.c index 9ca3b87edabc..3070b929a1bf 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -386,6 +386,39 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif +#define OBJECT_FREE (0) +#define OBJECT_ACTIVE (1) + +#ifdef CONFIG_DEBUG_SLAB_LEAK + +static void set_obj_status(struct page *page, int idx, int val) +{ + int freelist_size; + char *status; + struct kmem_cache *cachep = page->slab_cache; + + freelist_size = cachep->num * sizeof(freelist_idx_t); + status = (char *)page->freelist + freelist_size; + status[idx] = val; +} + +static inline unsigned int get_obj_status(struct page *page, int idx) +{ + int freelist_size; + char *status; + struct kmem_cache *cachep = page->slab_cache; + + freelist_size = cachep->num * sizeof(freelist_idx_t); + status = (char *)page->freelist + freelist_size; + + return status[idx]; +} + +#else +static inline void set_obj_status(struct page *page, int idx, int val) {} + +#endif + /* * Do not go above this order unless 0 objects fit into the slab or * overridden on the command line. @@ -576,12 +609,30 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) return cachep->array[smp_processor_id()]; } +static size_t calculate_freelist_size(int nr_objs, size_t align) +{ + size_t freelist_size; + + freelist_size = nr_objs * sizeof(freelist_idx_t); + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) + freelist_size += nr_objs * sizeof(char); + + if (align) + freelist_size = ALIGN(freelist_size, align); + + return freelist_size; +} + static int calculate_nr_objs(size_t slab_size, size_t buffer_size, size_t idx_size, size_t align) { int nr_objs; + size_t remained_size; size_t freelist_size; + int extra_space = 0; + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) + extra_space = sizeof(char); /* * Ignore padding for the initial guess. The padding * is at most @align-1 bytes, and @buffer_size is at @@ -590,14 +641,15 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size, * into the memory allocation when taking the padding * into account. */ - nr_objs = slab_size / (buffer_size + idx_size); + nr_objs = slab_size / (buffer_size + idx_size + extra_space); /* * This calculated number will be either the right * amount, or one greater than what we want. */ - freelist_size = slab_size - nr_objs * buffer_size; - if (freelist_size < ALIGN(nr_objs * idx_size, align)) + remained_size = slab_size - nr_objs * buffer_size; + freelist_size = calculate_freelist_size(nr_objs, align); + if (remained_size < freelist_size) nr_objs--; return nr_objs; @@ -635,7 +687,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, } else { nr_objs = calculate_nr_objs(slab_size, buffer_size, sizeof(freelist_idx_t), align); - mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align); + mgmt_size = calculate_freelist_size(nr_objs, align); } *num = nr_objs; *left_over = slab_size - nr_objs*buffer_size - mgmt_size; @@ -2041,13 +2093,16 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, break; if (flags & CFLGS_OFF_SLAB) { + size_t freelist_size_per_obj = sizeof(freelist_idx_t); /* * Max number of objs-per-slab for caches which * use off-slab slabs. Needed to avoid a possible * looping condition in cache_grow(). */ + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) + freelist_size_per_obj += sizeof(char); offslab_limit = size; - offslab_limit /= sizeof(freelist_idx_t); + offslab_limit /= freelist_size_per_obj; if (num > offslab_limit) break; @@ -2294,8 +2349,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (!cachep->num) return -E2BIG; - freelist_size = - ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align); + freelist_size = calculate_freelist_size(cachep->num, cachep->align); /* * If the slab has been placed off-slab, and we have enough space then @@ -2308,7 +2362,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ - freelist_size = cachep->num * sizeof(freelist_idx_t); + freelist_size = calculate_freelist_size(cachep->num, 0); #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() @@ -2612,6 +2666,7 @@ static void cache_init_objs(struct kmem_cache *cachep, if (cachep->ctor) cachep->ctor(objp); #endif + set_obj_status(page, i, OBJECT_FREE); set_free_obj(page, i, i); } } @@ -2820,6 +2875,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, BUG_ON(objnr >= cachep->num); BUG_ON(objp != index_to_obj(cachep, page, objnr)); + set_obj_status(page, objnr, OBJECT_FREE); if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { @@ -2953,6 +3009,8 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { + struct page *page; + if (!objp) return objp; if (cachep->flags & SLAB_POISON) { @@ -2983,6 +3041,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; } + + page = virt_to_head_page(objp); + set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); @@ -4219,21 +4280,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct page *page) { void *p; - int i, j; + int i; if (n[0] == n[1]) return; for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { - bool active = true; - - for (j = page->active; j < c->num; j++) { - /* Skip freed item */ - if (get_free_obj(page, j) == i) { - active = false; - break; - } - } - if (!active) + if (get_obj_status(page, i) != OBJECT_ACTIVE) continue; if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 6af373236d73..4b0113f73ee9 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -56,7 +56,8 @@ * struct: This defines the way the data will be stored in the ring buffer. * There are currently two types of elements. __field and __array. * a __field is broken up into (type, name). Where type can be any - * type but an array. + * primitive type (integer, long or pointer). __field_struct() can + * be any static complex data value (struct, union, but not an array). * For an array. there are three fields. (type, name, size). The * type of elements in the array, the name of the field and the size * of the array. diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 010b18ef4ea0..182be0f12407 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3476,12 +3476,17 @@ sub process { } } -# unnecessary return in a void function? (a single leading tab, then return;) - if ($sline =~ /^\+\treturn\s*;\s*$/ && - $prevline =~ /^\+/) { +# unnecessary return in a void function +# at end-of-function, with the previous line a single leading tab, then return; +# and the line before that not a goto label target like "out:" + if ($sline =~ /^[ \+]}\s*$/ && + $prevline =~ /^\+\treturn\s*;\s*$/ && + $linenr >= 3 && + $lines[$linenr - 3] =~ /^[ +]/ && + $lines[$linenr - 3] !~ /^[ +]\s*$Ident\s*:/) { WARN("RETURN_VOID", - "void function return statements are not generally useful\n" . $herecurr); - } + "void function return statements are not generally useful\n" . $hereprev); + } # if statements using unnecessary parentheses - ie: if ((foo == bar)) if ($^V && $^V ge 5.10.0 && diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index 51267f4184a6..2cede239a074 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -2,7 +2,7 @@ PROGS := tm-resched-dscr all: $(PROGS) -$(PROGS): +$(PROGS): ../harness.c run_tests: all @-for PROG in $(PROGS); do \ diff --git a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c index ee98e3886af2..42d4c8caad81 100644 --- a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c +++ b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c @@ -28,6 +28,8 @@ #include <assert.h> #include <asm/tm.h> +#include "utils.h" + #define TBEGIN ".long 0x7C00051D ;" #define TEND ".long 0x7C00055D ;" #define TCHECK ".long 0x7C00059C ;" @@ -36,7 +38,8 @@ #define SPRN_TEXASR 0x82 #define SPRN_DSCR 0x03 -int main(void) { +int test_body(void) +{ uint64_t rv, dscr1 = 1, dscr2, texasr; printf("Check DSCR TM context switch: "); @@ -81,10 +84,15 @@ int main(void) { } if (dscr2 != dscr1) { printf(" FAIL\n"); - exit(EXIT_FAILURE); + return 1; } else { printf(" OK\n"); - exit(EXIT_SUCCESS); + return 0; } } } + +int main(void) +{ + return test_harness(test_body, "tm_resched_dscr"); +} |