diff options
Diffstat (limited to 'arch')
163 files changed, 6532 insertions, 2214 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 97a8bc8a095c..7401e921aad0 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -22,6 +22,48 @@ config MMU bool default y +config ARCH_MMAP_RND_BITS_MAX + # On Book3S 64, the default virtual address space for 64-bit processes + # is 2^47 (128TB). As a maximum, allow randomisation to consume up to + # 32T of address space (2^45), which should ensure a reasonable gap + # between bottom-up and top-down allocations for applications that + # consume "normal" amounts of address space. Book3S 64 only supports 64K + # and 4K page sizes. + default 29 if PPC_BOOK3S_64 && PPC_64K_PAGES # 29 = 45 (32T) - 16 (64K) + default 33 if PPC_BOOK3S_64 # 33 = 45 (32T) - 12 (4K) + # + # On all other 64-bit platforms (currently only Book3E), the virtual + # address space is 2^46 (64TB). Allow randomisation to consume up to 16T + # of address space (2^44). Only 4K page sizes are supported. + default 32 if 64BIT # 32 = 44 (16T) - 12 (4K) + # + # For 32-bit, use the compat values, as they're the same. + default ARCH_MMAP_RND_COMPAT_BITS_MAX + +config ARCH_MMAP_RND_BITS_MIN + # Allow randomisation to consume up to 1GB of address space (2^30). + default 14 if 64BIT && PPC_64K_PAGES # 14 = 30 (1GB) - 16 (64K) + default 18 if 64BIT # 18 = 30 (1GB) - 12 (4K) + # + # For 32-bit, use the compat values, as they're the same. + default ARCH_MMAP_RND_COMPAT_BITS_MIN + +config ARCH_MMAP_RND_COMPAT_BITS_MAX + # Total virtual address space for 32-bit processes is 2^31 (2GB). + # Allow randomisation to consume up to 512MB of address space (2^29). + default 11 if PPC_256K_PAGES # 11 = 29 (512MB) - 18 (256K) + default 13 if PPC_64K_PAGES # 13 = 29 (512MB) - 16 (64K) + default 15 if PPC_16K_PAGES # 15 = 29 (512MB) - 14 (16K) + default 17 # 17 = 29 (512MB) - 12 (4K) + +config ARCH_MMAP_RND_COMPAT_BITS_MIN + # Total virtual address space for 32-bit processes is 2^31 (2GB). + # Allow randomisation to consume up to 8MB of address space (2^23). + default 5 if PPC_256K_PAGES # 5 = 23 (8MB) - 18 (256K) + default 7 if PPC_64K_PAGES # 7 = 23 (8MB) - 16 (64K) + default 9 if PPC_16K_PAGES # 9 = 23 (8MB) - 14 (16K) + default 11 # 11 = 23 (8MB) - 12 (4K) + config HAVE_SETUP_PER_CPU_AREA def_bool PPC64 @@ -120,6 +162,8 @@ config PPC select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KGDB + select HAVE_ARCH_MMAP_RND_BITS + select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_CBPF_JIT if !PPC64 @@ -142,6 +186,7 @@ config PPC select HAVE_IRQ_EXIT_ON_IRQ_STACK select HAVE_KERNEL_GZIP select HAVE_KPROBES + select HAVE_KPROBES_ON_FTRACE select HAVE_KRETPROBES select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_MEMBLOCK @@ -586,7 +631,7 @@ config ARCH_SPARSEMEM_ENABLE config ARCH_SPARSEMEM_DEFAULT def_bool y - depends on (SMP && PPC_PSERIES) || PPC_PS3 + depends on PPC_BOOK3S_64 config SYS_SUPPORTS_HUGETLBFS bool @@ -678,6 +723,16 @@ config PPC_256K_PAGES endchoice +config THREAD_SHIFT + int "Thread shift" if EXPERT + range 13 15 + default "15" if PPC_256K_PAGES + default "14" if PPC64 + default "13" + help + Used to define the stack size. The default is almost always what you + want. Only change this if you know what you are doing. + config FORCE_MAX_ZONEORDER int "Maximum zone order" range 8 9 if PPC64 && PPC_64K_PAGES diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index ac8b8332ed82..0695ce047d56 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -33,7 +33,7 @@ CONFIG_BLK_DEV_INITRD=y CONFIG_BPF_SYSCALL=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y -CONFIG_OPROFILE=y +CONFIG_OPROFILE=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y CONFIG_MODULES=y @@ -261,7 +261,7 @@ CONFIG_NILFS2_FS=m CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=m CONFIG_OVERLAY_FS=m -CONFIG_ISO9660_FS=m +CONFIG_ISO9660_FS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=m @@ -306,7 +306,7 @@ CONFIG_CRYPTO_TEST=m CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPT_CRC32C_VPMSUM=m +CONFIG_CRYPTO_CRC32C_VPMSUM=m CONFIG_CRYPTO_MD5_PPC=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_SHA256=y diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 4f1288b04303..e353168f98a7 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -19,7 +19,7 @@ CONFIG_BLK_DEV_INITRD=y CONFIG_BPF_SYSCALL=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y -CONFIG_OPROFILE=y +CONFIG_OPROFILE=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y CONFIG_MODULES=y @@ -291,7 +291,7 @@ CONFIG_NILFS2_FS=m CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=m CONFIG_OVERLAY_FS=m -CONFIG_ISO9660_FS=m +CONFIG_ISO9660_FS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=m @@ -340,7 +340,7 @@ CONFIG_PPC_EARLY_DEBUG=y CONFIG_CRYPTO_TEST=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPT_CRC32C_VPMSUM=m +CONFIG_CRYPTO_CRC32C_VPMSUM=m CONFIG_CRYPTO_MD5_PPC=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_SHA256=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 4ff68b752618..1a61aa20dfba 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -34,7 +34,7 @@ CONFIG_BLK_DEV_INITRD=y CONFIG_BPF_SYSCALL=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y -CONFIG_OPROFILE=y +CONFIG_OPROFILE=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y CONFIG_MODULES=y @@ -259,7 +259,7 @@ CONFIG_NILFS2_FS=m CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=m CONFIG_OVERLAY_FS=m -CONFIG_ISO9660_FS=m +CONFIG_ISO9660_FS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=m @@ -303,7 +303,7 @@ CONFIG_XMON=y CONFIG_CRYPTO_TEST=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPT_CRC32C_VPMSUM=m +CONFIG_CRYPTO_CRC32C_VPMSUM=m CONFIG_CRYPTO_MD5_PPC=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_SHA256=y diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index f6c5264287e5..7330150bfe34 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -17,6 +17,8 @@ #include <asm/checksum.h> #include <linux/uaccess.h> #include <asm/epapr_hcalls.h> +#include <asm/dcr.h> +#include <asm/mmu_context.h> #include <uapi/asm/ucontext.h> @@ -120,6 +122,8 @@ extern s64 __ashrdi3(s64, int); extern int __cmpdi2(s64, s64); extern int __ucmpdi2(u64, u64); +/* tracing */ void _mcount(void); +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip); #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */ diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index bc5fdfd22788..33a24fdd7958 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -55,6 +55,14 @@ #define PPC_BITEXTRACT(bits, ppc_bit, dst_bit) \ ((((bits) >> PPC_BITLSHIFT(ppc_bit)) & 1) << (dst_bit)) +#define PPC_BITLSHIFT32(be) (32 - 1 - (be)) +#define PPC_BIT32(bit) (1UL << PPC_BITLSHIFT32(bit)) +#define PPC_BITMASK32(bs, be) ((PPC_BIT32(bs) - PPC_BIT32(be))|PPC_BIT32(bs)) + +#define PPC_BITLSHIFT8(be) (8 - 1 - (be)) +#define PPC_BIT8(bit) (1UL << PPC_BITLSHIFT8(bit)) +#define PPC_BITMASK8(bs, be) ((PPC_BIT8(bs) - PPC_BIT8(be))|PPC_BIT8(bs)) + #include <asm/barrier.h> /* Macro for generating the ***_bits() functions */ diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 0c4e470571ca..b4b5e6b671ca 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -8,7 +8,7 @@ #define H_PTE_INDEX_SIZE 9 #define H_PMD_INDEX_SIZE 7 #define H_PUD_INDEX_SIZE 9 -#define H_PGD_INDEX_SIZE 9 +#define H_PGD_INDEX_SIZE 12 #ifndef __ASSEMBLY__ #define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE) diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index f3dd21efa2ea..214219dff87c 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -4,10 +4,14 @@ #define H_PTE_INDEX_SIZE 8 #define H_PMD_INDEX_SIZE 5 #define H_PUD_INDEX_SIZE 5 -#define H_PGD_INDEX_SIZE 12 +#define H_PGD_INDEX_SIZE 15 -#define H_PAGE_COMBO 0x00001000 /* this is a combo 4k page */ -#define H_PAGE_4K_PFN 0x00002000 /* PFN is for a single 4k page */ +/* + * 64k aligned address free up few of the lower bits of RPN for us + * We steal that here. For more deatils look at pte_pfn/pfn_pte() + */ +#define H_PAGE_COMBO _RPAGE_RPN0 /* this is a combo 4k page */ +#define H_PAGE_4K_PFN _RPAGE_RPN1 /* PFN is for a single 4k page */ /* * We need to differentiate between explicit huge page and THP huge * page, since THP huge page also need to track real subpage details diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index f7b721bbf918..4e957b027fe0 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -6,19 +6,13 @@ * Common bits between 4K and 64K pages in a linux-style PTE. * Additional bits may be defined in pgtable-hash64-*.h * - * Note: We only support user read/write permissions. Supervisor always - * have full read/write to pages above PAGE_OFFSET (pages below that - * always use the user access permissions). - * - * We could create separate kernel read-only if we used the 3 PP bits - * combinations that newer processors provide but we currently don't. */ -#define H_PAGE_BUSY 0x00800 /* software: PTE & hash are busy */ #define H_PTE_NONE_MASK _PAGE_HPTEFLAGS -#define H_PAGE_F_GIX_SHIFT 57 -#define H_PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */ -#define H_PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */ -#define H_PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */ +#define H_PAGE_F_GIX_SHIFT 56 +#define H_PAGE_BUSY _RPAGE_RSV1 /* software: PTE & hash are busy */ +#define H_PAGE_F_SECOND _RPAGE_RSV2 /* HPTE is in 2ndary HPTEG */ +#define H_PAGE_F_GIX (_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44) +#define H_PAGE_HASHPTE _RPAGE_RPN43 /* PTE has associated HPTE */ #ifdef CONFIG_PPC_64K_PAGES #include <asm/book3s/64/hash-64k.h> diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index c62f14d0bec1..6666cd366596 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -46,7 +46,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, */ VM_WARN_ON(page_shift == mmu_psize_defs[MMU_PAGE_1G].shift); if (page_shift == mmu_psize_defs[MMU_PAGE_2M].shift) - return __pte(pte_val(entry) | _PAGE_LARGE); + return __pte(pte_val(entry) | R_PAGE_LARGE); else return entry; } diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 52d8d1e4b772..6d56974adf28 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -39,6 +39,7 @@ /* Bits in the SLB VSID word */ #define SLB_VSID_SHIFT 12 +#define SLB_VSID_SHIFT_256M SLB_VSID_SHIFT #define SLB_VSID_SHIFT_1T 24 #define SLB_VSID_SSIZE_SHIFT 62 #define SLB_VSID_B ASM_CONST(0xc000000000000000) @@ -408,7 +409,7 @@ static inline unsigned long hpt_vpn(unsigned long ea, static inline unsigned long hpt_hash(unsigned long vpn, unsigned int shift, int ssize) { - int mask; + unsigned long mask; unsigned long hash, vsid; /* VPN_SHIFT can be atmost 12 */ @@ -491,13 +492,14 @@ extern void slb_set_size(u16 size); * We first generate a 37-bit "proto-VSID". Proto-VSIDs are generated * from mmu context id and effective segment id of the address. * - * For user processes max context id is limited to ((1ul << 19) - 5) - * for kernel space, we use the top 4 context ids to map address as below + * For user processes max context id is limited to MAX_USER_CONTEXT. + + * For kernel space, we use context ids 1-5 to map address as below: * NOTE: each context only support 64TB now. - * 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ] - * 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ] - * 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ] - * 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ] + * 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ] + * 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ] + * 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ] + * 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ] * * The proto-VSIDs are then scrambled into real VSIDs with the * multiplicative hash: @@ -511,20 +513,28 @@ extern void slb_set_size(u16 size); * robust scattering in the hash table (at least based on some initial * results). * - * We also consider VSID 0 special. We use VSID 0 for slb entries mapping - * bad address. This enables us to consolidate bad address handling in - * hash_page. + * We use VSID 0 to indicate an invalid VSID. The means we can't use context id + * 0, because a context id of 0 and an EA of 0 gives a proto-VSID of 0, which + * will produce a VSID of 0. * * We also need to avoid the last segment of the last context, because that * would give a protovsid of 0x1fffffffff. That will result in a VSID 0 - * because of the modulo operation in vsid scramble. But the vmemmap - * (which is what uses region 0xf) will never be close to 64TB in size - * (it's 56 bytes per page of system memory). + * because of the modulo operation in vsid scramble. */ +/* + * Max Va bits we support as of now is 68 bits. We want 19 bit + * context ID. + * Restrictions: + * GPU has restrictions of not able to access beyond 128TB + * (47 bit effective address). We also cannot do more than 20bit PID. + * For p4 and p5 which can only do 65 bit VA, we restrict our CONTEXT_BITS + * to 16 bits (ie, we can only have 2^16 pids at the same time). + */ +#define VA_BITS 68 #define CONTEXT_BITS 19 -#define ESID_BITS 18 -#define ESID_BITS_1T 6 +#define ESID_BITS (VA_BITS - (SID_SHIFT + CONTEXT_BITS)) +#define ESID_BITS_1T (VA_BITS - (SID_SHIFT_1T + CONTEXT_BITS)) #define ESID_BITS_MASK ((1 << ESID_BITS) - 1) #define ESID_BITS_1T_MASK ((1 << ESID_BITS_1T) - 1) @@ -532,63 +542,70 @@ extern void slb_set_size(u16 size); /* * 256MB segment * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments - * available for user + kernel mapping. The top 4 contexts are used for - * kernel mapping. Each segment contains 2^28 bytes. Each - * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts - * (19 == 37 + 28 - 46). + * available for user + kernel mapping. VSID 0 is reserved as invalid, contexts + * 1-4 are used for kernel mapping. Each segment contains 2^28 bytes. Each + * context maps 2^49 bytes (512TB). + * + * We also need to avoid the last segment of the last context, because that + * would give a protovsid of 0x1fffffffff. That will result in a VSID 0 + * because of the modulo operation in vsid scramble. + */ +#define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 2) +#define MIN_USER_CONTEXT (5) + +/* Would be nice to use KERNEL_REGION_ID here */ +#define KERNEL_REGION_CONTEXT_OFFSET (0xc - 1) + +/* + * For platforms that support on 65bit VA we limit the context bits */ -#define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 5) +#define MAX_USER_CONTEXT_65BIT_VA ((ASM_CONST(1) << (65 - (SID_SHIFT + ESID_BITS))) - 2) /* * This should be computed such that protovosid * vsid_mulitplier - * doesn't overflow 64 bits. It should also be co-prime to vsid_modulus + * doesn't overflow 64 bits. The vsid_mutliplier should also be + * co-prime to vsid_modulus. We also need to make sure that number + * of bits in multiplied result (dividend) is less than twice the number of + * protovsid bits for our modulus optmization to work. + * + * The below table shows the current values used. + * |-------+------------+----------------------+------------+-------------------| + * | | Prime Bits | proto VSID_BITS_65VA | Total Bits | 2* prot VSID_BITS | + * |-------+------------+----------------------+------------+-------------------| + * | 1T | 24 | 25 | 49 | 50 | + * |-------+------------+----------------------+------------+-------------------| + * | 256MB | 24 | 37 | 61 | 74 | + * |-------+------------+----------------------+------------+-------------------| + * + * |-------+------------+----------------------+------------+--------------------| + * | | Prime Bits | proto VSID_BITS_68VA | Total Bits | 2* proto VSID_BITS | + * |-------+------------+----------------------+------------+--------------------| + * | 1T | 24 | 28 | 52 | 56 | + * |-------+------------+----------------------+------------+--------------------| + * | 256MB | 24 | 40 | 64 | 80 | + * |-------+------------+----------------------+------------+--------------------| + * */ #define VSID_MULTIPLIER_256M ASM_CONST(12538073) /* 24-bit prime */ -#define VSID_BITS_256M (CONTEXT_BITS + ESID_BITS) -#define VSID_MODULUS_256M ((1UL<<VSID_BITS_256M)-1) +#define VSID_BITS_256M (VA_BITS - SID_SHIFT) +#define VSID_BITS_65_256M (65 - SID_SHIFT) +/* + * Modular multiplicative inverse of VSID_MULTIPLIER under modulo VSID_MODULUS + */ +#define VSID_MULINV_256M ASM_CONST(665548017062) #define VSID_MULTIPLIER_1T ASM_CONST(12538073) /* 24-bit prime */ -#define VSID_BITS_1T (CONTEXT_BITS + ESID_BITS_1T) -#define VSID_MODULUS_1T ((1UL<<VSID_BITS_1T)-1) - +#define VSID_BITS_1T (VA_BITS - SID_SHIFT_1T) +#define VSID_BITS_65_1T (65 - SID_SHIFT_1T) +#define VSID_MULINV_1T ASM_CONST(209034062) +/* 1TB VSID reserved for VRMA */ +#define VRMA_VSID 0x1ffffffUL #define USER_VSID_RANGE (1UL << (ESID_BITS + SID_SHIFT)) -/* - * This macro generates asm code to compute the VSID scramble - * function. Used in slb_allocate() and do_stab_bolted. The function - * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS - * - * rt = register containing the proto-VSID and into which the - * VSID will be stored - * rx = scratch register (clobbered) - * - * - rt and rx must be different registers - * - The answer will end up in the low VSID_BITS bits of rt. The higher - * bits may contain other garbage, so you may need to mask the - * result. - */ -#define ASM_VSID_SCRAMBLE(rt, rx, size) \ - lis rx,VSID_MULTIPLIER_##size@h; \ - ori rx,rx,VSID_MULTIPLIER_##size@l; \ - mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \ - \ - srdi rx,rt,VSID_BITS_##size; \ - clrldi rt,rt,(64-VSID_BITS_##size); \ - add rt,rt,rx; /* add high and low bits */ \ - /* NOTE: explanation based on VSID_BITS_##size = 36 \ - * Now, r3 == VSID (mod 2^36-1), and lies between 0 and \ - * 2^36-1+2^28-1. That in particular means that if r3 >= \ - * 2^36-1, then r3+1 has the 2^36 bit set. So, if r3+1 has \ - * the bit clear, r3 already has the answer we want, if it \ - * doesn't, the answer is the low 36 bits of r3+1. So in all \ - * cases the answer is the low 36 bits of (r3 + ((r3+1) >> 36))*/\ - addi rx,rt,1; \ - srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \ - add rt,rt,rx - /* 4 bits per slice and we have one slice per 1TB */ -#define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) +#define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) +#define TASK_SLICE_ARRAY_SZ(x) ((x)->context.addr_limit >> 41) #ifndef __ASSEMBLY__ @@ -634,7 +651,7 @@ static inline void subpage_prot_init_new_context(struct mm_struct *mm) { } #define vsid_scramble(protovsid, size) \ ((((protovsid) * VSID_MULTIPLIER_##size) % VSID_MODULUS_##size)) -#else /* 1 */ +/* simplified form avoiding mod operation */ #define vsid_scramble(protovsid, size) \ ({ \ unsigned long x; \ @@ -642,6 +659,21 @@ static inline void subpage_prot_init_new_context(struct mm_struct *mm) { } x = (x >> VSID_BITS_##size) + (x & VSID_MODULUS_##size); \ (x + ((x+1) >> VSID_BITS_##size)) & VSID_MODULUS_##size; \ }) + +#else /* 1 */ +static inline unsigned long vsid_scramble(unsigned long protovsid, + unsigned long vsid_multiplier, int vsid_bits) +{ + unsigned long vsid; + unsigned long vsid_modulus = ((1UL << vsid_bits) - 1); + /* + * We have same multipler for both 256 and 1T segements now + */ + vsid = protovsid * vsid_multiplier; + vsid = (vsid >> vsid_bits) + (vsid & vsid_modulus); + return (vsid + ((vsid + 1) >> vsid_bits)) & vsid_modulus; +} + #endif /* 1 */ /* Returns the segment size indicator for a user address */ @@ -656,36 +688,56 @@ static inline int user_segment_size(unsigned long addr) static inline unsigned long get_vsid(unsigned long context, unsigned long ea, int ssize) { + unsigned long va_bits = VA_BITS; + unsigned long vsid_bits; + unsigned long protovsid; + /* * Bad address. We return VSID 0 for that */ if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE) return 0; - if (ssize == MMU_SEGSIZE_256M) - return vsid_scramble((context << ESID_BITS) - | ((ea >> SID_SHIFT) & ESID_BITS_MASK), 256M); - return vsid_scramble((context << ESID_BITS_1T) - | ((ea >> SID_SHIFT_1T) & ESID_BITS_1T_MASK), 1T); + if (!mmu_has_feature(MMU_FTR_68_BIT_VA)) + va_bits = 65; + + if (ssize == MMU_SEGSIZE_256M) { + vsid_bits = va_bits - SID_SHIFT; + protovsid = (context << ESID_BITS) | + ((ea >> SID_SHIFT) & ESID_BITS_MASK); + return vsid_scramble(protovsid, VSID_MULTIPLIER_256M, vsid_bits); + } + /* 1T segment */ + vsid_bits = va_bits - SID_SHIFT_1T; + protovsid = (context << ESID_BITS_1T) | + ((ea >> SID_SHIFT_1T) & ESID_BITS_1T_MASK); + return vsid_scramble(protovsid, VSID_MULTIPLIER_1T, vsid_bits); } /* * This is only valid for addresses >= PAGE_OFFSET - * - * For kernel space, we use the top 4 context ids to map address as below - * 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ] - * 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ] - * 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ] - * 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ] */ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize) { unsigned long context; + if (!is_kernel_addr(ea)) + return 0; + /* - * kernel take the top 4 context from the available range + * For kernel space, we use context ids 1-4 to map the address space as + * below: + * + * 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ] + * 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ] + * 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ] + * 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ] + * + * So we can compute the context from the region (top nibble) by + * subtracting 11, or 0xc - 1. */ - context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1; + context = (ea >> 60) - KERNEL_REGION_CONTEXT_OFFSET; + return get_vsid(context, ea, ssize); } diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 805d4105e9bb..77529a3e3811 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -65,6 +65,8 @@ extern struct patb_entry *partition_tb; * MAX_USER_CONTEXT * 16 bytes of space. */ #define PRTB_SIZE_SHIFT (CONTEXT_BITS + 4) +#define PRTB_ENTRIES (1ul << CONTEXT_BITS) + /* * Power9 currently only support 64K partition table size. */ @@ -73,13 +75,20 @@ extern struct patb_entry *partition_tb; typedef unsigned long mm_context_id_t; struct spinlock; +/* Maximum possible number of NPUs in a system. */ +#define NV_MAX_NPUS 8 + typedef struct { mm_context_id_t id; u16 user_psize; /* page size index */ + /* NPU NMMU context */ + struct npu_context *npu_context; + #ifdef CONFIG_PPC_MM_SLICES u64 low_slices_psize; /* SLB page size encodings */ unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; + unsigned long addr_limit; #else u16 sllp; /* SLB page size encoding */ #endif diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 8f4d41936e5a..85bc9875c3be 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -13,6 +13,7 @@ #define _PAGE_BIT_SWAP_TYPE 0 #define _PAGE_RO 0 +#define _PAGE_SHARED 0 #define _PAGE_EXEC 0x00001 /* execute permission */ #define _PAGE_WRITE 0x00002 /* write access allowed */ @@ -37,21 +38,47 @@ #define _RPAGE_RSV3 0x0400000000000000UL #define _RPAGE_RSV4 0x0200000000000000UL -#ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */ -#else -#define _PAGE_SOFT_DIRTY 0x00000 -#endif -#define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */ +#define _PAGE_PTE 0x4000000000000000UL /* distinguishes PTEs from pointers */ +#define _PAGE_PRESENT 0x8000000000000000UL /* pte contains a translation */ /* - * For P9 DD1 only, we need to track whether the pte's huge. + * Top and bottom bits of RPN which can be used by hash + * translation mode, because we expect them to be zero + * otherwise. */ -#define _PAGE_LARGE _RPAGE_RSV1 +#define _RPAGE_RPN0 0x01000 +#define _RPAGE_RPN1 0x02000 +#define _RPAGE_RPN44 0x0100000000000000UL +#define _RPAGE_RPN43 0x0080000000000000UL +#define _RPAGE_RPN42 0x0040000000000000UL +#define _RPAGE_RPN41 0x0020000000000000UL + +/* Max physical address bit as per radix table */ +#define _RPAGE_PA_MAX 57 +/* + * Max physical address bit we will use for now. + * + * This is mostly a hardware limitation and for now Power9 has + * a 51 bit limit. + * + * This is different from the number of physical bit required to address + * the last byte of memory. That is defined by MAX_PHYSMEM_BITS. + * MAX_PHYSMEM_BITS is a linux limitation imposed by the maximum + * number of sections we can support (SECTIONS_SHIFT). + * + * This is different from Radix page table limitation above and + * should always be less than that. The limit is done such that + * we can overload the bits between _RPAGE_PA_MAX and _PAGE_PA_MAX + * for hash linux page table specific bits. + * + * In order to be compatible with future hardware generations we keep + * some offsets and limit this for now to 53 + */ +#define _PAGE_PA_MAX 53 -#define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */ -#define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */ +#define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */ +#define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */ /* * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE * Instead of fixing all of them, add an alternate define which @@ -59,10 +86,11 @@ */ #define _PAGE_NO_CACHE _PAGE_TOLERANT /* - * We support 57 bit real address in pte. Clear everything above 57, and - * every thing below PAGE_SHIFT; + * We support _RPAGE_PA_MAX bit real address in pte. On the linux side + * we are limited by _PAGE_PA_MAX. Clear everything above _PAGE_PA_MAX + * and every thing below PAGE_SHIFT; */ -#define PTE_RPN_MASK (((1UL << 57) - 1) & (PAGE_MASK)) +#define PTE_RPN_MASK (((1UL << _PAGE_PA_MAX) - 1) & (PAGE_MASK)) /* * set of bits not changed in pmd_modify. Even though we have hash specific bits * in here, on radix we expect them to be zero. @@ -205,10 +233,6 @@ extern unsigned long __pte_frag_nr; extern unsigned long __pte_frag_size_shift; #define PTE_FRAG_SIZE_SHIFT __pte_frag_size_shift #define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT) -/* - * Pgtable size used by swapper, init in asm code - */ -#define MAX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE) #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 9e0bb7cd6e22..ac16d1943022 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -11,6 +11,12 @@ #include <asm/book3s/64/radix-4k.h> #endif +/* + * For P9 DD1 only, we need to track whether the pte's huge. + */ +#define R_PAGE_LARGE _RPAGE_RSV1 + + #ifndef __ASSEMBLY__ #include <asm/book3s/64/tlbflush-radix.h> #include <asm/cpu_has_feature.h> @@ -252,7 +258,7 @@ static inline int radix__pmd_trans_huge(pmd_t pmd) static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) { if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - return __pmd(pmd_val(pmd) | _PAGE_PTE | _PAGE_LARGE); + return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE); return __pmd(pmd_val(pmd) | _PAGE_PTE); } static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma, diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 8ab937771068..abef812de7f8 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -12,6 +12,8 @@ #include <asm/types.h> #include <asm/ppc-opcode.h> +#include <linux/string.h> +#include <linux/kallsyms.h> /* Flags for create_branch: * "b" == create_branch(addr, target, 0); @@ -99,6 +101,45 @@ static inline unsigned long ppc_global_function_entry(void *func) #endif } +/* + * Wrapper around kallsyms_lookup() to return function entry address: + * - For ABIv1, we lookup the dot variant. + * - For ABIv2, we return the local entry point. + */ +static inline unsigned long ppc_kallsyms_lookup_name(const char *name) +{ + unsigned long addr; +#ifdef PPC64_ELF_ABI_v1 + /* check for dot variant */ + char dot_name[1 + KSYM_NAME_LEN]; + bool dot_appended = false; + + if (strnlen(name, KSYM_NAME_LEN) >= KSYM_NAME_LEN) + return 0; + + if (name[0] != '.') { + dot_name[0] = '.'; + dot_name[1] = '\0'; + strlcat(dot_name, name, sizeof(dot_name)); + dot_appended = true; + } else { + dot_name[0] = '\0'; + strlcat(dot_name, name, sizeof(dot_name)); + } + addr = kallsyms_lookup_name(dot_name); + if (!addr && dot_appended) + /* Let's try the original non-dot symbol lookup */ + addr = kallsyms_lookup_name(name); +#elif defined(PPC64_ELF_ABI_v2) + addr = kallsyms_lookup_name(name); + if (addr) + addr = ppc_function_entry((void *)addr); +#else + addr = kallsyms_lookup_name(name); +#endif + return addr; +} + #ifdef CONFIG_PPC64 /* * Some instruction encodings commonly used in dynamic ftracing diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index 155731557c9b..52586f9956bb 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -2,13 +2,39 @@ #define _ASM_POWERPC_CPUIDLE_H #ifdef CONFIG_PPC_POWERNV -/* Used in powernv idle state management */ +/* Thread state used in powernv idle state management */ #define PNV_THREAD_RUNNING 0 #define PNV_THREAD_NAP 1 #define PNV_THREAD_SLEEP 2 #define PNV_THREAD_WINKLE 3 -#define PNV_CORE_IDLE_LOCK_BIT 0x100 -#define PNV_CORE_IDLE_THREAD_BITS 0x0FF + +/* + * Core state used in powernv idle for POWER8. + * + * The lock bit synchronizes updates to the state, as well as parts of the + * sleep/wake code (see kernel/idle_book3s.S). + * + * Bottom 8 bits track the idle state of each thread. Bit is cleared before + * the thread executes an idle instruction (nap/sleep/winkle). + * + * Then there is winkle tracking. A core does not lose complete state + * until every thread is in winkle. So the winkle count field counts the + * number of threads in winkle (small window of false positives is okay + * around the sleep/wake, so long as there are no false negatives). + * + * When the winkle count reaches 8 (the COUNT_ALL_BIT becomes set), then + * the THREAD_WINKLE_BITS are set, which indicate which threads have not + * yet woken from the winkle state. + */ +#define PNV_CORE_IDLE_LOCK_BIT 0x10000000 + +#define PNV_CORE_IDLE_WINKLE_COUNT 0x00010000 +#define PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT 0x00080000 +#define PNV_CORE_IDLE_WINKLE_COUNT_BITS 0x000F0000 +#define PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT 8 +#define PNV_CORE_IDLE_THREAD_WINKLE_BITS 0x0000FF00 + +#define PNV_CORE_IDLE_THREAD_BITS 0x000000FF /* * ============================ NOTE ================================= @@ -46,6 +72,7 @@ extern u32 pnv_fastsleep_workaround_at_exit[]; extern u64 pnv_first_deep_stop_state; +unsigned long pnv_cpu_offline(unsigned int cpu); int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags); static inline void report_invalid_psscr_val(u64 psscr_val, int err) { diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index ab68d0ee7725..1f6847b107e4 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -471,10 +471,11 @@ enum { CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ CPU_FTR_DSCR | CPU_FTR_SAO | \ CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ - CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ + CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \ CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300) -#define CPU_FTRS_POWER9_DD1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) +#define CPU_FTRS_POWER9_DD1 ((CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) & \ + (~CPU_FTR_SAO)) #define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h index 378167377065..f70cbfe0ec04 100644 --- a/arch/powerpc/include/asm/dbell.h +++ b/arch/powerpc/include/asm/dbell.h @@ -35,33 +35,53 @@ enum ppc_dbell { #ifdef CONFIG_PPC_BOOK3S #define PPC_DBELL_MSGTYPE PPC_DBELL_SERVER -#define SPRN_DOORBELL_CPUTAG SPRN_TIR -#define PPC_DBELL_TAG_MASK 0x7f static inline void _ppc_msgsnd(u32 msg) { - if (cpu_has_feature(CPU_FTR_HVMODE)) - __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); - else - __asm__ __volatile__ (PPC_MSGSNDP(%0) : : "r" (msg)); + __asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGSND(%1), PPC_MSGSNDP(%1), %0) + : : "i" (CPU_FTR_HVMODE), "r" (msg)); +} + +/* sync before sending message */ +static inline void ppc_msgsnd_sync(void) +{ + __asm__ __volatile__ ("sync" : : : "memory"); +} + +/* sync after taking message interrupt */ +static inline void ppc_msgsync(void) +{ + /* sync is not required when taking messages from the same core */ + __asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGSYNC " ; lwsync", "", %0) + : : "i" (CPU_FTR_HVMODE|CPU_FTR_ARCH_300)); } #else /* CONFIG_PPC_BOOK3S */ #define PPC_DBELL_MSGTYPE PPC_DBELL -#define SPRN_DOORBELL_CPUTAG SPRN_PIR -#define PPC_DBELL_TAG_MASK 0x3fff static inline void _ppc_msgsnd(u32 msg) { __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); } +/* sync before sending message */ +static inline void ppc_msgsnd_sync(void) +{ + __asm__ __volatile__ ("sync" : : : "memory"); +} + +/* sync after taking message interrupt */ +static inline void ppc_msgsync(void) +{ +} + #endif /* CONFIG_PPC_BOOK3S */ -extern void doorbell_cause_ipi(int cpu, unsigned long data); +extern void doorbell_global_ipi(int cpu); +extern void doorbell_core_ipi(int cpu); +extern int doorbell_try_core_ipi(int cpu); extern void doorbell_exception(struct pt_regs *regs); -extern void doorbell_setup_this_cpu(void); static inline void ppc_msgsnd(enum ppc_dbell type, u32 flags, u32 tag) { diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h index 86308f177f2d..5d5af3fddfd8 100644 --- a/arch/powerpc/include/asm/debug.h +++ b/arch/powerpc/include/asm/debug.h @@ -8,8 +8,6 @@ struct pt_regs; -extern struct dentry *powerpc_debugfs_root; - #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE) extern int (*__debugger)(struct pt_regs *regs); diff --git a/arch/powerpc/include/asm/debugfs.h b/arch/powerpc/include/asm/debugfs.h new file mode 100644 index 000000000000..4f3b39f3e3d2 --- /dev/null +++ b/arch/powerpc/include/asm/debugfs.h @@ -0,0 +1,17 @@ +#ifndef _ASM_POWERPC_DEBUGFS_H +#define _ASM_POWERPC_DEBUGFS_H + +/* + * Copyright 2017, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/debugfs.h> + +extern struct dentry *powerpc_debugfs_root; + +#endif /* _ASM_POWERPC_DEBUGFS_H */ diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 14752eee3d0c..89259817f5ef 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -167,17 +167,14 @@ BEGIN_FTR_SECTION_NESTED(943) \ std ra,offset(r13); \ END_FTR_SECTION_NESTED(ftr,ftr,943) -#define EXCEPTION_PROLOG_0_PACA(area) \ +#define EXCEPTION_PROLOG_0(area) \ + GET_PACA(r13); \ std r9,area+EX_R9(r13); /* save r9 */ \ OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); \ HMT_MEDIUM; \ std r10,area+EX_R10(r13); /* save r10 - r12 */ \ OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR) -#define EXCEPTION_PROLOG_0(area) \ - GET_PACA(r13); \ - EXCEPTION_PROLOG_0_PACA(area) - #define __EXCEPTION_PROLOG_1(area, extra, vec) \ OPT_SAVE_REG_TO_PACA(area+EX_PPR, r9, CPU_FTR_HAS_PPR); \ OPT_SAVE_REG_TO_PACA(area+EX_CFAR, r10, CPU_FTR_CFAR); \ @@ -208,12 +205,6 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) EXCEPTION_PROLOG_1(area, extra, vec); \ EXCEPTION_PROLOG_PSERIES_1(label, h); -/* Have the PACA in r13 already */ -#define EXCEPTION_PROLOG_PSERIES_PACA(area, label, h, extra, vec) \ - EXCEPTION_PROLOG_0_PACA(area); \ - EXCEPTION_PROLOG_1(area, extra, vec); \ - EXCEPTION_PROLOG_PSERIES_1(label, h); - #define __KVMTEST(h, n) \ lbz r10,HSTATE_IN_GUEST(r13); \ cmpwi r10,0; \ @@ -256,11 +247,6 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) ld r9,area+EX_R9(r13); \ bctr -#define BRANCH_TO_KVM(reg, label) \ - __LOAD_FAR_HANDLER(reg, label); \ - mtctr reg; \ - bctr - #else #define BRANCH_TO_COMMON(reg, label) \ b label @@ -268,9 +254,6 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define BRANCH_LINK_TO_FAR(reg, label) \ bl label -#define BRANCH_TO_KVM(reg, label) \ - b label - #define __BRANCH_TO_KVM_EXIT(area, label) \ ld r9,area+EX_R9(r13); \ b label @@ -522,7 +505,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label) \ EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_HV, vec); \ - EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV) + EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_HV) /* * Our exception common code can be passed various "additions" diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h index ddf54f5bbdd1..2de2319b99e2 100644 --- a/arch/powerpc/include/asm/feature-fixups.h +++ b/arch/powerpc/include/asm/feature-fixups.h @@ -66,6 +66,9 @@ label##5: \ #define END_FTR_SECTION(msk, val) \ END_FTR_SECTION_NESTED(msk, val, 97) +#define END_FTR_SECTION_NESTED_IFSET(msk, label) \ + END_FTR_SECTION_NESTED((msk), (msk), label) + #define END_FTR_SECTION_IFSET(msk) END_FTR_SECTION((msk), (msk)) #define END_FTR_SECTION_IFCLR(msk) END_FTR_SECTION((msk), 0) diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h index 5067048daad4..86eb87382031 100644 --- a/arch/powerpc/include/asm/head-64.h +++ b/arch/powerpc/include/asm/head-64.h @@ -213,6 +213,7 @@ name: USE_TEXT_SECTION(); \ .balign IFETCH_ALIGN_BYTES; \ .global name; \ + _ASM_NOKPROBE_SYMBOL(name); \ DEFINE_FIXED_SYMBOL(name); \ name: diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 3cc12a86ef5d..d73755fafbb0 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -377,16 +377,6 @@ long plpar_hcall_raw(unsigned long opcode, unsigned long *retbuf, ...); long plpar_hcall9(unsigned long opcode, unsigned long *retbuf, ...); long plpar_hcall9_raw(unsigned long opcode, unsigned long *retbuf, ...); -/* For hcall instrumentation. One structure per-hcall, per-CPU */ -struct hcall_stats { - unsigned long num_calls; /* number of calls (on this CPU) */ - unsigned long tb_total; /* total wall time (mftb) of calls. */ - unsigned long purr_total; /* total cpu time (PURR) of calls. */ - unsigned long tb_start; - unsigned long purr_start; -}; -#define HCALL_STAT_ARRAY_SIZE ((MAX_HCALL_OPCODE >> 2) + 1) - struct hvcall_mpp_data { unsigned long entitled_mem; unsigned long mapped_mem; diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 5ed292431b5b..422f99cf9924 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -25,8 +25,6 @@ extern struct pci_dev *isa_bridge_pcidev; #endif #include <linux/device.h> -#include <linux/io.h> - #include <linux/compiler.h> #include <asm/page.h> #include <asm/byteorder.h> @@ -192,24 +190,8 @@ DEF_MMIO_OUT_D(out_le32, 32, stw); #endif /* __BIG_ENDIAN */ -/* - * Cache inhibitied accessors for use in real mode, you don't want to use these - * unless you know what you're doing. - * - * NB. These use the cpu byte ordering. - */ -DEF_MMIO_OUT_X(out_rm8, 8, stbcix); -DEF_MMIO_OUT_X(out_rm16, 16, sthcix); -DEF_MMIO_OUT_X(out_rm32, 32, stwcix); -DEF_MMIO_IN_X(in_rm8, 8, lbzcix); -DEF_MMIO_IN_X(in_rm16, 16, lhzcix); -DEF_MMIO_IN_X(in_rm32, 32, lwzcix); - #ifdef __powerpc64__ -DEF_MMIO_OUT_X(out_rm64, 64, stdcix); -DEF_MMIO_IN_X(in_rm64, 64, ldcix); - #ifdef __BIG_ENDIAN__ DEF_MMIO_OUT_D(out_be64, 64, std); DEF_MMIO_IN_D(in_be64, 64, ld); @@ -242,35 +224,6 @@ static inline void out_be64(volatile u64 __iomem *addr, u64 val) #endif #endif /* __powerpc64__ */ - -/* - * Simple Cache inhibited accessors - * Unlike the DEF_MMIO_* macros, these don't include any h/w memory - * barriers, callers need to manage memory barriers on their own. - * These can only be used in hypervisor real mode. - */ - -static inline u32 _lwzcix(unsigned long addr) -{ - u32 ret; - - __asm__ __volatile__("lwzcix %0,0, %1" - : "=r" (ret) : "r" (addr) : "memory"); - return ret; -} - -static inline void _stbcix(u64 addr, u8 val) -{ - __asm__ __volatile__("stbcix %0,0,%1" - : : "r" (val), "r" (addr) : "memory"); -} - -static inline void _stwcix(u64 addr, u32 val) -{ - __asm__ __volatile__("stwcix %0,0,%1" - : : "r" (val), "r" (addr) : "memory"); -} - /* * Low level IO stream instructions are defined out of line for now */ @@ -417,15 +370,64 @@ static inline void __raw_writeq(unsigned long v, volatile void __iomem *addr) } /* - * Real mode version of the above. stdcix is only supposed to be used - * in hypervisor real mode as per the architecture spec. + * Real mode versions of the above. Those instructions are only supposed + * to be used in hypervisor real mode as per the architecture spec. */ +static inline void __raw_rm_writeb(u8 val, volatile void __iomem *paddr) +{ + __asm__ __volatile__("stbcix %0,0,%1" + : : "r" (val), "r" (paddr) : "memory"); +} + +static inline void __raw_rm_writew(u16 val, volatile void __iomem *paddr) +{ + __asm__ __volatile__("sthcix %0,0,%1" + : : "r" (val), "r" (paddr) : "memory"); +} + +static inline void __raw_rm_writel(u32 val, volatile void __iomem *paddr) +{ + __asm__ __volatile__("stwcix %0,0,%1" + : : "r" (val), "r" (paddr) : "memory"); +} + static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr) { __asm__ __volatile__("stdcix %0,0,%1" : : "r" (val), "r" (paddr) : "memory"); } +static inline u8 __raw_rm_readb(volatile void __iomem *paddr) +{ + u8 ret; + __asm__ __volatile__("lbzcix %0,0, %1" + : "=r" (ret) : "r" (paddr) : "memory"); + return ret; +} + +static inline u16 __raw_rm_readw(volatile void __iomem *paddr) +{ + u16 ret; + __asm__ __volatile__("lhzcix %0,0, %1" + : "=r" (ret) : "r" (paddr) : "memory"); + return ret; +} + +static inline u32 __raw_rm_readl(volatile void __iomem *paddr) +{ + u32 ret; + __asm__ __volatile__("lwzcix %0,0, %1" + : "=r" (ret) : "r" (paddr) : "memory"); + return ret; +} + +static inline u64 __raw_rm_readq(volatile void __iomem *paddr) +{ + u64 ret; + __asm__ __volatile__("ldcix %0,0, %1" + : "=r" (ret) : "r" (paddr) : "memory"); + return ret; +} #endif /* __powerpc64__ */ /* @@ -757,6 +759,8 @@ extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size, extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size); #define ioremap_nocache(addr, size) ioremap((addr), (size)) #define ioremap_uc(addr, size) ioremap((addr), (size)) +#define ioremap_cache(addr, size) \ + ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL)) extern void iounmap(volatile void __iomem *addr); diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h index 0503c98b2117..a83821f33ea3 100644 --- a/arch/powerpc/include/asm/kprobes.h +++ b/arch/powerpc/include/asm/kprobes.h @@ -61,59 +61,6 @@ extern kprobe_opcode_t optprobe_template_end[]; #define MAX_OPTINSN_SIZE (optprobe_template_end - optprobe_template_entry) #define RELATIVEJUMP_SIZE sizeof(kprobe_opcode_t) /* 4 bytes */ -#ifdef PPC64_ELF_ABI_v2 -/* PPC64 ABIv2 needs local entry point */ -#define kprobe_lookup_name(name, addr) \ -{ \ - addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); \ - if (addr) \ - addr = (kprobe_opcode_t *)ppc_function_entry(addr); \ -} -#elif defined(PPC64_ELF_ABI_v1) -/* - * 64bit powerpc ABIv1 uses function descriptors: - * - Check for the dot variant of the symbol first. - * - If that fails, try looking up the symbol provided. - * - * This ensures we always get to the actual symbol and not the descriptor. - * Also handle <module:symbol> format. - */ -#define kprobe_lookup_name(name, addr) \ -{ \ - char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN]; \ - const char *modsym; \ - bool dot_appended = false; \ - if ((modsym = strchr(name, ':')) != NULL) { \ - modsym++; \ - if (*modsym != '\0' && *modsym != '.') { \ - /* Convert to <module:.symbol> */ \ - strncpy(dot_name, name, modsym - name); \ - dot_name[modsym - name] = '.'; \ - dot_name[modsym - name + 1] = '\0'; \ - strncat(dot_name, modsym, \ - sizeof(dot_name) - (modsym - name) - 2);\ - dot_appended = true; \ - } else { \ - dot_name[0] = '\0'; \ - strncat(dot_name, name, sizeof(dot_name) - 1); \ - } \ - } else if (name[0] != '.') { \ - dot_name[0] = '.'; \ - dot_name[1] = '\0'; \ - strncat(dot_name, name, KSYM_NAME_LEN - 2); \ - dot_appended = true; \ - } else { \ - dot_name[0] = '\0'; \ - strncat(dot_name, name, KSYM_NAME_LEN - 1); \ - } \ - addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name); \ - if (!addr && dot_appended) { \ - /* Let's try the original non-dot symbol lookup */ \ - addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); \ - } \ -} -#endif - #define flush_insn_slot(p) do { } while (0) #define kretprobe_blacklist_size 0 @@ -156,6 +103,16 @@ extern int kprobe_exceptions_notify(struct notifier_block *self, extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); extern int kprobe_handler(struct pt_regs *regs); extern int kprobe_post_handler(struct pt_regs *regs); +#ifdef CONFIG_KPROBES_ON_FTRACE +extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb); +#else +static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + return 0; +} +#endif #else static inline int kprobe_handler(struct pt_regs *regs) { return 0; } static inline int kprobe_post_handler(struct pt_regs *regs) { return 0; } diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index d9b48f5bb606..d55c7f881ce7 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -49,8 +49,6 @@ static inline bool kvm_is_radix(struct kvm *kvm) #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ #endif -#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ - /* * We use a lock bit in HPTE dword 0 to synchronize updates and * accesses to each HPTE, and another bit to indicate non-present diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index d318d432caa9..0593d9479f74 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -110,7 +110,7 @@ struct kvmppc_host_state { u8 ptid; struct kvm_vcpu *kvm_vcpu; struct kvmppc_vcore *kvm_vcore; - unsigned long xics_phys; + void __iomem *xics_phys; u32 saved_xirr; u64 dabr; u64 host_mmcr[7]; /* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index dd11c4c8c56a..c3877992eff9 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -409,7 +409,7 @@ struct openpic; extern void kvm_cma_reserve(void) __init; static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) { - paca[cpu].kvm_hstate.xics_phys = addr; + paca[cpu].kvm_hstate.xics_phys = (void __iomem *)addr; } static inline u32 kvmppc_get_xics_latch(void) @@ -478,8 +478,6 @@ extern void kvmppc_free_host_rm_ops(void); extern void kvmppc_free_pimap(struct kvm *kvm); extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall); extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu); -extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server); -extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args); extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd); extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); @@ -507,12 +505,6 @@ static inline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return 0; } static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { } -static inline int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, - unsigned long server) - { return -EINVAL; } -static inline int kvm_vm_ioctl_xics_irq(struct kvm *kvm, - struct kvm_irq_level *args) - { return -ENOTTY; } static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) { return 0; } #endif diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index ed62efe01e49..81eff8631434 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -24,97 +24,6 @@ #include <linux/bitops.h> -/* - * Machine Check bits on power7 and power8 - */ -#define P7_SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42)) /* P8 too */ - -/* SRR1 bits for machine check (On Power7 and Power8) */ -#define P7_SRR1_MC_IFETCH(srr1) ((srr1) & PPC_BITMASK(43, 45)) /* P8 too */ - -#define P7_SRR1_MC_IFETCH_UE (0x1 << PPC_BITLSHIFT(45)) /* P8 too */ -#define P7_SRR1_MC_IFETCH_SLB_PARITY (0x2 << PPC_BITLSHIFT(45)) /* P8 too */ -#define P7_SRR1_MC_IFETCH_SLB_MULTIHIT (0x3 << PPC_BITLSHIFT(45)) /* P8 too */ -#define P7_SRR1_MC_IFETCH_SLB_BOTH (0x4 << PPC_BITLSHIFT(45)) -#define P7_SRR1_MC_IFETCH_TLB_MULTIHIT (0x5 << PPC_BITLSHIFT(45)) /* P8 too */ -#define P7_SRR1_MC_IFETCH_UE_TLB_RELOAD (0x6 << PPC_BITLSHIFT(45)) /* P8 too */ -#define P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL (0x7 << PPC_BITLSHIFT(45)) - -/* SRR1 bits for machine check (On Power8) */ -#define P8_SRR1_MC_IFETCH_ERAT_MULTIHIT (0x4 << PPC_BITLSHIFT(45)) - -/* DSISR bits for machine check (On Power7 and Power8) */ -#define P7_DSISR_MC_UE (PPC_BIT(48)) /* P8 too */ -#define P7_DSISR_MC_UE_TABLEWALK (PPC_BIT(49)) /* P8 too */ -#define P7_DSISR_MC_ERAT_MULTIHIT (PPC_BIT(52)) /* P8 too */ -#define P7_DSISR_MC_TLB_MULTIHIT_MFTLB (PPC_BIT(53)) /* P8 too */ -#define P7_DSISR_MC_SLB_PARITY_MFSLB (PPC_BIT(55)) /* P8 too */ -#define P7_DSISR_MC_SLB_MULTIHIT (PPC_BIT(56)) /* P8 too */ -#define P7_DSISR_MC_SLB_MULTIHIT_PARITY (PPC_BIT(57)) /* P8 too */ - -/* - * DSISR bits for machine check (Power8) in addition to above. - * Secondary DERAT Multihit - */ -#define P8_DSISR_MC_ERAT_MULTIHIT_SEC (PPC_BIT(54)) - -/* SLB error bits */ -#define P7_DSISR_MC_SLB_ERRORS (P7_DSISR_MC_ERAT_MULTIHIT | \ - P7_DSISR_MC_SLB_PARITY_MFSLB | \ - P7_DSISR_MC_SLB_MULTIHIT | \ - P7_DSISR_MC_SLB_MULTIHIT_PARITY) - -#define P8_DSISR_MC_SLB_ERRORS (P7_DSISR_MC_SLB_ERRORS | \ - P8_DSISR_MC_ERAT_MULTIHIT_SEC) - -/* - * Machine Check bits on power9 - */ -#define P9_SRR1_MC_LOADSTORE(srr1) (((srr1) >> PPC_BITLSHIFT(42)) & 1) - -#define P9_SRR1_MC_IFETCH(srr1) ( \ - PPC_BITEXTRACT(srr1, 45, 0) | \ - PPC_BITEXTRACT(srr1, 44, 1) | \ - PPC_BITEXTRACT(srr1, 43, 2) | \ - PPC_BITEXTRACT(srr1, 36, 3) ) - -/* 0 is reserved */ -#define P9_SRR1_MC_IFETCH_UE 1 -#define P9_SRR1_MC_IFETCH_SLB_PARITY 2 -#define P9_SRR1_MC_IFETCH_SLB_MULTIHIT 3 -#define P9_SRR1_MC_IFETCH_ERAT_MULTIHIT 4 -#define P9_SRR1_MC_IFETCH_TLB_MULTIHIT 5 -#define P9_SRR1_MC_IFETCH_UE_TLB_RELOAD 6 -/* 7 is reserved */ -#define P9_SRR1_MC_IFETCH_LINK_TIMEOUT 8 -#define P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT 9 -/* 10 ? */ -#define P9_SRR1_MC_IFETCH_RA 11 -#define P9_SRR1_MC_IFETCH_RA_TABLEWALK 12 -#define P9_SRR1_MC_IFETCH_RA_ASYNC_STORE 13 -#define P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT 14 -#define P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN 15 - -/* DSISR bits for machine check (On Power9) */ -#define P9_DSISR_MC_UE (PPC_BIT(48)) -#define P9_DSISR_MC_UE_TABLEWALK (PPC_BIT(49)) -#define P9_DSISR_MC_LINK_LOAD_TIMEOUT (PPC_BIT(50)) -#define P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT (PPC_BIT(51)) -#define P9_DSISR_MC_ERAT_MULTIHIT (PPC_BIT(52)) -#define P9_DSISR_MC_TLB_MULTIHIT_MFTLB (PPC_BIT(53)) -#define P9_DSISR_MC_USER_TLBIE (PPC_BIT(54)) -#define P9_DSISR_MC_SLB_PARITY_MFSLB (PPC_BIT(55)) -#define P9_DSISR_MC_SLB_MULTIHIT_MFSLB (PPC_BIT(56)) -#define P9_DSISR_MC_RA_LOAD (PPC_BIT(57)) -#define P9_DSISR_MC_RA_TABLEWALK (PPC_BIT(58)) -#define P9_DSISR_MC_RA_TABLEWALK_FOREIGN (PPC_BIT(59)) -#define P9_DSISR_MC_RA_FOREIGN (PPC_BIT(60)) - -/* SLB error bits */ -#define P9_DSISR_MC_SLB_ERRORS (P9_DSISR_MC_ERAT_MULTIHIT | \ - P9_DSISR_MC_SLB_PARITY_MFSLB | \ - P9_DSISR_MC_SLB_MULTIHIT_MFSLB) - enum MCE_Version { MCE_V1 = 1, }; @@ -298,7 +207,8 @@ extern void save_mce_event(struct pt_regs *regs, long handled, extern int get_mce_event(struct machine_check_event *mce, bool release); extern void release_mce_event(void); extern void machine_check_queue_event(void); -extern void machine_check_print_event_info(struct machine_check_event *evt); +extern void machine_check_print_event_info(struct machine_check_event *evt, + bool user_mode); extern uint64_t get_mce_fault_addr(struct machine_check_event *evt); #endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index b62a8d43a06c..7ca8d8e80ffa 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -229,11 +229,6 @@ typedef struct { unsigned int id; unsigned int active; unsigned long vdso_base; -#ifdef CONFIG_PPC_MM_SLICES - u64 low_slices_psize; /* SLB page size encodings */ - u64 high_slices_psize; /* 4 bits per slice for now */ - u16 user_psize; /* page size index */ -#endif #ifdef CONFIG_PPC_64K_PAGES /* for 4K PTE fragment support */ void *pte_frag; diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 065e762fae85..78260409dc9c 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -29,6 +29,10 @@ */ /* + * Support for 68 bit VA space. We added that from ISA 2.05 + */ +#define MMU_FTR_68_BIT_VA ASM_CONST(0x00002000) +/* * Kernel read only support. * We added the ppp value 0b110 in ISA 2.04. */ @@ -109,10 +113,10 @@ #define MMU_FTRS_POWER4 MMU_FTRS_DEFAULT_HPTE_ARCH_V2 #define MMU_FTRS_PPC970 MMU_FTRS_POWER4 | MMU_FTR_TLBIE_CROP_VA #define MMU_FTRS_POWER5 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE -#define MMU_FTRS_POWER6 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO -#define MMU_FTRS_POWER7 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO -#define MMU_FTRS_POWER8 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO -#define MMU_FTRS_POWER9 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO +#define MMU_FTRS_POWER6 MMU_FTRS_POWER5 | MMU_FTR_KERNEL_RO | MMU_FTR_68_BIT_VA +#define MMU_FTRS_POWER7 MMU_FTRS_POWER6 +#define MMU_FTRS_POWER8 MMU_FTRS_POWER6 +#define MMU_FTRS_POWER9 MMU_FTRS_POWER6 #define MMU_FTRS_CELL MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \ MMU_FTR_CI_LARGE_PAGE #define MMU_FTRS_PA6T MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \ @@ -136,7 +140,7 @@ enum { MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE | MMU_FTR_1T_SEGMENT | MMU_FTR_TLBIE_CROP_VA | - MMU_FTR_KERNEL_RO | + MMU_FTR_KERNEL_RO | MMU_FTR_68_BIT_VA | #ifdef CONFIG_PPC_RADIX_MMU MMU_FTR_TYPE_RADIX | #endif @@ -290,7 +294,10 @@ static inline bool early_radix_enabled(void) #define MMU_PAGE_16G 14 #define MMU_PAGE_64G 15 -/* N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 */ +/* + * N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 + * Also we need to change he type of mm_context.low/high_slices_psize. + */ #define MMU_PAGE_COUNT 16 #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index c70c8272523d..da06e0496699 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -55,7 +55,8 @@ static inline void switch_mmu_context(struct mm_struct *prev, return switch_slb(tsk, next); } -extern int __init_new_context(void); +extern int hash__alloc_context_id(void); +extern void hash__reserve_context_id(int id); extern void __destroy_context(int context_id); static inline void mmu_context_init(void) { } #else @@ -74,8 +75,9 @@ extern void drop_cop(unsigned long acop, struct mm_struct *mm); * switch_mm is the entry point called from the architecture independent * code in kernel/sched/core.c */ -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) +static inline void switch_mm_irqs_off(struct mm_struct *prev, + struct mm_struct *next, + struct task_struct *tsk) { /* Mark this context has been used on the new CPU */ if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) @@ -114,6 +116,18 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, switch_mmu_context(prev, next, tsk); } +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + switch_mm_irqs_off(prev, next, tsk); + local_irq_restore(flags); +} +#define switch_mm_irqs_off switch_mm_irqs_off + + #define deactivate_mm(tsk,mm) do { } while (0) /* diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index c7f927e67d14..f0ff384d4ca5 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -88,11 +88,6 @@ #include <asm/nohash/pte-book3e.h> #include <asm/pte-common.h> -#ifdef CONFIG_PPC_MM_SLICES -#define HAVE_ARCH_UNMAPPED_AREA -#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#endif /* CONFIG_PPC_MM_SLICES */ - #ifndef __ASSEMBLY__ /* pte_clear moved to later in this file */ diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index a0aa285869b5..cb3e6242a78c 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -40,6 +40,8 @@ #define OPAL_I2C_ARBT_LOST -22 #define OPAL_I2C_NACK_RCVD -23 #define OPAL_I2C_STOP_ERR -24 +#define OPAL_XIVE_PROVISIONING -31 +#define OPAL_XIVE_FREE_ACTIVE -32 /* API Tokens (in r0) */ #define OPAL_INVALID_CALL -1 @@ -168,7 +170,27 @@ #define OPAL_INT_SET_MFRR 125 #define OPAL_PCI_TCE_KILL 126 #define OPAL_NMMU_SET_PTCR 127 -#define OPAL_LAST 127 +#define OPAL_XIVE_RESET 128 +#define OPAL_XIVE_GET_IRQ_INFO 129 +#define OPAL_XIVE_GET_IRQ_CONFIG 130 +#define OPAL_XIVE_SET_IRQ_CONFIG 131 +#define OPAL_XIVE_GET_QUEUE_INFO 132 +#define OPAL_XIVE_SET_QUEUE_INFO 133 +#define OPAL_XIVE_DONATE_PAGE 134 +#define OPAL_XIVE_ALLOCATE_VP_BLOCK 135 +#define OPAL_XIVE_FREE_VP_BLOCK 136 +#define OPAL_XIVE_GET_VP_INFO 137 +#define OPAL_XIVE_SET_VP_INFO 138 +#define OPAL_XIVE_ALLOCATE_IRQ 139 +#define OPAL_XIVE_FREE_IRQ 140 +#define OPAL_XIVE_SYNC 141 +#define OPAL_XIVE_DUMP 142 +#define OPAL_XIVE_RESERVED3 143 +#define OPAL_XIVE_RESERVED4 144 +#define OPAL_NPU_INIT_CONTEXT 146 +#define OPAL_NPU_DESTROY_CONTEXT 147 +#define OPAL_NPU_MAP_LPAR 148 +#define OPAL_LAST 148 /* Device tree flags */ @@ -928,6 +950,59 @@ enum { OPAL_PCI_TCE_KILL_ALL, }; +/* The xive operation mode indicates the active "API" and + * corresponds to the "mode" parameter of the opal_xive_reset() + * call + */ +enum { + OPAL_XIVE_MODE_EMU = 0, + OPAL_XIVE_MODE_EXPL = 1, +}; + +/* Flags for OPAL_XIVE_GET_IRQ_INFO */ +enum { + OPAL_XIVE_IRQ_TRIGGER_PAGE = 0x00000001, + OPAL_XIVE_IRQ_STORE_EOI = 0x00000002, + OPAL_XIVE_IRQ_LSI = 0x00000004, + OPAL_XIVE_IRQ_SHIFT_BUG = 0x00000008, + OPAL_XIVE_IRQ_MASK_VIA_FW = 0x00000010, + OPAL_XIVE_IRQ_EOI_VIA_FW = 0x00000020, +}; + +/* Flags for OPAL_XIVE_GET/SET_QUEUE_INFO */ +enum { + OPAL_XIVE_EQ_ENABLED = 0x00000001, + OPAL_XIVE_EQ_ALWAYS_NOTIFY = 0x00000002, + OPAL_XIVE_EQ_ESCALATE = 0x00000004, +}; + +/* Flags for OPAL_XIVE_GET/SET_VP_INFO */ +enum { + OPAL_XIVE_VP_ENABLED = 0x00000001, +}; + +/* "Any chip" replacement for chip ID for allocation functions */ +enum { + OPAL_XIVE_ANY_CHIP = 0xffffffff, +}; + +/* Xive sync options */ +enum { + /* This bits are cumulative, arg is a girq */ + XIVE_SYNC_EAS = 0x00000001, /* Sync irq source */ + XIVE_SYNC_QUEUE = 0x00000002, /* Sync irq target */ +}; + +/* Dump options */ +enum { + XIVE_DUMP_TM_HYP = 0, + XIVE_DUMP_TM_POOL = 1, + XIVE_DUMP_TM_OS = 2, + XIVE_DUMP_TM_USER = 3, + XIVE_DUMP_VP = 4, + XIVE_DUMP_EMU_STATE = 5, +}; + #endif /* __ASSEMBLY__ */ #endif /* __OPAL_API_H */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 1ff03a6da76e..588fb1c23af9 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -29,6 +29,11 @@ extern struct device_node *opal_node; /* API functions */ int64_t opal_invalid_call(void); +int64_t opal_npu_destroy_context(uint64_t phb_id, uint64_t pid, uint64_t bdf); +int64_t opal_npu_init_context(uint64_t phb_id, int pasid, uint64_t msr, + uint64_t bdf); +int64_t opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid, + uint64_t lpcr); int64_t opal_console_write(int64_t term_number, __be64 *length, const uint8_t *buffer); int64_t opal_console_read(int64_t term_number, __be64 *length, @@ -226,6 +231,42 @@ int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t kill_type, uint32_t pe_num, uint32_t tce_size, uint64_t dma_addr, uint32_t npages); int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr); +int64_t opal_xive_reset(uint64_t version); +int64_t opal_xive_get_irq_info(uint32_t girq, + __be64 *out_flags, + __be64 *out_eoi_page, + __be64 *out_trig_page, + __be32 *out_esb_shift, + __be32 *out_src_chip); +int64_t opal_xive_get_irq_config(uint32_t girq, __be64 *out_vp, + uint8_t *out_prio, __be32 *out_lirq); +int64_t opal_xive_set_irq_config(uint32_t girq, uint64_t vp, uint8_t prio, + uint32_t lirq); +int64_t opal_xive_get_queue_info(uint64_t vp, uint32_t prio, + __be64 *out_qpage, + __be64 *out_qsize, + __be64 *out_qeoi_page, + __be32 *out_escalate_irq, + __be64 *out_qflags); +int64_t opal_xive_set_queue_info(uint64_t vp, uint32_t prio, + uint64_t qpage, + uint64_t qsize, + uint64_t qflags); +int64_t opal_xive_donate_page(uint32_t chip_id, uint64_t addr); +int64_t opal_xive_alloc_vp_block(uint32_t alloc_order); +int64_t opal_xive_free_vp_block(uint64_t vp); +int64_t opal_xive_get_vp_info(uint64_t vp, + __be64 *out_flags, + __be64 *out_cam_value, + __be64 *out_report_cl_pair, + __be32 *out_chip_id); +int64_t opal_xive_set_vp_info(uint64_t vp, + uint64_t flags, + uint64_t report_cl_pair); +int64_t opal_xive_allocate_irq(uint32_t chip_id); +int64_t opal_xive_free_irq(uint32_t girq); +int64_t opal_xive_sync(uint32_t type, uint32_t id); +int64_t opal_xive_dump(uint32_t type, uint32_t id); /* Internal functions */ extern int early_init_dt_scan_opal(unsigned long node, const char *uname, diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 708c3e592eeb..140ddb9ae5a8 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -139,6 +139,7 @@ struct paca_struct { #ifdef CONFIG_PPC_MM_SLICES u64 mm_ctx_low_slices_psize; unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE]; + unsigned long addr_limit; #else u16 mm_ctx_user_psize; u16 mm_ctx_sllp; @@ -172,6 +173,11 @@ struct paca_struct { u8 thread_mask; /* Mask to denote subcore sibling threads */ u8 subcore_sibling_mask; + /* + * Pointer to an array which contains pointer + * to the sibling threads' paca. + */ + struct paca_struct **thread_sibling_pacas; #endif #ifdef CONFIG_PPC_BOOK3S_64 @@ -206,23 +212,7 @@ struct paca_struct { #endif }; -#ifdef CONFIG_PPC_BOOK3S -static inline void copy_mm_to_paca(mm_context_t *context) -{ - get_paca()->mm_ctx_id = context->id; -#ifdef CONFIG_PPC_MM_SLICES - get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize; - memcpy(&get_paca()->mm_ctx_high_slices_psize, - &context->high_slices_psize, SLICE_ARRAY_SIZE); -#else - get_paca()->mm_ctx_user_psize = context->user_psize; - get_paca()->mm_ctx_sllp = context->sllp; -#endif -} -#else -static inline void copy_mm_to_paca(mm_context_t *context){} -#endif - +extern void copy_mm_to_paca(struct mm_struct *mm); extern struct paca_struct *paca; extern void initialise_paca(struct paca_struct *new_paca, int cpu); extern void setup_paca(struct paca_struct *new_paca); diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index 3e83d2a20b6f..c4d9654bd637 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -98,21 +98,7 @@ extern u64 ppc64_pft_size; #define GET_LOW_SLICE_INDEX(addr) ((addr) >> SLICE_LOW_SHIFT) #define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT) -/* - * 1 bit per slice and we have one slice per 1TB - * Right now we support only 64TB. - * IF we change this we will have to change the type - * of high_slices - */ -#define SLICE_MASK_SIZE 8 - #ifndef __ASSEMBLY__ - -struct slice_mask { - u16 low_slices; - u64 high_slices; -}; - struct mm_struct; extern unsigned long slice_get_unmapped_area(unsigned long addr, diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h index ae0a23091a9b..723bf48e7494 100644 --- a/arch/powerpc/include/asm/perf_event_server.h +++ b/arch/powerpc/include/asm/perf_event_server.h @@ -38,6 +38,9 @@ struct power_pmu { unsigned long *valp); int (*get_alternatives)(u64 event_id, unsigned int flags, u64 alt[]); + void (*get_mem_data_src)(union perf_mem_data_src *dsrc, + u32 flags, struct pt_regs *regs); + void (*get_mem_weight)(u64 *weight); u64 (*bhrb_filter_map)(u64 branch_sample_type); void (*config_bhrb)(u64 pmu_bhrb_filter); void (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]); diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h index 0e9c2402dd20..f62797702300 100644 --- a/arch/powerpc/include/asm/powernv.h +++ b/arch/powerpc/include/asm/powernv.h @@ -11,9 +11,31 @@ #define _ASM_POWERNV_H #ifdef CONFIG_PPC_POWERNV +#define NPU2_WRITE 1 extern void powernv_set_nmmu_ptcr(unsigned long ptcr); +extern struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, + unsigned long flags, + struct npu_context *(*cb)(struct npu_context *, void *), + void *priv); +extern void pnv_npu2_destroy_context(struct npu_context *context, + struct pci_dev *gpdev); +extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, + unsigned long *flags, unsigned long *status, + int count); #else static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { } +static inline struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, + unsigned long flags, + struct npu_context *(*cb)(struct npu_context *, void *), + void *priv) { return ERR_PTR(-ENODEV); } +static inline void pnv_npu2_destroy_context(struct npu_context *context, + struct pci_dev *gpdev) { } + +static inline int pnv_npu2_handle_fault(struct npu_context *context, + uintptr_t *ea, unsigned long *flags, + unsigned long *status, int count) { + return -ENODEV; +} #endif #endif /* _ASM_POWERNV_H */ diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index e7d6d86563ee..142d78d645f4 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -161,6 +161,7 @@ #define PPC_INST_MFTMR 0x7c0002dc #define PPC_INST_MSGSND 0x7c00019c #define PPC_INST_MSGCLR 0x7c0001dc +#define PPC_INST_MSGSYNC 0x7c0006ec #define PPC_INST_MSGSNDP 0x7c00011c #define PPC_INST_MTTMR 0x7c0003dc #define PPC_INST_NOP 0x60000000 @@ -345,6 +346,7 @@ ___PPC_RB(b) | __PPC_EH(eh)) #define PPC_MSGSND(b) stringify_in_c(.long PPC_INST_MSGSND | \ ___PPC_RB(b)) +#define PPC_MSGSYNC stringify_in_c(.long PPC_INST_MSGSYNC) #define PPC_MSGCLR(b) stringify_in_c(.long PPC_INST_MSGCLR | \ ___PPC_RB(b)) #define PPC_MSGSNDP(b) stringify_in_c(.long PPC_INST_MSGSNDP | \ diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index e0fecbcea2a2..a4b1d8d6b793 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -102,11 +102,25 @@ void release_thread(struct task_struct *); #endif #ifdef CONFIG_PPC64 -/* 64-bit user address space is 46-bits (64TB user VM) */ -#define TASK_SIZE_USER64 (0x0000400000000000UL) +/* + * 64-bit user address space can have multiple limits + * For now supported values are: + */ +#define TASK_SIZE_64TB (0x0000400000000000UL) +#define TASK_SIZE_128TB (0x0000800000000000UL) +#define TASK_SIZE_512TB (0x0002000000000000UL) + +#ifdef CONFIG_PPC_BOOK3S_64 +/* + * Max value currently used: + */ +#define TASK_SIZE_USER64 TASK_SIZE_512TB +#else +#define TASK_SIZE_USER64 TASK_SIZE_64TB +#endif -/* - * 32-bit user address space is 4GB - 1 page +/* + * 32-bit user address space is 4GB - 1 page * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT */ #define TASK_SIZE_USER32 (0x0000000100000000UL - (1*PAGE_SIZE)) @@ -114,26 +128,37 @@ void release_thread(struct task_struct *); #define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ TASK_SIZE_USER32 : TASK_SIZE_USER64) #define TASK_SIZE TASK_SIZE_OF(current) - /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ #define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4)) -#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(TASK_SIZE_USER64 / 4)) +#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(TASK_SIZE_128TB / 4)) #define TASK_UNMAPPED_BASE ((is_32bit_task()) ? \ TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64 ) #endif +/* + * Initial task size value for user applications. For book3s 64 we start + * with 128TB and conditionally enable upto 512TB + */ +#ifdef CONFIG_PPC_BOOK3S_64 +#define DEFAULT_MAP_WINDOW ((is_32bit_task()) ? \ + TASK_SIZE_USER32 : TASK_SIZE_128TB) +#else +#define DEFAULT_MAP_WINDOW TASK_SIZE +#endif + #ifdef __powerpc64__ -#define STACK_TOP_USER64 TASK_SIZE_USER64 +/* Limit stack to 128TB */ +#define STACK_TOP_USER64 TASK_SIZE_128TB #define STACK_TOP_USER32 TASK_SIZE_USER32 #define STACK_TOP (is_32bit_task() ? \ STACK_TOP_USER32 : STACK_TOP_USER64) -#define STACK_TOP_MAX STACK_TOP_USER64 +#define STACK_TOP_MAX TASK_SIZE_USER64 #else /* __powerpc64__ */ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index fc879fd6bdae..d4f653c9259a 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -310,6 +310,7 @@ #define SPRN_PMCR 0x374 /* Power Management Control Register */ /* HFSCR and FSCR bit numbers are the same */ +#define FSCR_SCV_LG 12 /* Enable System Call Vectored */ #define FSCR_MSGP_LG 10 /* Enable MSGP */ #define FSCR_TAR_LG 8 /* Enable Target Address Register */ #define FSCR_EBB_LG 7 /* Enable Event Based Branching */ @@ -320,6 +321,7 @@ #define FSCR_VECVSX_LG 1 /* Enable VMX/VSX */ #define FSCR_FP_LG 0 /* Enable Floating Point */ #define SPRN_FSCR 0x099 /* Facility Status & Control Register */ +#define FSCR_SCV __MASK(FSCR_SCV_LG) #define FSCR_TAR __MASK(FSCR_TAR_LG) #define FSCR_EBB __MASK(FSCR_EBB_LG) #define FSCR_DSCR __MASK(FSCR_DSCR_LG) @@ -365,6 +367,7 @@ #define LPCR_MER_SH 11 #define LPCR_GTSE ASM_CONST(0x0000000000000400) /* Guest Translation Shootdown Enable */ #define LPCR_TC ASM_CONST(0x0000000000000200) /* Translation control */ +#define LPCR_HEIC ASM_CONST(0x0000000000000010) /* Hypervisor External Interrupt Control */ #define LPCR_LPES 0x0000000c #define LPCR_LPES0 ASM_CONST(0x0000000000000008) /* LPAR Env selector 0 */ #define LPCR_LPES1 ASM_CONST(0x0000000000000004) /* LPAR Env selector 1 */ @@ -656,6 +659,7 @@ #define SRR1_ISI_PROT 0x08000000 /* ISI: Other protection fault */ #define SRR1_WAKEMASK 0x00380000 /* reason for wakeup */ #define SRR1_WAKEMASK_P8 0x003c0000 /* reason for wakeup on POWER8 and 9 */ +#define SRR1_WAKEMCE_RESVD 0x003c0000 /* Unused/reserved value used by MCE wakeup to indicate cause to idle wakeup handler */ #define SRR1_WAKESYSERR 0x00300000 /* System error */ #define SRR1_WAKEEE 0x00200000 /* External interrupt */ #define SRR1_WAKEHVI 0x00240000 /* Hypervisor Virtualization Interrupt (P9) */ diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index 7dc006b58369..7902d6358854 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -6,6 +6,8 @@ #include <linux/uaccess.h> #include <asm-generic/sections.h> +extern char __head_end[]; + #ifdef __powerpc64__ extern char __start_interrupts[]; diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 32db16d2e7ad..751b2bd944fc 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -40,10 +40,11 @@ extern int cpu_to_chip_id(int cpu); struct smp_ops_t { void (*message_pass)(int cpu, int msg); #ifdef CONFIG_PPC_SMP_MUXED_IPI - void (*cause_ipi)(int cpu, unsigned long data); + void (*cause_ipi)(int cpu); #endif void (*probe)(void); int (*kick_cpu)(int nr); + int (*prepare_cpu)(int nr); void (*setup_cpu)(int nr); void (*bringup_done)(void); void (*take_timebase)(void); @@ -61,7 +62,6 @@ extern void smp_generic_take_timebase(void); DECLARE_PER_CPU(unsigned int, cpu_pvr); #ifdef CONFIG_HOTPLUG_CPU -extern void migrate_irqs(void); int generic_cpu_disable(void); void generic_cpu_die(unsigned int cpu); void generic_set_cpu_dead(unsigned int cpu); @@ -125,10 +125,10 @@ extern int smp_request_message_ipi(int virq, int message); extern const char *smp_ipi_name[]; /* for irq controllers with only a single ipi */ -extern void smp_muxed_ipi_set_data(int cpu, unsigned long data); extern void smp_muxed_ipi_message_pass(int cpu, int msg); extern void smp_muxed_ipi_set_message(int cpu, int msg); extern irqreturn_t smp_ipi_demux(void); +extern irqreturn_t smp_ipi_demux_relaxed(void); void smp_init_pSeries(void); void smp_init_cell(void); diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index 23be8f1e7e64..16fab6898240 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -8,10 +8,10 @@ struct rtas_args; -asmlinkage unsigned long sys_mmap(unsigned long addr, size_t len, +asmlinkage long sys_mmap(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, unsigned long fd, off_t offset); -asmlinkage unsigned long sys_mmap2(unsigned long addr, size_t len, +asmlinkage long sys_mmap2(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff); asmlinkage long ppc64_personality(unsigned long personality); diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 87e4b2d8dcd4..2e17d668c472 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -10,15 +10,7 @@ #ifdef __KERNEL__ -/* We have 8k stacks on ppc32 and 16k on ppc64 */ - -#if defined(CONFIG_PPC64) -#define THREAD_SHIFT 14 -#elif defined(CONFIG_PPC_256K_PAGES) -#define THREAD_SHIFT 15 -#else -#define THREAD_SHIFT 13 -#endif +#define THREAD_SHIFT CONFIG_THREAD_SHIFT #define THREAD_SIZE (1 << THREAD_SHIFT) diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h index e0b9e576905a..7ce2c3ac2964 100644 --- a/arch/powerpc/include/asm/xics.h +++ b/arch/powerpc/include/asm/xics.h @@ -57,7 +57,7 @@ struct icp_ops { void (*teardown_cpu)(void); void (*flush_ipi)(void); #ifdef CONFIG_SMP - void (*cause_ipi)(int cpu, unsigned long data); + void (*cause_ipi)(int cpu); irq_handler_t ipi_action; #endif }; diff --git a/arch/powerpc/include/asm/xive-regs.h b/arch/powerpc/include/asm/xive-regs.h new file mode 100644 index 000000000000..1d3f2be5ae39 --- /dev/null +++ b/arch/powerpc/include/asm/xive-regs.h @@ -0,0 +1,97 @@ +/* + * Copyright 2016,2017 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _ASM_POWERPC_XIVE_REGS_H +#define _ASM_POWERPC_XIVE_REGS_H + +/* + * Thread Management (aka "TM") registers + */ + +/* TM register offsets */ +#define TM_QW0_USER 0x000 /* All rings */ +#define TM_QW1_OS 0x010 /* Ring 0..2 */ +#define TM_QW2_HV_POOL 0x020 /* Ring 0..1 */ +#define TM_QW3_HV_PHYS 0x030 /* Ring 0..1 */ + +/* Byte offsets inside a QW QW0 QW1 QW2 QW3 */ +#define TM_NSR 0x0 /* + + - + */ +#define TM_CPPR 0x1 /* - + - + */ +#define TM_IPB 0x2 /* - + + + */ +#define TM_LSMFB 0x3 /* - + + + */ +#define TM_ACK_CNT 0x4 /* - + - - */ +#define TM_INC 0x5 /* - + - + */ +#define TM_AGE 0x6 /* - + - + */ +#define TM_PIPR 0x7 /* - + - + */ + +#define TM_WORD0 0x0 +#define TM_WORD1 0x4 + +/* + * QW word 2 contains the valid bit at the top and other fields + * depending on the QW. + */ +#define TM_WORD2 0x8 +#define TM_QW0W2_VU PPC_BIT32(0) +#define TM_QW0W2_LOGIC_SERV PPC_BITMASK32(1,31) // XX 2,31 ? +#define TM_QW1W2_VO PPC_BIT32(0) +#define TM_QW1W2_OS_CAM PPC_BITMASK32(8,31) +#define TM_QW2W2_VP PPC_BIT32(0) +#define TM_QW2W2_POOL_CAM PPC_BITMASK32(8,31) +#define TM_QW3W2_VT PPC_BIT32(0) +#define TM_QW3W2_LP PPC_BIT32(6) +#define TM_QW3W2_LE PPC_BIT32(7) +#define TM_QW3W2_T PPC_BIT32(31) + +/* + * In addition to normal loads to "peek" and writes (only when invalid) + * using 4 and 8 bytes accesses, the above registers support these + * "special" byte operations: + * + * - Byte load from QW0[NSR] - User level NSR (EBB) + * - Byte store to QW0[NSR] - User level NSR (EBB) + * - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access + * - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0 + * otherwise VT||0000000 + * - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present) + * + * Then we have all these "special" CI ops at these offset that trigger + * all sorts of side effects: + */ +#define TM_SPC_ACK_EBB 0x800 /* Load8 ack EBB to reg*/ +#define TM_SPC_ACK_OS_REG 0x810 /* Load16 ack OS irq to reg */ +#define TM_SPC_PUSH_USR_CTX 0x808 /* Store32 Push/Validate user context */ +#define TM_SPC_PULL_USR_CTX 0x808 /* Load32 Pull/Invalidate user context */ +#define TM_SPC_SET_OS_PENDING 0x812 /* Store8 Set OS irq pending bit */ +#define TM_SPC_PULL_OS_CTX 0x818 /* Load32/Load64 Pull/Invalidate OS context to reg */ +#define TM_SPC_PULL_POOL_CTX 0x828 /* Load32/Load64 Pull/Invalidate Pool context to reg*/ +#define TM_SPC_ACK_HV_REG 0x830 /* Load16 ack HV irq to reg */ +#define TM_SPC_PULL_USR_CTX_OL 0xc08 /* Store8 Pull/Inval usr ctx to odd line */ +#define TM_SPC_ACK_OS_EL 0xc10 /* Store8 ack OS irq to even line */ +#define TM_SPC_ACK_HV_POOL_EL 0xc20 /* Store8 ack HV evt pool to even line */ +#define TM_SPC_ACK_HV_EL 0xc30 /* Store8 ack HV irq to even line */ +/* XXX more... */ + +/* NSR fields for the various QW ack types */ +#define TM_QW0_NSR_EB PPC_BIT8(0) +#define TM_QW1_NSR_EO PPC_BIT8(0) +#define TM_QW3_NSR_HE PPC_BITMASK8(0,1) +#define TM_QW3_NSR_HE_NONE 0 +#define TM_QW3_NSR_HE_POOL 1 +#define TM_QW3_NSR_HE_PHYS 2 +#define TM_QW3_NSR_HE_LSI 3 +#define TM_QW3_NSR_I PPC_BIT8(2) +#define TM_QW3_NSR_GRP_LVL PPC_BIT8(3,7) + +/* Utilities to manipulate these (originaly from OPAL) */ +#define MASK_TO_LSH(m) (__builtin_ffsl(m) - 1) +#define GETFIELD(m, v) (((v) & (m)) >> MASK_TO_LSH(m)) +#define SETFIELD(m, v, val) \ + (((v) & ~(m)) | ((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m))) + +#endif /* _ASM_POWERPC_XIVE_REGS_H */ diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h new file mode 100644 index 000000000000..3cdbeaeac397 --- /dev/null +++ b/arch/powerpc/include/asm/xive.h @@ -0,0 +1,163 @@ +/* + * Copyright 2016,2017 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _ASM_POWERPC_XIVE_H +#define _ASM_POWERPC_XIVE_H + +#define XIVE_INVALID_VP 0xffffffff + +#ifdef CONFIG_PPC_XIVE + +/* + * Thread Interrupt Management Area (TIMA) + * + * This is a global MMIO region divided in 4 pages of varying access + * permissions, providing access to per-cpu interrupt management + * functions. It always identifies the CPU doing the access based + * on the PowerBus initiator ID, thus we always access via the + * same offset regardless of where the code is executing + */ +extern void __iomem *xive_tima; + +/* + * Offset in the TM area of our current execution level (provided by + * the backend) + */ +extern u32 xive_tima_offset; + +/* + * Per-irq data (irq_get_handler_data for normal IRQs), IPIs + * have it stored in the xive_cpu structure. We also cache + * for normal interrupts the current target CPU. + * + * This structure is setup by the backend for each interrupt. + */ +struct xive_irq_data { + u64 flags; + u64 eoi_page; + void __iomem *eoi_mmio; + u64 trig_page; + void __iomem *trig_mmio; + u32 esb_shift; + int src_chip; + + /* Setup/used by frontend */ + int target; + bool saved_p; +}; +#define XIVE_IRQ_FLAG_STORE_EOI 0x01 +#define XIVE_IRQ_FLAG_LSI 0x02 +#define XIVE_IRQ_FLAG_SHIFT_BUG 0x04 +#define XIVE_IRQ_FLAG_MASK_FW 0x08 +#define XIVE_IRQ_FLAG_EOI_FW 0x10 + +#define XIVE_INVALID_CHIP_ID -1 + +/* A queue tracking structure in a CPU */ +struct xive_q { + __be32 *qpage; + u32 msk; + u32 idx; + u32 toggle; + u64 eoi_phys; + u32 esc_irq; + atomic_t count; + atomic_t pending_count; +}; + +/* + * "magic" Event State Buffer (ESB) MMIO offsets. + * + * Each interrupt source has a 2-bit state machine called ESB + * which can be controlled by MMIO. It's made of 2 bits, P and + * Q. P indicates that an interrupt is pending (has been sent + * to a queue and is waiting for an EOI). Q indicates that the + * interrupt has been triggered while pending. + * + * This acts as a coalescing mechanism in order to guarantee + * that a given interrupt only occurs at most once in a queue. + * + * When doing an EOI, the Q bit will indicate if the interrupt + * needs to be re-triggered. + * + * The following offsets into the ESB MMIO allow to read or + * manipulate the PQ bits. They must be used with an 8-bytes + * load instruction. They all return the previous state of the + * interrupt (atomically). + * + * Additionally, some ESB pages support doing an EOI via a + * store at 0 and some ESBs support doing a trigger via a + * separate trigger page. + */ +#define XIVE_ESB_GET 0x800 +#define XIVE_ESB_SET_PQ_00 0xc00 +#define XIVE_ESB_SET_PQ_01 0xd00 +#define XIVE_ESB_SET_PQ_10 0xe00 +#define XIVE_ESB_SET_PQ_11 0xf00 +#define XIVE_ESB_MASK XIVE_ESB_SET_PQ_01 + +#define XIVE_ESB_VAL_P 0x2 +#define XIVE_ESB_VAL_Q 0x1 + +/* Global enable flags for the XIVE support */ +extern bool __xive_enabled; + +static inline bool xive_enabled(void) { return __xive_enabled; } + +extern bool xive_native_init(void); +extern void xive_smp_probe(void); +extern int xive_smp_prepare_cpu(unsigned int cpu); +extern void xive_smp_setup_cpu(void); +extern void xive_smp_disable_cpu(void); +extern void xive_kexec_teardown_cpu(int secondary); +extern void xive_shutdown(void); +extern void xive_flush_interrupt(void); + +/* xmon hook */ +extern void xmon_xive_do_dump(int cpu); + +/* APIs used by KVM */ +extern u32 xive_native_default_eq_shift(void); +extern u32 xive_native_alloc_vp_block(u32 max_vcpus); +extern void xive_native_free_vp_block(u32 vp_base); +extern int xive_native_populate_irq_data(u32 hw_irq, + struct xive_irq_data *data); +extern void xive_cleanup_irq_data(struct xive_irq_data *xd); +extern u32 xive_native_alloc_irq(void); +extern void xive_native_free_irq(u32 irq); +extern int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); + +extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio, + __be32 *qpage, u32 order, bool can_escalate); +extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio); + +extern bool __xive_irq_trigger(struct xive_irq_data *xd); +extern bool __xive_irq_retrigger(struct xive_irq_data *xd); +extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd); + +extern bool is_xive_irq(struct irq_chip *chip); + +#else + +static inline bool xive_enabled(void) { return false; } + +static inline bool xive_native_init(void) { return false; } +static inline void xive_smp_probe(void) { } +extern inline int xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; } +static inline void xive_smp_setup_cpu(void) { } +static inline void xive_smp_disable_cpu(void) { } +static inline void xive_kexec_teardown_cpu(int secondary) { } +static inline void xive_shutdown(void) { } +static inline void xive_flush_interrupt(void) { } + +static inline u32 xive_native_alloc_vp_block(u32 max_vcpus) { return XIVE_INVALID_VP; } +static inline void xive_native_free_vp_block(u32 vp_base) { } + +#endif + +#endif /* _ASM_POWERPC_XIVE_H */ diff --git a/arch/powerpc/include/asm/xmon.h b/arch/powerpc/include/asm/xmon.h index 5eb8e599e5cc..eb42a0c6e1d9 100644 --- a/arch/powerpc/include/asm/xmon.h +++ b/arch/powerpc/include/asm/xmon.h @@ -29,5 +29,7 @@ static inline void xmon_register_spus(struct list_head *list) { }; extern int cpus_are_in_xmon(void); #endif +extern void xmon_printf(const char *format, ...); + #endif /* __KERNEL __ */ #endif /* __ASM_POWERPC_XMON_H */ diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h index 03c06ba7464f..ab45cc2f3101 100644 --- a/arch/powerpc/include/uapi/asm/mman.h +++ b/arch/powerpc/include/uapi/asm/mman.h @@ -29,4 +29,20 @@ #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ +/* + * When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2), + * encode the log2 of the huge page size. A value of zero indicates that the + * default huge page size should be used. To use a non-default huge page size, + * one of these defines can be used, or the size can be encoded by hand. Note + * that on most systems only a subset, or possibly none, of these sizes will be + * available. + */ +#define MAP_HUGE_512KB (19 << MAP_HUGE_SHIFT) /* 512KB HugeTLB Page */ +#define MAP_HUGE_1MB (20 << MAP_HUGE_SHIFT) /* 1MB HugeTLB Page */ +#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) /* 2MB HugeTLB Page */ +#define MAP_HUGE_8MB (23 << MAP_HUGE_SHIFT) /* 8MB HugeTLB Page */ +#define MAP_HUGE_16MB (24 << MAP_HUGE_SHIFT) /* 16MB HugeTLB Page */ +#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) /* 1GB HugeTLB Page */ +#define MAP_HUGE_16GB (34 << MAP_HUGE_SHIFT) /* 16GB HugeTLB Page */ + #endif /* _UAPI_ASM_POWERPC_MMAN_H */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 811f441a125f..b9db46ae545b 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -25,8 +25,6 @@ CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) -# do not trace tracer code -CFLAGS_REMOVE_ftrace.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) # timers used by tracing CFLAGS_REMOVE_time.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) endif @@ -97,6 +95,7 @@ obj-$(CONFIG_BOOTX_TEXT) += btext.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_OPTPROBES) += optprobes.o optprobes_head.o +obj-$(CONFIG_KPROBES_ON_FTRACE) += kprobes-ftrace.o obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o obj-$(CONFIG_STACKTRACE) += stacktrace.o @@ -118,10 +117,7 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_PPC_IO_WORKAROUNDS) += io-workarounds.o -obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o -obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o -obj-$(CONFIG_TRACING) += trace_clock.o +obj-y += trace/ ifneq ($(CONFIG_PPC_INDIRECT_PIO),y) obj-y += iomap.o @@ -142,14 +138,14 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o # Disable GCOV & sanitizers in odd or sensitive code GCOV_PROFILE_prom_init.o := n UBSAN_SANITIZE_prom_init.o := n -GCOV_PROFILE_ftrace.o := n -UBSAN_SANITIZE_ftrace.o := n GCOV_PROFILE_machine_kexec_64.o := n UBSAN_SANITIZE_machine_kexec_64.o := n GCOV_PROFILE_machine_kexec_32.o := n UBSAN_SANITIZE_machine_kexec_32.o := n GCOV_PROFILE_kprobes.o := n UBSAN_SANITIZE_kprobes.o := n +GCOV_PROFILE_kprobes-ftrace.o := n +UBSAN_SANITIZE_kprobes-ftrace.o := n UBSAN_SANITIZE_vdso.o := n extra-$(CONFIG_PPC_FPU) += fpu.o diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 4367e7df51a1..8e1163426ccb 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -185,6 +185,7 @@ int main(void) #ifdef CONFIG_PPC_MM_SLICES OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize); OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize); + DEFINE(PACA_ADDR_LIMIT, offsetof(struct paca_struct, addr_limit)); DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def)); #endif /* CONFIG_PPC_MM_SLICES */ #endif @@ -399,8 +400,8 @@ int main(void) DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry)); #endif -#ifdef MAX_PGD_TABLE_SIZE - DEFINE(PGD_TABLE_SIZE, MAX_PGD_TABLE_SIZE); +#ifdef CONFIG_PPC_BOOK3S_64 + DEFINE(PGD_TABLE_SIZE, (sizeof(pgd_t) << max(RADIX_PGD_INDEX_SIZE, H_PGD_INDEX_SIZE))); #else DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE); #endif @@ -727,6 +728,7 @@ int main(void) OFFSET(PACA_THREAD_IDLE_STATE, paca_struct, thread_idle_state); OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask); OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask); + OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas); #endif DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S index 7fe8c79e6937..1fce4ddd2e6c 100644 --- a/arch/powerpc/kernel/cpu_setup_power.S +++ b/arch/powerpc/kernel/cpu_setup_power.S @@ -29,6 +29,7 @@ _GLOBAL(__setup_cpu_power7) li r0,0 mtspr SPRN_LPID,r0 mfspr r3,SPRN_LPCR + li r4,(LPCR_LPES1 >> LPCR_LPES_SH) bl __init_LPCR bl __init_tlb_power7 mtlr r11 @@ -42,6 +43,7 @@ _GLOBAL(__restore_cpu_power7) li r0,0 mtspr SPRN_LPID,r0 mfspr r3,SPRN_LPCR + li r4,(LPCR_LPES1 >> LPCR_LPES_SH) bl __init_LPCR bl __init_tlb_power7 mtlr r11 @@ -59,6 +61,7 @@ _GLOBAL(__setup_cpu_power8) mtspr SPRN_LPID,r0 mfspr r3,SPRN_LPCR ori r3, r3, LPCR_PECEDH + li r4,0 /* LPES = 0 */ bl __init_LPCR bl __init_HFSCR bl __init_tlb_power8 @@ -80,6 +83,7 @@ _GLOBAL(__restore_cpu_power8) mtspr SPRN_LPID,r0 mfspr r3,SPRN_LPCR ori r3, r3, LPCR_PECEDH + li r4,0 /* LPES = 0 */ bl __init_LPCR bl __init_HFSCR bl __init_tlb_power8 @@ -99,10 +103,11 @@ _GLOBAL(__setup_cpu_power9) mtspr SPRN_PSSCR,r0 mtspr SPRN_LPID,r0 mfspr r3,SPRN_LPCR - LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE) + LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC) or r3, r3, r4 LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR) andc r3, r3, r4 + li r4,0 /* LPES = 0 */ bl __init_LPCR bl __init_HFSCR bl __init_tlb_power9 @@ -122,10 +127,11 @@ _GLOBAL(__restore_cpu_power9) mtspr SPRN_PSSCR,r0 mtspr SPRN_LPID,r0 mfspr r3,SPRN_LPCR - LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE) + LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC) or r3, r3, r4 LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR) andc r3, r3, r4 + li r4,0 /* LPES = 0 */ bl __init_LPCR bl __init_HFSCR bl __init_tlb_power9 @@ -146,7 +152,7 @@ __init_hvmode_206: __init_LPCR: /* Setup a sane LPCR: - * Called with initial LPCR in R3 + * Called with initial LPCR in R3 and desired LPES 2-bit value in R4 * * LPES = 0b01 (HSRR0/1 used for 0x500) * PECE = 0b111 @@ -157,8 +163,7 @@ __init_LPCR: * * Other bits untouched for now */ - li r5,1 - rldimi r3,r5, LPCR_LPES_SH, 64-LPCR_LPES_SH-2 + rldimi r3,r4, LPCR_LPES_SH, 64-LPCR_LPES_SH-2 ori r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2) li r5,4 rldimi r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3 diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c index 2128f3a96c32..b6fe883b1016 100644 --- a/arch/powerpc/kernel/dbell.c +++ b/arch/powerpc/kernel/dbell.c @@ -20,18 +20,60 @@ #include <asm/kvm_ppc.h> #ifdef CONFIG_SMP -void doorbell_setup_this_cpu(void) + +/* + * Doorbells must only be used if CPU_FTR_DBELL is available. + * msgsnd is used in HV, and msgsndp is used in !HV. + * + * These should be used by platform code that is aware of restrictions. + * Other arch code should use ->cause_ipi. + * + * doorbell_global_ipi() sends a dbell to any target CPU. + * Must be used only by architectures that address msgsnd target + * by PIR/get_hard_smp_processor_id. + */ +void doorbell_global_ipi(int cpu) { - unsigned long tag = mfspr(SPRN_DOORBELL_CPUTAG) & PPC_DBELL_TAG_MASK; + u32 tag = get_hard_smp_processor_id(cpu); - smp_muxed_ipi_set_data(smp_processor_id(), tag); + kvmppc_set_host_ipi(cpu, 1); + /* Order previous accesses vs. msgsnd, which is treated as a store */ + ppc_msgsnd_sync(); + ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag); } -void doorbell_cause_ipi(int cpu, unsigned long data) +/* + * doorbell_core_ipi() sends a dbell to a target CPU in the same core. + * Must be used only by architectures that address msgsnd target + * by TIR/cpu_thread_in_core. + */ +void doorbell_core_ipi(int cpu) { + u32 tag = cpu_thread_in_core(cpu); + + kvmppc_set_host_ipi(cpu, 1); /* Order previous accesses vs. msgsnd, which is treated as a store */ - mb(); - ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, data); + ppc_msgsnd_sync(); + ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag); +} + +/* + * Attempt to cause a core doorbell if destination is on the same core. + * Returns 1 on success, 0 on failure. + */ +int doorbell_try_core_ipi(int cpu) +{ + int this_cpu = get_cpu(); + int ret = 0; + + if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu))) { + doorbell_core_ipi(cpu); + ret = 1; + } + + put_cpu(); + + return ret; } void doorbell_exception(struct pt_regs *regs) @@ -40,12 +82,14 @@ void doorbell_exception(struct pt_regs *regs) irq_enter(); + ppc_msgsync(); + may_hard_irq_enable(); kvmppc_set_host_ipi(smp_processor_id(), 0); __this_cpu_inc(irq_stat.doorbell_irqs); - smp_ipi_demux(); + smp_ipi_demux_relaxed(); /* already performed the barrier */ irq_exit(); set_irq_regs(old_regs); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 9de7f79e702b..63992b2d8e15 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -22,7 +22,6 @@ */ #include <linux/delay.h> -#include <linux/debugfs.h> #include <linux/sched.h> #include <linux/init.h> #include <linux/list.h> @@ -37,7 +36,7 @@ #include <linux/of.h> #include <linux/atomic.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/eeh.h> #include <asm/eeh_event.h> #include <asm/io.h> diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index a38600949f3a..8587059ad848 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -31,7 +31,6 @@ #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> #include <asm/unistd.h> -#include <asm/ftrace.h> #include <asm/ptrace.h> #include <asm/export.h> @@ -1315,109 +1314,3 @@ machine_check_in_rtas: /* XXX load up BATs and panic */ #endif /* CONFIG_PPC_RTAS */ - -#ifdef CONFIG_FUNCTION_TRACER -#ifdef CONFIG_DYNAMIC_FTRACE -_GLOBAL(mcount) -_GLOBAL(_mcount) - /* - * It is required that _mcount on PPC32 must preserve the - * link register. But we have r0 to play with. We use r0 - * to push the return address back to the caller of mcount - * into the ctr register, restore the link register and - * then jump back using the ctr register. - */ - mflr r0 - mtctr r0 - lwz r0, 4(r1) - mtlr r0 - bctr - -_GLOBAL(ftrace_caller) - MCOUNT_SAVE_FRAME - /* r3 ends up with link register */ - subi r3, r3, MCOUNT_INSN_SIZE -.globl ftrace_call -ftrace_call: - bl ftrace_stub - nop -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -.globl ftrace_graph_call -ftrace_graph_call: - b ftrace_graph_stub -_GLOBAL(ftrace_graph_stub) -#endif - MCOUNT_RESTORE_FRAME - /* old link register ends up in ctr reg */ - bctr -#else -_GLOBAL(mcount) -_GLOBAL(_mcount) - - MCOUNT_SAVE_FRAME - - subi r3, r3, MCOUNT_INSN_SIZE - LOAD_REG_ADDR(r5, ftrace_trace_function) - lwz r5,0(r5) - - mtctr r5 - bctrl - nop - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - b ftrace_graph_caller -#endif - MCOUNT_RESTORE_FRAME - bctr -#endif -EXPORT_SYMBOL(_mcount) - -_GLOBAL(ftrace_stub) - blr - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -_GLOBAL(ftrace_graph_caller) - /* load r4 with local address */ - lwz r4, 44(r1) - subi r4, r4, MCOUNT_INSN_SIZE - - /* Grab the LR out of the caller stack frame */ - lwz r3,52(r1) - - bl prepare_ftrace_return - nop - - /* - * prepare_ftrace_return gives us the address we divert to. - * Change the LR in the callers stack frame to this. - */ - stw r3,52(r1) - - MCOUNT_RESTORE_FRAME - /* old link register ends up in ctr reg */ - bctr - -_GLOBAL(return_to_handler) - /* need to save return values */ - stwu r1, -32(r1) - stw r3, 20(r1) - stw r4, 16(r1) - stw r31, 12(r1) - mr r31, r1 - - bl ftrace_return_to_handler - nop - - /* return value has real return address */ - mtlr r3 - - lwz r3, 20(r1) - lwz r4, 16(r1) - lwz r31,12(r1) - lwz r1, 0(r1) - - /* Jump back to real return address */ - blr -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -#endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 6432d4bf08c8..9b541d22595a 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -20,7 +20,6 @@ #include <linux/errno.h> #include <linux/err.h> -#include <linux/magic.h> #include <asm/unistd.h> #include <asm/processor.h> #include <asm/page.h> @@ -33,7 +32,6 @@ #include <asm/bug.h> #include <asm/ptrace.h> #include <asm/irqflags.h> -#include <asm/ftrace.h> #include <asm/hw_irq.h> #include <asm/context_tracking.h> #include <asm/tm.h> @@ -1173,381 +1171,3 @@ _GLOBAL(enter_prom) ld r0,16(r1) mtlr r0 blr - -#ifdef CONFIG_FUNCTION_TRACER -#ifdef CONFIG_DYNAMIC_FTRACE -_GLOBAL(mcount) -_GLOBAL(_mcount) -EXPORT_SYMBOL(_mcount) - mflr r12 - mtctr r12 - mtlr r0 - bctr - -#ifndef CC_USING_MPROFILE_KERNEL -_GLOBAL_TOC(ftrace_caller) - /* Taken from output of objdump from lib64/glibc */ - mflr r3 - ld r11, 0(r1) - stdu r1, -112(r1) - std r3, 128(r1) - ld r4, 16(r11) - subi r3, r3, MCOUNT_INSN_SIZE -.globl ftrace_call -ftrace_call: - bl ftrace_stub - nop -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -.globl ftrace_graph_call -ftrace_graph_call: - b ftrace_graph_stub -_GLOBAL(ftrace_graph_stub) -#endif - ld r0, 128(r1) - mtlr r0 - addi r1, r1, 112 - -#else /* CC_USING_MPROFILE_KERNEL */ -/* - * - * ftrace_caller() is the function that replaces _mcount() when ftrace is - * active. - * - * We arrive here after a function A calls function B, and we are the trace - * function for B. When we enter r1 points to A's stack frame, B has not yet - * had a chance to allocate one yet. - * - * Additionally r2 may point either to the TOC for A, or B, depending on - * whether B did a TOC setup sequence before calling us. - * - * On entry the LR points back to the _mcount() call site, and r0 holds the - * saved LR as it was on entry to B, ie. the original return address at the - * call site in A. - * - * Our job is to save the register state into a struct pt_regs (on the stack) - * and then arrange for the ftrace function to be called. - */ -_GLOBAL(ftrace_caller) - /* Save the original return address in A's stack frame */ - std r0,LRSAVE(r1) - - /* Create our stack frame + pt_regs */ - stdu r1,-SWITCH_FRAME_SIZE(r1) - - /* Save all gprs to pt_regs */ - SAVE_8GPRS(0,r1) - SAVE_8GPRS(8,r1) - SAVE_8GPRS(16,r1) - SAVE_8GPRS(24,r1) - - /* Load special regs for save below */ - mfmsr r8 - mfctr r9 - mfxer r10 - mfcr r11 - - /* Get the _mcount() call site out of LR */ - mflr r7 - /* Save it as pt_regs->nip & pt_regs->link */ - std r7, _NIP(r1) - std r7, _LINK(r1) - - /* Save callee's TOC in the ABI compliant location */ - std r2, 24(r1) - ld r2,PACATOC(r13) /* get kernel TOC in r2 */ - - addis r3,r2,function_trace_op@toc@ha - addi r3,r3,function_trace_op@toc@l - ld r5,0(r3) - -#ifdef CONFIG_LIVEPATCH - mr r14,r7 /* remember old NIP */ -#endif - /* Calculate ip from nip-4 into r3 for call below */ - subi r3, r7, MCOUNT_INSN_SIZE - - /* Put the original return address in r4 as parent_ip */ - mr r4, r0 - - /* Save special regs */ - std r8, _MSR(r1) - std r9, _CTR(r1) - std r10, _XER(r1) - std r11, _CCR(r1) - - /* Load &pt_regs in r6 for call below */ - addi r6, r1 ,STACK_FRAME_OVERHEAD - - /* ftrace_call(r3, r4, r5, r6) */ -.globl ftrace_call -ftrace_call: - bl ftrace_stub - nop - - /* Load ctr with the possibly modified NIP */ - ld r3, _NIP(r1) - mtctr r3 -#ifdef CONFIG_LIVEPATCH - cmpd r14,r3 /* has NIP been altered? */ -#endif - - /* Restore gprs */ - REST_8GPRS(0,r1) - REST_8GPRS(8,r1) - REST_8GPRS(16,r1) - REST_8GPRS(24,r1) - - /* Restore callee's TOC */ - ld r2, 24(r1) - - /* Pop our stack frame */ - addi r1, r1, SWITCH_FRAME_SIZE - - /* Restore original LR for return to B */ - ld r0, LRSAVE(r1) - mtlr r0 - -#ifdef CONFIG_LIVEPATCH - /* Based on the cmpd above, if the NIP was altered handle livepatch */ - bne- livepatch_handler -#endif - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - stdu r1, -112(r1) -.globl ftrace_graph_call -ftrace_graph_call: - b ftrace_graph_stub -_GLOBAL(ftrace_graph_stub) - addi r1, r1, 112 -#endif - - ld r0,LRSAVE(r1) /* restore callee's lr at _mcount site */ - mtlr r0 - bctr /* jump after _mcount site */ -#endif /* CC_USING_MPROFILE_KERNEL */ - -_GLOBAL(ftrace_stub) - blr - -#ifdef CONFIG_LIVEPATCH - /* - * This function runs in the mcount context, between two functions. As - * such it can only clobber registers which are volatile and used in - * function linkage. - * - * We get here when a function A, calls another function B, but B has - * been live patched with a new function C. - * - * On entry: - * - we have no stack frame and can not allocate one - * - LR points back to the original caller (in A) - * - CTR holds the new NIP in C - * - r0 & r12 are free - * - * r0 can't be used as the base register for a DS-form load or store, so - * we temporarily shuffle r1 (stack pointer) into r0 and then put it back. - */ -livepatch_handler: - CURRENT_THREAD_INFO(r12, r1) - - /* Save stack pointer into r0 */ - mr r0, r1 - - /* Allocate 3 x 8 bytes */ - ld r1, TI_livepatch_sp(r12) - addi r1, r1, 24 - std r1, TI_livepatch_sp(r12) - - /* Save toc & real LR on livepatch stack */ - std r2, -24(r1) - mflr r12 - std r12, -16(r1) - - /* Store stack end marker */ - lis r12, STACK_END_MAGIC@h - ori r12, r12, STACK_END_MAGIC@l - std r12, -8(r1) - - /* Restore real stack pointer */ - mr r1, r0 - - /* Put ctr in r12 for global entry and branch there */ - mfctr r12 - bctrl - - /* - * Now we are returning from the patched function to the original - * caller A. We are free to use r0 and r12, and we can use r2 until we - * restore it. - */ - - CURRENT_THREAD_INFO(r12, r1) - - /* Save stack pointer into r0 */ - mr r0, r1 - - ld r1, TI_livepatch_sp(r12) - - /* Check stack marker hasn't been trashed */ - lis r2, STACK_END_MAGIC@h - ori r2, r2, STACK_END_MAGIC@l - ld r12, -8(r1) -1: tdne r12, r2 - EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0 - - /* Restore LR & toc from livepatch stack */ - ld r12, -16(r1) - mtlr r12 - ld r2, -24(r1) - - /* Pop livepatch stack frame */ - CURRENT_THREAD_INFO(r12, r0) - subi r1, r1, 24 - std r1, TI_livepatch_sp(r12) - - /* Restore real stack pointer */ - mr r1, r0 - - /* Return to original caller of live patched function */ - blr -#endif - - -#else -_GLOBAL_TOC(_mcount) -EXPORT_SYMBOL(_mcount) - /* Taken from output of objdump from lib64/glibc */ - mflr r3 - ld r11, 0(r1) - stdu r1, -112(r1) - std r3, 128(r1) - ld r4, 16(r11) - - subi r3, r3, MCOUNT_INSN_SIZE - LOAD_REG_ADDR(r5,ftrace_trace_function) - ld r5,0(r5) - ld r5,0(r5) - mtctr r5 - bctrl - nop - - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - b ftrace_graph_caller -#endif - ld r0, 128(r1) - mtlr r0 - addi r1, r1, 112 -_GLOBAL(ftrace_stub) - blr - -#endif /* CONFIG_DYNAMIC_FTRACE */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -#ifndef CC_USING_MPROFILE_KERNEL -_GLOBAL(ftrace_graph_caller) - /* load r4 with local address */ - ld r4, 128(r1) - subi r4, r4, MCOUNT_INSN_SIZE - - /* Grab the LR out of the caller stack frame */ - ld r11, 112(r1) - ld r3, 16(r11) - - bl prepare_ftrace_return - nop - - /* - * prepare_ftrace_return gives us the address we divert to. - * Change the LR in the callers stack frame to this. - */ - ld r11, 112(r1) - std r3, 16(r11) - - ld r0, 128(r1) - mtlr r0 - addi r1, r1, 112 - blr - -#else /* CC_USING_MPROFILE_KERNEL */ -_GLOBAL(ftrace_graph_caller) - /* with -mprofile-kernel, parameter regs are still alive at _mcount */ - std r10, 104(r1) - std r9, 96(r1) - std r8, 88(r1) - std r7, 80(r1) - std r6, 72(r1) - std r5, 64(r1) - std r4, 56(r1) - std r3, 48(r1) - - /* Save callee's TOC in the ABI compliant location */ - std r2, 24(r1) - ld r2, PACATOC(r13) /* get kernel TOC in r2 */ - - mfctr r4 /* ftrace_caller has moved local addr here */ - std r4, 40(r1) - mflr r3 /* ftrace_caller has restored LR from stack */ - subi r4, r4, MCOUNT_INSN_SIZE - - bl prepare_ftrace_return - nop - - /* - * prepare_ftrace_return gives us the address we divert to. - * Change the LR to this. - */ - mtlr r3 - - ld r0, 40(r1) - mtctr r0 - ld r10, 104(r1) - ld r9, 96(r1) - ld r8, 88(r1) - ld r7, 80(r1) - ld r6, 72(r1) - ld r5, 64(r1) - ld r4, 56(r1) - ld r3, 48(r1) - - /* Restore callee's TOC */ - ld r2, 24(r1) - - addi r1, r1, 112 - mflr r0 - std r0, LRSAVE(r1) - bctr -#endif /* CC_USING_MPROFILE_KERNEL */ - -_GLOBAL(return_to_handler) - /* need to save return values */ - std r4, -32(r1) - std r3, -24(r1) - /* save TOC */ - std r2, -16(r1) - std r31, -8(r1) - mr r31, r1 - stdu r1, -112(r1) - - /* - * We might be called from a module. - * Switch to our TOC to run inside the core kernel. - */ - ld r2, PACATOC(r13) - - bl ftrace_return_to_handler - nop - - /* return value has real return address */ - mtlr r3 - - ld r1, 0(r1) - ld r4, -32(r1) - ld r3, -24(r1) - ld r2, -16(r1) - ld r31, -8(r1) - - /* Jump back to real return address */ - blr -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -#endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 857bf7c5b946..28f8d7bed6b1 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -116,9 +116,7 @@ EXC_VIRT_NONE(0x4000, 0x100) EXC_REAL_BEGIN(system_reset, 0x100, 0x100) SET_SCRATCH0(r13) - GET_PACA(r13) - clrrdi r13,r13,1 /* Last bit of HSPRG0 is set if waking from winkle */ - EXCEPTION_PROLOG_PSERIES_PACA(PACA_EXGEN, system_reset_common, EXC_STD, + EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD, IDLETEST, 0x100) EXC_REAL_END(system_reset, 0x100, 0x100) @@ -126,31 +124,7 @@ EXC_VIRT_NONE(0x4100, 0x100) #ifdef CONFIG_PPC_P7_NAP EXC_COMMON_BEGIN(system_reset_idle_common) -BEGIN_FTR_SECTION - GET_PACA(r13) /* Restore HSPRG0 to get the winkle bit in r13 */ -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) - bl pnv_restore_hyp_resource - - li r0,PNV_THREAD_RUNNING - stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */ - -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - li r0,KVM_HWTHREAD_IN_KERNEL - stb r0,HSTATE_HWTHREAD_STATE(r13) - /* Order setting hwthread_state vs. testing hwthread_req */ - sync - lbz r0,HSTATE_HWTHREAD_REQ(r13) - cmpwi r0,0 - beq 1f - BRANCH_TO_KVM(r10, kvm_start_guest) -1: -#endif - - /* Return SRR1 from power7_nap() */ - mfspr r3,SPRN_SRR1 - blt cr3,2f - b pnv_wakeup_loss -2: b pnv_wakeup_noloss + b pnv_powersave_wakeup #endif EXC_COMMON(system_reset_common, 0x100, system_reset_exception) @@ -172,14 +146,6 @@ EXC_REAL_BEGIN(machine_check, 0x200, 0x100) * vector */ SET_SCRATCH0(r13) /* save r13 */ - /* - * Running native on arch 2.06 or later, we may wakeup from winkle - * inside machine check. If yes, then last bit of HSPRG0 would be set - * to 1. Hence clear it unconditionally. - */ - GET_PACA(r13) - clrrdi r13,r13,1 - SET_PACA(r13) EXCEPTION_PROLOG_0(PACA_EXMC) BEGIN_FTR_SECTION b machine_check_powernv_early @@ -212,6 +178,12 @@ BEGIN_FTR_SECTION * NOTE: We are here with MSR_ME=0 (off), which means we risk a * checkstop if we get another machine check exception before we do * rfid with MSR_ME=1. + * + * This interrupt can wake directly from idle. If that is the case, + * the machine check is handled then the idle wakeup code is called + * to restore state. In that case, the POWER9 DD1 idle PACA workaround + * is not applied in the early machine check code, which will cause + * bugs. */ mr r11,r1 /* Save r1 */ lhz r10,PACA_IN_MCE(r13) @@ -340,6 +312,37 @@ EXC_COMMON_BEGIN(machine_check_common) /* restore original r1. */ \ ld r1,GPR1(r1) +#ifdef CONFIG_PPC_P7_NAP +/* + * This is an idle wakeup. Low level machine check has already been + * done. Queue the event then call the idle code to do the wake up. + */ +EXC_COMMON_BEGIN(machine_check_idle_common) + bl machine_check_queue_event + + /* + * We have not used any non-volatile GPRs here, and as a rule + * most exception code including machine check does not. + * Therefore PACA_NAPSTATELOST does not need to be set. Idle + * wakeup will restore volatile registers. + * + * Load the original SRR1 into r3 for pnv_powersave_wakeup_mce. + * + * Then decrement MCE nesting after finishing with the stack. + */ + ld r3,_MSR(r1) + + lhz r11,PACA_IN_MCE(r13) + subi r11,r11,1 + sth r11,PACA_IN_MCE(r13) + + /* Turn off the RI bit because SRR1 is used by idle wakeup code. */ + /* Recoverability could be improved by reducing the use of SRR1. */ + li r11,0 + mtmsrd r11,1 + + b pnv_powersave_wakeup_mce +#endif /* * Handle machine check early in real mode. We come here with * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack. @@ -352,6 +355,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early) bl machine_check_early std r3,RESULT(r1) /* Save result */ ld r12,_MSR(r1) + #ifdef CONFIG_PPC_P7_NAP /* * Check if thread was in power saving mode. We come here when any @@ -362,48 +366,14 @@ EXC_COMMON_BEGIN(machine_check_handle_early) * * Go back to nap/sleep/winkle mode again if (b) is true. */ - rlwinm. r11,r12,47-31,30,31 /* Was it in power saving mode? */ - beq 4f /* No, it wasn;t */ - /* Thread was in power saving mode. Go back to nap again. */ - cmpwi r11,2 - blt 3f - /* Supervisor/Hypervisor state loss */ - li r0,1 - stb r0,PACA_NAPSTATELOST(r13) -3: bl machine_check_queue_event - MACHINE_CHECK_HANDLER_WINDUP - GET_PACA(r13) - ld r1,PACAR1(r13) - /* - * Check what idle state this CPU was in and go back to same mode - * again. - */ - lbz r3,PACA_THREAD_IDLE_STATE(r13) - cmpwi r3,PNV_THREAD_NAP - bgt 10f - IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP) - /* No return */ -10: - cmpwi r3,PNV_THREAD_SLEEP - bgt 2f - IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP) - /* No return */ - -2: - /* - * Go back to winkle. Please note that this thread was woken up in - * machine check from winkle and have not restored the per-subcore - * state. Hence before going back to winkle, set last bit of HSPRG0 - * to 1. This will make sure that if this thread gets woken up - * again at reset vector 0x100 then it will get chance to restore - * the subcore state. - */ - ori r13,r13,1 - SET_PACA(r13) - IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE) - /* No return */ + BEGIN_FTR_SECTION + rlwinm. r11,r12,47-31,30,31 + beq- 4f + BRANCH_TO_COMMON(r10, machine_check_idle_common) 4: + END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #endif + /* * Check if we are coming from hypervisor userspace. If yes then we * continue in host kernel in V mode to deliver the MC event. diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 8ff0dd4e77a7..243dbef7e926 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -30,17 +30,16 @@ #include <linux/string.h> #include <linux/memblock.h> #include <linux/delay.h> -#include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/crash_dump.h> #include <linux/kobject.h> #include <linux/sysfs.h> +#include <asm/debugfs.h> #include <asm/page.h> #include <asm/prom.h> #include <asm/rtas.h> #include <asm/fadump.h> -#include <asm/debug.h> #include <asm/setup.h> static struct fw_dump fw_dump; @@ -319,15 +318,34 @@ int __init fadump_reserve_mem(void) pr_debug("fadumphdr_addr = %p\n", (void *) fw_dump.fadumphdr_addr); } else { - /* Reserve the memory at the top of memory. */ size = get_fadump_area_size(); - base = memory_boundary - size; - memblock_reserve(base, size); - printk(KERN_INFO "Reserved %ldMB of memory at %ldMB " - "for firmware-assisted dump\n", - (unsigned long)(size >> 20), - (unsigned long)(base >> 20)); + + /* + * Reserve memory at an offset closer to bottom of the RAM to + * minimize the impact of memory hot-remove operation. We can't + * use memblock_find_in_range() here since it doesn't allocate + * from bottom to top. + */ + for (base = fw_dump.boot_memory_size; + base <= (memory_boundary - size); + base += size) { + if (memblock_is_region_memory(base, size) && + !memblock_is_region_reserved(base, size)) + break; + } + if ((base > (memory_boundary - size)) || + memblock_reserve(base, size)) { + pr_err("Failed to reserve memory\n"); + return 0; + } + + pr_info("Reserved %ldMB of memory at %ldMB for firmware-" + "assisted dump (System RAM: %ldMB)\n", + (unsigned long)(size >> 20), + (unsigned long)(base >> 20), + (unsigned long)(memblock_phys_mem_size() >> 20)); } + fw_dump.reserve_dump_area_start = base; fw_dump.reserve_dump_area_size = size; return 1; diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 1607be7c0ef2..e22734278458 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -735,11 +735,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) EXCEPTION(0x2c00, Trap_2c, unknown_exception, EXC_XFER_EE) EXCEPTION(0x2d00, Trap_2d, unknown_exception, EXC_XFER_EE) EXCEPTION(0x2e00, Trap_2e, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2f00, MOLTrampoline, unknown_exception, EXC_XFER_EE_LITE) - - .globl mol_trampoline - .set mol_trampoline, i0x2f00 - EXPORT_SYMBOL(mol_trampoline) + EXCEPTION(0x2f00, Trap_2f, unknown_exception, EXC_XFER_EE) . = 0x3000 @@ -1278,16 +1274,6 @@ EXPORT_SYMBOL(empty_zero_page) swapper_pg_dir: .space PGD_TABLE_SIZE - .globl intercept_table -intercept_table: - .long 0, 0, i0x200, i0x300, i0x400, 0, i0x600, i0x700 - .long i0x800, 0, 0, 0, 0, i0xd00, 0, 0 - .long 0, 0, 0, i0x1300, 0, 0, 0, 0 - .long 0, 0, 0, 0, 0, 0, 0, 0 - .long 0, 0, 0, 0, 0, 0, 0, 0 - .long 0, 0, 0, 0, 0, 0, 0, 0 -EXPORT_SYMBOL(intercept_table) - /* Room for two PTE pointers, usually the kernel and current user pointers * to their respective root page table. */ diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 1dc5eae2ced3..0ddc602b33a4 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -949,7 +949,8 @@ start_here_multiplatform: LOAD_REG_ADDR(r3,init_thread_union) /* set up a stack pointer */ - addi r1,r3,THREAD_SIZE + LOAD_REG_IMMEDIATE(r1,THREAD_SIZE) + add r1,r3,r1 li r0,0 stdu r0,-STACK_FRAME_OVERHEAD(r1) diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 995728736677..0cdee271dd93 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -20,6 +20,7 @@ #include <asm/kvm_book3s_asm.h> #include <asm/opal.h> #include <asm/cpuidle.h> +#include <asm/exception-64s.h> #include <asm/book3s/64/mmu-hash.h> #include <asm/mmu.h> @@ -94,12 +95,12 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) core_idle_lock_held: HMT_LOW 3: lwz r15,0(r14) - andi. r15,r15,PNV_CORE_IDLE_LOCK_BIT + andis. r15,r15,PNV_CORE_IDLE_LOCK_BIT@h bne 3b HMT_MEDIUM lwarx r15,0,r14 - andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT - bne core_idle_lock_held + andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h + bne- core_idle_lock_held blr /* @@ -113,7 +114,7 @@ core_idle_lock_held: * * Address to 'rfid' to in r5 */ -_GLOBAL(pnv_powersave_common) +pnv_powersave_common: /* Use r3 to pass state nap/sleep/winkle */ /* NAP is a state loss, we create a regs frame on the * stack, fill it up with the state we care about and @@ -188,8 +189,8 @@ pnv_enter_arch207_idle_mode: /* The following store to HSTATE_HWTHREAD_STATE(r13) */ /* MUST occur in real mode, i.e. with the MMU off, */ /* and the MMU must stay off until we clear this flag */ - /* and test HSTATE_HWTHREAD_REQ(r13) in the system */ - /* reset interrupt vector in exceptions-64s.S. */ + /* and test HSTATE_HWTHREAD_REQ(r13) in */ + /* pnv_powersave_wakeup in this file. */ /* The reason is that another thread can switch the */ /* MMU to a guest context whenever this flag is set */ /* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on, */ @@ -209,15 +210,20 @@ pnv_enter_arch207_idle_mode: /* Sleep or winkle */ lbz r7,PACA_THREAD_MASK(r13) ld r14,PACA_CORE_IDLE_STATE_PTR(r13) + li r5,0 + beq cr3,3f + lis r5,PNV_CORE_IDLE_WINKLE_COUNT@h +3: lwarx_loop1: lwarx r15,0,r14 - andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT - bnel core_idle_lock_held + andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h + bnel- core_idle_lock_held + add r15,r15,r5 /* Add if winkle */ andc r15,r15,r7 /* Clear thread bit */ - andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS + andi. r9,r15,PNV_CORE_IDLE_THREAD_BITS /* * If cr0 = 0, then current thread is the last thread of the core entering @@ -240,7 +246,7 @@ common_enter: /* common code for all the threads entering sleep or winkle */ IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP) fastsleep_workaround_at_entry: - ori r15,r15,PNV_CORE_IDLE_LOCK_BIT + oris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h stwcx. r15,0,r14 bne- lwarx_loop1 isync @@ -250,10 +256,10 @@ fastsleep_workaround_at_entry: li r4,1 bl opal_config_cpu_idle_state - /* Clear Lock bit */ - li r0,0 + /* Unlock */ + xoris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h lwsync - stw r0,0(r14) + stw r15,0(r14) b common_enter enter_winkle: @@ -301,8 +307,8 @@ power_enter_stop: lwarx_loop_stop: lwarx r15,0,r14 - andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT - bnel core_idle_lock_held + andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h + bnel- core_idle_lock_held andc r15,r15,r7 /* Clear thread bit */ stwcx. r15,0,r14 @@ -375,17 +381,113 @@ _GLOBAL(power9_idle_stop) li r4,1 b pnv_powersave_common /* No return */ + /* - * Called from reset vector. Check whether we have woken up with - * hypervisor state loss. If yes, restore hypervisor state and return - * back to reset vector. + * On waking up from stop 0,1,2 with ESL=1 on POWER9 DD1, + * HSPRG0 will be set to the HSPRG0 value of one of the + * threads in this core. Thus the value we have in r13 + * may not be this thread's paca pointer. + * + * Fortunately, the TIR remains invariant. Since this thread's + * paca pointer is recorded in all its sibling's paca, we can + * correctly recover this thread's paca pointer if we + * know the index of this thread in the core. + * + * This index can be obtained from the TIR. * - * r13 - Contents of HSPRG0 + * i.e, thread's position in the core = TIR. + * If this value is i, then this thread's paca is + * paca->thread_sibling_pacas[i]. + */ +power9_dd1_recover_paca: + mfspr r4, SPRN_TIR + /* + * Since each entry in thread_sibling_pacas is 8 bytes + * we need to left-shift by 3 bits. Thus r4 = i * 8 + */ + sldi r4, r4, 3 + /* Get &paca->thread_sibling_pacas[0] in r5 */ + ld r5, PACA_SIBLING_PACA_PTRS(r13) + /* Load paca->thread_sibling_pacas[i] into r13 */ + ldx r13, r4, r5 + SET_PACA(r13) + /* + * Indicate that we have lost NVGPR state + * which needs to be restored from the stack. + */ + li r3, 1 + stb r0,PACA_NAPSTATELOST(r13) + blr + +/* + * Called from machine check handler for powersave wakeups. + * Low level machine check processing has already been done. Now just + * go through the wake up path to get everything in order. + * + * r3 - The original SRR1 value. + * Original SRR[01] have been clobbered. + * MSR_RI is clear. + */ +.global pnv_powersave_wakeup_mce +pnv_powersave_wakeup_mce: + /* Set cr3 for pnv_powersave_wakeup */ + rlwinm r11,r3,47-31,30,31 + cmpwi cr3,r11,2 + + /* + * Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake + * reason into SRR1, which allows reuse of the system reset wakeup + * code without being mistaken for another type of wakeup. + */ + oris r3,r3,SRR1_WAKEMCE_RESVD@h + mtspr SPRN_SRR1,r3 + + b pnv_powersave_wakeup + +/* + * Called from reset vector for powersave wakeups. * cr3 - set to gt if waking up with partial/complete hypervisor state loss */ -_GLOBAL(pnv_restore_hyp_resource) +.global pnv_powersave_wakeup +pnv_powersave_wakeup: + ld r2, PACATOC(r13) + BEGIN_FTR_SECTION - ld r2,PACATOC(r13); +BEGIN_FTR_SECTION_NESTED(70) + bl power9_dd1_recover_paca +END_FTR_SECTION_NESTED_IFSET(CPU_FTR_POWER9_DD1, 70) + bl pnv_restore_hyp_resource_arch300 +FTR_SECTION_ELSE + bl pnv_restore_hyp_resource_arch207 +ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) + + li r0,PNV_THREAD_RUNNING + stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */ + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + li r0,KVM_HWTHREAD_IN_KERNEL + stb r0,HSTATE_HWTHREAD_STATE(r13) + /* Order setting hwthread_state vs. testing hwthread_req */ + sync + lbz r0,HSTATE_HWTHREAD_REQ(r13) + cmpwi r0,0 + beq 1f + b kvm_start_guest +1: +#endif + + /* Return SRR1 from power7_nap() */ + mfspr r3,SPRN_SRR1 + blt cr3,pnv_wakeup_noloss + b pnv_wakeup_loss + +/* + * Check whether we have woken up with hypervisor state loss. + * If yes, restore hypervisor state and return back to link. + * + * cr3 - set to gt if waking up with partial/complete hypervisor state loss + */ +pnv_restore_hyp_resource_arch300: /* * POWER ISA 3. Use PSSCR to determine if we * are waking up from deep idle state @@ -400,31 +502,19 @@ BEGIN_FTR_SECTION */ rldicl r5,r5,4,60 cmpd cr4,r5,r4 - bge cr4,pnv_wakeup_tb_loss - /* - * Waking up without hypervisor state loss. Return to - * reset vector - */ - blr + bge cr4,pnv_wakeup_tb_loss /* returns to caller */ -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + blr /* Waking up without hypervisor state loss. */ +/* Same calling convention as arch300 */ +pnv_restore_hyp_resource_arch207: /* * POWER ISA 2.07 or less. - * Check if last bit of HSPGR0 is set. This indicates whether we are - * waking up from winkle. + * Check if we slept with sleep or winkle. */ - clrldi r5,r13,63 - clrrdi r13,r13,1 - - /* Now that we are sure r13 is corrected, load TOC */ - ld r2,PACATOC(r13); - cmpwi cr4,r5,1 - mtspr SPRN_HSPRG0,r13 - - lbz r0,PACA_THREAD_IDLE_STATE(r13) - cmpwi cr2,r0,PNV_THREAD_NAP - bgt cr2,pnv_wakeup_tb_loss /* Either sleep or Winkle */ + lbz r4,PACA_THREAD_IDLE_STATE(r13) + cmpwi cr2,r4,PNV_THREAD_NAP + bgt cr2,pnv_wakeup_tb_loss /* Either sleep or Winkle */ /* * We fall through here if PACA_THREAD_IDLE_STATE shows we are waking @@ -433,8 +523,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) */ bgt cr3,. - blr /* Return back to System Reset vector from where - pnv_restore_hyp_resource was invoked */ + blr /* Waking up without hypervisor state loss */ /* * Called if waking up from idle state which can cause either partial or @@ -444,9 +533,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) * * r13 - PACA * cr3 - gt if waking up with partial/complete hypervisor state loss + * + * If ISA300: * cr4 - gt or eq if waking up from complete hypervisor state loss. + * + * If ISA207: + * r4 - PACA_THREAD_IDLE_STATE */ -_GLOBAL(pnv_wakeup_tb_loss) +pnv_wakeup_tb_loss: ld r1,PACAR1(r13) /* * Before entering any idle state, the NVGPRs are saved in the stack @@ -459,18 +553,19 @@ _GLOBAL(pnv_wakeup_tb_loss) * is required to return back to reset vector after hypervisor state * restore is complete. */ + mr r18,r4 mflr r17 mfspr r16,SPRN_SRR1 BEGIN_FTR_SECTION CHECK_HMI_INTERRUPT END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) - lbz r7,PACA_THREAD_MASK(r13) ld r14,PACA_CORE_IDLE_STATE_PTR(r13) -lwarx_loop2: - lwarx r15,0,r14 - andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT + lbz r7,PACA_THREAD_MASK(r13) + /* + * Take the core lock to synchronize against other threads. + * * Lock bit is set in one of the 2 cases- * a. In the sleep/winkle enter path, the last thread is executing * fastsleep workaround code. @@ -478,23 +573,93 @@ lwarx_loop2: * workaround undo code or resyncing timebase or restoring context * In either case loop until the lock bit is cleared. */ - bnel core_idle_lock_held +1: + lwarx r15,0,r14 + andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h + bnel- core_idle_lock_held + oris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h + stwcx. r15,0,r14 + bne- 1b + isync - cmpwi cr2,r15,0 + andi. r9,r15,PNV_CORE_IDLE_THREAD_BITS + cmpwi cr2,r9,0 /* * At this stage * cr2 - eq if first thread to wakeup in core * cr3- gt if waking up with partial/complete hypervisor state loss + * ISA300: * cr4 - gt or eq if waking up from complete hypervisor state loss. */ - ori r15,r15,PNV_CORE_IDLE_LOCK_BIT - stwcx. r15,0,r14 - bne- lwarx_loop2 - isync - BEGIN_FTR_SECTION + /* + * Were we in winkle? + * If yes, check if all threads were in winkle, decrement our + * winkle count, set all thread winkle bits if all were in winkle. + * Check if our thread has a winkle bit set, and set cr4 accordingly + * (to match ISA300, above). Pseudo-code for core idle state + * transitions for ISA207 is as follows (everything happens atomically + * due to store conditional and/or lock bit): + * + * nap_idle() { } + * nap_wake() { } + * + * sleep_idle() + * { + * core_idle_state &= ~thread_in_core + * } + * + * sleep_wake() + * { + * bool first_in_core, first_in_subcore; + * + * first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0; + * first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0; + * + * core_idle_state |= thread_in_core; + * } + * + * winkle_idle() + * { + * core_idle_state &= ~thread_in_core; + * core_idle_state += 1 << WINKLE_COUNT_SHIFT; + * } + * + * winkle_wake() + * { + * bool first_in_core, first_in_subcore, winkle_state_lost; + * + * first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0; + * first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0; + * + * core_idle_state |= thread_in_core; + * + * if ((core_idle_state & WINKLE_MASK) == (8 << WINKLE_COUNT_SIHFT)) + * core_idle_state |= THREAD_WINKLE_BITS; + * core_idle_state -= 1 << WINKLE_COUNT_SHIFT; + * + * winkle_state_lost = core_idle_state & + * (thread_in_core << WINKLE_THREAD_SHIFT); + * core_idle_state &= ~(thread_in_core << WINKLE_THREAD_SHIFT); + * } + * + */ + cmpwi r18,PNV_THREAD_WINKLE + bne 2f + andis. r9,r15,PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT@h + subis r15,r15,PNV_CORE_IDLE_WINKLE_COUNT@h + beq 2f + ori r15,r15,PNV_CORE_IDLE_THREAD_WINKLE_BITS /* all were winkle */ +2: + /* Shift thread bit to winkle mask, then test if this thread is set, + * and remove it from the winkle bits */ + slwi r8,r7,8 + and r8,r8,r15 + andc r15,r15,r8 + cmpwi cr4,r8,1 /* cr4 will be gt if our bit is set, lt if not */ + lbz r4,PACA_SUBCORE_SIBLING_MASK(r13) and r4,r4,r15 cmpwi r4,0 /* Check if first in subcore */ @@ -579,7 +744,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) mtspr SPRN_WORC,r4 clear_lock: - andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS + xoris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h lwsync stw r15,0(r14) @@ -637,8 +802,7 @@ hypervisor_state_restored: mtspr SPRN_SRR1,r16 mtlr r17 - blr /* Return back to System Reset vector from where - pnv_restore_hyp_resource was invoked */ + blr /* return to pnv_powersave_wakeup */ fastsleep_workaround_at_exit: li r3,1 @@ -650,7 +814,8 @@ fastsleep_workaround_at_exit: * R3 here contains the value that will be returned to the caller * of power7_nap. */ -_GLOBAL(pnv_wakeup_loss) +.global pnv_wakeup_loss +pnv_wakeup_loss: ld r1,PACAR1(r13) BEGIN_FTR_SECTION CHECK_HMI_INTERRUPT @@ -670,7 +835,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) * R3 here contains the value that will be returned to the caller * of power7_nap. */ -_GLOBAL(pnv_wakeup_noloss) +pnv_wakeup_noloss: lbz r0,PACA_NAPSTATELOST(r13) cmpwi r0,0 bne pnv_wakeup_loss diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index a018f5cae899..5c291df30fe3 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -65,7 +65,6 @@ #include <asm/machdep.h> #include <asm/udbg.h> #include <asm/smp.h> -#include <asm/debug.h> #include <asm/livepatch.h> #include <asm/asm-prototypes.h> @@ -442,46 +441,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu) return sum; } -#ifdef CONFIG_HOTPLUG_CPU -void migrate_irqs(void) -{ - struct irq_desc *desc; - unsigned int irq; - static int warned; - cpumask_var_t mask; - const struct cpumask *map = cpu_online_mask; - - alloc_cpumask_var(&mask, GFP_KERNEL); - - for_each_irq_desc(irq, desc) { - struct irq_data *data; - struct irq_chip *chip; - - data = irq_desc_get_irq_data(desc); - if (irqd_is_per_cpu(data)) - continue; - - chip = irq_data_get_irq_chip(data); - - cpumask_and(mask, irq_data_get_affinity_mask(data), map); - if (cpumask_any(mask) >= nr_cpu_ids) { - pr_warn("Breaking affinity for irq %i\n", irq); - cpumask_copy(mask, map); - } - if (chip->irq_set_affinity) - chip->irq_set_affinity(data, mask, true); - else if (desc->action && !(warned++)) - pr_err("Cannot set affinity for irq %i\n", irq); - } - - free_cpumask_var(mask); - - local_irq_enable(); - mdelay(1); - local_irq_disable(); -} -#endif - static inline void check_stack_overflow(void) { #ifdef CONFIG_DEBUG_STACKOVERFLOW diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c new file mode 100644 index 000000000000..6c089d9757c9 --- /dev/null +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -0,0 +1,104 @@ +/* + * Dynamic Ftrace based Kprobes Optimization + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) Hitachi Ltd., 2012 + * Copyright 2016 Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> + * IBM Corporation + */ +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/hardirq.h> +#include <linux/preempt.h> +#include <linux/ftrace.h> + +static nokprobe_inline +int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb, unsigned long orig_nip) +{ + /* + * Emulate singlestep (and also recover regs->nip) + * as if there is a nop + */ + regs->nip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + __this_cpu_write(current_kprobe, NULL); + if (orig_nip) + regs->nip = orig_nip; + return 1; +} + +int skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + if (kprobe_ftrace(p)) + return __skip_singlestep(p, regs, kcb, 0); + else + return 0; +} +NOKPROBE_SYMBOL(skip_singlestep); + +/* Ftrace callback handler for kprobes */ +void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, + struct ftrace_ops *ops, struct pt_regs *regs) +{ + struct kprobe *p; + struct kprobe_ctlblk *kcb; + unsigned long flags; + + /* Disable irq for emulating a breakpoint and avoiding preempt */ + local_irq_save(flags); + hard_irq_disable(); + + p = get_kprobe((kprobe_opcode_t *)nip); + if (unlikely(!p) || kprobe_disabled(p)) + goto end; + + kcb = get_kprobe_ctlblk(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(p); + } else { + unsigned long orig_nip = regs->nip; + + /* + * On powerpc, NIP is *before* this instruction for the + * pre handler + */ + regs->nip -= MCOUNT_INSN_SIZE; + + __this_cpu_write(current_kprobe, p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + if (!p->pre_handler || !p->pre_handler(p, regs)) + __skip_singlestep(p, regs, kcb, orig_nip); + /* + * If pre_handler returns !0, it sets regs->nip and + * resets current kprobe. + */ + } +end: + local_irq_restore(flags); +} +NOKPROBE_SYMBOL(kprobe_ftrace_handler); + +int arch_prepare_kprobe_ftrace(struct kprobe *p) +{ + p->ainsn.insn = NULL; + p->ainsn.boostable = -1; + return 0; +} diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index fce05a38851c..160ae0fa7d0d 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -35,6 +35,7 @@ #include <asm/code-patching.h> #include <asm/cacheflush.h> #include <asm/sstep.h> +#include <asm/sections.h> #include <linux/uaccess.h> DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; @@ -42,7 +43,86 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}}; -int __kprobes arch_prepare_kprobe(struct kprobe *p) +bool arch_within_kprobe_blacklist(unsigned long addr) +{ + return (addr >= (unsigned long)__kprobes_text_start && + addr < (unsigned long)__kprobes_text_end) || + (addr >= (unsigned long)_stext && + addr < (unsigned long)__head_end); +} + +kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) +{ + kprobe_opcode_t *addr; + +#ifdef PPC64_ELF_ABI_v2 + /* PPC64 ABIv2 needs local entry point */ + addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); + if (addr && !offset) { +#ifdef CONFIG_KPROBES_ON_FTRACE + unsigned long faddr; + /* + * Per livepatch.h, ftrace location is always within the first + * 16 bytes of a function on powerpc with -mprofile-kernel. + */ + faddr = ftrace_location_range((unsigned long)addr, + (unsigned long)addr + 16); + if (faddr) + addr = (kprobe_opcode_t *)faddr; + else +#endif + addr = (kprobe_opcode_t *)ppc_function_entry(addr); + } +#elif defined(PPC64_ELF_ABI_v1) + /* + * 64bit powerpc ABIv1 uses function descriptors: + * - Check for the dot variant of the symbol first. + * - If that fails, try looking up the symbol provided. + * + * This ensures we always get to the actual symbol and not + * the descriptor. + * + * Also handle <module:symbol> format. + */ + char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN]; + const char *modsym; + bool dot_appended = false; + if ((modsym = strchr(name, ':')) != NULL) { + modsym++; + if (*modsym != '\0' && *modsym != '.') { + /* Convert to <module:.symbol> */ + strncpy(dot_name, name, modsym - name); + dot_name[modsym - name] = '.'; + dot_name[modsym - name + 1] = '\0'; + strncat(dot_name, modsym, + sizeof(dot_name) - (modsym - name) - 2); + dot_appended = true; + } else { + dot_name[0] = '\0'; + strncat(dot_name, name, sizeof(dot_name) - 1); + } + } else if (name[0] != '.') { + dot_name[0] = '.'; + dot_name[1] = '\0'; + strncat(dot_name, name, KSYM_NAME_LEN - 2); + dot_appended = true; + } else { + dot_name[0] = '\0'; + strncat(dot_name, name, KSYM_NAME_LEN - 1); + } + addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name); + if (!addr && dot_appended) { + /* Let's try the original non-dot symbol lookup */ + addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); + } +#else + addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); +#endif + + return addr; +} + +int arch_prepare_kprobe(struct kprobe *p) { int ret = 0; kprobe_opcode_t insn = *p->addr; @@ -74,30 +154,34 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) p->ainsn.boostable = 0; return ret; } +NOKPROBE_SYMBOL(arch_prepare_kprobe); -void __kprobes arch_arm_kprobe(struct kprobe *p) +void arch_arm_kprobe(struct kprobe *p) { *p->addr = BREAKPOINT_INSTRUCTION; flush_icache_range((unsigned long) p->addr, (unsigned long) p->addr + sizeof(kprobe_opcode_t)); } +NOKPROBE_SYMBOL(arch_arm_kprobe); -void __kprobes arch_disarm_kprobe(struct kprobe *p) +void arch_disarm_kprobe(struct kprobe *p) { *p->addr = p->opcode; flush_icache_range((unsigned long) p->addr, (unsigned long) p->addr + sizeof(kprobe_opcode_t)); } +NOKPROBE_SYMBOL(arch_disarm_kprobe); -void __kprobes arch_remove_kprobe(struct kprobe *p) +void arch_remove_kprobe(struct kprobe *p) { if (p->ainsn.insn) { free_insn_slot(p->ainsn.insn, 0); p->ainsn.insn = NULL; } } +NOKPROBE_SYMBOL(arch_remove_kprobe); -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) +static nokprobe_inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) { enable_single_step(regs); @@ -110,37 +194,80 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->nip = (unsigned long)p->ainsn.insn; } -static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) +static nokprobe_inline void save_previous_kprobe(struct kprobe_ctlblk *kcb) { kcb->prev_kprobe.kp = kprobe_running(); kcb->prev_kprobe.status = kcb->kprobe_status; kcb->prev_kprobe.saved_msr = kcb->kprobe_saved_msr; } -static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) +static nokprobe_inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb) { __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); kcb->kprobe_status = kcb->prev_kprobe.status; kcb->kprobe_saved_msr = kcb->prev_kprobe.saved_msr; } -static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, +static nokprobe_inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { __this_cpu_write(current_kprobe, p); kcb->kprobe_saved_msr = regs->msr; } -void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, - struct pt_regs *regs) +bool arch_function_offset_within_entry(unsigned long offset) +{ +#ifdef PPC64_ELF_ABI_v2 +#ifdef CONFIG_KPROBES_ON_FTRACE + return offset <= 16; +#else + return offset <= 8; +#endif +#else + return !offset; +#endif +} + +void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { ri->ret_addr = (kprobe_opcode_t *)regs->link; /* Replace the return addr with trampoline addr */ regs->link = (unsigned long)kretprobe_trampoline; } +NOKPROBE_SYMBOL(arch_prepare_kretprobe); -int __kprobes kprobe_handler(struct pt_regs *regs) +int try_to_emulate(struct kprobe *p, struct pt_regs *regs) +{ + int ret; + unsigned int insn = *p->ainsn.insn; + + /* regs->nip is also adjusted if emulate_step returns 1 */ + ret = emulate_step(regs, insn); + if (ret > 0) { + /* + * Once this instruction has been boosted + * successfully, set the boostable flag + */ + if (unlikely(p->ainsn.boostable == 0)) + p->ainsn.boostable = 1; + } else if (ret < 0) { + /* + * We don't allow kprobes on mtmsr(d)/rfi(d), etc. + * So, we should never get here... but, its still + * good to catch them, just in case... + */ + printk("Can't step on instruction %x\n", insn); + BUG(); + } else if (ret == 0) + /* This instruction can't be boosted */ + p->ainsn.boostable = -1; + + return ret; +} +NOKPROBE_SYMBOL(try_to_emulate); + +int kprobe_handler(struct pt_regs *regs) { struct kprobe *p; int ret = 0; @@ -177,10 +304,17 @@ int __kprobes kprobe_handler(struct pt_regs *regs) */ save_previous_kprobe(kcb); set_current_kprobe(p, regs, kcb); - kcb->kprobe_saved_msr = regs->msr; kprobes_inc_nmissed_count(p); prepare_singlestep(p, regs); kcb->kprobe_status = KPROBE_REENTER; + if (p->ainsn.boostable >= 0) { + ret = try_to_emulate(p, regs); + + if (ret > 0) { + restore_previous_kprobe(kcb); + return 1; + } + } return 1; } else { if (*addr != BREAKPOINT_INSTRUCTION) { @@ -197,7 +331,9 @@ int __kprobes kprobe_handler(struct pt_regs *regs) } p = __this_cpu_read(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { - goto ss_probe; + if (!skip_singlestep(p, regs, kcb)) + goto ss_probe; + ret = 1; } } goto no_kprobe; @@ -235,18 +371,9 @@ int __kprobes kprobe_handler(struct pt_regs *regs) ss_probe: if (p->ainsn.boostable >= 0) { - unsigned int insn = *p->ainsn.insn; + ret = try_to_emulate(p, regs); - /* regs->nip is also adjusted if emulate_step returns 1 */ - ret = emulate_step(regs, insn); if (ret > 0) { - /* - * Once this instruction has been boosted - * successfully, set the boostable flag - */ - if (unlikely(p->ainsn.boostable == 0)) - p->ainsn.boostable = 1; - if (p->post_handler) p->post_handler(p, regs, 0); @@ -254,17 +381,7 @@ ss_probe: reset_current_kprobe(); preempt_enable_no_resched(); return 1; - } else if (ret < 0) { - /* - * We don't allow kprobes on mtmsr(d)/rfi(d), etc. - * So, we should never get here... but, its still - * good to catch them, just in case... - */ - printk("Can't step on instruction %x\n", insn); - BUG(); - } else if (ret == 0) - /* This instruction can't be boosted */ - p->ainsn.boostable = -1; + } } prepare_singlestep(p, regs); kcb->kprobe_status = KPROBE_HIT_SS; @@ -274,6 +391,7 @@ no_kprobe: preempt_enable_no_resched(); return ret; } +NOKPROBE_SYMBOL(kprobe_handler); /* * Function return probe trampoline: @@ -291,8 +409,7 @@ asm(".global kretprobe_trampoline\n" /* * Called when the probe at kretprobe trampoline is hit */ -static int __kprobes trampoline_probe_handler(struct kprobe *p, - struct pt_regs *regs) +static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; @@ -361,6 +478,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, */ return 1; } +NOKPROBE_SYMBOL(trampoline_probe_handler); /* * Called after single-stepping. p->addr is the address of the @@ -370,7 +488,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, * single-stepped a copy of the instruction. The address of this * copy is p->ainsn.insn. */ -int __kprobes kprobe_post_handler(struct pt_regs *regs) +int kprobe_post_handler(struct pt_regs *regs) { struct kprobe *cur = kprobe_running(); struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); @@ -410,8 +528,9 @@ out: return 1; } +NOKPROBE_SYMBOL(kprobe_post_handler); -int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) +int kprobe_fault_handler(struct pt_regs *regs, int trapnr) { struct kprobe *cur = kprobe_running(); struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); @@ -474,13 +593,15 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) } return 0; } +NOKPROBE_SYMBOL(kprobe_fault_handler); unsigned long arch_deref_entry_point(void *entry) { return ppc_global_function_entry(entry); } +NOKPROBE_SYMBOL(arch_deref_entry_point); -int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct jprobe *jp = container_of(p, struct jprobe, kp); struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); @@ -497,17 +618,20 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) return 1; } +NOKPROBE_SYMBOL(setjmp_pre_handler); -void __used __kprobes jprobe_return(void) +void __used jprobe_return(void) { asm volatile("trap" ::: "memory"); } +NOKPROBE_SYMBOL(jprobe_return); -static void __used __kprobes jprobe_return_end(void) +static void __used jprobe_return_end(void) { -}; +} +NOKPROBE_SYMBOL(jprobe_return_end); -int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); @@ -520,6 +644,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) preempt_enable_no_resched(); return 1; } +NOKPROBE_SYMBOL(longjmp_break_handler); static struct kprobe trampoline_p = { .addr = (kprobe_opcode_t *) &kretprobe_trampoline, @@ -531,10 +656,11 @@ int __init arch_init_kprobes(void) return register_kprobe(&trampoline_p); } -int __kprobes arch_trampoline_kprobe(struct kprobe *p) +int arch_trampoline_kprobe(struct kprobe *p) { if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) return 1; return 0; } +NOKPROBE_SYMBOL(arch_trampoline_kprobe); diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index a1475e6aef3a..16eb0b508761 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -228,12 +228,13 @@ static void machine_check_process_queued_event(struct irq_work *work) while (__this_cpu_read(mce_queue_count) > 0) { index = __this_cpu_read(mce_queue_count) - 1; machine_check_print_event_info( - this_cpu_ptr(&mce_event_queue[index])); + this_cpu_ptr(&mce_event_queue[index]), false); __this_cpu_dec(mce_queue_count); } } -void machine_check_print_event_info(struct machine_check_event *evt) +void machine_check_print_event_info(struct machine_check_event *evt, + bool user_mode) { const char *level, *sevstr, *subtype; static const char *mc_ue_types[] = { @@ -310,7 +311,16 @@ void machine_check_print_event_info(struct machine_check_event *evt) printk("%s%s Machine check interrupt [%s]\n", level, sevstr, evt->disposition == MCE_DISPOSITION_RECOVERED ? - "Recovered" : "[Not recovered"); + "Recovered" : "Not recovered"); + + if (user_mode) { + printk("%s NIP: [%016llx] PID: %d Comm: %s\n", level, + evt->srr0, current->pid, current->comm); + } else { + printk("%s NIP [%016llx]: %pS\n", level, evt->srr0, + (void *)evt->srr0); + } + printk("%s Initiator: %s\n", level, evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown"); switch (evt->error_type) { diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 763d6f58caa8..f913139bb0c2 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -72,10 +72,14 @@ void __flush_tlb_power8(unsigned int action) void __flush_tlb_power9(unsigned int action) { + unsigned int num_sets; + if (radix_enabled()) - flush_tlb_206(POWER9_TLB_SETS_RADIX, action); + num_sets = POWER9_TLB_SETS_RADIX; + else + num_sets = POWER9_TLB_SETS_HASH; - flush_tlb_206(POWER9_TLB_SETS_HASH, action); + flush_tlb_206(num_sets, action); } @@ -147,159 +151,365 @@ static int mce_flush(int what) return 0; } -static int mce_handle_flush_derrors(uint64_t dsisr, uint64_t slb, uint64_t tlb, uint64_t erat) -{ - if ((dsisr & slb) && mce_flush(MCE_FLUSH_SLB)) - dsisr &= ~slb; - if ((dsisr & erat) && mce_flush(MCE_FLUSH_ERAT)) - dsisr &= ~erat; - if ((dsisr & tlb) && mce_flush(MCE_FLUSH_TLB)) - dsisr &= ~tlb; - /* Any other errors we don't understand? */ - if (dsisr) - return 0; - return 1; -} - -static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) +#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42)) + +struct mce_ierror_table { + unsigned long srr1_mask; + unsigned long srr1_value; + bool nip_valid; /* nip is a valid indicator of faulting address */ + unsigned int error_type; + unsigned int error_subtype; + unsigned int initiator; + unsigned int severity; +}; + +static const struct mce_ierror_table mce_p7_ierror_table[] = { +{ 0x00000000001c0000, 0x0000000000040000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000001c0000, 0x0000000000080000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000001c0000, 0x00000000000c0000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000001c0000, 0x0000000000100000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */ + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000001c0000, 0x0000000000140000, true, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000001c0000, 0x0000000000180000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000001c0000, 0x00000000001c0000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0, 0, 0, 0, 0, 0 } }; + +static const struct mce_ierror_table mce_p8_ierror_table[] = { +{ 0x00000000081c0000, 0x0000000000040000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000000080000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x00000000000c0000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000000100000, true, + MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000000140000, true, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000000180000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x00000000001c0000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000008000000, true, + MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000008040000, true, + MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0, 0, 0, 0, 0, 0 } }; + +static const struct mce_ierror_table mce_p9_ierror_table[] = { +{ 0x00000000081c0000, 0x0000000000040000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000000080000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x00000000000c0000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000000100000, true, + MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000000140000, true, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000000180000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000008000000, true, + MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000008040000, true, + MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x00000000080c0000, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000008100000, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000008140000, false, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_STORE, + MCE_INITIATOR_CPU, MCE_SEV_FATAL, }, /* ASYNC is fatal */ +{ 0x00000000081c0000, 0x0000000008180000, false, + MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_STORE_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_FATAL, }, /* ASYNC is fatal */ +{ 0x00000000081c0000, 0x00000000081c0000, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0, 0, 0, 0, 0, 0 } }; + +struct mce_derror_table { + unsigned long dsisr_value; + bool dar_valid; /* dar is a valid indicator of faulting address */ + unsigned int error_type; + unsigned int error_subtype; + unsigned int initiator; + unsigned int severity; +}; + +static const struct mce_derror_table mce_p7_derror_table[] = { +{ 0x00008000, false, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00004000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000800, true, + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000400, true, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000100, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000080, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000040, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */ + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0, false, 0, 0, 0, 0 } }; + +static const struct mce_derror_table mce_p8_derror_table[] = { +{ 0x00008000, false, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00004000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00002000, true, + MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00001000, true, + MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000800, true, + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000400, true, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000200, true, + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */ + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000100, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000080, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0, false, 0, 0, 0, 0 } }; + +static const struct mce_derror_table mce_p9_derror_table[] = { +{ 0x00008000, false, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00004000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00002000, true, + MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00001000, true, + MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000800, true, + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000400, true, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000200, false, + MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000100, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000080, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000040, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000020, false, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000010, false, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000008, false, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD_STORE_FOREIGN, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0, false, 0, 0, 0, 0 } }; + +static int mce_handle_ierror(struct pt_regs *regs, + const struct mce_ierror_table table[], + struct mce_error_info *mce_err, uint64_t *addr) { - long handled = 1; + uint64_t srr1 = regs->msr; + int handled = 0; + int i; + + *addr = 0; + + for (i = 0; table[i].srr1_mask; i++) { + if ((srr1 & table[i].srr1_mask) != table[i].srr1_value) + continue; + + /* attempt to correct the error */ + switch (table[i].error_type) { + case MCE_ERROR_TYPE_SLB: + handled = mce_flush(MCE_FLUSH_SLB); + break; + case MCE_ERROR_TYPE_ERAT: + handled = mce_flush(MCE_FLUSH_ERAT); + break; + case MCE_ERROR_TYPE_TLB: + handled = mce_flush(MCE_FLUSH_TLB); + break; + } - /* - * flush and reload SLBs for SLB errors and flush TLBs for TLB errors. - * reset the error bits whenever we handle them so that at the end - * we can check whether we handled all of them or not. - * */ -#ifdef CONFIG_PPC_STD_MMU_64 - if (dsisr & slb_error_bits) { - flush_and_reload_slb(); - /* reset error bits */ - dsisr &= ~(slb_error_bits); - } - if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) { - if (cur_cpu_spec && cur_cpu_spec->flush_tlb) - cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL); - /* reset error bits */ - dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB; + /* now fill in mce_error_info */ + mce_err->error_type = table[i].error_type; + switch (table[i].error_type) { + case MCE_ERROR_TYPE_UE: + mce_err->u.ue_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_SLB: + mce_err->u.slb_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_ERAT: + mce_err->u.erat_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_TLB: + mce_err->u.tlb_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_USER: + mce_err->u.user_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_RA: + mce_err->u.ra_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_LINK: + mce_err->u.link_error_type = table[i].error_subtype; + break; + } + mce_err->severity = table[i].severity; + mce_err->initiator = table[i].initiator; + if (table[i].nip_valid) + *addr = regs->nip; + return handled; } -#endif - /* Any other errors we don't understand? */ - if (dsisr & 0xffffffffUL) - handled = 0; - return handled; -} + mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN; + mce_err->severity = MCE_SEV_ERROR_SYNC; + mce_err->initiator = MCE_INITIATOR_CPU; -static long mce_handle_derror_p7(uint64_t dsisr) -{ - return mce_handle_derror(dsisr, P7_DSISR_MC_SLB_ERRORS); + return 0; } -static long mce_handle_common_ierror(uint64_t srr1) +static int mce_handle_derror(struct pt_regs *regs, + const struct mce_derror_table table[], + struct mce_error_info *mce_err, uint64_t *addr) { - long handled = 0; - - switch (P7_SRR1_MC_IFETCH(srr1)) { - case 0: - break; -#ifdef CONFIG_PPC_STD_MMU_64 - case P7_SRR1_MC_IFETCH_SLB_PARITY: - case P7_SRR1_MC_IFETCH_SLB_MULTIHIT: - /* flush and reload SLBs for SLB errors. */ - flush_and_reload_slb(); - handled = 1; - break; - case P7_SRR1_MC_IFETCH_TLB_MULTIHIT: - if (cur_cpu_spec && cur_cpu_spec->flush_tlb) { - cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL); - handled = 1; + uint64_t dsisr = regs->dsisr; + int handled = 0; + int found = 0; + int i; + + *addr = 0; + + for (i = 0; table[i].dsisr_value; i++) { + if (!(dsisr & table[i].dsisr_value)) + continue; + + /* attempt to correct the error */ + switch (table[i].error_type) { + case MCE_ERROR_TYPE_SLB: + if (mce_flush(MCE_FLUSH_SLB)) + handled = 1; + break; + case MCE_ERROR_TYPE_ERAT: + if (mce_flush(MCE_FLUSH_ERAT)) + handled = 1; + break; + case MCE_ERROR_TYPE_TLB: + if (mce_flush(MCE_FLUSH_TLB)) + handled = 1; + break; } - break; -#endif - default: - break; - } - return handled; -} - -static long mce_handle_ierror_p7(uint64_t srr1) -{ - long handled = 0; - - handled = mce_handle_common_ierror(srr1); + /* + * Attempt to handle multiple conditions, but only return + * one. Ensure uncorrectable errors are first in the table + * to match. + */ + if (found) + continue; + + /* now fill in mce_error_info */ + mce_err->error_type = table[i].error_type; + switch (table[i].error_type) { + case MCE_ERROR_TYPE_UE: + mce_err->u.ue_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_SLB: + mce_err->u.slb_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_ERAT: + mce_err->u.erat_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_TLB: + mce_err->u.tlb_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_USER: + mce_err->u.user_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_RA: + mce_err->u.ra_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_LINK: + mce_err->u.link_error_type = table[i].error_subtype; + break; + } + mce_err->severity = table[i].severity; + mce_err->initiator = table[i].initiator; + if (table[i].dar_valid) + *addr = regs->dar; -#ifdef CONFIG_PPC_STD_MMU_64 - if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) { - flush_and_reload_slb(); - handled = 1; + found = 1; } -#endif - return handled; -} -static void mce_get_common_ierror(struct mce_error_info *mce_err, uint64_t srr1) -{ - switch (P7_SRR1_MC_IFETCH(srr1)) { - case P7_SRR1_MC_IFETCH_SLB_PARITY: - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; - break; - case P7_SRR1_MC_IFETCH_SLB_MULTIHIT: - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; - break; - case P7_SRR1_MC_IFETCH_TLB_MULTIHIT: - mce_err->error_type = MCE_ERROR_TYPE_TLB; - mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; - break; - case P7_SRR1_MC_IFETCH_UE: - case P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL: - mce_err->error_type = MCE_ERROR_TYPE_UE; - mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH; - break; - case P7_SRR1_MC_IFETCH_UE_TLB_RELOAD: - mce_err->error_type = MCE_ERROR_TYPE_UE; - mce_err->u.ue_error_type = - MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH; - break; - } -} + if (found) + return handled; -static void mce_get_ierror_p7(struct mce_error_info *mce_err, uint64_t srr1) -{ - mce_get_common_ierror(mce_err, srr1); - if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) { - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; - } -} + mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN; + mce_err->severity = MCE_SEV_ERROR_SYNC; + mce_err->initiator = MCE_INITIATOR_CPU; -static void mce_get_derror_p7(struct mce_error_info *mce_err, uint64_t dsisr) -{ - if (dsisr & P7_DSISR_MC_UE) { - mce_err->error_type = MCE_ERROR_TYPE_UE; - mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE; - } else if (dsisr & P7_DSISR_MC_UE_TABLEWALK) { - mce_err->error_type = MCE_ERROR_TYPE_UE; - mce_err->u.ue_error_type = - MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE; - } else if (dsisr & P7_DSISR_MC_ERAT_MULTIHIT) { - mce_err->error_type = MCE_ERROR_TYPE_ERAT; - mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; - } else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT) { - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; - } else if (dsisr & P7_DSISR_MC_SLB_PARITY_MFSLB) { - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; - } else if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) { - mce_err->error_type = MCE_ERROR_TYPE_TLB; - mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; - } else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT_PARITY) { - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; - } + return 0; } static long mce_handle_ue_error(struct pt_regs *regs) @@ -320,292 +530,42 @@ static long mce_handle_ue_error(struct pt_regs *regs) return handled; } -long __machine_check_early_realmode_p7(struct pt_regs *regs) +static long mce_handle_error(struct pt_regs *regs, + const struct mce_derror_table dtable[], + const struct mce_ierror_table itable[]) { - uint64_t srr1, nip, addr; - long handled = 1; - struct mce_error_info mce_error_info = { 0 }; - - mce_error_info.severity = MCE_SEV_ERROR_SYNC; - mce_error_info.initiator = MCE_INITIATOR_CPU; - - srr1 = regs->msr; - nip = regs->nip; + struct mce_error_info mce_err = { 0 }; + uint64_t addr; + uint64_t srr1 = regs->msr; + long handled; - /* - * Handle memory errors depending whether this was a load/store or - * ifetch exception. Also, populate the mce error_type and - * type-specific error_type from either SRR1 or DSISR, depending - * whether this was a load/store or ifetch exception - */ - if (P7_SRR1_MC_LOADSTORE(srr1)) { - handled = mce_handle_derror_p7(regs->dsisr); - mce_get_derror_p7(&mce_error_info, regs->dsisr); - addr = regs->dar; - } else { - handled = mce_handle_ierror_p7(srr1); - mce_get_ierror_p7(&mce_error_info, srr1); - addr = regs->nip; - } + if (SRR1_MC_LOADSTORE(srr1)) + handled = mce_handle_derror(regs, dtable, &mce_err, &addr); + else + handled = mce_handle_ierror(regs, itable, &mce_err, &addr); - /* Handle UE error. */ - if (mce_error_info.error_type == MCE_ERROR_TYPE_UE) + if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE) handled = mce_handle_ue_error(regs); - save_mce_event(regs, handled, &mce_error_info, nip, addr); - return handled; -} - -static void mce_get_ierror_p8(struct mce_error_info *mce_err, uint64_t srr1) -{ - mce_get_common_ierror(mce_err, srr1); - if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) { - mce_err->error_type = MCE_ERROR_TYPE_ERAT; - mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; - } -} - -static void mce_get_derror_p8(struct mce_error_info *mce_err, uint64_t dsisr) -{ - mce_get_derror_p7(mce_err, dsisr); - if (dsisr & P8_DSISR_MC_ERAT_MULTIHIT_SEC) { - mce_err->error_type = MCE_ERROR_TYPE_ERAT; - mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; - } -} - -static long mce_handle_ierror_p8(uint64_t srr1) -{ - long handled = 0; + save_mce_event(regs, handled, &mce_err, regs->nip, addr); - handled = mce_handle_common_ierror(srr1); - -#ifdef CONFIG_PPC_STD_MMU_64 - if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) { - flush_and_reload_slb(); - handled = 1; - } -#endif return handled; } -static long mce_handle_derror_p8(uint64_t dsisr) -{ - return mce_handle_derror(dsisr, P8_DSISR_MC_SLB_ERRORS); -} - -long __machine_check_early_realmode_p8(struct pt_regs *regs) -{ - uint64_t srr1, nip, addr; - long handled = 1; - struct mce_error_info mce_error_info = { 0 }; - - mce_error_info.severity = MCE_SEV_ERROR_SYNC; - mce_error_info.initiator = MCE_INITIATOR_CPU; - - srr1 = regs->msr; - nip = regs->nip; - - if (P7_SRR1_MC_LOADSTORE(srr1)) { - handled = mce_handle_derror_p8(regs->dsisr); - mce_get_derror_p8(&mce_error_info, regs->dsisr); - addr = regs->dar; - } else { - handled = mce_handle_ierror_p8(srr1); - mce_get_ierror_p8(&mce_error_info, srr1); - addr = regs->nip; - } - - /* Handle UE error. */ - if (mce_error_info.error_type == MCE_ERROR_TYPE_UE) - handled = mce_handle_ue_error(regs); - - save_mce_event(regs, handled, &mce_error_info, nip, addr); - return handled; -} - -static int mce_handle_derror_p9(struct pt_regs *regs) -{ - uint64_t dsisr = regs->dsisr; - - return mce_handle_flush_derrors(dsisr, - P9_DSISR_MC_SLB_PARITY_MFSLB | - P9_DSISR_MC_SLB_MULTIHIT_MFSLB, - - P9_DSISR_MC_TLB_MULTIHIT_MFTLB, - - P9_DSISR_MC_ERAT_MULTIHIT); -} - -static int mce_handle_ierror_p9(struct pt_regs *regs) +long __machine_check_early_realmode_p7(struct pt_regs *regs) { - uint64_t srr1 = regs->msr; + /* P7 DD1 leaves top bits of DSISR undefined */ + regs->dsisr &= 0x0000ffff; - switch (P9_SRR1_MC_IFETCH(srr1)) { - case P9_SRR1_MC_IFETCH_SLB_PARITY: - case P9_SRR1_MC_IFETCH_SLB_MULTIHIT: - return mce_flush(MCE_FLUSH_SLB); - case P9_SRR1_MC_IFETCH_TLB_MULTIHIT: - return mce_flush(MCE_FLUSH_TLB); - case P9_SRR1_MC_IFETCH_ERAT_MULTIHIT: - return mce_flush(MCE_FLUSH_ERAT); - default: - return 0; - } + return mce_handle_error(regs, mce_p7_derror_table, mce_p7_ierror_table); } -static void mce_get_derror_p9(struct pt_regs *regs, - struct mce_error_info *mce_err, uint64_t *addr) -{ - uint64_t dsisr = regs->dsisr; - - mce_err->severity = MCE_SEV_ERROR_SYNC; - mce_err->initiator = MCE_INITIATOR_CPU; - - if (dsisr & P9_DSISR_MC_USER_TLBIE) - *addr = regs->nip; - else - *addr = regs->dar; - - if (dsisr & P9_DSISR_MC_UE) { - mce_err->error_type = MCE_ERROR_TYPE_UE; - mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE; - } else if (dsisr & P9_DSISR_MC_UE_TABLEWALK) { - mce_err->error_type = MCE_ERROR_TYPE_UE; - mce_err->u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE; - } else if (dsisr & P9_DSISR_MC_LINK_LOAD_TIMEOUT) { - mce_err->error_type = MCE_ERROR_TYPE_LINK; - mce_err->u.link_error_type = MCE_LINK_ERROR_LOAD_TIMEOUT; - } else if (dsisr & P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT) { - mce_err->error_type = MCE_ERROR_TYPE_LINK; - mce_err->u.link_error_type = MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT; - } else if (dsisr & P9_DSISR_MC_ERAT_MULTIHIT) { - mce_err->error_type = MCE_ERROR_TYPE_ERAT; - mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; - } else if (dsisr & P9_DSISR_MC_TLB_MULTIHIT_MFTLB) { - mce_err->error_type = MCE_ERROR_TYPE_TLB; - mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; - } else if (dsisr & P9_DSISR_MC_USER_TLBIE) { - mce_err->error_type = MCE_ERROR_TYPE_USER; - mce_err->u.user_error_type = MCE_USER_ERROR_TLBIE; - } else if (dsisr & P9_DSISR_MC_SLB_PARITY_MFSLB) { - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; - } else if (dsisr & P9_DSISR_MC_SLB_MULTIHIT_MFSLB) { - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; - } else if (dsisr & P9_DSISR_MC_RA_LOAD) { - mce_err->error_type = MCE_ERROR_TYPE_RA; - mce_err->u.ra_error_type = MCE_RA_ERROR_LOAD; - } else if (dsisr & P9_DSISR_MC_RA_TABLEWALK) { - mce_err->error_type = MCE_ERROR_TYPE_RA; - mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE; - } else if (dsisr & P9_DSISR_MC_RA_TABLEWALK_FOREIGN) { - mce_err->error_type = MCE_ERROR_TYPE_RA; - mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN; - } else if (dsisr & P9_DSISR_MC_RA_FOREIGN) { - mce_err->error_type = MCE_ERROR_TYPE_RA; - mce_err->u.ra_error_type = MCE_RA_ERROR_LOAD_STORE_FOREIGN; - } -} - -static void mce_get_ierror_p9(struct pt_regs *regs, - struct mce_error_info *mce_err, uint64_t *addr) +long __machine_check_early_realmode_p8(struct pt_regs *regs) { - uint64_t srr1 = regs->msr; - - switch (P9_SRR1_MC_IFETCH(srr1)) { - case P9_SRR1_MC_IFETCH_RA_ASYNC_STORE: - case P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT: - mce_err->severity = MCE_SEV_FATAL; - break; - default: - mce_err->severity = MCE_SEV_ERROR_SYNC; - break; - } - - mce_err->initiator = MCE_INITIATOR_CPU; - - *addr = regs->nip; - - switch (P9_SRR1_MC_IFETCH(srr1)) { - case P9_SRR1_MC_IFETCH_UE: - mce_err->error_type = MCE_ERROR_TYPE_UE; - mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH; - break; - case P9_SRR1_MC_IFETCH_SLB_PARITY: - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; - break; - case P9_SRR1_MC_IFETCH_SLB_MULTIHIT: - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; - break; - case P9_SRR1_MC_IFETCH_ERAT_MULTIHIT: - mce_err->error_type = MCE_ERROR_TYPE_ERAT; - mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; - break; - case P9_SRR1_MC_IFETCH_TLB_MULTIHIT: - mce_err->error_type = MCE_ERROR_TYPE_TLB; - mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; - break; - case P9_SRR1_MC_IFETCH_UE_TLB_RELOAD: - mce_err->error_type = MCE_ERROR_TYPE_UE; - mce_err->u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH; - break; - case P9_SRR1_MC_IFETCH_LINK_TIMEOUT: - mce_err->error_type = MCE_ERROR_TYPE_LINK; - mce_err->u.link_error_type = MCE_LINK_ERROR_IFETCH_TIMEOUT; - break; - case P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT: - mce_err->error_type = MCE_ERROR_TYPE_LINK; - mce_err->u.link_error_type = MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT; - break; - case P9_SRR1_MC_IFETCH_RA: - mce_err->error_type = MCE_ERROR_TYPE_RA; - mce_err->u.ra_error_type = MCE_RA_ERROR_IFETCH; - break; - case P9_SRR1_MC_IFETCH_RA_TABLEWALK: - mce_err->error_type = MCE_ERROR_TYPE_RA; - mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH; - break; - case P9_SRR1_MC_IFETCH_RA_ASYNC_STORE: - mce_err->error_type = MCE_ERROR_TYPE_RA; - mce_err->u.ra_error_type = MCE_RA_ERROR_STORE; - break; - case P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT: - mce_err->error_type = MCE_ERROR_TYPE_LINK; - mce_err->u.link_error_type = MCE_LINK_ERROR_STORE_TIMEOUT; - break; - case P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN: - mce_err->error_type = MCE_ERROR_TYPE_RA; - mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN; - break; - default: - break; - } + return mce_handle_error(regs, mce_p8_derror_table, mce_p8_ierror_table); } long __machine_check_early_realmode_p9(struct pt_regs *regs) { - uint64_t nip, addr; - long handled; - struct mce_error_info mce_error_info = { 0 }; - - nip = regs->nip; - - if (P9_SRR1_MC_LOADSTORE(regs->msr)) { - handled = mce_handle_derror_p9(regs); - mce_get_derror_p9(regs, &mce_error_info, &addr); - } else { - handled = mce_handle_ierror_p9(regs); - mce_get_ierror_p9(regs, &mce_error_info, &addr); - } - - /* Handle UE error. */ - if (mce_error_info.error_type == MCE_ERROR_TYPE_UE) - handled = mce_handle_ue_error(regs); - - save_mce_event(regs, handled, &mce_error_info, nip, addr); - return handled; + return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table); } diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 2282bf4e63cd..ec60ed0d4aad 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -243,10 +243,10 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) /* * 2. branch to optimized_callback() and emulate_step() */ - kprobe_lookup_name("optimized_callback", op_callback_addr); - kprobe_lookup_name("emulate_step", emulate_step_addr); + op_callback_addr = (kprobe_opcode_t *)ppc_kallsyms_lookup_name("optimized_callback"); + emulate_step_addr = (kprobe_opcode_t *)ppc_kallsyms_lookup_name("emulate_step"); if (!op_callback_addr || !emulate_step_addr) { - WARN(1, "kprobe_lookup_name() failed\n"); + WARN(1, "Unable to lookup optimized_callback()/emulate_step()\n"); goto error; } diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index dfc479df9634..8d63627e067f 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -245,3 +245,24 @@ void __init free_unused_pacas(void) free_lppacas(); } + +void copy_mm_to_paca(struct mm_struct *mm) +{ +#ifdef CONFIG_PPC_BOOK3S + mm_context_t *context = &mm->context; + + get_paca()->mm_ctx_id = context->id; +#ifdef CONFIG_PPC_MM_SLICES + VM_BUG_ON(!mm->context.addr_limit); + get_paca()->addr_limit = mm->context.addr_limit; + get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize; + memcpy(&get_paca()->mm_ctx_high_slices_psize, + &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm)); +#else /* CONFIG_PPC_MM_SLICES */ + get_paca()->mm_ctx_user_psize = context->user_psize; + get_paca()->mm_ctx_sllp = context->sllp; +#endif +#else /* CONFIG_PPC_BOOK3S */ + return; +#endif +} diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index f5d399e46193..d2f0afeae5a0 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -55,7 +55,6 @@ #include <asm/kexec.h> #include <asm/opal.h> #include <asm/fadump.h> -#include <asm/debug.h> #include <asm/epapr_hcalls.h> #include <asm/firmware.h> diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 1c1b44ec7642..dd8a04f3053a 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -815,7 +815,7 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { .virt_base = cpu_to_be32(0xffffffff), .virt_size = cpu_to_be32(0xffffffff), .load_base = cpu_to_be32(0xffffffff), - .min_rma = cpu_to_be32(256), /* 256MB min RMA */ + .min_rma = cpu_to_be32(512), /* 512MB min RMA */ .min_load = cpu_to_be32(0xffffffff), /* full client load */ .min_rma_percent = 0, /* min RMA percentage of total RAM */ .max_pft_size = 48, /* max log_2(hash table size) */ diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 4697da895133..5c10b5925ac2 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -31,11 +31,11 @@ #include <linux/unistd.h> #include <linux/serial.h> #include <linux/serial_8250.h> -#include <linux/debugfs.h> #include <linux/percpu.h> #include <linux/memblock.h> #include <linux/of_platform.h> #include <linux/hugetlb.h> +#include <asm/debugfs.h> #include <asm/io.h> #include <asm/paca.h> #include <asm/prom.h> @@ -920,6 +920,15 @@ void __init setup_arch(char **cmdline_p) init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; init_mm.brk = klimit; + +#ifdef CONFIG_PPC_MM_SLICES +#ifdef CONFIG_PPC64 + init_mm.context.addr_limit = TASK_SIZE_128TB; +#else +#error "context.addr_limit not initialized." +#endif +#endif + #ifdef CONFIG_PPC_64K_PAGES init_mm.context.pte_frag = NULL; #endif diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 9cfaa8b69b5f..729e990a019d 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -230,8 +230,8 @@ static void cpu_ready_for_interrupts(void) * If we are not in hypervisor mode the job is done once for * the whole partition in configure_exceptions(). */ - if (early_cpu_has_feature(CPU_FTR_HVMODE) && - early_cpu_has_feature(CPU_FTR_ARCH_207S)) { + if (cpu_has_feature(CPU_FTR_HVMODE) && + cpu_has_feature(CPU_FTR_ARCH_207S)) { unsigned long lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3); } diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 46f89e66a273..2eca1e491e2b 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -39,6 +39,7 @@ #include <asm/irq.h> #include <asm/hw_irq.h> #include <asm/kvm_ppc.h> +#include <asm/dbell.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/prom.h> @@ -211,17 +212,9 @@ int smp_request_message_ipi(int virq, int msg) #ifdef CONFIG_PPC_SMP_MUXED_IPI struct cpu_messages { long messages; /* current messages */ - unsigned long data; /* data for cause ipi */ }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message); -void smp_muxed_ipi_set_data(int cpu, unsigned long data) -{ - struct cpu_messages *info = &per_cpu(ipi_message, cpu); - - info->data = data; -} - void smp_muxed_ipi_set_message(int cpu, int msg) { struct cpu_messages *info = &per_cpu(ipi_message, cpu); @@ -236,14 +229,13 @@ void smp_muxed_ipi_set_message(int cpu, int msg) void smp_muxed_ipi_message_pass(int cpu, int msg) { - struct cpu_messages *info = &per_cpu(ipi_message, cpu); - smp_muxed_ipi_set_message(cpu, msg); + /* * cause_ipi functions are required to include a full barrier * before doing whatever causes the IPI. */ - smp_ops->cause_ipi(cpu, info->data); + smp_ops->cause_ipi(cpu); } #ifdef __BIG_ENDIAN__ @@ -254,11 +246,18 @@ void smp_muxed_ipi_message_pass(int cpu, int msg) irqreturn_t smp_ipi_demux(void) { - struct cpu_messages *info = this_cpu_ptr(&ipi_message); - unsigned long all; - mb(); /* order any irq clear */ + return smp_ipi_demux_relaxed(); +} + +/* sync-free variant. Callers should ensure synchronization */ +irqreturn_t smp_ipi_demux_relaxed(void) +{ + struct cpu_messages *info; + unsigned long all; + + info = this_cpu_ptr(&ipi_message); do { all = xchg(&info->messages, 0); #if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) @@ -439,7 +438,14 @@ int generic_cpu_disable(void) #ifdef CONFIG_PPC64 vdso_data->processorCount--; #endif - migrate_irqs(); + /* Update affinity of all IRQs previously aimed at this CPU */ + irq_migrate_all_off_this_cpu(); + + /* Give the CPU time to drain in-flight ones */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); + return 0; } @@ -521,6 +527,16 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) cpu_idle_thread_init(cpu, tidle); + /* + * The platform might need to allocate resources prior to bringing + * up the CPU + */ + if (smp_ops->prepare_cpu) { + rc = smp_ops->prepare_cpu(cpu); + if (rc) + return rc; + } + /* Make sure callin-map entry is 0 (can be leftover a CPU * hotplug */ diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 66711958493c..d534ed901538 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -59,7 +59,14 @@ EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - save_context_stack(trace, tsk->thread.ksp, tsk, 0); + unsigned long sp; + + if (tsk == current) + sp = current_stack_pointer(); + else + sp = tsk->thread.ksp; + + save_context_stack(trace, sp, tsk, 0); } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c index 6ae9bd5086a4..0050b2d2ff7a 100644 --- a/arch/powerpc/kernel/swsusp.c +++ b/arch/powerpc/kernel/swsusp.c @@ -10,6 +10,7 @@ */ #include <linux/sched.h> +#include <linux/suspend.h> #include <asm/current.h> #include <asm/mmu_context.h> #include <asm/switch_to.h> diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index de04c9fbb5cd..a877bf8269fe 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -42,11 +42,11 @@ #include <asm/unistd.h> #include <asm/asm-prototypes.h> -static inline unsigned long do_mmap2(unsigned long addr, size_t len, +static inline long do_mmap2(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long off, int shift) { - unsigned long ret = -EINVAL; + long ret = -EINVAL; if (!arch_validate_prot(prot)) goto out; @@ -62,16 +62,16 @@ out: return ret; } -unsigned long sys_mmap2(unsigned long addr, size_t len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) +SYSCALL_DEFINE6(mmap2, unsigned long, addr, size_t, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) { return do_mmap2(addr, len, prot, flags, fd, pgoff, PAGE_SHIFT-12); } -unsigned long sys_mmap(unsigned long addr, size_t len, - unsigned long prot, unsigned long flags, - unsigned long fd, off_t offset) +SYSCALL_DEFINE6(mmap, unsigned long, addr, size_t, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, off_t, offset) { return do_mmap2(addr, len, prot, flags, fd, offset, PAGE_SHIFT); } diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index c1fb255a60d6..949957b97ead 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -710,6 +710,10 @@ static int register_cpu_online(unsigned int cpu) struct device_attribute *attrs, *pmc_attrs; int i, nattrs; + /* For cpus present at boot a reference was already grabbed in register_cpu() */ + if (!s->of_node) + s->of_node = of_get_cpu_node(cpu, NULL); + #ifdef CONFIG_PPC64 if (cpu_has_feature(CPU_FTR_SMT)) device_create_file(s, &dev_attr_smt_snooze_delay); @@ -864,6 +868,8 @@ static int unregister_cpu_online(unsigned int cpu) } #endif cacheinfo_cpu_offline(cpu); + of_node_put(s->of_node); + s->of_node = NULL; #endif /* CONFIG_HOTPLUG_CPU */ return 0; } diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile new file mode 100644 index 000000000000..729dffc5f7bc --- /dev/null +++ b/arch/powerpc/kernel/trace/Makefile @@ -0,0 +1,29 @@ +# +# Makefile for the powerpc trace subsystem +# + +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror + +ifdef CONFIG_FUNCTION_TRACER +# do not trace tracer code +CFLAGS_REMOVE_ftrace.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) +endif + +obj32-$(CONFIG_FUNCTION_TRACER) += ftrace_32.o +obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64.o +ifdef CONFIG_MPROFILE_KERNEL +obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64_mprofile.o +else +obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64_pg.o +endif +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o +obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o +obj-$(CONFIG_TRACING) += trace_clock.o + +obj-$(CONFIG_PPC64) += $(obj64-y) +obj-$(CONFIG_PPC32) += $(obj32-y) + +# Disable GCOV & sanitizers in odd or sensitive code +GCOV_PROFILE_ftrace.o := n +UBSAN_SANITIZE_ftrace.o := n diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 5c9f50c1aa99..32509de6ce4c 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -21,6 +21,7 @@ #include <linux/init.h> #include <linux/list.h> +#include <asm/asm-prototypes.h> #include <asm/cacheflush.h> #include <asm/code-patching.h> #include <asm/ftrace.h> diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S new file mode 100644 index 000000000000..afef2c076282 --- /dev/null +++ b/arch/powerpc/kernel/trace/ftrace_32.S @@ -0,0 +1,118 @@ +/* + * Split from entry_32.S + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/magic.h> +#include <asm/reg.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/ftrace.h> +#include <asm/export.h> + +#ifdef CONFIG_DYNAMIC_FTRACE +_GLOBAL(mcount) +_GLOBAL(_mcount) + /* + * It is required that _mcount on PPC32 must preserve the + * link register. But we have r0 to play with. We use r0 + * to push the return address back to the caller of mcount + * into the ctr register, restore the link register and + * then jump back using the ctr register. + */ + mflr r0 + mtctr r0 + lwz r0, 4(r1) + mtlr r0 + bctr + +_GLOBAL(ftrace_caller) + MCOUNT_SAVE_FRAME + /* r3 ends up with link register */ + subi r3, r3, MCOUNT_INSN_SIZE +.globl ftrace_call +ftrace_call: + bl ftrace_stub + nop +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + b ftrace_graph_stub +_GLOBAL(ftrace_graph_stub) +#endif + MCOUNT_RESTORE_FRAME + /* old link register ends up in ctr reg */ + bctr +#else +_GLOBAL(mcount) +_GLOBAL(_mcount) + + MCOUNT_SAVE_FRAME + + subi r3, r3, MCOUNT_INSN_SIZE + LOAD_REG_ADDR(r5, ftrace_trace_function) + lwz r5,0(r5) + + mtctr r5 + bctrl + nop + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + b ftrace_graph_caller +#endif + MCOUNT_RESTORE_FRAME + bctr +#endif +EXPORT_SYMBOL(_mcount) + +_GLOBAL(ftrace_stub) + blr + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +_GLOBAL(ftrace_graph_caller) + /* load r4 with local address */ + lwz r4, 44(r1) + subi r4, r4, MCOUNT_INSN_SIZE + + /* Grab the LR out of the caller stack frame */ + lwz r3,52(r1) + + bl prepare_ftrace_return + nop + + /* + * prepare_ftrace_return gives us the address we divert to. + * Change the LR in the callers stack frame to this. + */ + stw r3,52(r1) + + MCOUNT_RESTORE_FRAME + /* old link register ends up in ctr reg */ + bctr + +_GLOBAL(return_to_handler) + /* need to save return values */ + stwu r1, -32(r1) + stw r3, 20(r1) + stw r4, 16(r1) + stw r31, 12(r1) + mr r31, r1 + + bl ftrace_return_to_handler + nop + + /* return value has real return address */ + mtlr r3 + + lwz r3, 20(r1) + lwz r4, 16(r1) + lwz r31,12(r1) + lwz r1, 0(r1) + + /* Jump back to real return address */ + blr +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/powerpc/kernel/trace/ftrace_64.S b/arch/powerpc/kernel/trace/ftrace_64.S new file mode 100644 index 000000000000..e5ccea19821e --- /dev/null +++ b/arch/powerpc/kernel/trace/ftrace_64.S @@ -0,0 +1,85 @@ +/* + * Split from entry_64.S + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/magic.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/ftrace.h> +#include <asm/ppc-opcode.h> +#include <asm/export.h> + +#ifdef CONFIG_DYNAMIC_FTRACE +_GLOBAL(mcount) +_GLOBAL(_mcount) +EXPORT_SYMBOL(_mcount) + mflr r12 + mtctr r12 + mtlr r0 + bctr + +#else /* CONFIG_DYNAMIC_FTRACE */ +_GLOBAL_TOC(_mcount) +EXPORT_SYMBOL(_mcount) + /* Taken from output of objdump from lib64/glibc */ + mflr r3 + ld r11, 0(r1) + stdu r1, -112(r1) + std r3, 128(r1) + ld r4, 16(r11) + + subi r3, r3, MCOUNT_INSN_SIZE + LOAD_REG_ADDR(r5,ftrace_trace_function) + ld r5,0(r5) + ld r5,0(r5) + mtctr r5 + bctrl + nop + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + b ftrace_graph_caller +#endif + ld r0, 128(r1) + mtlr r0 + addi r1, r1, 112 +_GLOBAL(ftrace_stub) + blr +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +_GLOBAL(return_to_handler) + /* need to save return values */ + std r4, -32(r1) + std r3, -24(r1) + /* save TOC */ + std r2, -16(r1) + std r31, -8(r1) + mr r31, r1 + stdu r1, -112(r1) + + /* + * We might be called from a module. + * Switch to our TOC to run inside the core kernel. + */ + ld r2, PACATOC(r13) + + bl ftrace_return_to_handler + nop + + /* return value has real return address */ + mtlr r3 + + ld r1, 0(r1) + ld r4, -32(r1) + ld r3, -24(r1) + ld r2, -16(r1) + ld r31, -8(r1) + + /* Jump back to real return address */ + blr +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S new file mode 100644 index 000000000000..7c933a99f5d5 --- /dev/null +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -0,0 +1,272 @@ +/* + * Split from ftrace_64.S + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/magic.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/ftrace.h> +#include <asm/ppc-opcode.h> +#include <asm/export.h> +#include <asm/thread_info.h> +#include <asm/bug.h> +#include <asm/ptrace.h> + +#ifdef CONFIG_DYNAMIC_FTRACE +/* + * + * ftrace_caller() is the function that replaces _mcount() when ftrace is + * active. + * + * We arrive here after a function A calls function B, and we are the trace + * function for B. When we enter r1 points to A's stack frame, B has not yet + * had a chance to allocate one yet. + * + * Additionally r2 may point either to the TOC for A, or B, depending on + * whether B did a TOC setup sequence before calling us. + * + * On entry the LR points back to the _mcount() call site, and r0 holds the + * saved LR as it was on entry to B, ie. the original return address at the + * call site in A. + * + * Our job is to save the register state into a struct pt_regs (on the stack) + * and then arrange for the ftrace function to be called. + */ +_GLOBAL(ftrace_caller) + /* Save the original return address in A's stack frame */ + std r0,LRSAVE(r1) + + /* Create our stack frame + pt_regs */ + stdu r1,-SWITCH_FRAME_SIZE(r1) + + /* Save all gprs to pt_regs */ + SAVE_8GPRS(0,r1) + SAVE_8GPRS(8,r1) + SAVE_8GPRS(16,r1) + SAVE_8GPRS(24,r1) + + /* Load special regs for save below */ + mfmsr r8 + mfctr r9 + mfxer r10 + mfcr r11 + + /* Get the _mcount() call site out of LR */ + mflr r7 + /* Save it as pt_regs->nip */ + std r7, _NIP(r1) + /* Save the read LR in pt_regs->link */ + std r0, _LINK(r1) + + /* Save callee's TOC in the ABI compliant location */ + std r2, 24(r1) + ld r2,PACATOC(r13) /* get kernel TOC in r2 */ + + addis r3,r2,function_trace_op@toc@ha + addi r3,r3,function_trace_op@toc@l + ld r5,0(r3) + +#ifdef CONFIG_LIVEPATCH + mr r14,r7 /* remember old NIP */ +#endif + /* Calculate ip from nip-4 into r3 for call below */ + subi r3, r7, MCOUNT_INSN_SIZE + + /* Put the original return address in r4 as parent_ip */ + mr r4, r0 + + /* Save special regs */ + std r8, _MSR(r1) + std r9, _CTR(r1) + std r10, _XER(r1) + std r11, _CCR(r1) + + /* Load &pt_regs in r6 for call below */ + addi r6, r1 ,STACK_FRAME_OVERHEAD + + /* ftrace_call(r3, r4, r5, r6) */ +.globl ftrace_call +ftrace_call: + bl ftrace_stub + nop + + /* Load ctr with the possibly modified NIP */ + ld r3, _NIP(r1) + mtctr r3 +#ifdef CONFIG_LIVEPATCH + cmpd r14,r3 /* has NIP been altered? */ +#endif + + /* Restore gprs */ + REST_8GPRS(0,r1) + REST_8GPRS(8,r1) + REST_8GPRS(16,r1) + REST_8GPRS(24,r1) + + /* Restore possibly modified LR */ + ld r0, _LINK(r1) + mtlr r0 + + /* Restore callee's TOC */ + ld r2, 24(r1) + + /* Pop our stack frame */ + addi r1, r1, SWITCH_FRAME_SIZE + +#ifdef CONFIG_LIVEPATCH + /* Based on the cmpd above, if the NIP was altered handle livepatch */ + bne- livepatch_handler +#endif + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + b ftrace_graph_stub +_GLOBAL(ftrace_graph_stub) +#endif + + bctr /* jump after _mcount site */ + +_GLOBAL(ftrace_stub) + blr + +#ifdef CONFIG_LIVEPATCH + /* + * This function runs in the mcount context, between two functions. As + * such it can only clobber registers which are volatile and used in + * function linkage. + * + * We get here when a function A, calls another function B, but B has + * been live patched with a new function C. + * + * On entry: + * - we have no stack frame and can not allocate one + * - LR points back to the original caller (in A) + * - CTR holds the new NIP in C + * - r0 & r12 are free + * + * r0 can't be used as the base register for a DS-form load or store, so + * we temporarily shuffle r1 (stack pointer) into r0 and then put it back. + */ +livepatch_handler: + CURRENT_THREAD_INFO(r12, r1) + + /* Save stack pointer into r0 */ + mr r0, r1 + + /* Allocate 3 x 8 bytes */ + ld r1, TI_livepatch_sp(r12) + addi r1, r1, 24 + std r1, TI_livepatch_sp(r12) + + /* Save toc & real LR on livepatch stack */ + std r2, -24(r1) + mflr r12 + std r12, -16(r1) + + /* Store stack end marker */ + lis r12, STACK_END_MAGIC@h + ori r12, r12, STACK_END_MAGIC@l + std r12, -8(r1) + + /* Restore real stack pointer */ + mr r1, r0 + + /* Put ctr in r12 for global entry and branch there */ + mfctr r12 + bctrl + + /* + * Now we are returning from the patched function to the original + * caller A. We are free to use r0 and r12, and we can use r2 until we + * restore it. + */ + + CURRENT_THREAD_INFO(r12, r1) + + /* Save stack pointer into r0 */ + mr r0, r1 + + ld r1, TI_livepatch_sp(r12) + + /* Check stack marker hasn't been trashed */ + lis r2, STACK_END_MAGIC@h + ori r2, r2, STACK_END_MAGIC@l + ld r12, -8(r1) +1: tdne r12, r2 + EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0 + + /* Restore LR & toc from livepatch stack */ + ld r12, -16(r1) + mtlr r12 + ld r2, -24(r1) + + /* Pop livepatch stack frame */ + CURRENT_THREAD_INFO(r12, r0) + subi r1, r1, 24 + std r1, TI_livepatch_sp(r12) + + /* Restore real stack pointer */ + mr r1, r0 + + /* Return to original caller of live patched function */ + blr +#endif /* CONFIG_LIVEPATCH */ + +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +_GLOBAL(ftrace_graph_caller) + stdu r1, -112(r1) + /* with -mprofile-kernel, parameter regs are still alive at _mcount */ + std r10, 104(r1) + std r9, 96(r1) + std r8, 88(r1) + std r7, 80(r1) + std r6, 72(r1) + std r5, 64(r1) + std r4, 56(r1) + std r3, 48(r1) + + /* Save callee's TOC in the ABI compliant location */ + std r2, 24(r1) + ld r2, PACATOC(r13) /* get kernel TOC in r2 */ + + mfctr r4 /* ftrace_caller has moved local addr here */ + std r4, 40(r1) + mflr r3 /* ftrace_caller has restored LR from stack */ + subi r4, r4, MCOUNT_INSN_SIZE + + bl prepare_ftrace_return + nop + + /* + * prepare_ftrace_return gives us the address we divert to. + * Change the LR to this. + */ + mtlr r3 + + ld r0, 40(r1) + mtctr r0 + ld r10, 104(r1) + ld r9, 96(r1) + ld r8, 88(r1) + ld r7, 80(r1) + ld r6, 72(r1) + ld r5, 64(r1) + ld r4, 56(r1) + ld r3, 48(r1) + + /* Restore callee's TOC */ + ld r2, 24(r1) + + addi r1, r1, 112 + mflr r0 + std r0, LRSAVE(r1) + bctr +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.S b/arch/powerpc/kernel/trace/ftrace_64_pg.S new file mode 100644 index 000000000000..f095358da96e --- /dev/null +++ b/arch/powerpc/kernel/trace/ftrace_64_pg.S @@ -0,0 +1,68 @@ +/* + * Split from ftrace_64.S + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/magic.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/ftrace.h> +#include <asm/ppc-opcode.h> +#include <asm/export.h> + +#ifdef CONFIG_DYNAMIC_FTRACE +_GLOBAL_TOC(ftrace_caller) + /* Taken from output of objdump from lib64/glibc */ + mflr r3 + ld r11, 0(r1) + stdu r1, -112(r1) + std r3, 128(r1) + ld r4, 16(r11) + subi r3, r3, MCOUNT_INSN_SIZE +.globl ftrace_call +ftrace_call: + bl ftrace_stub + nop +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + b ftrace_graph_stub +_GLOBAL(ftrace_graph_stub) +#endif + ld r0, 128(r1) + mtlr r0 + addi r1, r1, 112 + +_GLOBAL(ftrace_stub) + blr +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +_GLOBAL(ftrace_graph_caller) + /* load r4 with local address */ + ld r4, 128(r1) + subi r4, r4, MCOUNT_INSN_SIZE + + /* Grab the LR out of the caller stack frame */ + ld r11, 112(r1) + ld r3, 16(r11) + + bl prepare_ftrace_return + nop + + /* + * prepare_ftrace_return gives us the address we divert to. + * Change the LR in the callers stack frame to this. + */ + ld r11, 112(r1) + std r3, 16(r11) + + ld r0, 128(r1) + mtlr r0 + addi r1, r1, 112 + blr +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/powerpc/kernel/trace_clock.c b/arch/powerpc/kernel/trace/trace_clock.c index 49170690946d..49170690946d 100644 --- a/arch/powerpc/kernel/trace_clock.c +++ b/arch/powerpc/kernel/trace/trace_clock.c diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index ff365f9de27a..76f6045b021b 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -35,13 +35,13 @@ #include <linux/backlight.h> #include <linux/bug.h> #include <linux/kdebug.h> -#include <linux/debugfs.h> #include <linux/ratelimit.h> #include <linux/context_tracking.h> #include <asm/emulated_ops.h> #include <asm/pgtable.h> #include <linux/uaccess.h> +#include <asm/debugfs.h> #include <asm/io.h> #include <asm/machdep.h> #include <asm/rtas.h> @@ -1440,6 +1440,8 @@ void facility_unavailable_exception(struct pt_regs *regs) [FSCR_TM_LG] = "TM", [FSCR_EBB_LG] = "EBB", [FSCR_TAR_LG] = "TAR", + [FSCR_MSGP_LG] = "MSGP", + [FSCR_SCV_LG] = "SCV", }; char *facility = "unknown"; u64 value; diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 7394b770ae1f..f6eee507e0b9 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -77,6 +77,8 @@ SECTIONS #endif } :kernel + __head_end = .; + /* * If the build dies here, it's likely code in head_64.S is referencing * labels it can't reach, and the linker inserting stubs without the diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index b6b5c185bd92..aedacefd961d 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -20,6 +20,10 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/miscdevice.h> +#include <linux/gfp.h> +#include <linux/sched.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> #include <asm/reg.h> #include <asm/cputable.h> @@ -31,10 +35,6 @@ #include <asm/kvm_book3s.h> #include <asm/mmu_context.h> #include <asm/page.h> -#include <linux/gfp.h> -#include <linux/sched.h> -#include <linux/vmalloc.h> -#include <linux/highmem.h> #include "book3s.h" #include "trace.h" diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index a587e8f4fd26..74b0153780e3 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -229,6 +229,7 @@ void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) { + unsigned long vsid_bits = VSID_BITS_65_256M; struct kvmppc_sid_map *map; struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); u16 sid_map_mask; @@ -257,7 +258,12 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) kvmppc_mmu_pte_flush(vcpu, 0, 0); kvmppc_mmu_flush_segments(vcpu); } - map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++, 256M); + + if (mmu_has_feature(MMU_FTR_68_BIT_VA)) + vsid_bits = VSID_BITS_256M; + + map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++, + VSID_MULTIPLIER_256M, vsid_bits); map->guest_vsid = gvsid; map->valid = true; @@ -390,7 +396,7 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu) struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int err; - err = __init_new_context(); + err = hash__alloc_context_id(); if (err < 0) return -1; vcpu3s->context_id[0] = err; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 1ec86d9e2a82..fadb75abfe37 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -35,6 +35,15 @@ #include <linux/srcu.h> #include <linux/miscdevice.h> #include <linux/debugfs.h> +#include <linux/gfp.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> +#include <linux/hugetlb.h> +#include <linux/kvm_irqfd.h> +#include <linux/irqbypass.h> +#include <linux/module.h> +#include <linux/compiler.h> +#include <linux/of.h> #include <asm/reg.h> #include <asm/cputable.h> @@ -58,15 +67,6 @@ #include <asm/mmu.h> #include <asm/opal.h> #include <asm/xics.h> -#include <linux/gfp.h> -#include <linux/vmalloc.h> -#include <linux/highmem.h> -#include <linux/hugetlb.h> -#include <linux/kvm_irqfd.h> -#include <linux/irqbypass.h> -#include <linux/module.h> -#include <linux/compiler.h> -#include <linux/of.h> #include "book3s.h" diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 4d6c64b3041c..a752e29977e0 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -23,6 +23,7 @@ #include <asm/kvm_book3s.h> #include <asm/archrandom.h> #include <asm/xics.h> +#include <asm/xive.h> #include <asm/dbell.h> #include <asm/cputhreads.h> #include <asm/io.h> @@ -193,12 +194,6 @@ long kvmppc_h_random(struct kvm_vcpu *vcpu) return H_HARDWARE; } -static inline void rm_writeb(unsigned long paddr, u8 val) -{ - __asm__ __volatile__("stbcix %0,0,%1" - : : "r" (val), "r" (paddr) : "memory"); -} - /* * Send an interrupt or message to another CPU. * The caller needs to include any barrier needed to order writes @@ -206,7 +201,7 @@ static inline void rm_writeb(unsigned long paddr, u8 val) */ void kvmhv_rm_send_ipi(int cpu) { - unsigned long xics_phys; + void __iomem *xics_phys; unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); /* On POWER9 we can use msgsnd for any destination cpu. */ @@ -224,10 +219,14 @@ void kvmhv_rm_send_ipi(int cpu) return; } + /* We should never reach this */ + if (WARN_ON_ONCE(xive_enabled())) + return; + /* Else poke the target with an IPI */ xics_phys = paca[cpu].kvm_hstate.xics_phys; if (xics_phys) - rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); + __raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR); else opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY); } @@ -386,6 +385,9 @@ long kvmppc_read_intr(void) long rc; bool again; + if (xive_enabled()) + return 1; + do { again = false; rc = kvmppc_read_one_intr(&again); @@ -397,7 +399,7 @@ long kvmppc_read_intr(void) static long kvmppc_read_one_intr(bool *again) { - unsigned long xics_phys; + void __iomem *xics_phys; u32 h_xirr; __be32 xirr; u32 xisr; @@ -415,7 +417,7 @@ static long kvmppc_read_one_intr(bool *again) if (!xics_phys) rc = opal_int_get_xirr(&xirr, false); else - xirr = _lwzcix(xics_phys + XICS_XIRR); + xirr = __raw_rm_readl(xics_phys + XICS_XIRR); if (rc < 0) return 1; @@ -445,8 +447,8 @@ static long kvmppc_read_one_intr(bool *again) if (xisr == XICS_IPI) { rc = 0; if (xics_phys) { - _stbcix(xics_phys + XICS_MFRR, 0xff); - _stwcix(xics_phys + XICS_XIRR, xirr); + __raw_rm_writeb(0xff, xics_phys + XICS_MFRR); + __raw_rm_writel(xirr, xics_phys + XICS_XIRR); } else { opal_int_set_mfrr(hard_smp_processor_id(), 0xff); rc = opal_int_eoi(h_xirr); @@ -471,7 +473,8 @@ static long kvmppc_read_one_intr(bool *again) * we need to resend that IPI, bummer */ if (xics_phys) - _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY); + __raw_rm_writeb(IPI_PRIORITY, + xics_phys + XICS_MFRR); else opal_int_set_mfrr(hard_smp_processor_id(), IPI_PRIORITY); diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index e78542d99cd6..ffde4507ddfd 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -16,7 +16,6 @@ #include <asm/kvm_ppc.h> #include <asm/hvcall.h> #include <asm/xics.h> -#include <asm/debug.h> #include <asm/synch.h> #include <asm/cputhreads.h> #include <asm/pgtable.h> @@ -766,7 +765,7 @@ unsigned long eoi_rc; static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) { - unsigned long xics_phys; + void __iomem *xics_phys; int64_t rc; rc = pnv_opal_pci_msi_eoi(c, hwirq); @@ -779,7 +778,7 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) /* EOI it */ xics_phys = local_paca->kvm_hstate.xics_phys; if (xics_phys) { - _stwcix(xics_phys + XICS_XIRR, xirr); + __raw_rm_writel(xirr, xics_phys + XICS_XIRR); } else { rc = opal_int_eoi(be32_to_cpu(xirr)); *again = rc > 0; diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index e48803e2918d..459b72cb617a 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -19,10 +19,9 @@ #include <asm/kvm_ppc.h> #include <asm/hvcall.h> #include <asm/xics.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/time.h> -#include <linux/debugfs.h> #include <linux/seq_file.h> #include "book3s_xics.h" @@ -1084,7 +1083,7 @@ static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm, return xics->ics[icsid]; } -int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num) +static int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num) { struct kvmppc_icp *icp; diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 0d3002b7e2b4..500b0f6a0b64 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -8,6 +8,7 @@ */ #include <linux/kernel.h> +#include <linux/kprobes.h> #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/mm.h> @@ -59,7 +60,7 @@ bool is_offset_in_branch_range(long offset) * Helper to check if a given instruction is a conditional branch * Derived from the conditional checks in analyse_instr() */ -bool __kprobes is_conditional_branch(unsigned int instr) +bool is_conditional_branch(unsigned int instr) { unsigned int opcode = instr >> 26; @@ -75,6 +76,7 @@ bool __kprobes is_conditional_branch(unsigned int instr) } return false; } +NOKPROBE_SYMBOL(is_conditional_branch); unsigned int create_branch(const unsigned int *addr, unsigned long target, int flags) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 9c542ec70c5b..33117f8a0882 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -49,7 +49,8 @@ extern int do_stxvd2x(int rn, unsigned long ea); /* * Emulate the truncation of 64 bit values in 32-bit mode. */ -static unsigned long truncate_if_32bit(unsigned long msr, unsigned long val) +static nokprobe_inline unsigned long truncate_if_32bit(unsigned long msr, + unsigned long val) { #ifdef __powerpc64__ if ((msr & MSR_64BIT) == 0) @@ -61,7 +62,7 @@ static unsigned long truncate_if_32bit(unsigned long msr, unsigned long val) /* * Determine whether a conditional branch instruction would branch. */ -static int __kprobes branch_taken(unsigned int instr, struct pt_regs *regs) +static nokprobe_inline int branch_taken(unsigned int instr, struct pt_regs *regs) { unsigned int bo = (instr >> 21) & 0x1f; unsigned int bi; @@ -81,8 +82,7 @@ static int __kprobes branch_taken(unsigned int instr, struct pt_regs *regs) return 1; } - -static long __kprobes address_ok(struct pt_regs *regs, unsigned long ea, int nb) +static nokprobe_inline long address_ok(struct pt_regs *regs, unsigned long ea, int nb) { if (!user_mode(regs)) return 1; @@ -92,7 +92,7 @@ static long __kprobes address_ok(struct pt_regs *regs, unsigned long ea, int nb) /* * Calculate effective address for a D-form instruction */ -static unsigned long __kprobes dform_ea(unsigned int instr, struct pt_regs *regs) +static nokprobe_inline unsigned long dform_ea(unsigned int instr, struct pt_regs *regs) { int ra; unsigned long ea; @@ -109,7 +109,7 @@ static unsigned long __kprobes dform_ea(unsigned int instr, struct pt_regs *regs /* * Calculate effective address for a DS-form instruction */ -static unsigned long __kprobes dsform_ea(unsigned int instr, struct pt_regs *regs) +static nokprobe_inline unsigned long dsform_ea(unsigned int instr, struct pt_regs *regs) { int ra; unsigned long ea; @@ -126,8 +126,8 @@ static unsigned long __kprobes dsform_ea(unsigned int instr, struct pt_regs *reg /* * Calculate effective address for an X-form instruction */ -static unsigned long __kprobes xform_ea(unsigned int instr, - struct pt_regs *regs) +static nokprobe_inline unsigned long xform_ea(unsigned int instr, + struct pt_regs *regs) { int ra, rb; unsigned long ea; @@ -145,33 +145,33 @@ static unsigned long __kprobes xform_ea(unsigned int instr, * Return the largest power of 2, not greater than sizeof(unsigned long), * such that x is a multiple of it. */ -static inline unsigned long max_align(unsigned long x) +static nokprobe_inline unsigned long max_align(unsigned long x) { x |= sizeof(unsigned long); return x & -x; /* isolates rightmost bit */ } -static inline unsigned long byterev_2(unsigned long x) +static nokprobe_inline unsigned long byterev_2(unsigned long x) { return ((x >> 8) & 0xff) | ((x & 0xff) << 8); } -static inline unsigned long byterev_4(unsigned long x) +static nokprobe_inline unsigned long byterev_4(unsigned long x) { return ((x >> 24) & 0xff) | ((x >> 8) & 0xff00) | ((x & 0xff00) << 8) | ((x & 0xff) << 24); } #ifdef __powerpc64__ -static inline unsigned long byterev_8(unsigned long x) +static nokprobe_inline unsigned long byterev_8(unsigned long x) { return (byterev_4(x) << 32) | byterev_4(x >> 32); } #endif -static int __kprobes read_mem_aligned(unsigned long *dest, unsigned long ea, - int nb) +static nokprobe_inline int read_mem_aligned(unsigned long *dest, + unsigned long ea, int nb) { int err = 0; unsigned long x = 0; @@ -197,8 +197,8 @@ static int __kprobes read_mem_aligned(unsigned long *dest, unsigned long ea, return err; } -static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea, - int nb, struct pt_regs *regs) +static nokprobe_inline int read_mem_unaligned(unsigned long *dest, + unsigned long ea, int nb, struct pt_regs *regs) { int err; unsigned long x, b, c; @@ -248,7 +248,7 @@ static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea, * Read memory at address ea for nb bytes, return 0 for success * or -EFAULT if an error occurred. */ -static int __kprobes read_mem(unsigned long *dest, unsigned long ea, int nb, +static int read_mem(unsigned long *dest, unsigned long ea, int nb, struct pt_regs *regs) { if (!address_ok(regs, ea, nb)) @@ -257,9 +257,10 @@ static int __kprobes read_mem(unsigned long *dest, unsigned long ea, int nb, return read_mem_aligned(dest, ea, nb); return read_mem_unaligned(dest, ea, nb, regs); } +NOKPROBE_SYMBOL(read_mem); -static int __kprobes write_mem_aligned(unsigned long val, unsigned long ea, - int nb) +static nokprobe_inline int write_mem_aligned(unsigned long val, + unsigned long ea, int nb) { int err = 0; @@ -282,8 +283,8 @@ static int __kprobes write_mem_aligned(unsigned long val, unsigned long ea, return err; } -static int __kprobes write_mem_unaligned(unsigned long val, unsigned long ea, - int nb, struct pt_regs *regs) +static nokprobe_inline int write_mem_unaligned(unsigned long val, + unsigned long ea, int nb, struct pt_regs *regs) { int err; unsigned long c; @@ -325,7 +326,7 @@ static int __kprobes write_mem_unaligned(unsigned long val, unsigned long ea, * Write memory at address ea for nb bytes, return 0 for success * or -EFAULT if an error occurred. */ -static int __kprobes write_mem(unsigned long val, unsigned long ea, int nb, +static int write_mem(unsigned long val, unsigned long ea, int nb, struct pt_regs *regs) { if (!address_ok(regs, ea, nb)) @@ -334,13 +335,14 @@ static int __kprobes write_mem(unsigned long val, unsigned long ea, int nb, return write_mem_aligned(val, ea, nb); return write_mem_unaligned(val, ea, nb, regs); } +NOKPROBE_SYMBOL(write_mem); #ifdef CONFIG_PPC_FPU /* * Check the address and alignment, and call func to do the actual * load or store. */ -static int __kprobes do_fp_load(int rn, int (*func)(int, unsigned long), +static int do_fp_load(int rn, int (*func)(int, unsigned long), unsigned long ea, int nb, struct pt_regs *regs) { @@ -380,8 +382,9 @@ static int __kprobes do_fp_load(int rn, int (*func)(int, unsigned long), return err; return (*func)(rn, ptr); } +NOKPROBE_SYMBOL(do_fp_load); -static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long), +static int do_fp_store(int rn, int (*func)(int, unsigned long), unsigned long ea, int nb, struct pt_regs *regs) { @@ -425,11 +428,12 @@ static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long), } return err; } +NOKPROBE_SYMBOL(do_fp_store); #endif #ifdef CONFIG_ALTIVEC /* For Altivec/VMX, no need to worry about alignment */ -static int __kprobes do_vec_load(int rn, int (*func)(int, unsigned long), +static nokprobe_inline int do_vec_load(int rn, int (*func)(int, unsigned long), unsigned long ea, struct pt_regs *regs) { if (!address_ok(regs, ea & ~0xfUL, 16)) @@ -437,7 +441,7 @@ static int __kprobes do_vec_load(int rn, int (*func)(int, unsigned long), return (*func)(rn, ea); } -static int __kprobes do_vec_store(int rn, int (*func)(int, unsigned long), +static nokprobe_inline int do_vec_store(int rn, int (*func)(int, unsigned long), unsigned long ea, struct pt_regs *regs) { if (!address_ok(regs, ea & ~0xfUL, 16)) @@ -447,7 +451,7 @@ static int __kprobes do_vec_store(int rn, int (*func)(int, unsigned long), #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_VSX -static int __kprobes do_vsx_load(int rn, int (*func)(int, unsigned long), +static nokprobe_inline int do_vsx_load(int rn, int (*func)(int, unsigned long), unsigned long ea, struct pt_regs *regs) { int err; @@ -465,7 +469,7 @@ static int __kprobes do_vsx_load(int rn, int (*func)(int, unsigned long), return err; } -static int __kprobes do_vsx_store(int rn, int (*func)(int, unsigned long), +static nokprobe_inline int do_vsx_store(int rn, int (*func)(int, unsigned long), unsigned long ea, struct pt_regs *regs) { int err; @@ -522,7 +526,7 @@ static int __kprobes do_vsx_store(int rn, int (*func)(int, unsigned long), : "=r" (err) \ : "r" (addr), "i" (-EFAULT), "0" (err)) -static void __kprobes set_cr0(struct pt_regs *regs, int rd) +static nokprobe_inline void set_cr0(struct pt_regs *regs, int rd) { long val = regs->gpr[rd]; @@ -539,7 +543,7 @@ static void __kprobes set_cr0(struct pt_regs *regs, int rd) regs->ccr |= 0x20000000; } -static void __kprobes add_with_carry(struct pt_regs *regs, int rd, +static nokprobe_inline void add_with_carry(struct pt_regs *regs, int rd, unsigned long val1, unsigned long val2, unsigned long carry_in) { @@ -560,7 +564,7 @@ static void __kprobes add_with_carry(struct pt_regs *regs, int rd, regs->xer &= ~XER_CA; } -static void __kprobes do_cmp_signed(struct pt_regs *regs, long v1, long v2, +static nokprobe_inline void do_cmp_signed(struct pt_regs *regs, long v1, long v2, int crfld) { unsigned int crval, shift; @@ -576,7 +580,7 @@ static void __kprobes do_cmp_signed(struct pt_regs *regs, long v1, long v2, regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift); } -static void __kprobes do_cmp_unsigned(struct pt_regs *regs, unsigned long v1, +static nokprobe_inline void do_cmp_unsigned(struct pt_regs *regs, unsigned long v1, unsigned long v2, int crfld) { unsigned int crval, shift; @@ -592,7 +596,7 @@ static void __kprobes do_cmp_unsigned(struct pt_regs *regs, unsigned long v1, regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift); } -static int __kprobes trap_compare(long v1, long v2) +static nokprobe_inline int trap_compare(long v1, long v2) { int ret = 0; @@ -631,7 +635,7 @@ static int __kprobes trap_compare(long v1, long v2) * Returns 1 if the instruction has been executed, or 0 if not. * Sets *op to indicate what the instruction does. */ -int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs, +int analyse_instr(struct instruction_op *op, struct pt_regs *regs, unsigned int instr) { unsigned int opcode, ra, rb, rd, spr, u; @@ -1692,6 +1696,7 @@ int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs, #endif } EXPORT_SYMBOL_GPL(analyse_instr); +NOKPROBE_SYMBOL(analyse_instr); /* * For PPC32 we always use stwu with r1 to change the stack pointer. @@ -1701,7 +1706,7 @@ EXPORT_SYMBOL_GPL(analyse_instr); * don't emulate the real store operation. We will do real store * operation safely in exception return code by checking this flag. */ -static __kprobes int handle_stack_update(unsigned long ea, struct pt_regs *regs) +static nokprobe_inline int handle_stack_update(unsigned long ea, struct pt_regs *regs) { #ifdef CONFIG_PPC32 /* @@ -1721,7 +1726,7 @@ static __kprobes int handle_stack_update(unsigned long ea, struct pt_regs *regs) return 0; } -static __kprobes void do_signext(unsigned long *valp, int size) +static nokprobe_inline void do_signext(unsigned long *valp, int size) { switch (size) { case 2: @@ -1733,7 +1738,7 @@ static __kprobes void do_signext(unsigned long *valp, int size) } } -static __kprobes void do_byterev(unsigned long *valp, int size) +static nokprobe_inline void do_byterev(unsigned long *valp, int size) { switch (size) { case 2: @@ -1757,7 +1762,7 @@ static __kprobes void do_byterev(unsigned long *valp, int size) * or -1 if the instruction is one that should not be stepped, * such as an rfid, or a mtmsrd that would clear MSR_RI. */ -int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) +int emulate_step(struct pt_regs *regs, unsigned int instr) { struct instruction_op op; int r, err, size; @@ -1988,3 +1993,4 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4); return 1; } +NOKPROBE_SYMBOL(emulate_step); diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c index d979709a0239..c6b900f54c07 100644 --- a/arch/powerpc/mm/dump_hashpagetable.c +++ b/arch/powerpc/mm/dump_hashpagetable.c @@ -468,7 +468,7 @@ static void walk_linearmapping(struct pg_state *st) unsigned long psize = 1 << mmu_psize_defs[mmu_linear_psize].shift; for (addr = PAGE_OFFSET; addr < PAGE_OFFSET + - memblock_phys_mem_size(); addr += psize) + memblock_end_of_DRAM(); addr += psize) hpte_find(st, addr, mmu_linear_psize); } diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c index 49abaf4dc8e3..d659345a98d6 100644 --- a/arch/powerpc/mm/dump_linuxpagetables.c +++ b/arch/powerpc/mm/dump_linuxpagetables.c @@ -26,6 +26,10 @@ #include <asm/page.h> #include <asm/pgalloc.h> +#ifdef CONFIG_PPC32 +#define KERN_VIRT_START 0 +#endif + /* * To visualise what is happening, * @@ -56,6 +60,8 @@ struct pg_state { struct seq_file *seq; const struct addr_marker *marker; unsigned long start_address; + unsigned long start_pa; + unsigned long last_pa; unsigned int level; u64 current_flags; }; @@ -69,6 +75,7 @@ static struct addr_marker address_markers[] = { { 0, "Start of kernel VM" }, { 0, "vmalloc() Area" }, { 0, "vmalloc() End" }, +#ifdef CONFIG_PPC64 { 0, "isa I/O start" }, { 0, "isa I/O end" }, { 0, "phb I/O start" }, @@ -76,6 +83,20 @@ static struct addr_marker address_markers[] = { { 0, "I/O remap start" }, { 0, "I/O remap end" }, { 0, "vmemmap start" }, +#else + { 0, "Early I/O remap start" }, + { 0, "Early I/O remap end" }, +#ifdef CONFIG_NOT_COHERENT_CACHE + { 0, "Consistent mem start" }, + { 0, "Consistent mem end" }, +#endif +#ifdef CONFIG_HIGHMEM + { 0, "Highmem PTEs start" }, + { 0, "Highmem PTEs end" }, +#endif + { 0, "Fixmap start" }, + { 0, "Fixmap end" }, +#endif { -1, NULL }, }; @@ -100,8 +121,13 @@ static const struct flag_info flag_array[] = { .set = "user", .clear = " ", }, { +#if _PAGE_RO == 0 .mask = _PAGE_RW, .val = _PAGE_RW, +#else + .mask = _PAGE_RO, + .val = 0, +#endif .set = "rw", .clear = "ro", }, { @@ -154,11 +180,24 @@ static const struct flag_info flag_array[] = { .clear = " ", }, { #endif +#ifndef CONFIG_PPC_BOOK3S_64 .mask = _PAGE_NO_CACHE, .val = _PAGE_NO_CACHE, .set = "no cache", .clear = " ", }, { +#else + .mask = _PAGE_NON_IDEMPOTENT, + .val = _PAGE_NON_IDEMPOTENT, + .set = "non-idempotent", + .clear = " ", + }, { + .mask = _PAGE_TOLERANT, + .val = _PAGE_TOLERANT, + .set = "tolerant", + .clear = " ", + }, { +#endif #ifdef CONFIG_PPC_BOOK3S_64 .mask = H_PAGE_BUSY, .val = H_PAGE_BUSY, @@ -188,6 +227,10 @@ static const struct flag_info flag_array[] = { .mask = _PAGE_SPECIAL, .val = _PAGE_SPECIAL, .set = "special", + }, { + .mask = _PAGE_SHARED, + .val = _PAGE_SHARED, + .set = "shared", } }; @@ -252,7 +295,14 @@ static void dump_addr(struct pg_state *st, unsigned long addr) const char *unit = units; unsigned long delta; - seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1); +#ifdef CONFIG_PPC64 + seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1); + seq_printf(st->seq, "0x%016lx ", st->start_pa); +#else + seq_printf(st->seq, "0x%08lx-0x%08lx ", st->start_address, addr - 1); + seq_printf(st->seq, "0x%08lx ", st->start_pa); +#endif + delta = (addr - st->start_address) >> 10; /* Work out what appropriate unit to use */ while (!(delta & 1023) && unit[1]) { @@ -267,11 +317,15 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned int level, u64 val) { u64 flag = val & pg_level[level].mask; + u64 pa = val & PTE_RPN_MASK; + /* At first no level is set */ if (!st->level) { st->level = level; st->current_flags = flag; st->start_address = addr; + st->start_pa = pa; + st->last_pa = pa; seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); /* * Dump the section of virtual memory when: @@ -279,9 +333,11 @@ static void note_page(struct pg_state *st, unsigned long addr, * - we change levels in the tree. * - the address is in a different section of memory and is thus * used for a different purpose, regardless of the flags. + * - the pa of this page is not adjacent to the last inspected page */ } else if (flag != st->current_flags || level != st->level || - addr >= st->marker[1].start_address) { + addr >= st->marker[1].start_address || + pa != st->last_pa + PAGE_SIZE) { /* Check the PTE flags */ if (st->current_flags) { @@ -305,8 +361,12 @@ static void note_page(struct pg_state *st, unsigned long addr, seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); } st->start_address = addr; + st->start_pa = pa; + st->last_pa = pa; st->current_flags = flag; st->level = level; + } else { + st->last_pa = pa; } } @@ -377,20 +437,38 @@ static void walk_pagetables(struct pg_state *st) static void populate_markers(void) { - address_markers[0].start_address = PAGE_OFFSET; - address_markers[1].start_address = VMALLOC_START; - address_markers[2].start_address = VMALLOC_END; - address_markers[3].start_address = ISA_IO_BASE; - address_markers[4].start_address = ISA_IO_END; - address_markers[5].start_address = PHB_IO_BASE; - address_markers[6].start_address = PHB_IO_END; - address_markers[7].start_address = IOREMAP_BASE; - address_markers[8].start_address = IOREMAP_END; + int i = 0; + + address_markers[i++].start_address = PAGE_OFFSET; + address_markers[i++].start_address = VMALLOC_START; + address_markers[i++].start_address = VMALLOC_END; +#ifdef CONFIG_PPC64 + address_markers[i++].start_address = ISA_IO_BASE; + address_markers[i++].start_address = ISA_IO_END; + address_markers[i++].start_address = PHB_IO_BASE; + address_markers[i++].start_address = PHB_IO_END; + address_markers[i++].start_address = IOREMAP_BASE; + address_markers[i++].start_address = IOREMAP_END; #ifdef CONFIG_PPC_STD_MMU_64 - address_markers[9].start_address = H_VMEMMAP_BASE; + address_markers[i++].start_address = H_VMEMMAP_BASE; #else - address_markers[9].start_address = VMEMMAP_BASE; + address_markers[i++].start_address = VMEMMAP_BASE; +#endif +#else /* !CONFIG_PPC64 */ + address_markers[i++].start_address = ioremap_bot; + address_markers[i++].start_address = IOREMAP_TOP; +#ifdef CONFIG_NOT_COHERENT_CACHE + address_markers[i++].start_address = IOREMAP_TOP; + address_markers[i++].start_address = IOREMAP_TOP + + CONFIG_CONSISTENT_SIZE; +#endif +#ifdef CONFIG_HIGHMEM + address_markers[i++].start_address = PKMAP_BASE; + address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP); #endif + address_markers[i++].start_address = FIXADDR_START; + address_markers[i++].start_address = FIXADDR_TOP; +#endif /* CONFIG_PPC64 */ } static int ptdump_show(struct seq_file *m, void *v) @@ -435,7 +513,7 @@ static int ptdump_init(void) populate_markers(); build_pgtable_complete_mask(); - debugfs_file = debugfs_create_file("kernel_pagetables", 0400, NULL, + debugfs_file = debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); return debugfs_file ? 0 : -ENOMEM; } diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 51def8a515be..3a7d580fdc59 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -120,8 +120,6 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address, siginfo_t info; unsigned int lsb = 0; - up_read(¤t->mm->mmap_sem); - if (!user_mode(regs)) return MM_FAULT_ERR(SIGBUS); @@ -154,13 +152,6 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) * continue the pagefault. */ if (fatal_signal_pending(current)) { - /* - * If we have retry set, the mmap semaphore will have - * alrady been released in __lock_page_or_retry(). Else - * we release it now. - */ - if (!(fault & VM_FAULT_RETRY)) - up_read(¤t->mm->mmap_sem); /* Coming from kernel, we need to deal with uaccess fixups */ if (user_mode(regs)) return MM_FAULT_RETURN; @@ -173,8 +164,6 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) /* Out of memory */ if (fault & VM_FAULT_OOM) { - up_read(¤t->mm->mmap_sem); - /* * We ran out of memory, or some other thing happened to us that * made us unable to handle the page fault gracefully. @@ -298,7 +287,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, * can result in fault, which will cause a deadlock when called with * mmap_sem held */ - if (user_mode(regs)) + if (!is_exec && user_mode(regs)) store_update_sp = store_updates_sp(regs); if (user_mode(regs)) @@ -458,9 +447,30 @@ good_area: * the fault. */ fault = handle_mm_fault(vma, address, flags); + + /* + * Handle the retry right now, the mmap_sem has been released in that + * case. + */ + if (unlikely(fault & VM_FAULT_RETRY)) { + /* We retry only once */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + /* + * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk + * of starvation. + */ + flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; + if (!fatal_signal_pending(current)) + goto retry; + } + /* We will enter mm_fault_error() below */ + } else + up_read(¤t->mm->mmap_sem); + if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { if (fault & VM_FAULT_SIGSEGV) - goto bad_area; + goto bad_area_nosemaphore; rc = mm_fault_error(regs, address, fault); if (rc >= MM_FAULT_RETURN) goto bail; @@ -469,41 +479,29 @@ good_area: } /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. + * Major/minor page fault accounting. */ - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); + if (fault & VM_FAULT_MAJOR) { + current->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, + regs, address); #ifdef CONFIG_PPC_SMLPAR - if (firmware_has_feature(FW_FEATURE_CMO)) { - u32 page_ins; - - preempt_disable(); - page_ins = be32_to_cpu(get_lppaca()->page_ins); - page_ins += 1 << PAGE_FACTOR; - get_lppaca()->page_ins = cpu_to_be32(page_ins); - preempt_enable(); - } -#endif /* CONFIG_PPC_SMLPAR */ - } else { - current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - } - if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; - flags |= FAULT_FLAG_TRIED; - goto retry; + if (firmware_has_feature(FW_FEATURE_CMO)) { + u32 page_ins; + + preempt_disable(); + page_ins = be32_to_cpu(get_lppaca()->page_ins); + page_ins += 1 << PAGE_FACTOR; + get_lppaca()->page_ins = cpu_to_be32(page_ins); + preempt_enable(); } +#endif /* CONFIG_PPC_SMLPAR */ + } else { + current->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, + regs, address); } - up_read(&mm->mmap_sem); goto bail; bad_area: diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index 09cc50c8dace..6f962e5cb5e1 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -31,10 +31,8 @@ #ifdef CONFIG_SMP .section .bss .align 2 - .globl mmu_hash_lock mmu_hash_lock: .space 4 -EXPORT_SYMBOL(mmu_hash_lock) #endif /* CONFIG_SMP */ /* diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index c554768b1fa2..f2095ce9d4b0 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -35,9 +35,8 @@ #include <linux/memblock.h> #include <linux/context_tracking.h> #include <linux/libfdt.h> -#include <linux/debugfs.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/processor.h> #include <asm/pgtable.h> #include <asm/mmu.h> @@ -927,11 +926,6 @@ static void __init htab_initialize(void) } #endif /* CONFIG_DEBUG_PAGEALLOC */ - /* On U3 based machines, we need to reserve the DART area and - * _NOT_ map it to avoid cache paradoxes as it's remapped non - * cacheable later on - */ - /* create bolted the linear mapping in the hash table */ for_each_memblock(memory, reg) { base = (unsigned long)__va(reg->base); @@ -981,6 +975,19 @@ void __init hash__early_init_devtree(void) void __init hash__early_init_mmu(void) { + /* + * We have code in __hash_page_64K() and elsewhere, which assumes it can + * do the following: + * new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX); + * + * Where the slot number is between 0-15, and values of 8-15 indicate + * the secondary bucket. For that code to work H_PAGE_F_SECOND and + * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and + * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here + * with a BUILD_BUG_ON(). + */ + BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul << (H_PAGE_F_GIX_SHIFT + 3))); + htab_init_page_sizes(); /* @@ -1120,7 +1127,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) copro_flush_all_slbs(mm); if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { - copy_mm_to_paca(&mm->context); + copy_mm_to_paca(mm); slb_flush_and_rebolt(); } } @@ -1192,7 +1199,7 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm, { if (user_region) { if (psize != get_paca_psize(ea)) { - copy_mm_to_paca(&mm->context); + copy_mm_to_paca(mm); slb_flush_and_rebolt(); } } else if (get_paca()->vmalloc_sllp != @@ -1855,5 +1862,4 @@ static int __init hash64_debugfs(void) return 0; } machine_device_initcall(pseries, hash64_debugfs); - #endif /* CONFIG_DEBUG_FS */ diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c index 83a8be791e06..bfe4e8526b2d 100644 --- a/arch/powerpc/mm/hugetlbpage-book3e.c +++ b/arch/powerpc/mm/hugetlbpage-book3e.c @@ -148,16 +148,9 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, mm = vma->vm_mm; -#ifdef CONFIG_PPC_MM_SLICES - psize = get_slice_psize(mm, ea); - tsize = mmu_get_tsize(psize); - shift = mmu_psize_defs[psize].shift; -#else psize = vma_mmu_pagesize(vma); shift = __ilog2(psize); tsize = shift - 10; -#endif - /* * We can't be interrupted while we're setting up the MAS * regusters or after we've confirmed that no tlb exists. diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c index 35254a678456..6575b9aabef4 100644 --- a/arch/powerpc/mm/hugetlbpage-radix.c +++ b/arch/powerpc/mm/hugetlbpage-radix.c @@ -50,9 +50,12 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; + if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE)) + mm->context.addr_limit = TASK_SIZE; + if (len & ~huge_page_mask(h)) return -EINVAL; - if (len > TASK_SIZE) + if (len > mm->task_size) return -ENOMEM; if (flags & MAP_FIXED) { @@ -64,7 +67,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (mm->task_size - len >= addr && (!vma || addr + len <= vma->vm_start)) return addr; } @@ -78,5 +81,9 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, info.high_limit = current->mm->mmap_base; info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; + + if (addr > DEFAULT_MAP_WINDOW) + info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW; + return vm_unmapped_area(&info); } diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 8c3389cbcd12..a4f33de4008e 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -753,6 +753,24 @@ static int __init add_huge_page_size(unsigned long long size) if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) return -EINVAL; +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * We need to make sure that for different page sizes reported by + * firmware we only add hugetlb support for page sizes that can be + * supported by linux page table layout. + * For now we have + * Radix: 2M + * Hash: 16M and 16G + */ + if (radix_enabled()) { + if (mmu_psize != MMU_PAGE_2M) + return -EINVAL; + } else { + if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) + return -EINVAL; + } +#endif + BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); /* Return if huge page size has already been setup */ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 9be992083d2a..8f6f2a173e47 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -71,10 +71,6 @@ #if H_PGTABLE_RANGE > USER_VSID_RANGE #warning Limited user VSID range means pagetable space is wasted #endif - -#if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) -#warning TASK_SIZE is smaller than it needs to be. -#endif #endif /* CONFIG_PPC_STD_MMU_64 */ phys_addr_t memstart_addr = ~0; diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index a5d9ef59debe..9dbd2a733d6b 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -59,13 +59,14 @@ static inline int mmap_is_legacy(void) unsigned long arch_mmap_rnd(void) { - unsigned long rnd; + unsigned long shift, rnd; - /* 8MB for 32bit, 1GB for 64bit */ + shift = mmap_rnd_bits; +#ifdef CONFIG_COMPAT if (is_32bit_task()) - rnd = get_random_long() % (1<<(23-PAGE_SHIFT)); - else - rnd = get_random_long() % (1UL<<(30-PAGE_SHIFT)); + shift = mmap_rnd_compat_bits; +#endif + rnd = get_random_long() % (1ul << shift); return rnd << PAGE_SHIFT; } @@ -79,7 +80,7 @@ static inline unsigned long mmap_base(unsigned long rnd) else if (gap > MAX_GAP) gap = MAX_GAP; - return PAGE_ALIGN(TASK_SIZE - gap - rnd); + return PAGE_ALIGN(DEFAULT_MAP_WINDOW - gap - rnd); } #ifdef CONFIG_PPC_RADIX_MMU @@ -97,7 +98,11 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, struct vm_area_struct *vma; struct vm_unmapped_area_info info; - if (len > TASK_SIZE - mmap_min_addr) + if (unlikely(addr > mm->context.addr_limit && + mm->context.addr_limit != TASK_SIZE)) + mm->context.addr_limit = TASK_SIZE; + + if (len > mm->task_size - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) @@ -106,7 +111,7 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + if (mm->task_size - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vma->vm_start)) return addr; } @@ -114,8 +119,13 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = mm->mmap_base; - info.high_limit = TASK_SIZE; info.align_mask = 0; + + if (unlikely(addr > DEFAULT_MAP_WINDOW)) + info.high_limit = mm->context.addr_limit; + else + info.high_limit = DEFAULT_MAP_WINDOW; + return vm_unmapped_area(&info); } @@ -131,8 +141,12 @@ radix__arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr = addr0; struct vm_unmapped_area_info info; + if (unlikely(addr > mm->context.addr_limit && + mm->context.addr_limit != TASK_SIZE)) + mm->context.addr_limit = TASK_SIZE; + /* requested length too big for entire address space */ - if (len > TASK_SIZE - mmap_min_addr) + if (len > mm->task_size - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) @@ -142,7 +156,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + if (mm->task_size - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vma->vm_start)) return addr; } @@ -152,7 +166,14 @@ radix__arch_get_unmapped_area_topdown(struct file *filp, info.low_limit = max(PAGE_SIZE, mmap_min_addr); info.high_limit = mm->mmap_base; info.align_mask = 0; + + if (addr > DEFAULT_MAP_WINDOW) + info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW; + addr = vm_unmapped_area(&info); + if (!(addr & ~PAGE_MASK)) + return addr; + VM_BUG_ON(addr != -ENOMEM); /* * A failed mmap() very likely causes application failure, @@ -160,15 +181,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp, * can happen with large stack limits and large mmap() * allocations. */ - if (addr & ~PAGE_MASK) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; - addr = vm_unmapped_area(&info); - } - - return addr; + return radix__arch_get_unmapped_area(filp, addr0, len, pgoff, flags); } static void radix__arch_pick_mmap_layout(struct mm_struct *mm, diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index 73bf6e14c3aa..c6dca2ae78ef 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -30,17 +30,16 @@ static DEFINE_SPINLOCK(mmu_context_lock); static DEFINE_IDA(mmu_context_ida); -int __init_new_context(void) +static int alloc_context_id(int min_id, int max_id) { - int index; - int err; + int index, err; again: if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL)) return -ENOMEM; spin_lock(&mmu_context_lock); - err = ida_get_new_above(&mmu_context_ida, 1, &index); + err = ida_get_new_above(&mmu_context_ida, min_id, &index); spin_unlock(&mmu_context_lock); if (err == -EAGAIN) @@ -48,7 +47,7 @@ again: else if (err) return err; - if (index > MAX_USER_CONTEXT) { + if (index > max_id) { spin_lock(&mmu_context_lock); ida_remove(&mmu_context_ida, index); spin_unlock(&mmu_context_lock); @@ -57,48 +56,105 @@ again: return index; } -EXPORT_SYMBOL_GPL(__init_new_context); -static int radix__init_new_context(struct mm_struct *mm, int index) + +void hash__reserve_context_id(int id) +{ + int rc, result = 0; + + do { + if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL)) + break; + + spin_lock(&mmu_context_lock); + rc = ida_get_new_above(&mmu_context_ida, id, &result); + spin_unlock(&mmu_context_lock); + } while (rc == -EAGAIN); + + WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result); +} + +int hash__alloc_context_id(void) +{ + unsigned long max; + + if (mmu_has_feature(MMU_FTR_68_BIT_VA)) + max = MAX_USER_CONTEXT; + else + max = MAX_USER_CONTEXT_65BIT_VA; + + return alloc_context_id(MIN_USER_CONTEXT, max); +} +EXPORT_SYMBOL_GPL(hash__alloc_context_id); + +static int hash__init_new_context(struct mm_struct *mm) +{ + int index; + + index = hash__alloc_context_id(); + if (index < 0) + return index; + + /* + * We do switch_slb() early in fork, even before we setup the + * mm->context.addr_limit. Default to max task size so that we copy the + * default values to paca which will help us to handle slb miss early. + */ + mm->context.addr_limit = TASK_SIZE_128TB; + + /* + * The old code would re-promote on fork, we don't do that when using + * slices as it could cause problem promoting slices that have been + * forced down to 4K. + * + * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check + * explicitly against context.id == 0. This ensures that we properly + * initialize context slice details for newly allocated mm's (which will + * have id == 0) and don't alter context slice inherited via fork (which + * will have id != 0). + * + * We should not be calling init_new_context() on init_mm. Hence a + * check against 0 is OK. + */ + if (mm->context.id == 0) + slice_set_user_psize(mm, mmu_virtual_psize); + + subpage_prot_init_new_context(mm); + + return index; +} + +static int radix__init_new_context(struct mm_struct *mm) { unsigned long rts_field; + int index; + + index = alloc_context_id(1, PRTB_ENTRIES - 1); + if (index < 0) + return index; /* * set the process table entry, */ rts_field = radix__get_tree_size(); process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); - return 0; + + mm->context.npu_context = NULL; + + return index; } int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { int index; - index = __init_new_context(); + if (radix_enabled()) + index = radix__init_new_context(mm); + else + index = hash__init_new_context(mm); + if (index < 0) return index; - if (radix_enabled()) { - radix__init_new_context(mm, index); - } else { - - /* The old code would re-promote on fork, we don't do that - * when using slices as it could cause problem promoting slices - * that have been forced down to 4K - * - * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check - * explicitly against context.id == 0. This ensures that we - * properly initialize context slice details for newly allocated - * mm's (which will have id == 0) and don't alter context slice - * inherited via fork (which will have id != 0). - * - * We should not be calling init_new_context() on init_mm. Hence a - * check against 0 is ok. - */ - if (mm->context.id == 0) - slice_set_user_psize(mm, mmu_virtual_psize); - subpage_prot_init_new_context(mm); - } mm->context.id = index; #ifdef CONFIG_PPC_ICSWX mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL); diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index fc67bd766eaf..e0a2d8e806ed 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -81,7 +81,7 @@ struct page *new_iommu_non_cma_page(struct page *page, unsigned long private, gfp_t gfp_mask = GFP_USER; struct page *new_page; - if (PageHuge(page) || PageTransHuge(page) || PageCompound(page)) + if (PageCompound(page)) return NULL; if (PageHighMem(page)) @@ -100,7 +100,7 @@ static int mm_iommu_move_page_from_cma(struct page *page) LIST_HEAD(cma_migrate_pages); /* Ignore huge pages for now */ - if (PageHuge(page) || PageTransHuge(page) || PageCompound(page)) + if (PageCompound(page)) return -EBUSY; lru_add_drain(); diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index c491f2c8f2b9..4554d6527682 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -333,11 +333,6 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm) mm->context.id = MMU_NO_CONTEXT; mm->context.active = 0; - -#ifdef CONFIG_PPC_MM_SLICES - slice_set_user_psize(mm, mmu_virtual_psize); -#endif - return 0; } diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 9befaee237d6..371792e4418f 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -875,13 +875,6 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) void *nd; int tnid; - if (spanned_pages) - pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n", - nid, start_pfn << PAGE_SHIFT, - (end_pfn << PAGE_SHIFT) - 1); - else - pr_info("Initmem setup node %d\n", nid); - nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); nd = __va(nd_pa); diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 5e01b2ece1d0..654a0d7ba0e7 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -131,7 +131,7 @@ static void __slb_flush_and_rebolt(void) "slbmte %2,%3\n" "isync" :: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)), - "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, 1)), + "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, VMALLOC_INDEX)), "r"(ksp_vsid_data), "r"(ksp_esid_data) : "memory"); @@ -229,7 +229,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) asm volatile("slbie %0" : : "r" (slbie_data)); get_paca()->slb_cache_ptr = 0; - copy_mm_to_paca(&mm->context); + copy_mm_to_paca(mm); /* * preload some userspace segments into the SLB. diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index a85e06ea6c20..1519617aab36 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -23,6 +23,48 @@ #include <asm/pgtable.h> #include <asm/firmware.h> +/* + * This macro generates asm code to compute the VSID scramble + * function. Used in slb_allocate() and do_stab_bolted. The function + * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS + * + * rt = register containing the proto-VSID and into which the + * VSID will be stored + * rx = scratch register (clobbered) + * rf = flags + * + * - rt and rx must be different registers + * - The answer will end up in the low VSID_BITS bits of rt. The higher + * bits may contain other garbage, so you may need to mask the + * result. + */ +#define ASM_VSID_SCRAMBLE(rt, rx, rf, size) \ + lis rx,VSID_MULTIPLIER_##size@h; \ + ori rx,rx,VSID_MULTIPLIER_##size@l; \ + mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \ +/* \ + * powermac get slb fault before feature fixup, so make 65 bit part \ + * the default part of feature fixup \ + */ \ +BEGIN_MMU_FTR_SECTION \ + srdi rx,rt,VSID_BITS_65_##size; \ + clrldi rt,rt,(64-VSID_BITS_65_##size); \ + add rt,rt,rx; \ + addi rx,rt,1; \ + srdi rx,rx,VSID_BITS_65_##size; \ + add rt,rt,rx; \ + rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_65_##size)); \ +MMU_FTR_SECTION_ELSE \ + srdi rx,rt,VSID_BITS_##size; \ + clrldi rt,rt,(64-VSID_BITS_##size); \ + add rt,rt,rx; /* add high and low bits */ \ + addi rx,rt,1; \ + srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \ + add rt,rt,rx; \ + rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_##size)); \ +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA) + + /* void slb_allocate_realmode(unsigned long ea); * * Create an SLB entry for the given EA (user or kernel). @@ -45,13 +87,6 @@ _GLOBAL(slb_allocate_realmode) /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ blt cr7,0f /* user or kernel? */ - /* kernel address: proto-VSID = ESID */ - /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but - * this code will generate the protoVSID 0xfffffffff for the - * top segment. That's ok, the scramble below will translate - * it to VSID 0, which is reserved as a bad VSID - one which - * will never have any pages in it. */ - /* Check if hitting the linear mapping or some other kernel space */ bne cr7,1f @@ -63,12 +98,10 @@ _GLOBAL(slb_allocate_realmode) slb_miss_kernel_load_linear: li r11,0 /* - * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 + * context = (ea >> 60) - (0xc - 1) * r9 = region id. */ - addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha - addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l - + subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET BEGIN_FTR_SECTION b .Lslb_finish_load @@ -77,9 +110,9 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) 1: #ifdef CONFIG_SPARSEMEM_VMEMMAP - /* Check virtual memmap region. To be patches at kernel boot */ cmpldi cr0,r9,0xf bne 1f +/* Check virtual memmap region. To be patched at kernel boot */ .globl slb_miss_kernel_load_vmemmap slb_miss_kernel_load_vmemmap: li r11,0 @@ -102,11 +135,10 @@ slb_miss_kernel_load_io: li r11,0 6: /* - * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 + * context = (ea >> 60) - (0xc - 1) * r9 = region id. */ - addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha - addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l + subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET BEGIN_FTR_SECTION b .Lslb_finish_load @@ -117,7 +149,13 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) * For userspace addresses, make sure this is region 0. */ cmpdi r9, 0 - bne 8f + bne- 8f + /* + * user space make sure we are within the allowed limit + */ + ld r11,PACA_ADDR_LIMIT(r13) + cmpld r3,r11 + bge- 8f /* when using slices, we extract the psize off the slice bitmaps * and then we need to get the sllp encoding off the mmu_psize_defs @@ -189,13 +227,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) */ .Lslb_finish_load: rldimi r10,r9,ESID_BITS,0 - ASM_VSID_SCRAMBLE(r10,r9,256M) - /* - * bits above VSID_BITS_256M need to be ignored from r10 - * also combine VSID and flags - */ - rldimi r11,r10,SLB_VSID_SHIFT,(64 - (SLB_VSID_SHIFT + VSID_BITS_256M)) - + ASM_VSID_SCRAMBLE(r10,r9,r11,256M) /* r3 = EA, r11 = VSID data */ /* * Find a slot, round robin. Previously we tried to find a @@ -259,12 +291,12 @@ slb_compare_rr_to_size: .Lslb_finish_load_1T: srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ rldimi r10,r9,ESID_BITS_1T,0 - ASM_VSID_SCRAMBLE(r10,r9,1T) + ASM_VSID_SCRAMBLE(r10,r9,r11,1T) /* * bits above VSID_BITS_1T need to be ignored from r10 * also combine VSID and flags */ - rldimi r11,r10,SLB_VSID_SHIFT_1T,(64 - (SLB_VSID_SHIFT_1T + VSID_BITS_1T)) + li r10,MMU_SEGSIZE_1T rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 2b27458902ee..966b9fccfa66 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -36,38 +36,29 @@ #include <asm/copro.h> #include <asm/hugetlb.h> -/* some sanity checks */ -#if (H_PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE -#error H_PGTABLE_RANGE exceeds slice_mask high_slices size -#endif - static DEFINE_SPINLOCK(slice_convert_lock); - +/* + * One bit per slice. We have lower slices which cover 256MB segments + * upto 4G range. That gets us 16 low slices. For the rest we track slices + * in 1TB size. + */ +struct slice_mask { + u64 low_slices; + DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH); +}; #ifdef DEBUG int _slice_debug = 1; static void slice_print_mask(const char *label, struct slice_mask mask) { - char *p, buf[16 + 3 + 64 + 1]; - int i; - if (!_slice_debug) return; - p = buf; - for (i = 0; i < SLICE_NUM_LOW; i++) - *(p++) = (mask.low_slices & (1 << i)) ? '1' : '0'; - *(p++) = ' '; - *(p++) = '-'; - *(p++) = ' '; - for (i = 0; i < SLICE_NUM_HIGH; i++) - *(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0'; - *(p++) = 0; - - printk(KERN_DEBUG "%s:%s\n", label, buf); + pr_devel("%s low_slice: %*pbl\n", label, (int)SLICE_NUM_LOW, &mask.low_slices); + pr_devel("%s high_slice: %*pbl\n", label, (int)SLICE_NUM_HIGH, mask.high_slices); } -#define slice_dbg(fmt...) do { if (_slice_debug) pr_debug(fmt); } while(0) +#define slice_dbg(fmt...) do { if (_slice_debug) pr_devel(fmt); } while (0) #else @@ -76,25 +67,28 @@ static void slice_print_mask(const char *label, struct slice_mask mask) {} #endif -static struct slice_mask slice_range_to_mask(unsigned long start, - unsigned long len) +static void slice_range_to_mask(unsigned long start, unsigned long len, + struct slice_mask *ret) { unsigned long end = start + len - 1; - struct slice_mask ret = { 0, 0 }; + + ret->low_slices = 0; + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); if (start < SLICE_LOW_TOP) { - unsigned long mend = min(end, SLICE_LOW_TOP); - unsigned long mstart = min(start, SLICE_LOW_TOP); + unsigned long mend = min(end, (SLICE_LOW_TOP - 1)); - ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) - - (1u << GET_LOW_SLICE_INDEX(mstart)); + ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) + - (1u << GET_LOW_SLICE_INDEX(start)); } - if ((start + len) > SLICE_LOW_TOP) - ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1)) - - (1ul << GET_HIGH_SLICE_INDEX(start)); + if ((start + len) > SLICE_LOW_TOP) { + unsigned long start_index = GET_HIGH_SLICE_INDEX(start); + unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); + unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; - return ret; + bitmap_set(ret->high_slices, start_index, count); + } } static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, @@ -128,53 +122,60 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice) return !slice_area_is_free(mm, start, end - start); } -static struct slice_mask slice_mask_for_free(struct mm_struct *mm) +static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret) { - struct slice_mask ret = { 0, 0 }; unsigned long i; + ret->low_slices = 0; + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); + for (i = 0; i < SLICE_NUM_LOW; i++) if (!slice_low_has_vma(mm, i)) - ret.low_slices |= 1u << i; + ret->low_slices |= 1u << i; if (mm->task_size <= SLICE_LOW_TOP) - return ret; + return; - for (i = 0; i < SLICE_NUM_HIGH; i++) + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) if (!slice_high_has_vma(mm, i)) - ret.high_slices |= 1ul << i; - - return ret; + __set_bit(i, ret->high_slices); } -static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) +static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_mask *ret) { unsigned char *hpsizes; int index, mask_index; - struct slice_mask ret = { 0, 0 }; unsigned long i; u64 lpsizes; + ret->low_slices = 0; + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); + lpsizes = mm->context.low_slices_psize; for (i = 0; i < SLICE_NUM_LOW; i++) if (((lpsizes >> (i * 4)) & 0xf) == psize) - ret.low_slices |= 1u << i; + ret->low_slices |= 1u << i; hpsizes = mm->context.high_slices_psize; - for (i = 0; i < SLICE_NUM_HIGH; i++) { + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) { mask_index = i & 0x1; index = i >> 1; if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize) - ret.high_slices |= 1ul << i; + __set_bit(i, ret->high_slices); } - - return ret; } -static int slice_check_fit(struct slice_mask mask, struct slice_mask available) +static int slice_check_fit(struct mm_struct *mm, + struct slice_mask mask, struct slice_mask available) { + DECLARE_BITMAP(result, SLICE_NUM_HIGH); + unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.addr_limit); + + bitmap_and(result, mask.high_slices, + available.high_slices, slice_count); + return (mask.low_slices & available.low_slices) == mask.low_slices && - (mask.high_slices & available.high_slices) == mask.high_slices; + bitmap_equal(result, mask.high_slices, slice_count); } static void slice_flush_segments(void *parm) @@ -185,7 +186,7 @@ static void slice_flush_segments(void *parm) if (mm != current->active_mm) return; - copy_mm_to_paca(¤t->active_mm->context); + copy_mm_to_paca(current->active_mm); local_irq_save(flags); slb_flush_and_rebolt(); @@ -218,18 +219,18 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz mm->context.low_slices_psize = lpsizes; hpsizes = mm->context.high_slices_psize; - for (i = 0; i < SLICE_NUM_HIGH; i++) { + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) { mask_index = i & 0x1; index = i >> 1; - if (mask.high_slices & (1ul << i)) + if (test_bit(i, mask.high_slices)) hpsizes[index] = (hpsizes[index] & ~(0xf << (mask_index * 4))) | (((unsigned long)psize) << (mask_index * 4)); } slice_dbg(" lsps=%lx, hsps=%lx\n", - mm->context.low_slices_psize, - mm->context.high_slices_psize); + (unsigned long)mm->context.low_slices_psize, + (unsigned long)mm->context.high_slices_psize); spin_unlock_irqrestore(&slice_convert_lock, flags); @@ -257,14 +258,14 @@ static bool slice_scan_available(unsigned long addr, slice = GET_HIGH_SLICE_INDEX(addr); *boundary_addr = (slice + end) ? ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP; - return !!(available.high_slices & (1ul << slice)); + return !!test_bit(slice, available.high_slices); } } static unsigned long slice_find_area_bottomup(struct mm_struct *mm, unsigned long len, struct slice_mask available, - int psize) + int psize, unsigned long high_limit) { int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); unsigned long addr, found, next_end; @@ -276,7 +277,10 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm, info.align_offset = 0; addr = TASK_UNMAPPED_BASE; - while (addr < TASK_SIZE) { + /* + * Check till the allow max value for this mmap request + */ + while (addr < high_limit) { info.low_limit = addr; if (!slice_scan_available(addr, available, 1, &addr)) continue; @@ -288,8 +292,8 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm, * Check if we need to reduce the range, or if we can * extend it to cover the next available slice. */ - if (addr >= TASK_SIZE) - addr = TASK_SIZE; + if (addr >= high_limit) + addr = high_limit; else if (slice_scan_available(addr, available, 1, &next_end)) { addr = next_end; goto next_slice; @@ -307,7 +311,7 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm, static unsigned long slice_find_area_topdown(struct mm_struct *mm, unsigned long len, struct slice_mask available, - int psize) + int psize, unsigned long high_limit) { int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); unsigned long addr, found, prev; @@ -319,6 +323,15 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, info.align_offset = 0; addr = mm->mmap_base; + /* + * If we are trying to allocate above DEFAULT_MAP_WINDOW + * Add the different to the mmap_base. + * Only for that request for which high_limit is above + * DEFAULT_MAP_WINDOW we should apply this. + */ + if (high_limit > DEFAULT_MAP_WINDOW) + addr += mm->context.addr_limit - DEFAULT_MAP_WINDOW; + while (addr > PAGE_SIZE) { info.high_limit = addr; if (!slice_scan_available(addr - 1, available, 0, &addr)) @@ -350,29 +363,38 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, * can happen with large stack limits and large mmap() * allocations. */ - return slice_find_area_bottomup(mm, len, available, psize); + return slice_find_area_bottomup(mm, len, available, psize, high_limit); } static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, struct slice_mask mask, int psize, - int topdown) + int topdown, unsigned long high_limit) { if (topdown) - return slice_find_area_topdown(mm, len, mask, psize); + return slice_find_area_topdown(mm, len, mask, psize, high_limit); else - return slice_find_area_bottomup(mm, len, mask, psize); + return slice_find_area_bottomup(mm, len, mask, psize, high_limit); } -#define or_mask(dst, src) do { \ - (dst).low_slices |= (src).low_slices; \ - (dst).high_slices |= (src).high_slices; \ -} while (0) +static inline void slice_or_mask(struct slice_mask *dst, struct slice_mask *src) +{ + DECLARE_BITMAP(result, SLICE_NUM_HIGH); + + dst->low_slices |= src->low_slices; + bitmap_or(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH); + bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH); +} -#define andnot_mask(dst, src) do { \ - (dst).low_slices &= ~(src).low_slices; \ - (dst).high_slices &= ~(src).high_slices; \ -} while (0) +static inline void slice_andnot_mask(struct slice_mask *dst, struct slice_mask *src) +{ + DECLARE_BITMAP(result, SLICE_NUM_HIGH); + + dst->low_slices &= ~src->low_slices; + + bitmap_andnot(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH); + bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH); +} #ifdef CONFIG_PPC_64K_PAGES #define MMU_PAGE_BASE MMU_PAGE_64K @@ -384,14 +406,43 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, unsigned long flags, unsigned int psize, int topdown) { - struct slice_mask mask = {0, 0}; + struct slice_mask mask; struct slice_mask good_mask; - struct slice_mask potential_mask = {0,0} /* silence stupid warning */; - struct slice_mask compat_mask = {0, 0}; + struct slice_mask potential_mask; + struct slice_mask compat_mask; int fixed = (flags & MAP_FIXED); int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); struct mm_struct *mm = current->mm; unsigned long newaddr; + unsigned long high_limit; + + /* + * Check if we need to expland slice area. + */ + if (unlikely(addr > mm->context.addr_limit && + mm->context.addr_limit != TASK_SIZE)) { + mm->context.addr_limit = TASK_SIZE; + on_each_cpu(slice_flush_segments, mm, 1); + } + /* + * This mmap request can allocate upt to 512TB + */ + if (addr > DEFAULT_MAP_WINDOW) + high_limit = mm->context.addr_limit; + else + high_limit = DEFAULT_MAP_WINDOW; + /* + * init different masks + */ + mask.low_slices = 0; + bitmap_zero(mask.high_slices, SLICE_NUM_HIGH); + + /* silence stupid warning */; + potential_mask.low_slices = 0; + bitmap_zero(potential_mask.high_slices, SLICE_NUM_HIGH); + + compat_mask.low_slices = 0; + bitmap_zero(compat_mask.high_slices, SLICE_NUM_HIGH); /* Sanity checks */ BUG_ON(mm->task_size == 0); @@ -423,7 +474,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* First make up a "good" mask of slices that have the right size * already */ - good_mask = slice_mask_for_size(mm, psize); + slice_mask_for_size(mm, psize, &good_mask); slice_print_mask(" good_mask", good_mask); /* @@ -448,22 +499,22 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, #ifdef CONFIG_PPC_64K_PAGES /* If we support combo pages, we can allow 64k pages in 4k slices */ if (psize == MMU_PAGE_64K) { - compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); + slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask); if (fixed) - or_mask(good_mask, compat_mask); + slice_or_mask(&good_mask, &compat_mask); } #endif /* First check hint if it's valid or if we have MAP_FIXED */ if (addr != 0 || fixed) { /* Build a mask for the requested range */ - mask = slice_range_to_mask(addr, len); + slice_range_to_mask(addr, len, &mask); slice_print_mask(" mask", mask); /* Check if we fit in the good mask. If we do, we just return, * nothing else to do */ - if (slice_check_fit(mask, good_mask)) { + if (slice_check_fit(mm, mask, good_mask)) { slice_dbg(" fits good !\n"); return addr; } @@ -471,7 +522,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* Now let's see if we can find something in the existing * slices for that size */ - newaddr = slice_find_area(mm, len, good_mask, psize, topdown); + newaddr = slice_find_area(mm, len, good_mask, + psize, topdown, high_limit); if (newaddr != -ENOMEM) { /* Found within the good mask, we don't have to setup, * we thus return directly @@ -484,11 +536,11 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* We don't fit in the good mask, check what other slices are * empty and thus can be converted */ - potential_mask = slice_mask_for_free(mm); - or_mask(potential_mask, good_mask); + slice_mask_for_free(mm, &potential_mask); + slice_or_mask(&potential_mask, &good_mask); slice_print_mask(" potential", potential_mask); - if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) { + if ((addr != 0 || fixed) && slice_check_fit(mm, mask, potential_mask)) { slice_dbg(" fits potential !\n"); goto convert; } @@ -503,7 +555,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, * anywhere in the good area. */ if (addr) { - addr = slice_find_area(mm, len, good_mask, psize, topdown); + addr = slice_find_area(mm, len, good_mask, + psize, topdown, high_limit); if (addr != -ENOMEM) { slice_dbg(" found area at 0x%lx\n", addr); return addr; @@ -513,28 +566,29 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* Now let's see if we can find something in the existing slices * for that size plus free slices */ - addr = slice_find_area(mm, len, potential_mask, psize, topdown); + addr = slice_find_area(mm, len, potential_mask, + psize, topdown, high_limit); #ifdef CONFIG_PPC_64K_PAGES if (addr == -ENOMEM && psize == MMU_PAGE_64K) { /* retry the search with 4k-page slices included */ - or_mask(potential_mask, compat_mask); - addr = slice_find_area(mm, len, potential_mask, psize, - topdown); + slice_or_mask(&potential_mask, &compat_mask); + addr = slice_find_area(mm, len, potential_mask, + psize, topdown, high_limit); } #endif if (addr == -ENOMEM) return -ENOMEM; - mask = slice_range_to_mask(addr, len); + slice_range_to_mask(addr, len, &mask); slice_dbg(" found potential area at 0x%lx\n", addr); slice_print_mask(" mask", mask); convert: - andnot_mask(mask, good_mask); - andnot_mask(mask, compat_mask); - if (mask.low_slices || mask.high_slices) { + slice_andnot_mask(&mask, &good_mask); + slice_andnot_mask(&mask, &compat_mask); + if (mask.low_slices || !bitmap_empty(mask.high_slices, SLICE_NUM_HIGH)) { slice_convert(mm, mask, psize); if (psize > MMU_PAGE_BASE) on_each_cpu(slice_flush_segments, mm, 1); @@ -649,8 +703,8 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) slice_dbg(" lsps=%lx, hsps=%lx\n", - mm->context.low_slices_psize, - mm->context.high_slices_psize); + (unsigned long)mm->context.low_slices_psize, + (unsigned long)mm->context.high_slices_psize); bail: spin_unlock_irqrestore(&slice_convert_lock, flags); @@ -659,9 +713,11 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) void slice_set_range_psize(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned int psize) { - struct slice_mask mask = slice_range_to_mask(start, len); + struct slice_mask mask; VM_BUG_ON(radix_enabled()); + + slice_range_to_mask(start, len, &mask); slice_convert(mm, mask, psize); } @@ -694,14 +750,14 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, if (radix_enabled()) return 0; - mask = slice_range_to_mask(addr, len); - available = slice_mask_for_size(mm, psize); + slice_range_to_mask(addr, len, &mask); + slice_mask_for_size(mm, psize, &available); #ifdef CONFIG_PPC_64K_PAGES /* We need to account for 4k slices too */ if (psize == MMU_PAGE_64K) { struct slice_mask compat_mask; - compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); - or_mask(available, compat_mask); + slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask); + slice_or_mask(&available, &compat_mask); } #endif @@ -711,6 +767,6 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, slice_print_mask(" mask", mask); slice_print_mask(" available", available); #endif - return !slice_check_fit(mask, available); + return !slice_check_fit(mm, mask, available); } #endif diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index 94210940112f..e94fbd4c8845 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -197,7 +197,8 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) /* Check parameters */ if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) || - addr >= TASK_SIZE || len >= TASK_SIZE || addr + len > TASK_SIZE) + addr >= mm->task_size || len >= mm->task_size || + addr + len > mm->task_size) return -EINVAL; if (is_hugepage_only_range(mm, addr, len)) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 952713d6cf04..5e17c4e873a5 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -34,10 +34,8 @@ static inline void __tlbiel_pid(unsigned long pid, int set, prs = 1; /* process scoped */ r = 1; /* raidx format */ - asm volatile("ptesync": : :"memory"); asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - asm volatile("ptesync": : :"memory"); } /* @@ -47,9 +45,33 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) { int set; - for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) { + asm volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, + * also flush the entire Page Walk Cache. + */ + __tlbiel_pid(pid, 0, ric); + + if (ric == RIC_FLUSH_ALL) + /* For the remaining sets, just flush the TLB */ + ric = RIC_FLUSH_TLB; + + for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) __tlbiel_pid(pid, set, ric); - } + + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +static inline void tlbiel_pwc(unsigned long pid) +{ + asm volatile("ptesync": : :"memory"); + + /* For PWC flush, we don't look at set number */ + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); + + asm volatile("ptesync": : :"memory"); asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); } @@ -129,12 +151,18 @@ void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) { unsigned long pid; struct mm_struct *mm = tlb->mm; + /* + * If we are doing a full mm flush, we will do a tlb flush + * with RIC_FLUSH_ALL later. + */ + if (tlb->fullmm) + return; preempt_disable(); pid = mm->context.id; if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_PWC); + tlbiel_pwc(pid); preempt_enable(); } @@ -195,6 +223,12 @@ void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) unsigned long pid; struct mm_struct *mm = tlb->mm; + /* + * If we are doing a full mm flush, we will do a tlb flush + * with RIC_FLUSH_ALL later. + */ + if (tlb->fullmm) + return; preempt_disable(); pid = mm->context.id; @@ -210,7 +244,7 @@ void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) if (lock_tlbie) raw_spin_unlock(&native_tlbie_lock); } else - _tlbiel_pid(pid, RIC_FLUSH_PWC); + tlbiel_pwc(pid); no_context: preempt_enable(); } @@ -437,7 +471,7 @@ void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm, return; } - if (old_pte & _PAGE_LARGE) + if (old_pte & R_PAGE_LARGE) radix__flush_tlb_page_psize(mm, address, MMU_PAGE_2M); else radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize); diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index ba28fcb98597..bfc4a0869609 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -770,7 +770,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, * avoid going over total available memory just in case... */ #ifdef CONFIG_PPC_FSL_BOOK3E - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { + if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { unsigned long linear_sz; unsigned int num_cams; diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 2ff13249f87a..6c2d4168daec 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2049,6 +2049,14 @@ static void record_and_restart(struct perf_event *event, unsigned long val, data.br_stack = &cpuhw->bhrb_stack; } + if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && + ppmu->get_mem_data_src) + ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs); + + if (event->attr.sample_type & PERF_SAMPLE_WEIGHT && + ppmu->get_mem_weight) + ppmu->get_mem_weight(&data.weight); + if (perf_event_overflow(event, &data, regs)) power_pmu_stop(event, 0); } diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c index cd951fd231c4..8125160be7bc 100644 --- a/arch/powerpc/perf/isa207-common.c +++ b/arch/powerpc/perf/isa207-common.c @@ -148,6 +148,88 @@ static bool is_thresh_cmp_valid(u64 event) return true; } +static inline u64 isa207_find_source(u64 idx, u32 sub_idx) +{ + u64 ret = PERF_MEM_NA; + + switch(idx) { + case 0: + /* Nothing to do */ + break; + case 1: + ret = PH(LVL, L1); + break; + case 2: + ret = PH(LVL, L2); + break; + case 3: + ret = PH(LVL, L3); + break; + case 4: + if (sub_idx <= 1) + ret = PH(LVL, LOC_RAM); + else if (sub_idx > 1 && sub_idx <= 2) + ret = PH(LVL, REM_RAM1); + else + ret = PH(LVL, REM_RAM2); + ret |= P(SNOOP, HIT); + break; + case 5: + ret = PH(LVL, REM_CCE1); + if ((sub_idx == 0) || (sub_idx == 2) || (sub_idx == 4)) + ret |= P(SNOOP, HIT); + else if ((sub_idx == 1) || (sub_idx == 3) || (sub_idx == 5)) + ret |= P(SNOOP, HITM); + break; + case 6: + ret = PH(LVL, REM_CCE2); + if ((sub_idx == 0) || (sub_idx == 2)) + ret |= P(SNOOP, HIT); + else if ((sub_idx == 1) || (sub_idx == 3)) + ret |= P(SNOOP, HITM); + break; + case 7: + ret = PM(LVL, L1); + break; + } + + return ret; +} + +void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags, + struct pt_regs *regs) +{ + u64 idx; + u32 sub_idx; + u64 sier; + u64 val; + + /* Skip if no SIER support */ + if (!(flags & PPMU_HAS_SIER)) { + dsrc->val = 0; + return; + } + + sier = mfspr(SPRN_SIER); + val = (sier & ISA207_SIER_TYPE_MASK) >> ISA207_SIER_TYPE_SHIFT; + if (val == 1 || val == 2) { + idx = (sier & ISA207_SIER_LDST_MASK) >> ISA207_SIER_LDST_SHIFT; + sub_idx = (sier & ISA207_SIER_DATA_SRC_MASK) >> ISA207_SIER_DATA_SRC_SHIFT; + + dsrc->val = isa207_find_source(idx, sub_idx); + dsrc->val |= (val == 1) ? P(OP, LOAD) : P(OP, STORE); + } +} + +void isa207_get_mem_weight(u64 *weight) +{ + u64 mmcra = mfspr(SPRN_MMCRA); + u64 exp = MMCRA_THR_CTR_EXP(mmcra); + u64 mantissa = MMCRA_THR_CTR_MANT(mmcra); + + *weight = mantissa << (2 * exp); +} + int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp) { unsigned int unit, pmc, cache, ebb; diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h index 899210f14ee4..8acbe6e802c7 100644 --- a/arch/powerpc/perf/isa207-common.h +++ b/arch/powerpc/perf/isa207-common.h @@ -248,6 +248,15 @@ #define MMCRA_SDAR_MODE_TLB (1ull << MMCRA_SDAR_MODE_SHIFT) #define MMCRA_SDAR_MODE_NO_UPDATES ~(0x3ull << MMCRA_SDAR_MODE_SHIFT) #define MMCRA_IFM_SHIFT 30 +#define MMCRA_THR_CTR_MANT_SHIFT 19 +#define MMCRA_THR_CTR_MANT_MASK 0x7Ful +#define MMCRA_THR_CTR_MANT(v) (((v) >> MMCRA_THR_CTR_MANT_SHIFT) &\ + MMCRA_THR_CTR_MANT_MASK) + +#define MMCRA_THR_CTR_EXP_SHIFT 27 +#define MMCRA_THR_CTR_EXP_MASK 0x7ul +#define MMCRA_THR_CTR_EXP(v) (((v) >> MMCRA_THR_CTR_EXP_SHIFT) &\ + MMCRA_THR_CTR_EXP_MASK) /* MMCR1 Threshold Compare bit constant for power9 */ #define p9_MMCRA_THR_CMP_SHIFT 45 @@ -260,6 +269,19 @@ #define MAX_ALT 2 #define MAX_PMU_COUNTERS 6 +#define ISA207_SIER_TYPE_SHIFT 15 +#define ISA207_SIER_TYPE_MASK (0x7ull << ISA207_SIER_TYPE_SHIFT) + +#define ISA207_SIER_LDST_SHIFT 1 +#define ISA207_SIER_LDST_MASK (0x7ull << ISA207_SIER_LDST_SHIFT) + +#define ISA207_SIER_DATA_SRC_SHIFT 53 +#define ISA207_SIER_DATA_SRC_MASK (0x7ull << ISA207_SIER_DATA_SRC_SHIFT) + +#define P(a, b) PERF_MEM_S(a, b) +#define PH(a, b) (P(LVL, HIT) | P(a, b)) +#define PM(a, b) (P(LVL, MISS) | P(a, b)) + int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp); int isa207_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[], unsigned long mmcr[], @@ -267,6 +289,8 @@ int isa207_compute_mmcr(u64 event[], int n_ev, void isa207_disable_pmc(unsigned int pmc, unsigned long mmcr[]); int isa207_get_alternatives(u64 event, u64 alt[], const unsigned int ev_alt[][MAX_ALT], int size); - +void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags, + struct pt_regs *regs); +void isa207_get_mem_weight(u64 *weight); #endif diff --git a/arch/powerpc/perf/power8-events-list.h b/arch/powerpc/perf/power8-events-list.h index 3a2e6e8ebb92..0f1d184627cc 100644 --- a/arch/powerpc/perf/power8-events-list.h +++ b/arch/powerpc/perf/power8-events-list.h @@ -89,3 +89,9 @@ EVENT(PM_MRK_FILT_MATCH, 0x2013c) EVENT(PM_MRK_FILT_MATCH_ALT, 0x3012e) /* Alternate event code for PM_LD_MISS_L1 */ EVENT(PM_LD_MISS_L1_ALT, 0x400f0) +/* + * Memory Access Event -- mem_access + * Primary PMU event used here is PM_MRK_INST_CMPL, along with + * Random Load/Store Facility Sampling (RIS) in Random sampling mode (MMCRA[SM]). + */ +EVENT(MEM_ACCESS, 0x10401e0) diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c index ce15b19a7962..5463516e369b 100644 --- a/arch/powerpc/perf/power8-pmu.c +++ b/arch/powerpc/perf/power8-pmu.c @@ -90,6 +90,7 @@ GENERIC_EVENT_ATTR(branch-instructions, PM_BRU_FIN); GENERIC_EVENT_ATTR(branch-misses, PM_BR_MPRED_CMPL); GENERIC_EVENT_ATTR(cache-references, PM_LD_REF_L1); GENERIC_EVENT_ATTR(cache-misses, PM_LD_MISS_L1); +GENERIC_EVENT_ATTR(mem_access, MEM_ACCESS); CACHE_EVENT_ATTR(L1-dcache-load-misses, PM_LD_MISS_L1); CACHE_EVENT_ATTR(L1-dcache-loads, PM_LD_REF_L1); @@ -120,6 +121,7 @@ static struct attribute *power8_events_attr[] = { GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL), GENERIC_EVENT_PTR(PM_LD_REF_L1), GENERIC_EVENT_PTR(PM_LD_MISS_L1), + GENERIC_EVENT_PTR(MEM_ACCESS), CACHE_EVENT_PTR(PM_LD_MISS_L1), CACHE_EVENT_PTR(PM_LD_REF_L1), @@ -325,6 +327,8 @@ static struct power_pmu power8_pmu = { .bhrb_filter_map = power8_bhrb_filter_map, .get_constraint = isa207_get_constraint, .get_alternatives = power8_get_alternatives, + .get_mem_data_src = isa207_get_mem_data_src, + .get_mem_weight = isa207_get_mem_weight, .disable_pmc = isa207_disable_pmc, .flags = PPMU_HAS_SIER | PPMU_ARCH_207S, .n_generic = ARRAY_SIZE(power8_generic_events), diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index 7f6582708e06..018f8e90ac35 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -427,6 +427,8 @@ static struct power_pmu power9_pmu = { .bhrb_filter_map = power9_bhrb_filter_map, .get_constraint = isa207_get_constraint, .get_alternatives = power9_get_alternatives, + .get_mem_data_src = isa207_get_mem_data_src, + .get_mem_weight = isa207_get_mem_weight, .disable_pmc = isa207_disable_pmc, .flags = PPMU_HAS_SIER | PPMU_ARCH_207S, .n_generic = ARRAY_SIZE(power9_generic_events), diff --git a/arch/powerpc/platforms/44x/sam440ep.c b/arch/powerpc/platforms/44x/sam440ep.c index 688ffeab0699..55fed5e4de14 100644 --- a/arch/powerpc/platforms/44x/sam440ep.c +++ b/arch/powerpc/platforms/44x/sam440ep.c @@ -70,7 +70,7 @@ static struct i2c_board_info sam440ep_rtc_info = { .irq = -1, }; -static int sam440ep_setup_rtc(void) +static int __init sam440ep_setup_rtc(void) { return i2c_register_board_info(0, &sam440ep_rtc_info, 1); } diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index 078097a0b09d..0975066f76e8 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -461,16 +461,9 @@ static void mpc85xx_smp_machine_kexec(struct kimage *image) } #endif /* CONFIG_KEXEC_CORE */ -static void smp_85xx_basic_setup(int cpu_nr) -{ - if (cpu_has_feature(CPU_FTR_DBELL)) - doorbell_setup_this_cpu(); -} - static void smp_85xx_setup_cpu(int cpu_nr) { mpic_setup_this_cpu(); - smp_85xx_basic_setup(cpu_nr); } void __init mpc85xx_smp_init(void) @@ -484,7 +477,7 @@ void __init mpc85xx_smp_init(void) smp_85xx_ops.setup_cpu = smp_85xx_setup_cpu; smp_85xx_ops.message_pass = smp_mpic_message_pass; } else - smp_85xx_ops.setup_cpu = smp_85xx_basic_setup; + smp_85xx_ops.setup_cpu = NULL; if (cpu_has_feature(CPU_FTR_DBELL)) { /* @@ -492,7 +485,7 @@ void __init mpc85xx_smp_init(void) * smp_muxed_ipi_message_pass */ smp_85xx_ops.message_pass = NULL; - smp_85xx_ops.cause_ipi = doorbell_cause_ipi; + smp_85xx_ops.cause_ipi = doorbell_global_ipi; smp_85xx_ops.probe = NULL; } diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 99b0ae8acb78..684e886eaae4 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -279,7 +279,8 @@ config PPC_ICSWX This option enables kernel support for the PowerPC Initiate Coprocessor Store Word (icswx) coprocessor instruction on POWER7 - or newer processors. + and POWER8 processors. POWER9 uses new copy/paste instructions + to invoke the coprocessor. This option is only useful if you have a processor that supports the icswx coprocessor instruction. It does not have any effect @@ -359,7 +360,7 @@ config PPC_BOOK3E_MMU config PPC_MM_SLICES bool - default y if (!PPC_FSL_BOOK3E && PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES) + default y if PPC_STD_MMU_64 default n config PPC_HAVE_PMU_SUPPORT @@ -371,9 +372,16 @@ config PPC_PERF_CTRS help This enables the powerpc-specific perf_event back-end. +config FORCE_SMP + # Allow platforms to force SMP=y by selecting this + bool + default n + select SMP + config SMP depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE || PPC_47x - bool "Symmetric multi-processing support" + select GENERIC_IRQ_MIGRATION + bool "Symmetric multi-processing support" if !FORCE_SMP ---help--- This enables support for systems with more than one CPU. If you have a system with only one CPU, say N. If you have a system with more diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c index 8b55c5f19d4c..8d3ae2cc52bf 100644 --- a/arch/powerpc/platforms/cell/axon_msi.c +++ b/arch/powerpc/platforms/cell/axon_msi.c @@ -15,9 +15,9 @@ #include <linux/msi.h> #include <linux/export.h> #include <linux/of_platform.h> -#include <linux/debugfs.h> #include <linux/slab.h> +#include <asm/debugfs.h> #include <asm/dcr.h> #include <asm/machdep.h> #include <asm/prom.h> diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index 746ca7321b03..a3207cea360e 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -172,7 +172,7 @@ static irqreturn_t psurge_ipi_intr(int irq, void *d) return IRQ_HANDLED; } -static void smp_psurge_cause_ipi(int cpu, unsigned long data) +static void smp_psurge_cause_ipi(int cpu) { psurge_set_ipi(cpu); } diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 3a07e4dcf97c..6a6f4ef46b9e 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -4,6 +4,7 @@ config PPC_POWERNV select PPC_NATIVE select PPC_XICS select PPC_ICP_NATIVE + select PPC_XIVE_NATIVE select PPC_P7_NAP select PCI select PCI_MSI @@ -19,6 +20,8 @@ config PPC_POWERNV select CPU_FREQ_GOV_ONDEMAND select CPU_FREQ_GOV_CONSERVATIVE select PPC_DOORBELL + select MMU_NOTIFIER + select FORCE_SMP default y config OPAL_PRD diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 4ee837e6391a..445f30a2c5ef 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -53,19 +53,6 @@ static int pnv_save_sprs_for_deep_states(void) uint64_t pir = get_hard_smp_processor_id(cpu); uint64_t hsprg0_val = (uint64_t)&paca[cpu]; - if (!cpu_has_feature(CPU_FTR_ARCH_300)) { - /* - * HSPRG0 is used to store the cpu's pointer to paca. - * Hence last 3 bits are guaranteed to be 0. Program - * slw to restore HSPRG0 with 63rd bit set, so that - * when a thread wakes up at 0x100 we can use this bit - * to distinguish between fastsleep and deep winkle. - * This is not necessary with stop/psscr since PLS - * field of psscr indicates which state we are waking - * up from. - */ - hsprg0_val |= 1; - } rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val); if (rc != 0) return rc; @@ -122,9 +109,12 @@ static void pnv_alloc_idle_core_states(void) for (i = 0; i < nr_cores; i++) { int first_cpu = i * threads_per_core; int node = cpu_to_node(first_cpu); + size_t paca_ptr_array_size; core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node); *core_idle_state = PNV_CORE_IDLE_THREAD_BITS; + paca_ptr_array_size = (threads_per_core * + sizeof(struct paca_struct *)); for (j = 0; j < threads_per_core; j++) { int cpu = first_cpu + j; @@ -132,6 +122,11 @@ static void pnv_alloc_idle_core_states(void) paca[cpu].core_idle_state_ptr = core_idle_state; paca[cpu].thread_idle_state = PNV_THREAD_RUNNING; paca[cpu].thread_mask = 1 << j; + if (!cpu_has_feature(CPU_FTR_POWER9_DD1)) + continue; + paca[cpu].thread_sibling_pacas = + kmalloc_node(paca_ptr_array_size, + GFP_KERNEL, node); } } @@ -147,7 +142,6 @@ u32 pnv_get_supported_cpuidle_states(void) } EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states); - static void pnv_fastsleep_workaround_apply(void *info) { @@ -241,8 +235,9 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600, * The default stop state that will be used by ppc_md.power_save * function on platforms that support stop instruction. */ -u64 pnv_default_stop_val; -u64 pnv_default_stop_mask; +static u64 pnv_default_stop_val; +static u64 pnv_default_stop_mask; +static bool default_stop_found; /* * Used for ppc_md.power_save which needs a function with no parameters @@ -262,8 +257,42 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE; * psscr value and mask of the deepest stop idle state. * Used when a cpu is offlined. */ -u64 pnv_deepest_stop_psscr_val; -u64 pnv_deepest_stop_psscr_mask; +static u64 pnv_deepest_stop_psscr_val; +static u64 pnv_deepest_stop_psscr_mask; +static bool deepest_stop_found; + +/* + * pnv_cpu_offline: A function that puts the CPU into the deepest + * available platform idle state on a CPU-Offline. + */ +unsigned long pnv_cpu_offline(unsigned int cpu) +{ + unsigned long srr1; + + u32 idle_states = pnv_get_supported_cpuidle_states(); + + if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) { + srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val, + pnv_deepest_stop_psscr_mask); + } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { + srr1 = power7_winkle(); + } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || + (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { + srr1 = power7_sleep(); + } else if (idle_states & OPAL_PM_NAP_ENABLED) { + srr1 = power7_nap(1); + } else { + /* This is the fallback method. We emulate snooze */ + while (!generic_check_cpu_restart(cpu)) { + HMT_low(); + HMT_very_low(); + } + srr1 = 0; + HMT_medium(); + } + + return srr1; +} /* * Power ISA 3.0 idle initialization. @@ -352,7 +381,6 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags, u32 *residency_ns = NULL; u64 max_residency_ns = 0; int rc = 0, i; - bool default_stop_found = false, deepest_stop_found = false; psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL); psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL); @@ -432,21 +460,24 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags, } } - if (!default_stop_found) { - pnv_default_stop_val = PSSCR_HV_DEFAULT_VAL; - pnv_default_stop_mask = PSSCR_HV_DEFAULT_MASK; - pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n", + if (unlikely(!default_stop_found)) { + pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n"); + } else { + ppc_md.power_save = power9_idle; + pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n", pnv_default_stop_val, pnv_default_stop_mask); } - if (!deepest_stop_found) { - pnv_deepest_stop_psscr_val = PSSCR_HV_DEFAULT_VAL; - pnv_deepest_stop_psscr_mask = PSSCR_HV_DEFAULT_MASK; - pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n", + if (unlikely(!deepest_stop_found)) { + pr_warn("cpuidle-powernv: No suitable stop state for CPU-Hotplug. Offlined CPUs will busy wait"); + } else { + pr_info("cpuidle-powernv: Deepest stop: psscr = 0x%016llx,mask=0x%016llx\n", pnv_deepest_stop_psscr_val, pnv_deepest_stop_psscr_mask); } + pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n", + pnv_first_deep_stop_state); out: kfree(psscr_val); kfree(psscr_mask); @@ -524,10 +555,30 @@ static int __init pnv_init_idle_states(void) pnv_alloc_idle_core_states(); + /* + * For each CPU, record its PACA address in each of it's + * sibling thread's PACA at the slot corresponding to this + * CPU's index in the core. + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + int cpu; + + pr_info("powernv: idle: Saving PACA pointers of all CPUs in their thread sibling PACA\n"); + for_each_possible_cpu(cpu) { + int base_cpu = cpu_first_thread_sibling(cpu); + int idx = cpu_thread_in_core(cpu); + int i; + + for (i = 0; i < threads_per_core; i++) { + int j = base_cpu + i; + + paca[j].thread_sibling_pacas[idx] = &paca[cpu]; + } + } + } + if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) ppc_md.power_save = power7_idle; - else if (supported_cpuidle_states & OPAL_PM_STOP_INST_FAST) - ppc_md.power_save = power9_idle; out: return 0; diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 1c383f38031d..4c88c3e6ec9e 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -9,11 +9,20 @@ * License as published by the Free Software Foundation. */ +#include <linux/slab.h> +#include <linux/mmu_notifier.h> +#include <linux/mmu_context.h> +#include <linux/of.h> #include <linux/export.h> #include <linux/pci.h> #include <linux/memblock.h> #include <linux/iommu.h> +#include <asm/tlb.h> +#include <asm/powernv.h> +#include <asm/reg.h> +#include <asm/opal.h> +#include <asm/io.h> #include <asm/iommu.h> #include <asm/pnv-pci.h> #include <asm/msi_bitmap.h> @@ -22,6 +31,8 @@ #include "powernv.h" #include "pci.h" +#define npu_to_phb(x) container_of(x, struct pnv_phb, npu) + /* * Other types of TCE cache invalidation are not functional in the * hardware. @@ -37,6 +48,12 @@ struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev) struct device_node *dn; struct pci_dev *gpdev; + if (WARN_ON(!npdev)) + return NULL; + + if (WARN_ON(!npdev->dev.of_node)) + return NULL; + /* Get assoicated PCI device */ dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0); if (!dn) @@ -55,6 +72,12 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index) struct device_node *dn; struct pci_dev *npdev; + if (WARN_ON(!gpdev)) + return NULL; + + if (WARN_ON(!gpdev->dev.of_node)) + return NULL; + /* Get assoicated PCI device */ dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index); if (!dn) @@ -359,3 +382,442 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) return gpe; } + +/* Maximum number of nvlinks per npu */ +#define NV_MAX_LINKS 6 + +/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */ +static int max_npu2_index; + +struct npu_context { + struct mm_struct *mm; + struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS]; + struct mmu_notifier mn; + struct kref kref; + + /* Callback to stop translation requests on a given GPU */ + struct npu_context *(*release_cb)(struct npu_context *, void *); + + /* + * Private pointer passed to the above callback for usage by + * device drivers. + */ + void *priv; +}; + +/* + * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC + * if none are available. + */ +static int get_mmio_atsd_reg(struct npu *npu) +{ + int i; + + for (i = 0; i < npu->mmio_atsd_count; i++) { + if (!test_and_set_bit(i, &npu->mmio_atsd_usage)) + return i; + } + + return -ENOSPC; +} + +static void put_mmio_atsd_reg(struct npu *npu, int reg) +{ + clear_bit(reg, &npu->mmio_atsd_usage); +} + +/* MMIO ATSD register offsets */ +#define XTS_ATSD_AVA 1 +#define XTS_ATSD_STAT 2 + +static int mmio_launch_invalidate(struct npu *npu, unsigned long launch, + unsigned long va) +{ + int mmio_atsd_reg; + + do { + mmio_atsd_reg = get_mmio_atsd_reg(npu); + cpu_relax(); + } while (mmio_atsd_reg < 0); + + __raw_writeq(cpu_to_be64(va), + npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA); + eieio(); + __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]); + + return mmio_atsd_reg; +} + +static int mmio_invalidate_pid(struct npu *npu, unsigned long pid) +{ + unsigned long launch; + + /* IS set to invalidate matching PID */ + launch = PPC_BIT(12); + + /* PRS set to process-scoped */ + launch |= PPC_BIT(13); + + /* AP */ + launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); + + /* PID */ + launch |= pid << PPC_BITLSHIFT(38); + + /* Invalidating the entire process doesn't use a va */ + return mmio_launch_invalidate(npu, launch, 0); +} + +static int mmio_invalidate_va(struct npu *npu, unsigned long va, + unsigned long pid) +{ + unsigned long launch; + + /* IS set to invalidate target VA */ + launch = 0; + + /* PRS set to process scoped */ + launch |= PPC_BIT(13); + + /* AP */ + launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); + + /* PID */ + launch |= pid << PPC_BITLSHIFT(38); + + return mmio_launch_invalidate(npu, launch, va); +} + +#define mn_to_npu_context(x) container_of(x, struct npu_context, mn) + +/* + * Invalidate either a single address or an entire PID depending on + * the value of va. + */ +static void mmio_invalidate(struct npu_context *npu_context, int va, + unsigned long address) +{ + int i, j, reg; + struct npu *npu; + struct pnv_phb *nphb; + struct pci_dev *npdev; + struct { + struct npu *npu; + int reg; + } mmio_atsd_reg[NV_MAX_NPUS]; + unsigned long pid = npu_context->mm->context.id; + + /* + * Loop over all the NPUs this process is active on and launch + * an invalidate. + */ + for (i = 0; i <= max_npu2_index; i++) { + mmio_atsd_reg[i].reg = -1; + for (j = 0; j < NV_MAX_LINKS; j++) { + npdev = npu_context->npdev[i][j]; + if (!npdev) + continue; + + nphb = pci_bus_to_host(npdev->bus)->private_data; + npu = &nphb->npu; + mmio_atsd_reg[i].npu = npu; + + if (va) + mmio_atsd_reg[i].reg = + mmio_invalidate_va(npu, address, pid); + else + mmio_atsd_reg[i].reg = + mmio_invalidate_pid(npu, pid); + + /* + * The NPU hardware forwards the shootdown to all GPUs + * so we only have to launch one shootdown per NPU. + */ + break; + } + } + + /* + * Unfortunately the nest mmu does not support flushing specific + * addresses so we have to flush the whole mm. + */ + flush_tlb_mm(npu_context->mm); + + /* Wait for all invalidations to complete */ + for (i = 0; i <= max_npu2_index; i++) { + if (mmio_atsd_reg[i].reg < 0) + continue; + + /* Wait for completion */ + npu = mmio_atsd_reg[i].npu; + reg = mmio_atsd_reg[i].reg; + while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT)) + cpu_relax(); + put_mmio_atsd_reg(npu, reg); + } +} + +static void pnv_npu2_mn_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct npu_context *npu_context = mn_to_npu_context(mn); + + /* Call into device driver to stop requests to the NMMU */ + if (npu_context->release_cb) + npu_context->release_cb(npu_context, npu_context->priv); + + /* + * There should be no more translation requests for this PID, but we + * need to ensure any entries for it are removed from the TLB. + */ + mmio_invalidate(npu_context, 0, 0); +} + +static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address, + pte_t pte) +{ + struct npu_context *npu_context = mn_to_npu_context(mn); + + mmio_invalidate(npu_context, 1, address); +} + +static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct npu_context *npu_context = mn_to_npu_context(mn); + + mmio_invalidate(npu_context, 1, address); +} + +static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct npu_context *npu_context = mn_to_npu_context(mn); + unsigned long address; + + for (address = start; address <= end; address += PAGE_SIZE) + mmio_invalidate(npu_context, 1, address); +} + +static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { + .release = pnv_npu2_mn_release, + .change_pte = pnv_npu2_mn_change_pte, + .invalidate_page = pnv_npu2_mn_invalidate_page, + .invalidate_range = pnv_npu2_mn_invalidate_range, +}; + +/* + * Call into OPAL to setup the nmmu context for the current task in + * the NPU. This must be called to setup the context tables before the + * GPU issues ATRs. pdev should be a pointed to PCIe GPU device. + * + * A release callback should be registered to allow a device driver to + * be notified that it should not launch any new translation requests + * as the final TLB invalidate is about to occur. + * + * Returns an error if there no contexts are currently available or a + * npu_context which should be passed to pnv_npu2_handle_fault(). + * + * mmap_sem must be held in write mode. + */ +struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, + unsigned long flags, + struct npu_context *(*cb)(struct npu_context *, void *), + void *priv) +{ + int rc; + u32 nvlink_index; + struct device_node *nvlink_dn; + struct mm_struct *mm = current->mm; + struct pnv_phb *nphb; + struct npu *npu; + struct npu_context *npu_context; + + /* + * At present we don't support GPUs connected to multiple NPUs and I'm + * not sure the hardware does either. + */ + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return ERR_PTR(-ENODEV); + + if (!npdev) + /* No nvlink associated with this GPU device */ + return ERR_PTR(-ENODEV); + + if (!mm) { + /* kernel thread contexts are not supported */ + return ERR_PTR(-EINVAL); + } + + nphb = pci_bus_to_host(npdev->bus)->private_data; + npu = &nphb->npu; + + /* + * Setup the NPU context table for a particular GPU. These need to be + * per-GPU as we need the tables to filter ATSDs when there are no + * active contexts on a particular GPU. + */ + rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags, + PCI_DEVID(gpdev->bus->number, gpdev->devfn)); + if (rc < 0) + return ERR_PTR(-ENOSPC); + + /* + * We store the npu pci device so we can more easily get at the + * associated npus. + */ + npu_context = mm->context.npu_context; + if (!npu_context) { + npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL); + if (!npu_context) + return ERR_PTR(-ENOMEM); + + mm->context.npu_context = npu_context; + npu_context->mm = mm; + npu_context->mn.ops = &nv_nmmu_notifier_ops; + __mmu_notifier_register(&npu_context->mn, mm); + kref_init(&npu_context->kref); + } else { + kref_get(&npu_context->kref); + } + + npu_context->release_cb = cb; + npu_context->priv = priv; + nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); + if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", + &nvlink_index))) + return ERR_PTR(-ENODEV); + npu_context->npdev[npu->index][nvlink_index] = npdev; + + return npu_context; +} +EXPORT_SYMBOL(pnv_npu2_init_context); + +static void pnv_npu2_release_context(struct kref *kref) +{ + struct npu_context *npu_context = + container_of(kref, struct npu_context, kref); + + npu_context->mm->context.npu_context = NULL; + mmu_notifier_unregister(&npu_context->mn, + npu_context->mm); + + kfree(npu_context); +} + +void pnv_npu2_destroy_context(struct npu_context *npu_context, + struct pci_dev *gpdev) +{ + struct pnv_phb *nphb, *phb; + struct npu *npu; + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); + struct device_node *nvlink_dn; + u32 nvlink_index; + + if (WARN_ON(!npdev)) + return; + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return; + + nphb = pci_bus_to_host(npdev->bus)->private_data; + npu = &nphb->npu; + phb = pci_bus_to_host(gpdev->bus)->private_data; + nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); + if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", + &nvlink_index))) + return; + npu_context->npdev[npu->index][nvlink_index] = NULL; + opal_npu_destroy_context(phb->opal_id, npu_context->mm->context.id, + PCI_DEVID(gpdev->bus->number, gpdev->devfn)); + kref_put(&npu_context->kref, pnv_npu2_release_context); +} +EXPORT_SYMBOL(pnv_npu2_destroy_context); + +/* + * Assumes mmap_sem is held for the contexts associated mm. + */ +int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, + unsigned long *flags, unsigned long *status, int count) +{ + u64 rc = 0, result = 0; + int i, is_write; + struct page *page[1]; + + /* mmap_sem should be held so the struct_mm must be present */ + struct mm_struct *mm = context->mm; + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return -ENODEV; + + WARN_ON(!rwsem_is_locked(&mm->mmap_sem)); + + for (i = 0; i < count; i++) { + is_write = flags[i] & NPU2_WRITE; + rc = get_user_pages_remote(NULL, mm, ea[i], 1, + is_write ? FOLL_WRITE : 0, + page, NULL, NULL); + + /* + * To support virtualised environments we will have to do an + * access to the page to ensure it gets faulted into the + * hypervisor. For the moment virtualisation is not supported in + * other areas so leave the access out. + */ + if (rc != 1) { + status[i] = rc; + result = -EFAULT; + continue; + } + + status[i] = 0; + put_page(page[0]); + } + + return result; +} +EXPORT_SYMBOL(pnv_npu2_handle_fault); + +int pnv_npu2_init(struct pnv_phb *phb) +{ + unsigned int i; + u64 mmio_atsd; + struct device_node *dn; + struct pci_dev *gpdev; + static int npu_index; + uint64_t rc = 0; + + for_each_child_of_node(phb->hose->dn, dn) { + gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn)); + if (gpdev) { + rc = opal_npu_map_lpar(phb->opal_id, + PCI_DEVID(gpdev->bus->number, gpdev->devfn), + 0, 0); + if (rc) + dev_err(&gpdev->dev, + "Error %lld mapping device to LPAR\n", + rc); + } + } + + for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd", + i, &mmio_atsd); i++) + phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32); + + pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i); + phb->npu.mmio_atsd_count = i; + phb->npu.mmio_atsd_usage = 0; + npu_index++; + if (WARN_ON(npu_index >= NV_MAX_NPUS)) + return -ENOSPC; + max_npu2_index = npu_index; + phb->npu.index = npu_index; + + return 0; +} diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c index a91d7876fae2..6c7ad1d8b32e 100644 --- a/arch/powerpc/platforms/powernv/opal-lpc.c +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -12,7 +12,6 @@ #include <linux/kernel.h> #include <linux/of.h> #include <linux/bug.h> -#include <linux/debugfs.h> #include <linux/io.h> #include <linux/slab.h> @@ -21,7 +20,7 @@ #include <asm/opal.h> #include <asm/prom.h> #include <linux/uaccess.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/isa-bridge.h> static int opal_lpc_chip_id = -1; diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c index 308efd170c27..aa267f120033 100644 --- a/arch/powerpc/platforms/powernv/opal-sensor.c +++ b/arch/powerpc/platforms/powernv/opal-sensor.c @@ -64,6 +64,10 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) *sensor_data = be32_to_cpu(data); break; + case OPAL_WRONG_STATE: + ret = -EIO; + break; + default: ret = opal_error_code(ret); break; diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index da8a0f7a035c..f620572f891f 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -50,21 +50,13 @@ END_FTR_SECTION(0, 1); \ #define OPAL_BRANCH(LABEL) #endif -/* TODO: - * - * - Trace irqs in/off (needs saving/restoring all args, argh...) - * - Get r11 feed up by Dave so I can have better register usage +/* + * DO_OPAL_CALL assumes: + * r0 = opal call token + * r12 = msr + * LR has been saved */ - -#define OPAL_CALL(name, token) \ - _GLOBAL_TOC(name); \ - mfmsr r12; \ - mflr r0; \ - andi. r11,r12,MSR_IR|MSR_DR; \ - std r0,PPC_LR_STKOFF(r1); \ - li r0,token; \ - beq opal_real_call; \ - OPAL_BRANCH(opal_tracepoint_entry) \ +#define DO_OPAL_CALL() \ mfcr r11; \ stw r11,8(r1); \ li r11,0; \ @@ -83,6 +75,18 @@ END_FTR_SECTION(0, 1); \ mtspr SPRN_HSRR0,r12; \ hrfid +#define OPAL_CALL(name, token) \ + _GLOBAL_TOC(name); \ + mfmsr r12; \ + mflr r0; \ + andi. r11,r12,MSR_IR|MSR_DR; \ + std r0,PPC_LR_STKOFF(r1); \ + li r0,token; \ + beq opal_real_call; \ + OPAL_BRANCH(opal_tracepoint_entry) \ + DO_OPAL_CALL() + + opal_return: /* * Fixup endian on OPAL return... we should be able to simplify @@ -148,26 +152,13 @@ opal_tracepoint_entry: ld r8,STK_REG(R29)(r1) ld r9,STK_REG(R30)(r1) ld r10,STK_REG(R31)(r1) + + /* setup LR so we return via tracepoint_return */ LOAD_REG_ADDR(r11,opal_tracepoint_return) - mfcr r12 std r11,16(r1) - stw r12,8(r1) - li r11,0 + mfmsr r12 - ori r11,r11,MSR_EE - std r12,PACASAVEDMSR(r13) - andc r12,r12,r11 - mtmsrd r12,1 - LOAD_REG_ADDR(r11,opal_return) - mtlr r11 - li r11,MSR_DR|MSR_IR|MSR_LE - andc r12,r12,r11 - mtspr SPRN_HSRR1,r12 - LOAD_REG_ADDR(r11,opal) - ld r12,8(r11) - ld r2,0(r11) - mtspr SPRN_HSRR0,r12 - hrfid + DO_OPAL_CALL() opal_tracepoint_return: std r3,STK_REG(R31)(r1) @@ -301,3 +292,21 @@ OPAL_CALL(opal_int_eoi, OPAL_INT_EOI); OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR); +OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET); +OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO); +OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG); +OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG); +OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO); +OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO); +OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); +OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); +OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); +OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ); +OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); +OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); +OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); +OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); +OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); +OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); +OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); +OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index d0ac535cf5d7..28651fb25417 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -73,25 +73,32 @@ static int opal_xscom_err_xlate(int64_t rc) static u64 opal_scom_unmangle(u64 addr) { + u64 tmp; + /* - * XSCOM indirect addresses have the top bit set. Additionally - * the rest of the top 3 nibbles is always 0. + * XSCOM addresses use the top nibble to set indirect mode and + * its form. Bits 4-11 are always 0. * * Because the debugfs interface uses signed offsets and shifts * the address left by 3, we basically cannot use the top 4 bits * of the 64-bit address, and thus cannot use the indirect bit. * - * To deal with that, we support the indirect bit being in bit - * 4 (IBM notation) instead of bit 0 in this API, we do the - * conversion here. To leave room for further xscom address - * expansion, we only clear out the top byte + * To deal with that, we support the indirect bits being in + * bits 4-7 (IBM notation) instead of bit 0-3 in this API, we + * do the conversion here. * - * For in-kernel use, we also support the real indirect bit, so - * we test for any of the top 5 bits + * For in-kernel use, we don't need to do this mangling. In + * kernel won't have bits 4-7 set. * + * So: + * debugfs will always set 0-3 = 0 and clear 4-7 + * kernel will always clear 0-3 = 0 and set 4-7 */ - if (addr & (0x1full << 59)) - addr = (addr & ~(0xffull << 56)) | (1ull << 63); + tmp = addr; + tmp &= 0x0f00000000000000; + addr &= 0xf0ffffffffffffff; + addr |= tmp << 4; + return addr; } diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index e0f856bfbfe8..7925a9d72cca 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -435,7 +435,7 @@ int opal_machine_check(struct pt_regs *regs) evt.version); return 0; } - machine_check_print_event_info(&evt); + machine_check_print_event_info(&evt, user_mode(regs)); if (opal_recover_mce(regs, &evt)) return 1; @@ -595,6 +595,80 @@ static void opal_export_symmap(void) pr_warn("Error %d creating OPAL symbols file\n", rc); } +static ssize_t export_attr_read(struct file *fp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, + loff_t off, size_t count) +{ + return memory_read_from_buffer(buf, count, &off, bin_attr->private, + bin_attr->size); +} + +/* + * opal_export_attrs: creates a sysfs node for each property listed in + * the device-tree under /ibm,opal/firmware/exports/ + * All new sysfs nodes are created under /opal/exports/. + * This allows for reserved memory regions (e.g. HDAT) to be read. + * The new sysfs nodes are only readable by root. + */ +static void opal_export_attrs(void) +{ + struct bin_attribute *attr; + struct device_node *np; + struct property *prop; + struct kobject *kobj; + u64 vals[2]; + int rc; + + np = of_find_node_by_path("/ibm,opal/firmware/exports"); + if (!np) + return; + + /* Create new 'exports' directory - /sys/firmware/opal/exports */ + kobj = kobject_create_and_add("exports", opal_kobj); + if (!kobj) { + pr_warn("kobject_create_and_add() of exports failed\n"); + return; + } + + for_each_property_of_node(np, prop) { + if (!strcmp(prop->name, "name") || !strcmp(prop->name, "phandle")) + continue; + + if (of_property_read_u64_array(np, prop->name, &vals[0], 2)) + continue; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + + if (attr == NULL) { + pr_warn("Failed kmalloc for bin_attribute!"); + continue; + } + + sysfs_bin_attr_init(attr); + attr->attr.name = kstrdup(prop->name, GFP_KERNEL); + attr->attr.mode = 0400; + attr->read = export_attr_read; + attr->private = __va(vals[0]); + attr->size = vals[1]; + + if (attr->attr.name == NULL) { + pr_warn("Failed kstrdup for bin_attribute attr.name"); + kfree(attr); + continue; + } + + rc = sysfs_create_bin_file(kobj, attr); + if (rc) { + pr_warn("Error %d creating OPAL sysfs exports/%s file\n", + rc, prop->name); + kfree(attr->attr.name); + kfree(attr); + } + } + + of_node_put(np); +} + static void __init opal_dump_region_init(void) { void *addr; @@ -733,6 +807,9 @@ static int __init opal_init(void) opal_msglog_sysfs_init(); } + /* Export all properties */ + opal_export_attrs(); + /* Initialize platform devices: IPMI backend, PRD & flash interface */ opal_pdev_init("ibm,opal-ipmi"); opal_pdev_init("ibm,opal-flash"); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index ee4cdb5b893f..93ce3f9d2758 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -14,7 +14,6 @@ #include <linux/kernel.h> #include <linux/pci.h> #include <linux/crash_dump.h> -#include <linux/debugfs.h> #include <linux/delay.h> #include <linux/string.h> #include <linux/init.h> @@ -38,7 +37,7 @@ #include <asm/iommu.h> #include <asm/tce.h> #include <asm/xics.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/firmware.h> #include <asm/pnv-pci.h> #include <asm/mmzone.h> @@ -1262,6 +1261,8 @@ static void pnv_pci_ioda_setup_PEs(void) /* PE#0 is needed for error reporting */ pnv_ioda_reserve_pe(phb, 0); pnv_ioda_setup_npu_PEs(hose->bus); + if (phb->model == PNV_PHB_MODEL_NPU2) + pnv_npu2_init(phb); } } } @@ -2757,9 +2758,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, if (rc) return; - if (pe->flags & PNV_IODA_PE_DEV) - iommu_add_device(&pe->pdev->dev); - else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) + if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) pnv_ioda_setup_bus_dma(pe, pe->pbus, true); } diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 204a829ff506..9b2bdcad51ba 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -758,7 +758,7 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages) unsigned long pnv_tce_get(struct iommu_table *tbl, long index) { - return *(pnv_tce(tbl, index - tbl->it_offset)); + return be64_to_cpu(*(pnv_tce(tbl, index - tbl->it_offset))); } struct iommu_table *pnv_pci_table_alloc(int nid) diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index e1d3e5526b54..4eab713136d1 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -7,6 +7,9 @@ struct pci_dn; +/* Maximum possible number of ATSD MMIO registers per NPU */ +#define NV_NMMU_ATSD_REGS 8 + enum pnv_phb_type { PNV_PHB_IODA1 = 0, PNV_PHB_IODA2 = 1, @@ -174,6 +177,16 @@ struct pnv_phb { struct OpalIoP7IOCErrorData hub_diag; } diag; + /* Nvlink2 data */ + struct npu { + int index; + __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS]; + unsigned int mmio_atsd_count; + + /* Bitmask for MMIO register usage */ + unsigned long mmio_atsd_usage; + } npu; + #ifdef CONFIG_CXL_BASE struct cxl_afu *cxl_afu; #endif @@ -236,7 +249,7 @@ extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num); extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe); extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe); - +extern int pnv_npu2_init(struct pnv_phb *phb); /* cxl functions */ extern bool pnv_cxl_enable_device_hook(struct pci_dev *dev); diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index 613052232475..6dbc0a1da1f6 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -18,8 +18,6 @@ static inline void pnv_pci_shutdown(void) { } #endif extern u32 pnv_get_supported_cpuidle_states(void); -extern u64 pnv_deepest_stop_psscr_val; -extern u64 pnv_deepest_stop_psscr_mask; extern void pnv_lpc_init(void); diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c index 5dcbdea1afac..1a9d84371a4d 100644 --- a/arch/powerpc/platforms/powernv/rng.c +++ b/arch/powerpc/platforms/powernv/rng.c @@ -62,7 +62,7 @@ int powernv_get_random_real_mode(unsigned long *v) rng = raw_cpu_read(powernv_rng); - *v = rng_whiten(rng, in_rm64(rng->regs_real)); + *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real)); return 1; } diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index d50c7d99baaf..2dc7e5fb86c3 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -32,6 +32,7 @@ #include <asm/machdep.h> #include <asm/firmware.h> #include <asm/xics.h> +#include <asm/xive.h> #include <asm/opal.h> #include <asm/kexec.h> #include <asm/smp.h> @@ -76,7 +77,9 @@ static void __init pnv_init(void) static void __init pnv_init_IRQ(void) { - xics_init(); + /* Try using a XIVE if available, otherwise use a XICS */ + if (!xive_native_init()) + xics_init(); WARN_ON(!ppc_md.get_irq); } @@ -95,6 +98,10 @@ static void pnv_show_cpuinfo(struct seq_file *m) else seq_printf(m, "firmware\t: BML\n"); of_node_put(root); + if (radix_enabled()) + seq_printf(m, "MMU\t\t: Radix\n"); + else + seq_printf(m, "MMU\t\t: Hash\n"); } static void pnv_prepare_going_down(void) @@ -218,10 +225,12 @@ static void pnv_kexec_wait_secondaries_down(void) static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) { - xics_kexec_teardown_cpu(secondary); + if (xive_enabled()) + xive_kexec_teardown_cpu(secondary); + else + xics_kexec_teardown_cpu(secondary); /* On OPAL, we return all CPUs to firmware */ - if (!firmware_has_feature(FW_FEATURE_OPAL)) return; @@ -237,6 +246,10 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) /* Primary waits for the secondaries to have reached OPAL */ pnv_kexec_wait_secondaries_down(); + /* Switch XIVE back to emulation mode */ + if (xive_enabled()) + xive_shutdown(); + /* * We might be running as little-endian - now that interrupts * are disabled, reset the HILE bit to big-endian so we don't diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 8b67e1eefb5c..5189445db164 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -29,12 +29,14 @@ #include <asm/vdso_datapage.h> #include <asm/cputhreads.h> #include <asm/xics.h> +#include <asm/xive.h> #include <asm/opal.h> #include <asm/runlatch.h> #include <asm/code-patching.h> #include <asm/dbell.h> #include <asm/kvm_ppc.h> #include <asm/ppc-opcode.h> +#include <asm/cpuidle.h> #include "powernv.h" @@ -47,13 +49,10 @@ static void pnv_smp_setup_cpu(int cpu) { - if (cpu != boot_cpuid) + if (xive_enabled()) + xive_smp_setup_cpu(); + else if (cpu != boot_cpuid) xics_setup_cpu(); - -#ifdef CONFIG_PPC_DOORBELL - if (cpu_has_feature(CPU_FTR_DBELL)) - doorbell_setup_this_cpu(); -#endif } static int pnv_smp_kick_cpu(int nr) @@ -132,7 +131,10 @@ static int pnv_smp_cpu_disable(void) vdso_data->processorCount--; if (cpu == boot_cpuid) boot_cpuid = cpumask_any(cpu_online_mask); - xics_migrate_irqs_away(); + if (xive_enabled()) + xive_smp_disable_cpu(); + else + xics_migrate_irqs_away(); return 0; } @@ -140,7 +142,6 @@ static void pnv_smp_cpu_kill_self(void) { unsigned int cpu; unsigned long srr1, wmask; - u32 idle_states; /* Standard hot unplug procedure */ local_irq_disable(); @@ -155,8 +156,6 @@ static void pnv_smp_cpu_kill_self(void) if (cpu_has_feature(CPU_FTR_ARCH_207S)) wmask = SRR1_WAKEMASK_P8; - idle_states = pnv_get_supported_cpuidle_states(); - /* We don't want to take decrementer interrupts while we are offline, * so clear LPCR:PECE1. We keep PECE2 (and LPCR_PECE_HVEE on P9) * enabled as to let IPIs in. @@ -184,19 +183,7 @@ static void pnv_smp_cpu_kill_self(void) kvmppc_set_host_ipi(cpu, 0); ppc64_runlatch_off(); - - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val, - pnv_deepest_stop_psscr_mask); - } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { - srr1 = power7_winkle(); - } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || - (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { - srr1 = power7_sleep(); - } else { - srr1 = power7_nap(1); - } - + srr1 = pnv_cpu_offline(cpu); ppc64_runlatch_on(); /* @@ -213,9 +200,12 @@ static void pnv_smp_cpu_kill_self(void) if (((srr1 & wmask) == SRR1_WAKEEE) || ((srr1 & wmask) == SRR1_WAKEHVI) || (local_paca->irq_happened & PACA_IRQ_EE)) { - if (cpu_has_feature(CPU_FTR_ARCH_300)) - icp_opal_flush_interrupt(); - else + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (xive_enabled()) + xive_flush_interrupt(); + else + icp_opal_flush_interrupt(); + } else icp_native_flush_interrupt(); } else if ((srr1 & wmask) == SRR1_WAKEHDBELL) { unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); @@ -252,10 +242,68 @@ static int pnv_cpu_bootable(unsigned int nr) return smp_generic_cpu_bootable(nr); } +static int pnv_smp_prepare_cpu(int cpu) +{ + if (xive_enabled()) + return xive_smp_prepare_cpu(cpu); + return 0; +} + +/* Cause IPI as setup by the interrupt controller (xics or xive) */ +static void (*ic_cause_ipi)(int cpu); + +static void pnv_cause_ipi(int cpu) +{ + if (doorbell_try_core_ipi(cpu)) + return; + + ic_cause_ipi(cpu); +} + +static void pnv_p9_dd1_cause_ipi(int cpu) +{ + int this_cpu = get_cpu(); + + /* + * POWER9 DD1 has a global addressed msgsnd, but for now we restrict + * IPIs to same core, because it requires additional synchronization + * for inter-core doorbells which we do not implement. + */ + if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu))) + doorbell_global_ipi(cpu); + else + ic_cause_ipi(cpu); + + put_cpu(); +} + +static void __init pnv_smp_probe(void) +{ + if (xive_enabled()) + xive_smp_probe(); + else + xics_smp_probe(); + + if (cpu_has_feature(CPU_FTR_DBELL)) { + ic_cause_ipi = smp_ops->cause_ipi; + WARN_ON(!ic_cause_ipi); + + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) + smp_ops->cause_ipi = pnv_p9_dd1_cause_ipi; + else + smp_ops->cause_ipi = doorbell_global_ipi; + } else { + smp_ops->cause_ipi = pnv_cause_ipi; + } + } +} + static struct smp_ops_t pnv_smp_ops = { - .message_pass = smp_muxed_ipi_message_pass, - .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */ - .probe = xics_smp_probe, + .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ + .cause_ipi = NULL, /* Filled at runtime by pnv_smp_probe() */ + .probe = pnv_smp_probe, + .prepare_cpu = pnv_smp_prepare_cpu, .kick_cpu = pnv_smp_kick_cpu, .setup_cpu = pnv_smp_setup_cpu, .cpu_bootable = pnv_cpu_bootable, diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 30ec04f1c67c..913c54e23eea 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -17,9 +17,10 @@ config PPC_PSERIES select PPC_UDBG_16550 select PPC_NATIVE select PPC_DOORBELL - select HOTPLUG_CPU if SMP + select HOTPLUG_CPU select ARCH_RANDOM select PPC_DOORBELL + select FORCE_SMP default y config PPC_SPLPAR diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 193e052fa0dd..bda18d8e1674 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -288,7 +288,6 @@ int dlpar_detach_node(struct device_node *dn) if (rc) return rc; - of_node_put(dn); /* Must decrement the refcount */ return 0; } diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 6b04e3f0f982..18014cdeb590 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -21,13 +21,12 @@ */ #include <linux/slab.h> -#include <linux/debugfs.h> #include <linux/spinlock.h> #include <asm/smp.h> #include <linux/uaccess.h> #include <asm/firmware.h> #include <asm/lppaca.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/plpar_wrappers.h> #include <asm/machdep.h> diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c index f02ec3ab428c..957ae347b0b3 100644 --- a/arch/powerpc/platforms/pseries/hvCall_inst.c +++ b/arch/powerpc/platforms/pseries/hvCall_inst.c @@ -29,6 +29,16 @@ #include <asm/trace.h> #include <asm/machdep.h> +/* For hcall instrumentation. One structure per-hcall, per-CPU */ +struct hcall_stats { + unsigned long num_calls; /* number of calls (on this CPU) */ + unsigned long tb_total; /* total wall time (mftb) of calls. */ + unsigned long purr_total; /* total cpu time (PURR) of calls. */ + unsigned long tb_start; + unsigned long purr_start; +}; +#define HCALL_STAT_ARRAY_SIZE ((MAX_HCALL_OPCODE >> 2) + 1) + DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats); /* diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 8b1fe895daa3..6541d0b03e4c 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -958,3 +958,64 @@ int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data) return rc; } + +static unsigned long vsid_unscramble(unsigned long vsid, int ssize) +{ + unsigned long protovsid; + unsigned long va_bits = VA_BITS; + unsigned long modinv, vsid_modulus; + unsigned long max_mod_inv, tmp_modinv; + + if (!mmu_has_feature(MMU_FTR_68_BIT_VA)) + va_bits = 65; + + if (ssize == MMU_SEGSIZE_256M) { + modinv = VSID_MULINV_256M; + vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1); + } else { + modinv = VSID_MULINV_1T; + vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1); + } + + /* + * vsid outside our range. + */ + if (vsid >= vsid_modulus) + return 0; + + /* + * If modinv is the modular multiplicate inverse of (x % vsid_modulus) + * and vsid = (protovsid * x) % vsid_modulus, then we say: + * protovsid = (vsid * modinv) % vsid_modulus + */ + + /* Check if (vsid * modinv) overflow (63 bits) */ + max_mod_inv = 0x7fffffffffffffffull / vsid; + if (modinv < max_mod_inv) + return (vsid * modinv) % vsid_modulus; + + tmp_modinv = modinv/max_mod_inv; + modinv %= max_mod_inv; + + protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus; + protovsid = (protovsid + vsid * modinv) % vsid_modulus; + + return protovsid; +} + +static int __init reserve_vrma_context_id(void) +{ + unsigned long protovsid; + + /* + * Reserve context ids which map to reserved virtual addresses. For now + * we only reserve the context id which maps to the VRMA VSID. We ignore + * the addresses in "ibm,adjunct-virtual-addresses" because we don't + * enable adjunct support via the "ibm,client-architecture-support" + * interface. + */ + protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T); + hash__reserve_context_id(protovsid >> ESID_BITS_1T); + return 0; +} +machine_device_initcall(pseries, reserve_vrma_context_id); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index b4d362ed03a1..b5d86426e97b 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -87,6 +87,10 @@ static void pSeries_show_cpuinfo(struct seq_file *m) model = of_get_property(root, "model", NULL); seq_printf(m, "machine\t\t: CHRP %s\n", model); of_node_put(root); + if (radix_enabled()) + seq_printf(m, "MMU\t\t: Radix\n"); + else + seq_printf(m, "MMU\t\t: Hash\n"); } /* Initialize firmware assisted non-maskable interrupts if diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index f6f83aeccaaa..12ff5bef67fd 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -55,11 +55,6 @@ */ static cpumask_var_t of_spin_mask; -/* - * If we multiplex IPI mechanisms, store the appropriate XICS IPI mechanism here - */ -static void (*xics_cause_ipi)(int cpu, unsigned long data); - /* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */ int smp_query_cpu_stopped(unsigned int pcpu) { @@ -143,8 +138,6 @@ static void smp_setup_cpu(int cpu) { if (cpu != boot_cpuid) xics_setup_cpu(); - if (cpu_has_feature(CPU_FTR_DBELL)) - doorbell_setup_this_cpu(); if (firmware_has_feature(FW_FEATURE_SPLPAR)) vpa_init(cpu); @@ -187,23 +180,23 @@ static int smp_pSeries_kick_cpu(int nr) return 0; } -/* Only used on systems that support multiple IPI mechanisms */ -static void pSeries_cause_ipi_mux(int cpu, unsigned long data) +static void smp_pseries_cause_ipi(int cpu) { - if (cpumask_test_cpu(cpu, cpu_sibling_mask(smp_processor_id()))) - doorbell_cause_ipi(cpu, data); - else - xics_cause_ipi(cpu, data); + /* POWER9 should not use this handler */ + if (doorbell_try_core_ipi(cpu)) + return; + + icp_ops->cause_ipi(cpu); } static __init void pSeries_smp_probe(void) { xics_smp_probe(); - if (cpu_has_feature(CPU_FTR_DBELL)) { - xics_cause_ipi = smp_ops->cause_ipi; - smp_ops->cause_ipi = pSeries_cause_ipi_mux; - } + if (cpu_has_feature(CPU_FTR_DBELL)) + smp_ops->cause_ipi = smp_pseries_cause_ipi; + else + smp_ops->cause_ipi = icp_ops->cause_ipi; } static struct smp_ops_t pseries_smp_ops = { diff --git a/arch/powerpc/sysdev/Kconfig b/arch/powerpc/sysdev/Kconfig index 52dc165c0efb..caf882e749dc 100644 --- a/arch/powerpc/sysdev/Kconfig +++ b/arch/powerpc/sysdev/Kconfig @@ -28,6 +28,7 @@ config PPC_MSI_BITMAP default y if PPC_POWERNV source "arch/powerpc/sysdev/xics/Kconfig" +source "arch/powerpc/sysdev/xive/Kconfig" config PPC_SCOM bool diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile index a254824719f1..c0ae11d4f62f 100644 --- a/arch/powerpc/sysdev/Makefile +++ b/arch/powerpc/sysdev/Makefile @@ -71,5 +71,6 @@ obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS) += udbg_memcons.o subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror obj-$(CONFIG_PPC_XICS) += xics/ +obj-$(CONFIG_PPC_XIVE) += xive/ obj-$(CONFIG_GE_FPGA) += ge/ diff --git a/arch/powerpc/sysdev/scom.c b/arch/powerpc/sysdev/scom.c index d0e9f178a324..76ea32c1b664 100644 --- a/arch/powerpc/sysdev/scom.c +++ b/arch/powerpc/sysdev/scom.c @@ -19,10 +19,9 @@ */ #include <linux/kernel.h> -#include <linux/debugfs.h> #include <linux/slab.h> #include <linux/export.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/prom.h> #include <asm/scom.h> #include <linux/uaccess.h> diff --git a/arch/powerpc/sysdev/xics/icp-hv.c b/arch/powerpc/sysdev/xics/icp-hv.c index e7fa26c4ff73..bbc839a98c41 100644 --- a/arch/powerpc/sysdev/xics/icp-hv.c +++ b/arch/powerpc/sysdev/xics/icp-hv.c @@ -138,7 +138,7 @@ static void icp_hv_set_cpu_priority(unsigned char cppr) #ifdef CONFIG_SMP -static void icp_hv_cause_ipi(int cpu, unsigned long data) +static void icp_hv_cause_ipi(int cpu) { icp_hv_set_qirr(cpu, IPI_PRIORITY); } diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c index 8a6a043e239b..2bfb9968d562 100644 --- a/arch/powerpc/sysdev/xics/icp-native.c +++ b/arch/powerpc/sysdev/xics/icp-native.c @@ -143,19 +143,9 @@ static unsigned int icp_native_get_irq(void) #ifdef CONFIG_SMP -static void icp_native_cause_ipi(int cpu, unsigned long data) +static void icp_native_cause_ipi(int cpu) { kvmppc_set_host_ipi(cpu, 1); -#ifdef CONFIG_PPC_DOORBELL - if (cpu_has_feature(CPU_FTR_DBELL)) { - if (cpumask_test_cpu(cpu, cpu_sibling_mask(get_cpu()))) { - doorbell_cause_ipi(cpu, data); - put_cpu(); - return; - } - put_cpu(); - } -#endif icp_native_set_qirr(cpu, IPI_PRIORITY); } @@ -168,15 +158,15 @@ void icp_native_cause_ipi_rm(int cpu) * Need the physical address of the XICS to be * previously saved in kvm_hstate in the paca. */ - unsigned long xics_phys; + void __iomem *xics_phys; /* * Just like the cause_ipi functions, it is required to - * include a full barrier (out8 includes a sync) before - * causing the IPI. + * include a full barrier before causing the IPI. */ xics_phys = paca[cpu].kvm_hstate.xics_phys; - out_rm8((u8 *)(xics_phys + XICS_MFRR), IPI_PRIORITY); + mb(); + __raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR); } #endif diff --git a/arch/powerpc/sysdev/xics/icp-opal.c b/arch/powerpc/sysdev/xics/icp-opal.c index b53f80f0b4d8..c71d2ea42627 100644 --- a/arch/powerpc/sysdev/xics/icp-opal.c +++ b/arch/powerpc/sysdev/xics/icp-opal.c @@ -126,7 +126,7 @@ static void icp_opal_eoi(struct irq_data *d) #ifdef CONFIG_SMP -static void icp_opal_cause_ipi(int cpu, unsigned long data) +static void icp_opal_cause_ipi(int cpu) { int hw_cpu = get_hard_smp_processor_id(cpu); diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index 23efe4e42172..ffe138b8b9dc 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -143,11 +143,11 @@ static void xics_request_ipi(void) void __init xics_smp_probe(void) { - /* Setup cause_ipi callback based on which ICP is used */ - smp_ops->cause_ipi = icp_ops->cause_ipi; - /* Register all the IPIs */ xics_request_ipi(); + + /* Setup cause_ipi callback based on which ICP is used */ + smp_ops->cause_ipi = icp_ops->cause_ipi; } #endif /* CONFIG_SMP */ diff --git a/arch/powerpc/sysdev/xive/Kconfig b/arch/powerpc/sysdev/xive/Kconfig new file mode 100644 index 000000000000..12ccd7373d2f --- /dev/null +++ b/arch/powerpc/sysdev/xive/Kconfig @@ -0,0 +1,11 @@ +config PPC_XIVE + bool + default n + select PPC_SMP_MUXED_IPI + select HARDIRQS_SW_RESEND + +config PPC_XIVE_NATIVE + bool + default n + select PPC_XIVE + depends on PPC_POWERNV diff --git a/arch/powerpc/sysdev/xive/Makefile b/arch/powerpc/sysdev/xive/Makefile new file mode 100644 index 000000000000..3fab303fc169 --- /dev/null +++ b/arch/powerpc/sysdev/xive/Makefile @@ -0,0 +1,4 @@ +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror + +obj-y += common.o +obj-$(CONFIG_PPC_XIVE_NATIVE) += native.o diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c new file mode 100644 index 000000000000..6a98efb14264 --- /dev/null +++ b/arch/powerpc/sysdev/xive/common.c @@ -0,0 +1,1302 @@ +/* + * Copyright 2016,2017 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "xive: " fmt + +#include <linux/types.h> +#include <linux/threads.h> +#include <linux/kernel.h> +#include <linux/irq.h> +#include <linux/debugfs.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/of.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/msi.h> + +#include <asm/prom.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/machdep.h> +#include <asm/irq.h> +#include <asm/errno.h> +#include <asm/xive.h> +#include <asm/xive-regs.h> +#include <asm/xmon.h> + +#include "xive-internal.h" + +#undef DEBUG_FLUSH +#undef DEBUG_ALL + +#ifdef DEBUG_ALL +#define DBG_VERBOSE(fmt...) pr_devel(fmt) +#else +#define DBG_VERBOSE(fmt...) do { } while(0) +#endif + +bool __xive_enabled; +bool xive_cmdline_disabled; + +/* We use only one priority for now */ +static u8 xive_irq_priority; + +/* TIMA */ +void __iomem *xive_tima; +u32 xive_tima_offset; + +/* Backend ops */ +static const struct xive_ops *xive_ops; + +/* Our global interrupt domain */ +static struct irq_domain *xive_irq_domain; + +#ifdef CONFIG_SMP +/* The IPIs all use the same logical irq number */ +static u32 xive_ipi_irq; +#endif + +/* Xive state for each CPU */ +static DEFINE_PER_CPU(struct xive_cpu *, xive_cpu); + +/* + * A "disabled" interrupt should never fire, to catch problems + * we set its logical number to this + */ +#define XIVE_BAD_IRQ 0x7fffffff +#define XIVE_MAX_IRQ (XIVE_BAD_IRQ - 1) + +/* An invalid CPU target */ +#define XIVE_INVALID_TARGET (-1) + +/* + * Read the next entry in a queue, return its content if it's valid + * or 0 if there is no new entry. + * + * The queue pointer is moved forward unless "just_peek" is set + */ +static u32 xive_read_eq(struct xive_q *q, bool just_peek) +{ + u32 cur; + + if (!q->qpage) + return 0; + cur = be32_to_cpup(q->qpage + q->idx); + + /* Check valid bit (31) vs current toggle polarity */ + if ((cur >> 31) == q->toggle) + return 0; + + /* If consuming from the queue ... */ + if (!just_peek) { + /* Next entry */ + q->idx = (q->idx + 1) & q->msk; + + /* Wrap around: flip valid toggle */ + if (q->idx == 0) + q->toggle ^= 1; + } + /* Mask out the valid bit (31) */ + return cur & 0x7fffffff; +} + +/* + * Scans all the queue that may have interrupts in them + * (based on "pending_prio") in priority order until an + * interrupt is found or all the queues are empty. + * + * Then updates the CPPR (Current Processor Priority + * Register) based on the most favored interrupt found + * (0xff if none) and return what was found (0 if none). + * + * If just_peek is set, return the most favored pending + * interrupt if any but don't update the queue pointers. + * + * Note: This function can operate generically on any number + * of queues (up to 8). The current implementation of the XIVE + * driver only uses a single queue however. + * + * Note2: This will also "flush" "the pending_count" of a queue + * into the "count" when that queue is observed to be empty. + * This is used to keep track of the amount of interrupts + * targetting a queue. When an interrupt is moved away from + * a queue, we only decrement that queue count once the queue + * has been observed empty to avoid races. + */ +static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) +{ + u32 irq = 0; + u8 prio; + + /* Find highest pending priority */ + while (xc->pending_prio != 0) { + struct xive_q *q; + + prio = ffs(xc->pending_prio) - 1; + DBG_VERBOSE("scan_irq: trying prio %d\n", prio); + + /* Try to fetch */ + irq = xive_read_eq(&xc->queue[prio], just_peek); + + /* Found something ? That's it */ + if (irq) + break; + + /* Clear pending bits */ + xc->pending_prio &= ~(1 << prio); + + /* + * Check if the queue count needs adjusting due to + * interrupts being moved away. See description of + * xive_dec_target_count() + */ + q = &xc->queue[prio]; + if (atomic_read(&q->pending_count)) { + int p = atomic_xchg(&q->pending_count, 0); + if (p) { + WARN_ON(p > atomic_read(&q->count)); + atomic_sub(p, &q->count); + } + } + } + + /* If nothing was found, set CPPR to 0xff */ + if (irq == 0) + prio = 0xff; + + /* Update HW CPPR to match if necessary */ + if (prio != xc->cppr) { + DBG_VERBOSE("scan_irq: adjusting CPPR to %d\n", prio); + xc->cppr = prio; + out_8(xive_tima + xive_tima_offset + TM_CPPR, prio); + } + + return irq; +} + +/* + * This is used to perform the magic loads from an ESB + * described in xive.h + */ +static u8 xive_poke_esb(struct xive_irq_data *xd, u32 offset) +{ + u64 val; + + /* Handle HW errata */ + if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG) + offset |= offset << 4; + + val = in_be64(xd->eoi_mmio + offset); + + return (u8)val; +} + +#ifdef CONFIG_XMON +static void xive_dump_eq(const char *name, struct xive_q *q) +{ + u32 i0, i1, idx; + + if (!q->qpage) + return; + idx = q->idx; + i0 = be32_to_cpup(q->qpage + idx); + idx = (idx + 1) & q->msk; + i1 = be32_to_cpup(q->qpage + idx); + xmon_printf(" %s Q T=%d %08x %08x ...\n", name, + q->toggle, i0, i1); +} + +void xmon_xive_do_dump(int cpu) +{ + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + + xmon_printf("XIVE state for CPU %d:\n", cpu); + xmon_printf(" pp=%02x cppr=%02x\n", xc->pending_prio, xc->cppr); + xive_dump_eq("IRQ", &xc->queue[xive_irq_priority]); +#ifdef CONFIG_SMP + { + u64 val = xive_poke_esb(&xc->ipi_data, XIVE_ESB_GET); + xmon_printf(" IPI state: %x:%c%c\n", xc->hw_ipi, + val & XIVE_ESB_VAL_P ? 'P' : 'p', + val & XIVE_ESB_VAL_P ? 'Q' : 'q'); + } +#endif +} +#endif /* CONFIG_XMON */ + +static unsigned int xive_get_irq(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + u32 irq; + + /* + * This can be called either as a result of a HW interrupt or + * as a "replay" because EOI decided there was still something + * in one of the queues. + * + * First we perform an ACK cycle in order to update our mask + * of pending priorities. This will also have the effect of + * updating the CPPR to the most favored pending interrupts. + * + * In the future, if we have a way to differenciate a first + * entry (on HW interrupt) from a replay triggered by EOI, + * we could skip this on replays unless we soft-mask tells us + * that a new HW interrupt occurred. + */ + xive_ops->update_pending(xc); + + DBG_VERBOSE("get_irq: pending=%02x\n", xc->pending_prio); + + /* Scan our queue(s) for interrupts */ + irq = xive_scan_interrupts(xc, false); + + DBG_VERBOSE("get_irq: got irq 0x%x, new pending=0x%02x\n", + irq, xc->pending_prio); + + /* Return pending interrupt if any */ + if (irq == XIVE_BAD_IRQ) + return 0; + return irq; +} + +/* + * After EOI'ing an interrupt, we need to re-check the queue + * to see if another interrupt is pending since multiple + * interrupts can coalesce into a single notification to the + * CPU. + * + * If we find that there is indeed more in there, we call + * force_external_irq_replay() to make Linux synthetize an + * external interrupt on the next call to local_irq_restore(). + */ +static void xive_do_queue_eoi(struct xive_cpu *xc) +{ + if (xive_scan_interrupts(xc, true) != 0) { + DBG_VERBOSE("eoi: pending=0x%02x\n", xc->pending_prio); + force_external_irq_replay(); + } +} + +/* + * EOI an interrupt at the source. There are several methods + * to do this depending on the HW version and source type + */ +void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) +{ + /* If the XIVE supports the new "store EOI facility, use it */ + if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) + out_be64(xd->eoi_mmio, 0); + else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) { + /* + * The FW told us to call it. This happens for some + * interrupt sources that need additional HW whacking + * beyond the ESB manipulation. For example LPC interrupts + * on P9 DD1.0 need a latch to be clared in the LPC bridge + * itself. The Firmware will take care of it. + */ + if (WARN_ON_ONCE(!xive_ops->eoi)) + return; + xive_ops->eoi(hw_irq); + } else { + u8 eoi_val; + + /* + * Otherwise for EOI, we use the special MMIO that does + * a clear of both P and Q and returns the old Q, + * except for LSIs where we use the "EOI cycle" special + * load. + * + * This allows us to then do a re-trigger if Q was set + * rather than synthesizing an interrupt in software + * + * For LSIs, using the HW EOI cycle works around a problem + * on P9 DD1 PHBs where the other ESB accesses don't work + * properly. + */ + if (xd->flags & XIVE_IRQ_FLAG_LSI) + in_be64(xd->eoi_mmio); + else { + eoi_val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_00); + DBG_VERBOSE("eoi_val=%x\n", offset, eoi_val); + + /* Re-trigger if needed */ + if ((eoi_val & XIVE_ESB_VAL_Q) && xd->trig_mmio) + out_be64(xd->trig_mmio, 0); + } + } +} + +/* irq_chip eoi callback */ +static void xive_irq_eoi(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + + DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n", + d->irq, irqd_to_hwirq(d), xc->pending_prio); + + /* EOI the source if it hasn't been disabled */ + if (!irqd_irq_disabled(d)) + xive_do_source_eoi(irqd_to_hwirq(d), xd); + + /* + * Clear saved_p to indicate that it's no longer occupying + * a queue slot on the target queue + */ + xd->saved_p = false; + + /* Check for more work in the queue */ + xive_do_queue_eoi(xc); +} + +/* + * Helper used to mask and unmask an interrupt source. This + * is only called for normal interrupts that do not require + * masking/unmasking via firmware. + */ +static void xive_do_source_set_mask(struct xive_irq_data *xd, + bool mask) +{ + u64 val; + + /* + * If the interrupt had P set, it may be in a queue. + * + * We need to make sure we don't re-enable it until it + * has been fetched from that queue and EOId. We keep + * a copy of that P state and use it to restore the + * ESB accordingly on unmask. + */ + if (mask) { + val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_01); + xd->saved_p = !!(val & XIVE_ESB_VAL_P); + } else if (xd->saved_p) + xive_poke_esb(xd, XIVE_ESB_SET_PQ_10); + else + xive_poke_esb(xd, XIVE_ESB_SET_PQ_00); +} + +/* + * Try to chose "cpu" as a new interrupt target. Increments + * the queue accounting for that target if it's not already + * full. + */ +static bool xive_try_pick_target(int cpu) +{ + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + struct xive_q *q = &xc->queue[xive_irq_priority]; + int max; + + /* + * Calculate max number of interrupts in that queue. + * + * We leave a gap of 1 just in case... + */ + max = (q->msk + 1) - 1; + return !!atomic_add_unless(&q->count, 1, max); +} + +/* + * Un-account an interrupt for a target CPU. We don't directly + * decrement q->count since the interrupt might still be present + * in the queue. + * + * Instead increment a separate counter "pending_count" which + * will be substracted from "count" later when that CPU observes + * the queue to be empty. + */ +static void xive_dec_target_count(int cpu) +{ + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + struct xive_q *q = &xc->queue[xive_irq_priority]; + + if (unlikely(WARN_ON(cpu < 0 || !xc))) { + pr_err("%s: cpu=%d xc=%p\n", __func__, cpu, xc); + return; + } + + /* + * We increment the "pending count" which will be used + * to decrement the target queue count whenever it's next + * processed and found empty. This ensure that we don't + * decrement while we still have the interrupt there + * occupying a slot. + */ + atomic_inc(&q->pending_count); +} + +/* Find a tentative CPU target in a CPU mask */ +static int xive_find_target_in_mask(const struct cpumask *mask, + unsigned int fuzz) +{ + int cpu, first, num, i; + + /* Pick up a starting point CPU in the mask based on fuzz */ + num = cpumask_weight(mask); + first = fuzz % num; + + /* Locate it */ + cpu = cpumask_first(mask); + for (i = 0; i < first && cpu < nr_cpu_ids; i++) + cpu = cpumask_next(cpu, mask); + + /* Sanity check */ + if (WARN_ON(cpu >= nr_cpu_ids)) + cpu = cpumask_first(cpu_online_mask); + + /* Remember first one to handle wrap-around */ + first = cpu; + + /* + * Now go through the entire mask until we find a valid + * target. + */ + for (;;) { + /* + * We re-check online as the fallback case passes us + * an untested affinity mask + */ + if (cpu_online(cpu) && xive_try_pick_target(cpu)) + return cpu; + cpu = cpumask_next(cpu, mask); + if (cpu == first) + break; + /* Wrap around */ + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(mask); + } + return -1; +} + +/* + * Pick a target CPU for an interrupt. This is done at + * startup or if the affinity is changed in a way that + * invalidates the current target. + */ +static int xive_pick_irq_target(struct irq_data *d, + const struct cpumask *affinity) +{ + static unsigned int fuzz; + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + cpumask_var_t mask; + int cpu = -1; + + /* + * If we have chip IDs, first we try to build a mask of + * CPUs matching the CPU and find a target in there + */ + if (xd->src_chip != XIVE_INVALID_CHIP_ID && + zalloc_cpumask_var(&mask, GFP_ATOMIC)) { + /* Build a mask of matching chip IDs */ + for_each_cpu_and(cpu, affinity, cpu_online_mask) { + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + if (xc->chip_id == xd->src_chip) + cpumask_set_cpu(cpu, mask); + } + /* Try to find a target */ + if (cpumask_empty(mask)) + cpu = -1; + else + cpu = xive_find_target_in_mask(mask, fuzz++); + free_cpumask_var(mask); + if (cpu >= 0) + return cpu; + fuzz--; + } + + /* No chip IDs, fallback to using the affinity mask */ + return xive_find_target_in_mask(affinity, fuzz++); +} + +static unsigned int xive_irq_startup(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + int target, rc; + + pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n", + d->irq, hw_irq, d); + +#ifdef CONFIG_PCI_MSI + /* + * The generic MSI code returns with the interrupt disabled on the + * card, using the MSI mask bits. Firmware doesn't appear to unmask + * at that level, so we do it here by hand. + */ + if (irq_data_get_msi_desc(d)) + pci_msi_unmask_irq(d); +#endif + + /* Pick a target */ + target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d)); + if (target == XIVE_INVALID_TARGET) { + /* Try again breaking affinity */ + target = xive_pick_irq_target(d, cpu_online_mask); + if (target == XIVE_INVALID_TARGET) + return -ENXIO; + pr_warn("irq %d started with broken affinity\n", d->irq); + } + + /* Sanity check */ + if (WARN_ON(target == XIVE_INVALID_TARGET || + target >= nr_cpu_ids)) + target = smp_processor_id(); + + xd->target = target; + + /* + * Configure the logical number to be the Linux IRQ number + * and set the target queue + */ + rc = xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(target), + xive_irq_priority, d->irq); + if (rc) + return rc; + + /* Unmask the ESB */ + xive_do_source_set_mask(xd, false); + + return 0; +} + +static void xive_irq_shutdown(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + + pr_devel("xive_irq_shutdown: irq %d [0x%x] data @%p\n", + d->irq, hw_irq, d); + + if (WARN_ON(xd->target == XIVE_INVALID_TARGET)) + return; + + /* Mask the interrupt at the source */ + xive_do_source_set_mask(xd, true); + + /* + * The above may have set saved_p. We clear it otherwise it + * will prevent re-enabling later on. It is ok to forget the + * fact that the interrupt might be in a queue because we are + * accounting that already in xive_dec_target_count() and will + * be re-routing it to a new queue with proper accounting when + * it's started up again + */ + xd->saved_p = false; + + /* + * Mask the interrupt in HW in the IVT/EAS and set the number + * to be the "bad" IRQ number + */ + xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(xd->target), + 0xff, XIVE_BAD_IRQ); + + xive_dec_target_count(xd->target); + xd->target = XIVE_INVALID_TARGET; +} + +static void xive_irq_unmask(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + pr_devel("xive_irq_unmask: irq %d data @%p\n", d->irq, xd); + + /* + * This is a workaround for PCI LSI problems on P9, for + * these, we call FW to set the mask. The problems might + * be fixed by P9 DD2.0, if that is the case, firmware + * will no longer set that flag. + */ + if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) { + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(xd->target), + xive_irq_priority, d->irq); + return; + } + + xive_do_source_set_mask(xd, false); +} + +static void xive_irq_mask(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + pr_devel("xive_irq_mask: irq %d data @%p\n", d->irq, xd); + + /* + * This is a workaround for PCI LSI problems on P9, for + * these, we call OPAL to set the mask. The problems might + * be fixed by P9 DD2.0, if that is the case, firmware + * will no longer set that flag. + */ + if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) { + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(xd->target), + 0xff, d->irq); + return; + } + + xive_do_source_set_mask(xd, true); +} + +static int xive_irq_set_affinity(struct irq_data *d, + const struct cpumask *cpumask, + bool force) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + u32 target, old_target; + int rc = 0; + + pr_devel("xive_irq_set_affinity: irq %d\n", d->irq); + + /* Is this valid ? */ + if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) + return -EINVAL; + + /* + * If existing target is already in the new mask, and is + * online then do nothing. + */ + if (xd->target != XIVE_INVALID_TARGET && + cpu_online(xd->target) && + cpumask_test_cpu(xd->target, cpumask)) + return IRQ_SET_MASK_OK; + + /* Pick a new target */ + target = xive_pick_irq_target(d, cpumask); + + /* No target found */ + if (target == XIVE_INVALID_TARGET) + return -ENXIO; + + /* Sanity check */ + if (WARN_ON(target >= nr_cpu_ids)) + target = smp_processor_id(); + + old_target = xd->target; + + rc = xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(target), + xive_irq_priority, d->irq); + if (rc < 0) { + pr_err("Error %d reconfiguring irq %d\n", rc, d->irq); + return rc; + } + + pr_devel(" target: 0x%x\n", target); + xd->target = target; + + /* Give up previous target */ + if (old_target != XIVE_INVALID_TARGET) + xive_dec_target_count(old_target); + + return IRQ_SET_MASK_OK; +} + +static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + /* + * We only support these. This has really no effect other than setting + * the corresponding descriptor bits mind you but those will in turn + * affect the resend function when re-enabling an edge interrupt. + * + * Set set the default to edge as explained in map(). + */ + if (flow_type == IRQ_TYPE_DEFAULT || flow_type == IRQ_TYPE_NONE) + flow_type = IRQ_TYPE_EDGE_RISING; + + if (flow_type != IRQ_TYPE_EDGE_RISING && + flow_type != IRQ_TYPE_LEVEL_LOW) + return -EINVAL; + + irqd_set_trigger_type(d, flow_type); + + /* + * Double check it matches what the FW thinks + * + * NOTE: We don't know yet if the PAPR interface will provide + * the LSI vs MSI information apart from the device-tree so + * this check might have to move into an optional backend call + * that is specific to the native backend + */ + if ((flow_type == IRQ_TYPE_LEVEL_LOW) != + !!(xd->flags & XIVE_IRQ_FLAG_LSI)) { + pr_warn("Interrupt %d (HW 0x%x) type mismatch, Linux says %s, FW says %s\n", + d->irq, (u32)irqd_to_hwirq(d), + (flow_type == IRQ_TYPE_LEVEL_LOW) ? "Level" : "Edge", + (xd->flags & XIVE_IRQ_FLAG_LSI) ? "Level" : "Edge"); + } + + return IRQ_SET_MASK_OK_NOCOPY; +} + +static int xive_irq_retrigger(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + /* This should be only for MSIs */ + if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI)) + return 0; + + /* + * To perform a retrigger, we first set the PQ bits to + * 11, then perform an EOI. + */ + xive_poke_esb(xd, XIVE_ESB_SET_PQ_11); + + /* + * Note: We pass "0" to the hw_irq argument in order to + * avoid calling into the backend EOI code which we don't + * want to do in the case of a re-trigger. Backends typically + * only do EOI for LSIs anyway. + */ + xive_do_source_eoi(0, xd); + + return 1; +} + +static struct irq_chip xive_irq_chip = { + .name = "XIVE-IRQ", + .irq_startup = xive_irq_startup, + .irq_shutdown = xive_irq_shutdown, + .irq_eoi = xive_irq_eoi, + .irq_mask = xive_irq_mask, + .irq_unmask = xive_irq_unmask, + .irq_set_affinity = xive_irq_set_affinity, + .irq_set_type = xive_irq_set_type, + .irq_retrigger = xive_irq_retrigger, +}; + +bool is_xive_irq(struct irq_chip *chip) +{ + return chip == &xive_irq_chip; +} + +void xive_cleanup_irq_data(struct xive_irq_data *xd) +{ + if (xd->eoi_mmio) { + iounmap(xd->eoi_mmio); + if (xd->eoi_mmio == xd->trig_mmio) + xd->trig_mmio = NULL; + xd->eoi_mmio = NULL; + } + if (xd->trig_mmio) { + iounmap(xd->trig_mmio); + xd->trig_mmio = NULL; + } +} + +static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw) +{ + struct xive_irq_data *xd; + int rc; + + xd = kzalloc(sizeof(struct xive_irq_data), GFP_KERNEL); + if (!xd) + return -ENOMEM; + rc = xive_ops->populate_irq_data(hw, xd); + if (rc) { + kfree(xd); + return rc; + } + xd->target = XIVE_INVALID_TARGET; + irq_set_handler_data(virq, xd); + + return 0; +} + +static void xive_irq_free_data(unsigned int virq) +{ + struct xive_irq_data *xd = irq_get_handler_data(virq); + + if (!xd) + return; + irq_set_handler_data(virq, NULL); + xive_cleanup_irq_data(xd); + kfree(xd); +} + +#ifdef CONFIG_SMP + +static void xive_cause_ipi(int cpu) +{ + struct xive_cpu *xc; + struct xive_irq_data *xd; + + xc = per_cpu(xive_cpu, cpu); + + DBG_VERBOSE("IPI CPU %d -> %d (HW IRQ 0x%x)\n", + smp_processor_id(), cpu, xc->hw_ipi); + + xd = &xc->ipi_data; + if (WARN_ON(!xd->trig_mmio)) + return; + out_be64(xd->trig_mmio, 0); +} + +static irqreturn_t xive_muxed_ipi_action(int irq, void *dev_id) +{ + return smp_ipi_demux(); +} + +static void xive_ipi_eoi(struct irq_data *d) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + + /* Handle possible race with unplug and drop stale IPIs */ + if (!xc) + return; + xive_do_source_eoi(xc->hw_ipi, &xc->ipi_data); + xive_do_queue_eoi(xc); +} + +static void xive_ipi_do_nothing(struct irq_data *d) +{ + /* + * Nothing to do, we never mask/unmask IPIs, but the callback + * has to exist for the struct irq_chip. + */ +} + +static struct irq_chip xive_ipi_chip = { + .name = "XIVE-IPI", + .irq_eoi = xive_ipi_eoi, + .irq_mask = xive_ipi_do_nothing, + .irq_unmask = xive_ipi_do_nothing, +}; + +static void __init xive_request_ipi(void) +{ + unsigned int virq; + + /* + * Initialization failed, move on, we might manage to + * reach the point where we display our errors before + * the system falls appart + */ + if (!xive_irq_domain) + return; + + /* Initialize it */ + virq = irq_create_mapping(xive_irq_domain, 0); + xive_ipi_irq = virq; + + WARN_ON(request_irq(virq, xive_muxed_ipi_action, + IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL)); +} + +static int xive_setup_cpu_ipi(unsigned int cpu) +{ + struct xive_cpu *xc; + int rc; + + pr_debug("Setting up IPI for CPU %d\n", cpu); + + xc = per_cpu(xive_cpu, cpu); + + /* Check if we are already setup */ + if (xc->hw_ipi != 0) + return 0; + + /* Grab an IPI from the backend, this will populate xc->hw_ipi */ + if (xive_ops->get_ipi(cpu, xc)) + return -EIO; + + /* + * Populate the IRQ data in the xive_cpu structure and + * configure the HW / enable the IPIs. + */ + rc = xive_ops->populate_irq_data(xc->hw_ipi, &xc->ipi_data); + if (rc) { + pr_err("Failed to populate IPI data on CPU %d\n", cpu); + return -EIO; + } + rc = xive_ops->configure_irq(xc->hw_ipi, + get_hard_smp_processor_id(cpu), + xive_irq_priority, xive_ipi_irq); + if (rc) { + pr_err("Failed to map IPI CPU %d\n", cpu); + return -EIO; + } + pr_devel("CPU %d HW IPI %x, virq %d, trig_mmio=%p\n", cpu, + xc->hw_ipi, xive_ipi_irq, xc->ipi_data.trig_mmio); + + /* Unmask it */ + xive_do_source_set_mask(&xc->ipi_data, false); + + return 0; +} + +static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc) +{ + /* Disable the IPI and free the IRQ data */ + + /* Already cleaned up ? */ + if (xc->hw_ipi == 0) + return; + + /* Mask the IPI */ + xive_do_source_set_mask(&xc->ipi_data, true); + + /* + * Note: We don't call xive_cleanup_irq_data() to free + * the mappings as this is called from an IPI on kexec + * which is not a safe environment to call iounmap() + */ + + /* Deconfigure/mask in the backend */ + xive_ops->configure_irq(xc->hw_ipi, hard_smp_processor_id(), + 0xff, xive_ipi_irq); + + /* Free the IPIs in the backend */ + xive_ops->put_ipi(cpu, xc); +} + +void __init xive_smp_probe(void) +{ + smp_ops->cause_ipi = xive_cause_ipi; + + /* Register the IPI */ + xive_request_ipi(); + + /* Allocate and setup IPI for the boot CPU */ + xive_setup_cpu_ipi(smp_processor_id()); +} + +#endif /* CONFIG_SMP */ + +static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq, + irq_hw_number_t hw) +{ + int rc; + + /* + * Mark interrupts as edge sensitive by default so that resend + * actually works. Will fix that up below if needed. + */ + irq_clear_status_flags(virq, IRQ_LEVEL); + +#ifdef CONFIG_SMP + /* IPIs are special and come up with HW number 0 */ + if (hw == 0) { + /* + * IPIs are marked per-cpu. We use separate HW interrupts under + * the hood but associated with the same "linux" interrupt + */ + irq_set_chip_and_handler(virq, &xive_ipi_chip, + handle_percpu_irq); + return 0; + } +#endif + + rc = xive_irq_alloc_data(virq, hw); + if (rc) + return rc; + + irq_set_chip_and_handler(virq, &xive_irq_chip, handle_fasteoi_irq); + + return 0; +} + +static void xive_irq_domain_unmap(struct irq_domain *d, unsigned int virq) +{ + struct irq_data *data = irq_get_irq_data(virq); + unsigned int hw_irq; + + /* XXX Assign BAD number */ + if (!data) + return; + hw_irq = (unsigned int)irqd_to_hwirq(data); + if (hw_irq) + xive_irq_free_data(virq); +} + +static int xive_irq_domain_xlate(struct irq_domain *h, struct device_node *ct, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_flags) + +{ + *out_hwirq = intspec[0]; + + /* + * If intsize is at least 2, we look for the type in the second cell, + * we assume the LSB indicates a level interrupt. + */ + if (intsize > 1) { + if (intspec[1] & 1) + *out_flags = IRQ_TYPE_LEVEL_LOW; + else + *out_flags = IRQ_TYPE_EDGE_RISING; + } else + *out_flags = IRQ_TYPE_LEVEL_LOW; + + return 0; +} + +static int xive_irq_domain_match(struct irq_domain *h, struct device_node *node, + enum irq_domain_bus_token bus_token) +{ + return xive_ops->match(node); +} + +static const struct irq_domain_ops xive_irq_domain_ops = { + .match = xive_irq_domain_match, + .map = xive_irq_domain_map, + .unmap = xive_irq_domain_unmap, + .xlate = xive_irq_domain_xlate, +}; + +static void __init xive_init_host(void) +{ + xive_irq_domain = irq_domain_add_nomap(NULL, XIVE_MAX_IRQ, + &xive_irq_domain_ops, NULL); + if (WARN_ON(xive_irq_domain == NULL)) + return; + irq_set_default_host(xive_irq_domain); +} + +static void xive_cleanup_cpu_queues(unsigned int cpu, struct xive_cpu *xc) +{ + if (xc->queue[xive_irq_priority].qpage) + xive_ops->cleanup_queue(cpu, xc, xive_irq_priority); +} + +static int xive_setup_cpu_queues(unsigned int cpu, struct xive_cpu *xc) +{ + int rc = 0; + + /* We setup 1 queues for now with a 64k page */ + if (!xc->queue[xive_irq_priority].qpage) + rc = xive_ops->setup_queue(cpu, xc, xive_irq_priority); + + return rc; +} + +static int xive_prepare_cpu(unsigned int cpu) +{ + struct xive_cpu *xc; + + xc = per_cpu(xive_cpu, cpu); + if (!xc) { + struct device_node *np; + + xc = kzalloc_node(sizeof(struct xive_cpu), + GFP_KERNEL, cpu_to_node(cpu)); + if (!xc) + return -ENOMEM; + np = of_get_cpu_node(cpu, NULL); + if (np) + xc->chip_id = of_get_ibm_chip_id(np); + of_node_put(np); + + per_cpu(xive_cpu, cpu) = xc; + } + + /* Setup EQs if not already */ + return xive_setup_cpu_queues(cpu, xc); +} + +static void xive_setup_cpu(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + + /* Debug: Dump the TM state */ + pr_devel("CPU %d [HW 0x%02x] VT=%02x\n", + smp_processor_id(), hard_smp_processor_id(), + in_8(xive_tima + xive_tima_offset + TM_WORD2)); + + /* The backend might have additional things to do */ + if (xive_ops->setup_cpu) + xive_ops->setup_cpu(smp_processor_id(), xc); + + /* Set CPPR to 0xff to enable flow of interrupts */ + xc->cppr = 0xff; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff); +} + +#ifdef CONFIG_SMP +void xive_smp_setup_cpu(void) +{ + pr_devel("SMP setup CPU %d\n", smp_processor_id()); + + /* This will have already been done on the boot CPU */ + if (smp_processor_id() != boot_cpuid) + xive_setup_cpu(); + +} + +int xive_smp_prepare_cpu(unsigned int cpu) +{ + int rc; + + /* Allocate per-CPU data and queues */ + rc = xive_prepare_cpu(cpu); + if (rc) + return rc; + + /* Allocate and setup IPI for the new CPU */ + return xive_setup_cpu_ipi(cpu); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc) +{ + u32 irq; + + /* We assume local irqs are disabled */ + WARN_ON(!irqs_disabled()); + + /* Check what's already in the CPU queue */ + while ((irq = xive_scan_interrupts(xc, false)) != 0) { + /* + * We need to re-route that interrupt to its new destination. + * First get and lock the descriptor + */ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_data *d = irq_desc_get_irq_data(desc); + struct xive_irq_data *xd; + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + + /* + * Ignore anything that isn't a XIVE irq and ignore + * IPIs, so can just be dropped. + */ + if (d->domain != xive_irq_domain || hw_irq == 0) + continue; + + /* + * The IRQ should have already been re-routed, it's just a + * stale in the old queue, so re-trigger it in order to make + * it reach is new destination. + */ +#ifdef DEBUG_FLUSH + pr_info("CPU %d: Got irq %d while offline, re-sending...\n", + cpu, irq); +#endif + raw_spin_lock(&desc->lock); + xd = irq_desc_get_handler_data(desc); + + /* + * For LSIs, we EOI, this will cause a resend if it's + * still asserted. Otherwise do an MSI retrigger. + */ + if (xd->flags & XIVE_IRQ_FLAG_LSI) + xive_do_source_eoi(irqd_to_hwirq(d), xd); + else + xive_irq_retrigger(d); + + raw_spin_unlock(&desc->lock); + } +} + +void xive_smp_disable_cpu(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + unsigned int cpu = smp_processor_id(); + + /* Migrate interrupts away from the CPU */ + irq_migrate_all_off_this_cpu(); + + /* Set CPPR to 0 to disable flow of interrupts */ + xc->cppr = 0; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0); + + /* Flush everything still in the queue */ + xive_flush_cpu_queue(cpu, xc); + + /* Re-enable CPPR */ + xc->cppr = 0xff; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff); +} + +void xive_flush_interrupt(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + unsigned int cpu = smp_processor_id(); + + /* Called if an interrupt occurs while the CPU is hot unplugged */ + xive_flush_cpu_queue(cpu, xc); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +#endif /* CONFIG_SMP */ + +void xive_kexec_teardown_cpu(int secondary) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + unsigned int cpu = smp_processor_id(); + + /* Set CPPR to 0 to disable flow of interrupts */ + xc->cppr = 0; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0); + + /* Backend cleanup if any */ + if (xive_ops->teardown_cpu) + xive_ops->teardown_cpu(cpu, xc); + +#ifdef CONFIG_SMP + /* Get rid of IPI */ + xive_cleanup_cpu_ipi(cpu, xc); +#endif + + /* Disable and free the queues */ + xive_cleanup_cpu_queues(cpu, xc); +} + +void xive_shutdown(void) +{ + xive_ops->shutdown(); +} + +bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset, + u8 max_prio) +{ + xive_tima = area; + xive_tima_offset = offset; + xive_ops = ops; + xive_irq_priority = max_prio; + + ppc_md.get_irq = xive_get_irq; + __xive_enabled = true; + + pr_devel("Initializing host..\n"); + xive_init_host(); + + pr_devel("Initializing boot CPU..\n"); + + /* Allocate per-CPU data and queues */ + xive_prepare_cpu(smp_processor_id()); + + /* Get ready for interrupts */ + xive_setup_cpu(); + + pr_info("Interrupt handling intialized with %s backend\n", + xive_ops->name); + pr_info("Using priority %d for all interrupts\n", max_prio); + + return true; +} + +static int __init xive_off(char *arg) +{ + xive_cmdline_disabled = true; + return 0; +} +__setup("xive=off", xive_off); diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c new file mode 100644 index 000000000000..1a726229a427 --- /dev/null +++ b/arch/powerpc/sysdev/xive/native.c @@ -0,0 +1,640 @@ +/* + * Copyright 2016,2017 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "xive: " fmt + +#include <linux/types.h> +#include <linux/irq.h> +#include <linux/debugfs.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/of.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/delay.h> +#include <linux/cpumask.h> +#include <linux/mm.h> + +#include <asm/prom.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/irq.h> +#include <asm/errno.h> +#include <asm/xive.h> +#include <asm/xive-regs.h> +#include <asm/opal.h> + +#include "xive-internal.h" + + +static u32 xive_provision_size; +static u32 *xive_provision_chips; +static u32 xive_provision_chip_count; +static u32 xive_queue_shift; +static u32 xive_pool_vps = XIVE_INVALID_VP; +static struct kmem_cache *xive_provision_cache; + +int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) +{ + __be64 flags, eoi_page, trig_page; + __be32 esb_shift, src_chip; + u64 opal_flags; + s64 rc; + + memset(data, 0, sizeof(*data)); + + rc = opal_xive_get_irq_info(hw_irq, &flags, &eoi_page, &trig_page, + &esb_shift, &src_chip); + if (rc) { + pr_err("opal_xive_get_irq_info(0x%x) returned %lld\n", + hw_irq, rc); + return -EINVAL; + } + + opal_flags = be64_to_cpu(flags); + if (opal_flags & OPAL_XIVE_IRQ_STORE_EOI) + data->flags |= XIVE_IRQ_FLAG_STORE_EOI; + if (opal_flags & OPAL_XIVE_IRQ_LSI) + data->flags |= XIVE_IRQ_FLAG_LSI; + if (opal_flags & OPAL_XIVE_IRQ_SHIFT_BUG) + data->flags |= XIVE_IRQ_FLAG_SHIFT_BUG; + if (opal_flags & OPAL_XIVE_IRQ_MASK_VIA_FW) + data->flags |= XIVE_IRQ_FLAG_MASK_FW; + if (opal_flags & OPAL_XIVE_IRQ_EOI_VIA_FW) + data->flags |= XIVE_IRQ_FLAG_EOI_FW; + data->eoi_page = be64_to_cpu(eoi_page); + data->trig_page = be64_to_cpu(trig_page); + data->esb_shift = be32_to_cpu(esb_shift); + data->src_chip = be32_to_cpu(src_chip); + + data->eoi_mmio = ioremap(data->eoi_page, 1u << data->esb_shift); + if (!data->eoi_mmio) { + pr_err("Failed to map EOI page for irq 0x%x\n", hw_irq); + return -ENOMEM; + } + + if (!data->trig_page) + return 0; + if (data->trig_page == data->eoi_page) { + data->trig_mmio = data->eoi_mmio; + return 0; + } + + data->trig_mmio = ioremap(data->trig_page, 1u << data->esb_shift); + if (!data->trig_mmio) { + pr_err("Failed to map trigger page for irq 0x%x\n", hw_irq); + return -ENOMEM; + } + return 0; +} + +int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq) +{ + s64 rc; + + for (;;) { + rc = opal_xive_set_irq_config(hw_irq, target, prio, sw_irq); + if (rc != OPAL_BUSY) + break; + msleep(1); + } + return rc == 0 ? 0 : -ENXIO; +} + +/* This can be called multiple time to change a queue configuration */ +int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio, + __be32 *qpage, u32 order, bool can_escalate) +{ + s64 rc = 0; + __be64 qeoi_page_be; + __be32 esc_irq_be; + u64 flags, qpage_phys; + + /* If there's an actual queue page, clean it */ + if (order) { + if (WARN_ON(!qpage)) + return -EINVAL; + qpage_phys = __pa(qpage); + } else + qpage_phys = 0; + + /* Initialize the rest of the fields */ + q->msk = order ? ((1u << (order - 2)) - 1) : 0; + q->idx = 0; + q->toggle = 0; + + rc = opal_xive_get_queue_info(vp_id, prio, NULL, NULL, + &qeoi_page_be, + &esc_irq_be, + NULL); + if (rc) { + pr_err("Error %lld getting queue info prio %d\n", rc, prio); + rc = -EIO; + goto fail; + } + q->eoi_phys = be64_to_cpu(qeoi_page_be); + + /* Default flags */ + flags = OPAL_XIVE_EQ_ALWAYS_NOTIFY | OPAL_XIVE_EQ_ENABLED; + + /* Escalation needed ? */ + if (can_escalate) { + q->esc_irq = be32_to_cpu(esc_irq_be); + flags |= OPAL_XIVE_EQ_ESCALATE; + } + + /* Configure and enable the queue in HW */ + for (;;) { + rc = opal_xive_set_queue_info(vp_id, prio, qpage_phys, order, flags); + if (rc != OPAL_BUSY) + break; + msleep(1); + } + if (rc) { + pr_err("Error %lld setting queue for prio %d\n", rc, prio); + rc = -EIO; + } else { + /* + * KVM code requires all of the above to be visible before + * q->qpage is set due to how it manages IPI EOIs + */ + wmb(); + q->qpage = qpage; + } +fail: + return rc; +} + +static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio) +{ + s64 rc; + + /* Disable the queue in HW */ + for (;;) { + rc = opal_xive_set_queue_info(vp_id, prio, 0, 0, 0); + if (rc != OPAL_BUSY) + break; + msleep(1); + } + if (rc) + pr_err("Error %lld disabling queue for prio %d\n", rc, prio); +} + +void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio) +{ + __xive_native_disable_queue(vp_id, q, prio); +} + +static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio) +{ + struct xive_q *q = &xc->queue[prio]; + unsigned int alloc_order; + struct page *pages; + __be32 *qpage; + + alloc_order = (xive_queue_shift > PAGE_SHIFT) ? + (xive_queue_shift - PAGE_SHIFT) : 0; + pages = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, alloc_order); + if (!pages) + return -ENOMEM; + qpage = (__be32 *)page_address(pages); + memset(qpage, 0, 1 << xive_queue_shift); + return xive_native_configure_queue(get_hard_smp_processor_id(cpu), + q, prio, qpage, xive_queue_shift, false); +} + +static void xive_native_cleanup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio) +{ + struct xive_q *q = &xc->queue[prio]; + unsigned int alloc_order; + + /* + * We use the variant with no iounmap as this is called on exec + * from an IPI and iounmap isn't safe + */ + __xive_native_disable_queue(get_hard_smp_processor_id(cpu), q, prio); + alloc_order = (xive_queue_shift > PAGE_SHIFT) ? + (xive_queue_shift - PAGE_SHIFT) : 0; + free_pages((unsigned long)q->qpage, alloc_order); + q->qpage = NULL; +} + +static bool xive_native_match(struct device_node *node) +{ + return of_device_is_compatible(node, "ibm,opal-xive-vc"); +} + +#ifdef CONFIG_SMP +static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc) +{ + struct device_node *np; + unsigned int chip_id; + s64 irq; + + /* Find the chip ID */ + np = of_get_cpu_node(cpu, NULL); + if (np) { + if (of_property_read_u32(np, "ibm,chip-id", &chip_id) < 0) + chip_id = 0; + } + + /* Allocate an IPI and populate info about it */ + for (;;) { + irq = opal_xive_allocate_irq(chip_id); + if (irq == OPAL_BUSY) { + msleep(1); + continue; + } + if (irq < 0) { + pr_err("Failed to allocate IPI on CPU %d\n", cpu); + return -ENXIO; + } + xc->hw_ipi = irq; + break; + } + return 0; +} + +u32 xive_native_alloc_irq(void) +{ + s64 rc; + + for (;;) { + rc = opal_xive_allocate_irq(OPAL_XIVE_ANY_CHIP); + if (rc != OPAL_BUSY) + break; + msleep(1); + } + if (rc < 0) + return 0; + return rc; +} + +void xive_native_free_irq(u32 irq) +{ + for (;;) { + s64 rc = opal_xive_free_irq(irq); + if (rc != OPAL_BUSY) + break; + msleep(1); + } +} + +static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc) +{ + s64 rc; + + /* Free the IPI */ + if (!xc->hw_ipi) + return; + for (;;) { + rc = opal_xive_free_irq(xc->hw_ipi); + if (rc == OPAL_BUSY) { + msleep(1); + continue; + } + xc->hw_ipi = 0; + break; + } +} +#endif /* CONFIG_SMP */ + +static void xive_native_shutdown(void) +{ + /* Switch the XIVE to emulation mode */ + opal_xive_reset(OPAL_XIVE_MODE_EMU); +} + +/* + * Perform an "ack" cycle on the current thread, thus + * grabbing the pending active priorities and updating + * the CPPR to the most favored one. + */ +static void xive_native_update_pending(struct xive_cpu *xc) +{ + u8 he, cppr; + u16 ack; + + /* Perform the acknowledge hypervisor to register cycle */ + ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_HV_REG)); + + /* Synchronize subsequent queue accesses */ + mb(); + + /* + * Grab the CPPR and the "HE" field which indicates the source + * of the hypervisor interrupt (if any) + */ + cppr = ack & 0xff; + he = GETFIELD(TM_QW3_NSR_HE, (ack >> 8)); + switch(he) { + case TM_QW3_NSR_HE_NONE: /* Nothing to see here */ + break; + case TM_QW3_NSR_HE_PHYS: /* Physical thread interrupt */ + if (cppr == 0xff) + return; + /* Mark the priority pending */ + xc->pending_prio |= 1 << cppr; + + /* + * A new interrupt should never have a CPPR less favored + * than our current one. + */ + if (cppr >= xc->cppr) + pr_err("CPU %d odd ack CPPR, got %d at %d\n", + smp_processor_id(), cppr, xc->cppr); + + /* Update our idea of what the CPPR is */ + xc->cppr = cppr; + break; + case TM_QW3_NSR_HE_POOL: /* HV Pool interrupt (unused) */ + case TM_QW3_NSR_HE_LSI: /* Legacy FW LSI (unused) */ + pr_err("CPU %d got unexpected interrupt type HE=%d\n", + smp_processor_id(), he); + return; + } +} + +static void xive_native_eoi(u32 hw_irq) +{ + /* + * Not normally used except if specific interrupts need + * a workaround on EOI. + */ + opal_int_eoi(hw_irq); +} + +static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc) +{ + s64 rc; + u32 vp; + __be64 vp_cam_be; + u64 vp_cam; + + if (xive_pool_vps == XIVE_INVALID_VP) + return; + + /* Enable the pool VP */ + vp = xive_pool_vps + get_hard_smp_processor_id(cpu); + pr_debug("CPU %d setting up pool VP 0x%x\n", cpu, vp); + for (;;) { + rc = opal_xive_set_vp_info(vp, OPAL_XIVE_VP_ENABLED, 0); + if (rc != OPAL_BUSY) + break; + msleep(1); + } + if (rc) { + pr_err("Failed to enable pool VP on CPU %d\n", cpu); + return; + } + + /* Grab it's CAM value */ + rc = opal_xive_get_vp_info(vp, NULL, &vp_cam_be, NULL, NULL); + if (rc) { + pr_err("Failed to get pool VP info CPU %d\n", cpu); + return; + } + vp_cam = be64_to_cpu(vp_cam_be); + + pr_debug("VP CAM = %llx\n", vp_cam); + + /* Push it on the CPU (set LSMFB to 0xff to skip backlog scan) */ + pr_debug("(Old HW value: %08x)\n", + in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2)); + out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD0, 0xff); + out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2, + TM_QW2W2_VP | vp_cam); + pr_debug("(New HW value: %08x)\n", + in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2)); +} + +static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc) +{ + s64 rc; + u32 vp; + + if (xive_pool_vps == XIVE_INVALID_VP) + return; + + /* Pull the pool VP from the CPU */ + in_be64(xive_tima + TM_SPC_PULL_POOL_CTX); + + /* Disable it */ + vp = xive_pool_vps + get_hard_smp_processor_id(cpu); + for (;;) { + rc = opal_xive_set_vp_info(vp, 0, 0); + if (rc != OPAL_BUSY) + break; + msleep(1); + } +} + +static void xive_native_sync_source(u32 hw_irq) +{ + opal_xive_sync(XIVE_SYNC_EAS, hw_irq); +} + +static const struct xive_ops xive_native_ops = { + .populate_irq_data = xive_native_populate_irq_data, + .configure_irq = xive_native_configure_irq, + .setup_queue = xive_native_setup_queue, + .cleanup_queue = xive_native_cleanup_queue, + .match = xive_native_match, + .shutdown = xive_native_shutdown, + .update_pending = xive_native_update_pending, + .eoi = xive_native_eoi, + .setup_cpu = xive_native_setup_cpu, + .teardown_cpu = xive_native_teardown_cpu, + .sync_source = xive_native_sync_source, +#ifdef CONFIG_SMP + .get_ipi = xive_native_get_ipi, + .put_ipi = xive_native_put_ipi, +#endif /* CONFIG_SMP */ + .name = "native", +}; + +static bool xive_parse_provisioning(struct device_node *np) +{ + int rc; + + if (of_property_read_u32(np, "ibm,xive-provision-page-size", + &xive_provision_size) < 0) + return true; + rc = of_property_count_elems_of_size(np, "ibm,xive-provision-chips", 4); + if (rc < 0) { + pr_err("Error %d getting provision chips array\n", rc); + return false; + } + xive_provision_chip_count = rc; + if (rc == 0) + return true; + + xive_provision_chips = kzalloc(4 * xive_provision_chip_count, + GFP_KERNEL); + if (WARN_ON(!xive_provision_chips)) + return false; + + rc = of_property_read_u32_array(np, "ibm,xive-provision-chips", + xive_provision_chips, + xive_provision_chip_count); + if (rc < 0) { + pr_err("Error %d reading provision chips array\n", rc); + return false; + } + + xive_provision_cache = kmem_cache_create("xive-provision", + xive_provision_size, + xive_provision_size, + 0, NULL); + if (!xive_provision_cache) { + pr_err("Failed to allocate provision cache\n"); + return false; + } + return true; +} + +u32 xive_native_default_eq_shift(void) +{ + return xive_queue_shift; +} + +bool xive_native_init(void) +{ + struct device_node *np; + struct resource r; + void __iomem *tima; + struct property *prop; + u8 max_prio = 7; + const __be32 *p; + u32 val; + s64 rc; + + if (xive_cmdline_disabled) + return false; + + pr_devel("xive_native_init()\n"); + np = of_find_compatible_node(NULL, NULL, "ibm,opal-xive-pe"); + if (!np) { + pr_devel("not found !\n"); + return false; + } + pr_devel("Found %s\n", np->full_name); + + /* Resource 1 is HV window */ + if (of_address_to_resource(np, 1, &r)) { + pr_err("Failed to get thread mgmnt area resource\n"); + return false; + } + tima = ioremap(r.start, resource_size(&r)); + if (!tima) { + pr_err("Failed to map thread mgmnt area\n"); + return false; + } + + /* Read number of priorities */ + if (of_property_read_u32(np, "ibm,xive-#priorities", &val) == 0) + max_prio = val - 1; + + /* Iterate the EQ sizes and pick one */ + of_property_for_each_u32(np, "ibm,xive-eq-sizes", prop, p, val) { + xive_queue_shift = val; + if (val == PAGE_SHIFT) + break; + } + + /* Grab size of provisioning pages */ + xive_parse_provisioning(np); + + /* Switch the XIVE to exploitation mode */ + rc = opal_xive_reset(OPAL_XIVE_MODE_EXPL); + if (rc) { + pr_err("Switch to exploitation mode failed with error %lld\n", rc); + return false; + } + + /* Initialize XIVE core with our backend */ + if (!xive_core_init(&xive_native_ops, tima, TM_QW3_HV_PHYS, + max_prio)) { + opal_xive_reset(OPAL_XIVE_MODE_EMU); + return false; + } + pr_info("Using %dkB queues\n", 1 << (xive_queue_shift - 10)); + return true; +} + +static bool xive_native_provision_pages(void) +{ + u32 i; + void *p; + + for (i = 0; i < xive_provision_chip_count; i++) { + u32 chip = xive_provision_chips[i]; + + /* + * XXX TODO: Try to make the allocation local to the node where + * the chip resides. + */ + p = kmem_cache_alloc(xive_provision_cache, GFP_KERNEL); + if (!p) { + pr_err("Failed to allocate provisioning page\n"); + return false; + } + opal_xive_donate_page(chip, __pa(p)); + } + return true; +} + +u32 xive_native_alloc_vp_block(u32 max_vcpus) +{ + s64 rc; + u32 order; + + order = fls(max_vcpus) - 1; + if (max_vcpus > (1 << order)) + order++; + + pr_info("VP block alloc, for max VCPUs %d use order %d\n", + max_vcpus, order); + + for (;;) { + rc = opal_xive_alloc_vp_block(order); + switch (rc) { + case OPAL_BUSY: + msleep(1); + break; + case OPAL_XIVE_PROVISIONING: + if (!xive_native_provision_pages()) + return XIVE_INVALID_VP; + break; + default: + if (rc < 0) { + pr_err("OPAL failed to allocate VCPUs order %d, err %lld\n", + order, rc); + return XIVE_INVALID_VP; + } + return rc; + } + } +} +EXPORT_SYMBOL_GPL(xive_native_alloc_vp_block); + +void xive_native_free_vp_block(u32 vp_base) +{ + s64 rc; + + if (vp_base == XIVE_INVALID_VP) + return; + + rc = opal_xive_free_vp_block(vp_base); + if (rc < 0) + pr_warn("OPAL error %lld freeing VP block\n", rc); +} +EXPORT_SYMBOL_GPL(xive_native_free_vp_block); diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h new file mode 100644 index 000000000000..d07ef2d29caf --- /dev/null +++ b/arch/powerpc/sysdev/xive/xive-internal.h @@ -0,0 +1,62 @@ +/* + * Copyright 2016,2017 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef __XIVE_INTERNAL_H +#define __XIVE_INTERNAL_H + +/* Each CPU carry one of these with various per-CPU state */ +struct xive_cpu { +#ifdef CONFIG_SMP + /* HW irq number and data of IPI */ + u32 hw_ipi; + struct xive_irq_data ipi_data; +#endif /* CONFIG_SMP */ + + int chip_id; + + /* Queue datas. Only one is populated */ +#define XIVE_MAX_QUEUES 8 + struct xive_q queue[XIVE_MAX_QUEUES]; + + /* + * Pending mask. Each bit corresponds to a priority that + * potentially has pending interrupts. + */ + u8 pending_prio; + + /* Cache of HW CPPR */ + u8 cppr; +}; + +/* Backend ops */ +struct xive_ops { + int (*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data); + int (*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); + int (*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio); + void (*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio); + void (*setup_cpu)(unsigned int cpu, struct xive_cpu *xc); + void (*teardown_cpu)(unsigned int cpu, struct xive_cpu *xc); + bool (*match)(struct device_node *np); + void (*shutdown)(void); + + void (*update_pending)(struct xive_cpu *xc); + void (*eoi)(u32 hw_irq); + void (*sync_source)(u32 hw_irq); +#ifdef CONFIG_SMP + int (*get_ipi)(unsigned int cpu, struct xive_cpu *xc); + void (*put_ipi)(unsigned int cpu, struct xive_cpu *xc); +#endif + const char *name; +}; + +bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset, + u8 max_prio); + +extern bool xive_cmdline_disabled; + +#endif /* __XIVE_INTERNAL_H */ diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 16321ad9e70c..2b21e90fff8d 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -29,7 +29,9 @@ #include <linux/nmi.h> #include <linux/ctype.h> +#include <asm/debugfs.h> #include <asm/ptrace.h> +#include <asm/smp.h> #include <asm/string.h> #include <asm/prom.h> #include <asm/machdep.h> @@ -48,7 +50,7 @@ #include <asm/reg.h> #include <asm/debug.h> #include <asm/hw_breakpoint.h> - +#include <asm/xive.h> #include <asm/opal.h> #include <asm/firmware.h> @@ -76,6 +78,7 @@ static int xmon_gate; #endif /* CONFIG_SMP */ static unsigned long in_xmon __read_mostly = 0; +static int xmon_on = IS_ENABLED(CONFIG_XMON_DEFAULT); static unsigned long adrs; static int size = 1; @@ -184,8 +187,6 @@ static void dump_tlb_44x(void); static void dump_tlb_book3e(void); #endif -static int xmon_no_auto_backtrace; - #ifdef CONFIG_PPC64 #define REG "%.16lx" #else @@ -232,7 +233,13 @@ Commands:\n\ "\ dr dump stream of raw bytes\n\ dt dump the tracing buffers (uses printk)\n\ - e print exception information\n\ +" +#ifdef CONFIG_PPC_POWERNV +" dx# dump xive on CPU #\n\ + dxi# dump xive irq state #\n\ + dxa dump xive on all CPUs\n" +#endif +" e print exception information\n\ f flush cache\n\ la lookup symbol+offset of specified address\n\ ls lookup address of specified symbol\n\ @@ -884,10 +891,7 @@ cmds(struct pt_regs *excp) last_cmd = NULL; xmon_regs = excp; - if (!xmon_no_auto_backtrace) { - xmon_no_auto_backtrace = 1; - xmon_show_stack(excp->gpr[1], excp->link, excp->nip); - } + xmon_show_stack(excp->gpr[1], excp->link, excp->nip); for(;;) { #ifdef CONFIG_SMP @@ -2338,6 +2342,81 @@ static void dump_pacas(void) } #endif +#ifdef CONFIG_PPC_POWERNV +static void dump_one_xive(int cpu) +{ + unsigned int hwid = get_hard_smp_processor_id(cpu); + + opal_xive_dump(XIVE_DUMP_TM_HYP, hwid); + opal_xive_dump(XIVE_DUMP_TM_POOL, hwid); + opal_xive_dump(XIVE_DUMP_TM_OS, hwid); + opal_xive_dump(XIVE_DUMP_TM_USER, hwid); + opal_xive_dump(XIVE_DUMP_VP, hwid); + opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid); + + if (setjmp(bus_error_jmp) != 0) { + catch_memory_errors = 0; + printf("*** Error dumping xive on cpu %d\n", cpu); + return; + } + + catch_memory_errors = 1; + sync(); + xmon_xive_do_dump(cpu); + sync(); + __delay(200); + catch_memory_errors = 0; +} + +static void dump_all_xives(void) +{ + int cpu; + + if (num_possible_cpus() == 0) { + printf("No possible cpus, use 'dx #' to dump individual cpus\n"); + return; + } + + for_each_possible_cpu(cpu) + dump_one_xive(cpu); +} + +static void dump_one_xive_irq(u32 num) +{ + s64 rc; + __be64 vp; + u8 prio; + __be32 lirq; + + rc = opal_xive_get_irq_config(num, &vp, &prio, &lirq); + xmon_printf("IRQ 0x%x config: vp=0x%llx prio=%d lirq=0x%x (rc=%lld)\n", + num, be64_to_cpu(vp), prio, be32_to_cpu(lirq), rc); +} + +static void dump_xives(void) +{ + unsigned long num; + int c; + + c = inchar(); + if (c == 'a') { + dump_all_xives(); + return; + } else if (c == 'i') { + if (scanhex(&num)) + dump_one_xive_irq(num); + return; + } + + termch = c; /* Put c back, it wasn't 'a' */ + + if (scanhex(&num)) + dump_one_xive(num); + else + dump_one_xive(xmon_owner); +} +#endif /* CONFIG_PPC_POWERNV */ + static void dump_by_size(unsigned long addr, long count, int size) { unsigned char temp[16]; @@ -2386,6 +2465,14 @@ dump(void) return; } #endif +#ifdef CONFIG_PPC_POWERNV + if (c == 'x') { + xmon_start_pagination(); + dump_xives(); + xmon_end_pagination(); + return; + } +#endif if (c == '\n') termch = c; @@ -3070,23 +3157,28 @@ void dump_segments(void) for (i = 0; i < mmu_slb_size; i++) { asm volatile("slbmfee %0,%1" : "=r" (esid) : "r" (i)); asm volatile("slbmfev %0,%1" : "=r" (vsid) : "r" (i)); - if (esid || vsid) { - printf("%02d %016lx %016lx", i, esid, vsid); - if (esid & SLB_ESID_V) { - llp = vsid & SLB_VSID_LLP; - if (vsid & SLB_VSID_B_1T) { - printf(" 1T ESID=%9lx VSID=%13lx LLP:%3lx \n", - GET_ESID_1T(esid), - (vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, - llp); - } else { - printf(" 256M ESID=%9lx VSID=%13lx LLP:%3lx \n", - GET_ESID(esid), - (vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT, - llp); - } - } else - printf("\n"); + + if (!esid && !vsid) + continue; + + printf("%02d %016lx %016lx", i, esid, vsid); + + if (!(esid & SLB_ESID_V)) { + printf("\n"); + continue; + } + + llp = vsid & SLB_VSID_LLP; + if (vsid & SLB_VSID_B_1T) { + printf(" 1T ESID=%9lx VSID=%13lx LLP:%3lx \n", + GET_ESID_1T(esid), + (vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, + llp); + } else { + printf(" 256M ESID=%9lx VSID=%13lx LLP:%3lx \n", + GET_ESID(esid), + (vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT, + llp); } } } @@ -3302,6 +3394,8 @@ static void sysrq_handle_xmon(int key) /* ensure xmon is enabled */ xmon_init(1); debugger(get_irq_regs()); + if (!xmon_on) + xmon_init(0); } static struct sysrq_key_op sysrq_xmon_op = { @@ -3315,10 +3409,37 @@ static int __init setup_xmon_sysrq(void) register_sysrq_key('x', &sysrq_xmon_op); return 0; } -__initcall(setup_xmon_sysrq); +device_initcall(setup_xmon_sysrq); #endif /* CONFIG_MAGIC_SYSRQ */ -static int __initdata xmon_early, xmon_off; +#ifdef CONFIG_DEBUG_FS +static int xmon_dbgfs_set(void *data, u64 val) +{ + xmon_on = !!val; + xmon_init(xmon_on); + + return 0; +} + +static int xmon_dbgfs_get(void *data, u64 *val) +{ + *val = xmon_on; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(xmon_dbgfs_ops, xmon_dbgfs_get, + xmon_dbgfs_set, "%llu\n"); + +static int __init setup_xmon_dbgfs(void) +{ + debugfs_create_file("xmon", 0600, powerpc_debugfs_root, NULL, + &xmon_dbgfs_ops); + return 0; +} +device_initcall(setup_xmon_dbgfs); +#endif /* CONFIG_DEBUG_FS */ + +static int xmon_early __initdata; static int __init early_parse_xmon(char *p) { @@ -3326,12 +3447,12 @@ static int __init early_parse_xmon(char *p) /* just "xmon" is equivalent to "xmon=early" */ xmon_init(1); xmon_early = 1; - } else if (strncmp(p, "on", 2) == 0) + xmon_on = 1; + } else if (strncmp(p, "on", 2) == 0) { xmon_init(1); - else if (strncmp(p, "off", 3) == 0) - xmon_off = 1; - else if (strncmp(p, "nobt", 4) == 0) - xmon_no_auto_backtrace = 1; + xmon_on = 1; + } else if (strncmp(p, "off", 3) == 0) + xmon_on = 0; else return 1; @@ -3341,10 +3462,8 @@ early_param("xmon", early_parse_xmon); void __init xmon_setup(void) { -#ifdef CONFIG_XMON_DEFAULT - if (!xmon_off) + if (xmon_on) xmon_init(1); -#endif if (xmon_early) debugger(NULL); } |