diff options
Diffstat (limited to 'arch/s390')
83 files changed, 2822 insertions, 2575 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8abe77536d9d..2167bce993ff 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -102,13 +102,13 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE select ARCH_KEEP_MEMBLOCK - select ARCH_SAVE_PAGE_KEYS if HIBERNATION select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS2 @@ -195,6 +195,7 @@ config S390 select ARCH_HAS_FORCE_DMA_UNENCRYPTED select SWIOTLB select GENERIC_ALLOCATOR + imply IMA_SECURE_AND_OR_TRUSTED_BOOT config SCHED_OMIT_FRAME_POINTER @@ -450,14 +451,6 @@ config NR_CPUS config HOTPLUG_CPU def_bool y -# Some NUMA nodes have memory ranges that span -# other nodes. Even though a pfn is valid and -# between a node's start and end pfns, it may not -# reside on that node. See memmap_init_zone() -# for details. <- They meant memory holes! -config NODES_SPAN_OTHER_NODES - def_bool NUMA - config NUMA bool "NUMA support" depends on SCHED_TOPOLOGY @@ -467,58 +460,9 @@ config NUMA This option adds NUMA support to the kernel. - An operation mode can be selected by appending - numa=<method> to the kernel command line. - - The default behaviour is identical to appending numa=plain to - the command line. This will create just one node with all - available memory and all CPUs in it. - config NODES_SHIFT - int "Maximum NUMA nodes (as a power of 2)" - range 1 10 - depends on NUMA - default "4" - help - Specify the maximum number of NUMA nodes available on the target - system. Increases memory reserved to accommodate various tables. - -menu "Select NUMA modes" - depends on NUMA - -config NUMA_EMU - bool "NUMA emulation" - default y - help - Numa emulation mode will split the available system memory into - equal chunks which then are distributed over the configured number - of nodes in a round-robin manner. - - The number of fake nodes is limited by the number of available memory - chunks (i.e. memory size / fake size) and the number of supported - nodes in the kernel. - - The CPUs are assigned to the nodes in a way that partially respects - the original machine topology (if supported by the machine). - Fair distribution of the CPUs is not guaranteed. - -config EMU_SIZE - hex "NUMA emulation memory chunk size" - default 0x10000000 - range 0x400000 0x100000000 - depends on NUMA_EMU - help - Select the default size by which the memory is chopped and then - assigned to emulated NUMA nodes. - - This can be overridden by specifying - - emu_size=<n> - - on the kernel command line where also suffixes K, M, G, and T are - supported. - -endmenu + int + default "1" config SCHED_SMT def_bool n @@ -866,15 +810,6 @@ config SECCOMP If unsure, say Y. -menu "Power Management" - -config ARCH_HIBERNATION_POSSIBLE - def_bool y - -source "kernel/power/Kconfig" - -endmenu - config CCW def_bool y @@ -1009,7 +944,6 @@ config S390_GUEST select TTY select VIRTUALIZATION select VIRTIO - select VIRTIO_CONSOLE help Enabling this option adds support for virtio based paravirtual device drivers on s390. diff --git a/arch/s390/Makefile b/arch/s390/Makefile index e0e3a465bbfd..8dfa2cf1f05c 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -146,7 +146,7 @@ all: bzImage #KBUILD_IMAGE is necessary for packaging targets like rpm-pkg, deb-pkg... KBUILD_IMAGE := $(boot)/bzImage -install: vmlinux +install: $(Q)$(MAKE) $(build)=$(boot) $@ bzImage: vmlinux diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c index 54f375627532..8bf46d705957 100644 --- a/arch/s390/appldata/appldata_os.c +++ b/arch/s390/appldata/appldata_os.c @@ -75,7 +75,7 @@ struct appldata_os_data { (waiting for I/O) */ /* per cpu data */ - struct appldata_os_per_cpu os_cpu[0]; + struct appldata_os_per_cpu os_cpu[]; } __attribute__((packed)); static struct appldata_os_data *appldata_os_data; diff --git a/arch/s390/boot/.gitignore b/arch/s390/boot/.gitignore index 16ff906e4610..b265bfede188 100644 --- a/arch/s390/boot/.gitignore +++ b/arch/s390/boot/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only image bzImage section_cmp.* diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index e2c47d3a1c89..45b33b83de08 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -37,7 +37,7 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o obj-y += version.o pgm_check_info.o ctype.o text_dma.o -obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) += uv.o +obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o obj-$(CONFIG_RELOCATABLE) += machine_kexec_reloc.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o targets := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) @@ -70,7 +70,7 @@ $(obj)/compressed/vmlinux: $(obj)/startup.a FORCE $(obj)/startup.a: $(OBJECTS) FORCE $(call if_changed,ar) -install: $(CONFIGURE) $(obj)/bzImage +install: sh -x $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/bzImage \ System.map "$(INSTALL_PATH)" diff --git a/arch/s390/boot/compressed/.gitignore b/arch/s390/boot/compressed/.gitignore index e72fcd7ecebb..765a08f1bd77 100644 --- a/arch/s390/boot/compressed/.gitignore +++ b/arch/s390/boot/compressed/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only vmlinux vmlinux.lds diff --git a/arch/s390/boot/install.sh b/arch/s390/boot/install.sh index bed227f267ae..515b27a996b3 100644 --- a/arch/s390/boot/install.sh +++ b/arch/s390/boot/install.sh @@ -21,15 +21,10 @@ if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi -# Default install - same as make zlilo +echo "Warning: '${INSTALLKERNEL}' command not available - additional " \ + "bootloader config required" >&2 +if [ -f $4/vmlinuz-$1 ]; then mv $4/vmlinuz-$1 $4/vmlinuz-$1.old; fi +if [ -f $4/System.map-$1 ]; then mv $4/System.map-$1 $4/System.map-$1.old; fi -if [ -f $4/vmlinuz ]; then - mv $4/vmlinuz $4/vmlinuz.old -fi - -if [ -f $4/System.map ]; then - mv $4/System.map $4/System.old -fi - -cat $2 > $4/vmlinuz -cp $3 $4/System.map +cat $2 > $4/vmlinuz-$1 +cp $3 $4/System.map-$1 diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c index 5d12352545c5..5591243d673e 100644 --- a/arch/s390/boot/kaslr.c +++ b/arch/s390/boot/kaslr.c @@ -75,7 +75,7 @@ static unsigned long get_random(unsigned long limit) *(unsigned long *) prng.parm_block ^= seed; for (i = 0; i < 16; i++) { cpacf_kmc(CPACF_KMC_PRNG, prng.parm_block, - (char *) entropy, (char *) entropy, + (u8 *) entropy, (u8 *) entropy, sizeof(entropy)); memcpy(prng.parm_block, entropy, sizeof(entropy)); } diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c index ed007f4a6444..8fde561f1d07 100644 --- a/arch/s390/boot/uv.c +++ b/arch/s390/boot/uv.c @@ -3,7 +3,13 @@ #include <asm/facility.h> #include <asm/sections.h> +/* will be used in arch/s390/kernel/uv.c */ +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST int __bootdata_preserved(prot_virt_guest); +#endif +#if IS_ENABLED(CONFIG_KVM) +struct uv_info __bootdata_preserved(uv_info); +#endif void uv_query_info(void) { @@ -15,10 +21,25 @@ void uv_query_info(void) if (!test_facility(158)) return; - if (uv_call(0, (uint64_t)&uvcb)) + /* rc==0x100 means that there is additional data we do not process */ + if (uv_call(0, (uint64_t)&uvcb) && uvcb.header.rc != 0x100) return; + if (IS_ENABLED(CONFIG_KVM)) { + memcpy(uv_info.inst_calls_list, uvcb.inst_calls_list, sizeof(uv_info.inst_calls_list)); + uv_info.uv_base_stor_len = uvcb.uv_base_stor_len; + uv_info.guest_base_stor_len = uvcb.conf_base_phys_stor_len; + uv_info.guest_virt_base_stor_len = uvcb.conf_base_virt_stor_len; + uv_info.guest_virt_var_stor_len = uvcb.conf_virt_var_stor_len; + uv_info.guest_cpu_stor_len = uvcb.cpu_stor_len; + uv_info.max_sec_stor_addr = ALIGN(uvcb.max_guest_stor_addr, PAGE_SIZE); + uv_info.max_num_sec_conf = uvcb.max_num_sec_conf; + uv_info.max_guest_cpus = uvcb.max_guest_cpus; + } + +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST if (test_bit_inv(BIT_UVC_CMD_SET_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list) && test_bit_inv(BIT_UVC_CMD_REMOVE_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list)) prot_virt_guest = 1; +#endif } diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 2e60c80395ab..46038bc58c9e 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -53,6 +53,7 @@ CONFIG_VFIO_AP=m CONFIG_CRASH_DUMP=y CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y +CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_KVM=m @@ -474,7 +475,6 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_EMULEX is not set # CONFIG_NET_VENDOR_EZCHIP is not set # CONFIG_NET_VENDOR_GOOGLE is not set -# CONFIG_NET_VENDOR_HP is not set # CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set # CONFIG_NET_VENDOR_MARVELL is not set @@ -532,6 +532,7 @@ CONFIG_INPUT_EVDEV=y # CONFIG_SERIO is not set CONFIG_LEGACY_PTY_COUNT=0 CONFIG_NULL_TTY=m +CONFIG_VIRTIO_CONSOLE=y CONFIG_HW_RANDOM_VIRTIO=m CONFIG_RAW_DRIVER=m CONFIG_HANGCHECK_TIMER=m @@ -684,7 +685,6 @@ CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_XXHASH=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD128=m CONFIG_CRYPTO_RMD160=m @@ -748,7 +748,6 @@ CONFIG_DEBUG_INFO_DWARF4=y CONFIG_GDB_SCRIPTS=y CONFIG_FRAME_WARN=1024 CONFIG_HEADERS_INSTALL=y -CONFIG_HEADERS_CHECK=y CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_PAGEALLOC=y @@ -772,9 +771,9 @@ CONFIG_DEBUG_MEMORY_INIT=y CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m CONFIG_DEBUG_PER_CPU_MAPS=y CONFIG_DEBUG_SHIRQ=y +CONFIG_PANIC_ON_OOPS=y CONFIG_DETECT_HUNG_TASK=y CONFIG_WQ_WATCHDOG=y -CONFIG_PANIC_ON_OOPS=y CONFIG_DEBUG_TIMEKEEPING=y CONFIG_PROVE_LOCKING=y CONFIG_LOCK_STAT=y @@ -783,9 +782,20 @@ CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DEBUG_LOCKING_API_SELFTESTS=y CONFIG_DEBUG_SG=y CONFIG_DEBUG_NOTIFIERS=y +CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_DEBUG_CREDENTIALS=y CONFIG_RCU_TORTURE_TEST=m CONFIG_RCU_CPU_STALL_TIMEOUT=300 +CONFIG_LATENCYTOP=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_STACK_TRACER=y +CONFIG_IRQSOFF_TRACER=y +CONFIG_PREEMPT_TRACER=y +CONFIG_SCHED_TRACER=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_HIST_TRIGGERS=y +CONFIG_S390_PTDUMP=y CONFIG_NOTIFIER_ERROR_INJECTION=m CONFIG_NETDEV_NOTIFIER_ERROR_INJECT=m CONFIG_FAULT_INJECTION=y @@ -796,15 +806,6 @@ CONFIG_FAIL_IO_TIMEOUT=y CONFIG_FAIL_FUTEX=y CONFIG_FAULT_INJECTION_DEBUG_FS=y CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y -CONFIG_LATENCYTOP=y -CONFIG_IRQSOFF_TRACER=y -CONFIG_PREEMPT_TRACER=y -CONFIG_SCHED_TRACER=y -CONFIG_FTRACE_SYSCALLS=y -CONFIG_STACK_TRACER=y -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_FUNCTION_PROFILER=y -CONFIG_HIST_TRIGGERS=y CONFIG_LKDTM=m CONFIG_TEST_LIST_SORT=y CONFIG_TEST_SORT=y @@ -814,5 +815,3 @@ CONFIG_INTERVAL_TREE_TEST=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y CONFIG_TEST_BPF=m -CONFIG_BUG_ON_DATA_CORRUPTION=y -CONFIG_S390_PTDUMP=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 25f799849582..7cd0648c1f4e 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -53,6 +53,7 @@ CONFIG_VFIO_AP=m CONFIG_CRASH_DUMP=y CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y +CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_KVM=m @@ -470,7 +471,6 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_EMULEX is not set # CONFIG_NET_VENDOR_EZCHIP is not set # CONFIG_NET_VENDOR_GOOGLE is not set -# CONFIG_NET_VENDOR_HP is not set # CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set # CONFIG_NET_VENDOR_MARVELL is not set @@ -528,6 +528,7 @@ CONFIG_INPUT_EVDEV=y # CONFIG_SERIO is not set CONFIG_LEGACY_PTY_COUNT=0 CONFIG_NULL_TTY=m +CONFIG_VIRTIO_CONSOLE=y CONFIG_HW_RANDOM_VIRTIO=m CONFIG_RAW_DRIVER=m CONFIG_HANGCHECK_TIMER=m @@ -677,7 +678,6 @@ CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_XXHASH=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD128=m CONFIG_CRYPTO_RMD160=m @@ -739,18 +739,18 @@ CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_PANIC_ON_OOPS=y +CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_RCU_TORTURE_TEST=m CONFIG_RCU_CPU_STALL_TIMEOUT=60 CONFIG_LATENCYTOP=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_STACK_TRACER=y CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y -CONFIG_STACK_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_FUNCTION_PROFILER=y CONFIG_HIST_TRIGGERS=y +CONFIG_S390_PTDUMP=y CONFIG_LKDTM=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y CONFIG_TEST_BPF=m -CONFIG_BUG_ON_DATA_CORRUPTION=y -CONFIG_S390_PTDUMP=y diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index 1c23d84a9097..73044634d342 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -342,6 +342,7 @@ static int cbc_aes_crypt(struct skcipher_request *req, unsigned long modifier) memcpy(walk.iv, param.iv, AES_BLOCK_SIZE); ret = skcipher_walk_done(&walk, nbytes - n); } + memzero_explicit(¶m, sizeof(param)); return ret; } @@ -470,6 +471,8 @@ static int xts_aes_crypt(struct skcipher_request *req, unsigned long modifier) walk.dst.virt.addr, walk.src.virt.addr, n); ret = skcipher_walk_done(&walk, nbytes - n); } + memzero_explicit(&pcc_param, sizeof(pcc_param)); + memzero_explicit(&xts_param, sizeof(xts_param)); return ret; } diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 1832ae6442ef..83f6e85de7bc 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -5,21 +5,6 @@ generated-y += syscall_table.h generated-y += unistd_nr.h generic-y += asm-offsets.h -generic-y += cacheflush.h -generic-y += device.h -generic-y += dma-mapping.h -generic-y += div64.h -generic-y += emergency-restart.h generic-y += export.h -generic-y += fb.h -generic-y += irq_regs.h -generic-y += irq_work.h -generic-y += kmap_types.h -generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h -generic-y += mm-arch-hooks.h -generic-y += mmiowb.h -generic-y += trace_clock.h -generic-y += unaligned.h -generic-y += word-at-a-time.h diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h index 5e97a4353147..26f9144562c9 100644 --- a/arch/s390/include/asm/futex.h +++ b/arch/s390/include/asm/futex.h @@ -29,7 +29,6 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, mm_segment_t old_fs; old_fs = enable_sacf_uaccess(); - pagefault_disable(); switch (op) { case FUTEX_OP_SET: __futex_atomic_op("lr %2,%5\n", @@ -54,7 +53,6 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, default: ret = -ENOSYS; } - pagefault_enable(); disable_sacf_uaccess(old_fs); if (!ret) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 37f96b6f0e61..a816fb4734b8 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -9,6 +9,7 @@ #ifndef _ASM_S390_GMAP_H #define _ASM_S390_GMAP_H +#include <linux/radix-tree.h> #include <linux/refcount.h> /* Generic bits for GMAP notification on DAT table entry changes. */ @@ -31,6 +32,7 @@ * @table: pointer to the page directory * @asce: address space control element for gmap page table * @pfault_enabled: defines if pfaults are applicable for the guest + * @guest_handle: protected virtual machine handle for the ultravisor * @host_to_rmap: radix tree with gmap_rmap lists * @children: list of shadow gmap structures * @pt_list: list of all page tables used in the shadow guest address space @@ -54,6 +56,8 @@ struct gmap { unsigned long asce_end; void *private; bool pfault_enabled; + /* only set for protected virtual machines */ + unsigned long guest_handle; /* Additional data for shadow guest address spaces */ struct radix_tree_root host_to_rmap; struct list_head children; @@ -144,4 +148,6 @@ int gmap_mprotect_notify(struct gmap *, unsigned long start, void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], unsigned long gaddr, unsigned long vmaddr); +int gmap_mark_unmergeable(void); +void s390_reset_acc(struct mm_struct *mm); #endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/hw_irq.h b/arch/s390/include/asm/hw_irq.h index adae176757ae..9078b5b6b837 100644 --- a/arch/s390/include/asm/hw_irq.h +++ b/arch/s390/include/asm/hw_irq.h @@ -7,6 +7,5 @@ void __init init_airq_interrupts(void); void __init init_cio_interrupts(void); -void __init init_ext_interrupts(void); #endif diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index 084e71b7272a..b63bd66404b8 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -119,6 +119,7 @@ enum diag308_subcode { DIAG308_LOAD_NORMAL_DUMP = 4, DIAG308_SET = 5, DIAG308_STORE = 6, + DIAG308_LOAD_NORMAL = 7, }; enum diag308_rc { diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 1726224e7772..d6bcd34f3ec3 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -127,6 +127,12 @@ struct mcck_volatile_info { #define CR14_INITIAL_MASK (CR14_UNUSED_32 | CR14_UNUSED_33 | \ CR14_EXTERNAL_DAMAGE_SUBMASK) +#define SIDAD_SIZE_MASK 0xff +#define sida_origin(sie_block) \ + ((sie_block)->sidad & PAGE_MASK) +#define sida_size(sie_block) \ + ((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE) + #define CPUSTAT_STOPPED 0x80000000 #define CPUSTAT_WAIT 0x10000000 #define CPUSTAT_ECALL_PEND 0x08000000 @@ -160,7 +166,13 @@ struct kvm_s390_sie_block { __u8 reserved08[4]; /* 0x0008 */ #define PROG_IN_SIE (1<<0) __u32 prog0c; /* 0x000c */ - __u8 reserved10[16]; /* 0x0010 */ + union { + __u8 reserved10[16]; /* 0x0010 */ + struct { + __u64 pv_handle_cpu; + __u64 pv_handle_config; + }; + }; #define PROG_BLOCK_SIE (1<<0) #define PROG_REQUEST (1<<1) atomic_t prog20; /* 0x0020 */ @@ -209,10 +221,23 @@ struct kvm_s390_sie_block { #define ICPT_PARTEXEC 0x38 #define ICPT_IOINST 0x40 #define ICPT_KSS 0x5c +#define ICPT_MCHKREQ 0x60 +#define ICPT_INT_ENABLE 0x64 +#define ICPT_PV_INSTR 0x68 +#define ICPT_PV_NOTIFY 0x6c +#define ICPT_PV_PREF 0x70 __u8 icptcode; /* 0x0050 */ __u8 icptstatus; /* 0x0051 */ __u16 ihcpu; /* 0x0052 */ - __u8 reserved54[2]; /* 0x0054 */ + __u8 reserved54; /* 0x0054 */ +#define IICTL_CODE_NONE 0x00 +#define IICTL_CODE_MCHK 0x01 +#define IICTL_CODE_EXT 0x02 +#define IICTL_CODE_IO 0x03 +#define IICTL_CODE_RESTART 0x04 +#define IICTL_CODE_SPECIFICATION 0x10 +#define IICTL_CODE_OPERAND 0x11 + __u8 iictl; /* 0x0055 */ __u16 ipa; /* 0x0056 */ __u32 ipb; /* 0x0058 */ __u32 scaoh; /* 0x005c */ @@ -233,7 +258,7 @@ struct kvm_s390_sie_block { #define ECB3_RI 0x01 __u8 ecb3; /* 0x0063 */ __u32 scaol; /* 0x0064 */ - __u8 reserved68; /* 0x0068 */ + __u8 sdf; /* 0x0068 */ __u8 epdx; /* 0x0069 */ __u8 reserved6a[2]; /* 0x006a */ __u32 todpr; /* 0x006c */ @@ -249,31 +274,58 @@ struct kvm_s390_sie_block { #define HPID_KVM 0x4 #define HPID_VSIE 0x5 __u8 hpid; /* 0x00b8 */ - __u8 reservedb9[11]; /* 0x00b9 */ - __u16 extcpuaddr; /* 0x00c4 */ - __u16 eic; /* 0x00c6 */ + __u8 reservedb9[7]; /* 0x00b9 */ + union { + struct { + __u32 eiparams; /* 0x00c0 */ + __u16 extcpuaddr; /* 0x00c4 */ + __u16 eic; /* 0x00c6 */ + }; + __u64 mcic; /* 0x00c0 */ + } __packed; __u32 reservedc8; /* 0x00c8 */ - __u16 pgmilc; /* 0x00cc */ - __u16 iprcc; /* 0x00ce */ - __u32 dxc; /* 0x00d0 */ - __u16 mcn; /* 0x00d4 */ - __u8 perc; /* 0x00d6 */ - __u8 peratmid; /* 0x00d7 */ + union { + struct { + __u16 pgmilc; /* 0x00cc */ + __u16 iprcc; /* 0x00ce */ + }; + __u32 edc; /* 0x00cc */ + } __packed; + union { + struct { + __u32 dxc; /* 0x00d0 */ + __u16 mcn; /* 0x00d4 */ + __u8 perc; /* 0x00d6 */ + __u8 peratmid; /* 0x00d7 */ + }; + __u64 faddr; /* 0x00d0 */ + } __packed; __u64 peraddr; /* 0x00d8 */ __u8 eai; /* 0x00e0 */ __u8 peraid; /* 0x00e1 */ __u8 oai; /* 0x00e2 */ __u8 armid; /* 0x00e3 */ __u8 reservede4[4]; /* 0x00e4 */ - __u64 tecmc; /* 0x00e8 */ - __u8 reservedf0[12]; /* 0x00f0 */ + union { + __u64 tecmc; /* 0x00e8 */ + struct { + __u16 subchannel_id; /* 0x00e8 */ + __u16 subchannel_nr; /* 0x00ea */ + __u32 io_int_parm; /* 0x00ec */ + __u32 io_int_word; /* 0x00f0 */ + }; + } __packed; + __u8 reservedf4[8]; /* 0x00f4 */ #define CRYCB_FORMAT_MASK 0x00000003 #define CRYCB_FORMAT0 0x00000000 #define CRYCB_FORMAT1 0x00000001 #define CRYCB_FORMAT2 0x00000003 __u32 crycbd; /* 0x00fc */ __u64 gcr[16]; /* 0x0100 */ - __u64 gbea; /* 0x0180 */ + union { + __u64 gbea; /* 0x0180 */ + __u64 sidad; + }; __u8 reserved188[8]; /* 0x0188 */ __u64 sdnxo; /* 0x0190 */ __u8 reserved198[8]; /* 0x0198 */ @@ -292,7 +344,7 @@ struct kvm_s390_sie_block { __u64 itdba; /* 0x01e8 */ __u64 riccbd; /* 0x01f0 */ __u64 gvrd; /* 0x01f8 */ -} __attribute__((packed)); +} __packed __aligned(512); struct kvm_s390_itdb { __u8 data[256]; @@ -301,7 +353,9 @@ struct kvm_s390_itdb { struct sie_page { struct kvm_s390_sie_block sie_block; struct mcck_volatile_info mcck_info; /* 0x0200 */ - __u8 reserved218[1000]; /* 0x0218 */ + __u8 reserved218[360]; /* 0x0218 */ + __u64 pv_grregs[16]; /* 0x0380 */ + __u8 reserved400[512]; /* 0x0400 */ struct kvm_s390_itdb itdb; /* 0x0600 */ __u8 reserved700[2304]; /* 0x0700 */ }; @@ -476,6 +530,7 @@ enum irq_types { IRQ_PEND_PFAULT_INIT, IRQ_PEND_EXT_HOST, IRQ_PEND_EXT_SERVICE, + IRQ_PEND_EXT_SERVICE_EV, IRQ_PEND_EXT_TIMING, IRQ_PEND_EXT_CPU_TIMER, IRQ_PEND_EXT_CLOCK_COMP, @@ -520,6 +575,7 @@ enum irq_types { (1UL << IRQ_PEND_EXT_TIMING) | \ (1UL << IRQ_PEND_EXT_HOST) | \ (1UL << IRQ_PEND_EXT_SERVICE) | \ + (1UL << IRQ_PEND_EXT_SERVICE_EV) | \ (1UL << IRQ_PEND_VIRTIO) | \ (1UL << IRQ_PEND_PFAULT_INIT) | \ (1UL << IRQ_PEND_PFAULT_DONE)) @@ -536,6 +592,13 @@ enum irq_types { #define IRQ_PEND_MCHK_MASK ((1UL << IRQ_PEND_MCHK_REP) | \ (1UL << IRQ_PEND_MCHK_EX)) +#define IRQ_PEND_EXT_II_MASK ((1UL << IRQ_PEND_EXT_CPU_TIMER) | \ + (1UL << IRQ_PEND_EXT_CLOCK_COMP) | \ + (1UL << IRQ_PEND_EXT_EMERGENCY) | \ + (1UL << IRQ_PEND_EXT_EXTERNAL) | \ + (1UL << IRQ_PEND_EXT_SERVICE) | \ + (1UL << IRQ_PEND_EXT_SERVICE_EV)) + struct kvm_s390_interrupt_info { struct list_head list; u64 type; @@ -594,6 +657,7 @@ struct kvm_s390_local_interrupt { struct kvm_s390_float_interrupt { unsigned long pending_irqs; + unsigned long masked_irqs; spinlock_t lock; struct list_head lists[FIRQ_LIST_COUNT]; int counters[FIRQ_MAX_COUNT]; @@ -645,6 +709,11 @@ struct kvm_guestdbg_info_arch { unsigned long last_bp; }; +struct kvm_s390_pv_vcpu { + u64 handle; + unsigned long stor_base; +}; + struct kvm_vcpu_arch { struct kvm_s390_sie_block *sie_block; /* if vsie is active, currently executed shadow sie control block */ @@ -673,6 +742,7 @@ struct kvm_vcpu_arch { __u64 cputm_start; bool gs_enabled; bool skey_enabled; + struct kvm_s390_pv_vcpu pv; }; struct kvm_vm_stat { @@ -701,9 +771,6 @@ struct s390_io_adapter { bool masked; bool swap; bool suppressible; - struct rw_semaphore maps_lock; - struct list_head maps; - atomic_t nr_maps; }; #define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8) @@ -846,6 +913,13 @@ struct kvm_s390_gisa_interrupt { DECLARE_BITMAP(kicked_mask, KVM_MAX_VCPUS); }; +struct kvm_s390_pv { + u64 handle; + u64 guest_len; + unsigned long stor_base; + void *stor_var; +}; + struct kvm_arch{ void *sca; int use_esca; @@ -881,6 +955,7 @@ struct kvm_arch{ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); struct kvm_s390_gisa_interrupt gisa_int; + struct kvm_s390_pv pv; }; #define KVM_HVA_ERR_BAD (-1UL) @@ -921,7 +996,7 @@ static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, - struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} + struct kvm_memory_slot *slot) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 237ee0c4169f..612ed3c6d581 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -141,7 +141,9 @@ struct lowcore { /* br %r1 trampoline */ __u16 br_r1_trampoline; /* 0x0400 */ - __u8 pad_0x0402[0x0e00-0x0402]; /* 0x0402 */ + __u32 return_lpswe; /* 0x0402 */ + __u32 return_mcck_lpswe; /* 0x0406 */ + __u8 pad_0x040a[0x0e00-0x040a]; /* 0x040a */ /* * 0xe00 contains the address of the IPL Parameter Information diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index bcfb6371086f..e12ff0f29d1a 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -16,6 +16,8 @@ typedef struct { unsigned long asce; unsigned long asce_limit; unsigned long vdso_base; + /* The mmu context belongs to a secure guest. */ + atomic_t is_protected; /* * The following bitfields need a down_write on the mm * semaphore when they are written to. As they are only @@ -32,8 +34,6 @@ typedef struct { unsigned int uses_cmm:1; /* The gmaps associated with this context are allowed to use huge pages. */ unsigned int allow_gmap_hpage_1m:1; - /* The mmu context is for compat task */ - unsigned int compat_mm:1; } mm_context_t; #define INIT_MM_CONTEXT(name) \ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 8d04e6f3f796..c9f3d8a52756 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -18,14 +18,16 @@ static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + unsigned long asce_type, init_entry; + spin_lock_init(&mm->context.lock); INIT_LIST_HEAD(&mm->context.pgtable_list); INIT_LIST_HEAD(&mm->context.gmap_list); cpumask_clear(&mm->context.cpu_attach_mask); atomic_set(&mm->context.flush_count, 0); + atomic_set(&mm->context.is_protected, 0); mm->context.gmap_asce = 0; mm->context.flush_mm = 0; - mm->context.compat_mm = test_thread_flag(TIF_31BIT); #ifdef CONFIG_PGSTE mm->context.alloc_pgste = page_table_allocate_pgste || test_thread_flag(TIF_PGSTE) || @@ -36,33 +38,34 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.allow_gmap_hpage_1m = 0; #endif switch (mm->context.asce_limit) { - case _REGION2_SIZE: + default: /* - * forked 3-level task, fall through to set new asce with new - * mm->pgd + * context created by exec, the value of asce_limit can + * only be zero in this case */ - case 0: - /* context created by exec, set asce limit to 4TB */ - mm->context.asce_limit = STACK_TOP_MAX; - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION3; + VM_BUG_ON(mm->context.asce_limit); + /* continue as 3-level task */ + mm->context.asce_limit = _REGION2_SIZE; + fallthrough; + case _REGION2_SIZE: + /* forked 3-level task */ + init_entry = _REGION3_ENTRY_EMPTY; + asce_type = _ASCE_TYPE_REGION3; break; - case -PAGE_SIZE: - /* forked 5-level task, set new asce with new_mm->pgd */ - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION1; + case TASK_SIZE_MAX: + /* forked 5-level task */ + init_entry = _REGION1_ENTRY_EMPTY; + asce_type = _ASCE_TYPE_REGION1; break; case _REGION1_SIZE: - /* forked 4-level task, set new asce with new mm->pgd */ - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION2; + /* forked 4-level task */ + init_entry = _REGION2_ENTRY_EMPTY; + asce_type = _ASCE_TYPE_REGION2; break; - case _REGION3_SIZE: - /* forked 2-level compat task, set new asce with new mm->pgd */ - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; } - crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | asce_type; + crst_table_init((unsigned long *) mm->pgd, init_entry); return 0; } diff --git a/arch/s390/include/asm/numa.h b/arch/s390/include/asm/numa.h index 35f8cbe7e5bb..23cd5d1b734b 100644 --- a/arch/s390/include/asm/numa.h +++ b/arch/s390/include/asm/numa.h @@ -13,24 +13,13 @@ #ifdef CONFIG_NUMA #include <linux/numa.h> -#include <linux/cpumask.h> void numa_setup(void); -int numa_pfn_to_nid(unsigned long pfn); -int __node_distance(int a, int b); -void numa_update_cpu_topology(void); - -extern cpumask_t node_to_cpumask_map[MAX_NUMNODES]; -extern int numa_debug_enabled; #else static inline void numa_setup(void) { } -static inline void numa_update_cpu_topology(void) { } -static inline int numa_pfn_to_nid(unsigned long pfn) -{ - return 0; -} #endif /* CONFIG_NUMA */ + #endif /* _ASM_S390_NUMA_H */ diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 85e944f04c70..f2d4c1bd3429 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -42,7 +42,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end); static inline void storage_key_init_range(unsigned long start, unsigned long end) { - if (PAGE_DEFAULT_KEY) + if (PAGE_DEFAULT_KEY != 0) __storage_key_init_range(start, end); } @@ -153,6 +153,11 @@ static inline int devmem_is_allowed(unsigned long pfn) #define HAVE_ARCH_FREE_PAGE #define HAVE_ARCH_ALLOC_PAGE +#if IS_ENABLED(CONFIG_PGSTE) +int arch_make_page_accessible(struct page *page); +#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE +#endif + #endif /* !__ASSEMBLY__ */ #define __PAGE_OFFSET 0x0UL @@ -161,20 +166,20 @@ static inline int devmem_is_allowed(unsigned long pfn) #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)(unsigned long)(x)) -#define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT) -#define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT) +#define phys_to_pfn(phys) ((phys) >> PAGE_SHIFT) +#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT) + +#define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys)) +#define page_to_phys(page) pfn_to_phys(page_to_pfn(page)) + +#define pfn_to_virt(pfn) __va(pfn_to_phys(pfn)) +#define virt_to_pfn(kaddr) (phys_to_pfn(__pa(kaddr))) #define pfn_to_kaddr(pfn) pfn_to_virt(pfn) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) #define page_to_virt(page) pfn_to_virt(page_to_pfn(page)) -#define phys_to_pfn(kaddr) ((kaddr) >> PAGE_SHIFT) -#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT) - -#define phys_to_page(kaddr) pfn_to_page(phys_to_pfn(kaddr)) -#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) - -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index b05187ce5dbd..7485ee561fec 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -5,6 +5,7 @@ #include <linux/pci.h> #include <linux/mutex.h> #include <linux/iommu.h> +#include <linux/pci_hotplug.h> #include <asm-generic/pci.h> #include <asm/pci_clp.h> #include <asm/pci_debug.h> @@ -25,6 +26,7 @@ int pci_proc_domain(struct pci_bus *); #define ZPCI_NR_DMA_SPACES 1 #define ZPCI_NR_DEVICES CONFIG_PCI_NR_FUNCTIONS +#define ZPCI_DOMAIN_BITMAP_SIZE (1 << 16) /* PCI Function Controls */ #define ZPCI_FC_FN_ENABLED 0x80 @@ -96,6 +98,7 @@ struct s390_domain; struct zpci_dev { struct pci_bus *bus; struct list_head entry; /* list of all zpci_devices, needed for hotplug, etc. */ + struct hotplug_slot hotplug_slot; enum zpci_state state; u32 fid; /* function ID, used by sclp */ @@ -186,6 +189,9 @@ int clp_enable_fh(struct zpci_dev *, u8); int clp_disable_fh(struct zpci_dev *); int clp_get_state(u32 fid, enum zpci_state *state); +/* UID */ +void update_uid_checking(bool new); + /* IOMMU Interface */ int zpci_init_iommu(struct zpci_dev *zdev); void zpci_destroy_iommu(struct zpci_dev *zdev); diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 77606c4acd58..74a352f8c0d1 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -34,19 +34,21 @@ static inline void crst_table_init(unsigned long *crst, unsigned long entry) memset64((u64 *)crst, entry, _CRST_ENTRIES); } -static inline unsigned long pgd_entry_type(struct mm_struct *mm) +int crst_table_upgrade(struct mm_struct *mm, unsigned long limit); + +static inline unsigned long check_asce_limit(struct mm_struct *mm, unsigned long addr, + unsigned long len) { - if (mm_pmd_folded(mm)) - return _SEGMENT_ENTRY_EMPTY; - if (mm_pud_folded(mm)) - return _REGION3_ENTRY_EMPTY; - if (mm_p4d_folded(mm)) - return _REGION2_ENTRY_EMPTY; - return _REGION1_ENTRY_EMPTY; -} + int rc; -int crst_table_upgrade(struct mm_struct *mm, unsigned long limit); -void crst_table_downgrade(struct mm_struct *); + if (addr + len > mm->context.asce_limit && + addr + len <= TASK_SIZE) { + rc = crst_table_upgrade(mm, addr + len); + if (rc) + return (unsigned long) rc; + } + return addr; +} static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address) { @@ -116,24 +118,11 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - unsigned long *table = crst_table_alloc(mm); - - if (!table) - return NULL; - if (mm->context.asce_limit == _REGION3_SIZE) { - /* Forking a compat process with 2 page table levels */ - if (!pgtable_pmd_page_ctor(virt_to_page(table))) { - crst_table_free(mm, table); - return NULL; - } - } - return (pgd_t *) table; + return (pgd_t *) crst_table_alloc(mm); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - if (mm->context.asce_limit == _REGION3_SIZE) - pgtable_pmd_page_dtor(virt_to_page(pgd)); crst_table_free(mm, (unsigned long *) pgd); } diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 137a3920ca36..6076c8c912d2 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -19,6 +19,7 @@ #include <linux/atomic.h> #include <asm/bug.h> #include <asm/page.h> +#include <asm/uv.h> extern pgd_t swapper_pg_dir[]; extern void paging_init(void); @@ -520,6 +521,15 @@ static inline int mm_has_pgste(struct mm_struct *mm) return 0; } +static inline int mm_is_protected(struct mm_struct *mm) +{ +#ifdef CONFIG_PGSTE + if (unlikely(atomic_read(&mm->context.is_protected))) + return 1; +#endif + return 0; +} + static inline int mm_alloc_pgste(struct mm_struct *mm) { #ifdef CONFIG_PGSTE @@ -752,6 +762,12 @@ static inline int pmd_write(pmd_t pmd) return (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) != 0; } +#define pud_write pud_write +static inline int pud_write(pud_t pud) +{ + return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0; +} + static inline int pmd_dirty(pmd_t pmd) { return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0; @@ -1061,7 +1077,12 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - return ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); + pte_t res; + + res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); + if (mm_is_protected(mm) && pte_present(res)) + uv_convert_from_secure(pte_val(res) & PAGE_MASK); + return res; } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION @@ -1073,7 +1094,12 @@ void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - return ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID)); + pte_t res; + + res = ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID)); + if (mm_is_protected(vma->vm_mm) && pte_present(res)) + uv_convert_from_secure(pte_val(res) & PAGE_MASK); + return res; } /* @@ -1088,12 +1114,17 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full) { + pte_t res; + if (full) { - pte_t pte = *ptep; + res = *ptep; *ptep = __pte(_PAGE_INVALID); - return pte; + } else { + res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); } - return ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); + if (mm_is_protected(mm) && pte_present(res)) + uv_convert_from_secure(pte_val(res) & PAGE_MASK); + return res; } #define __HAVE_ARCH_PTEP_SET_WRPROTECT diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 361ef5eda468..555d148ccf32 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -84,7 +84,6 @@ void s390_update_cpu_mhz(void); void cpu_detect_mhz_feature(void); extern const struct seq_operations cpuinfo_op; -extern int sysctl_ieee_emulation_warnings; extern void execve_tail(void); extern void __bpon(void); @@ -93,15 +92,15 @@ extern void __bpon(void); */ #define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_31BIT) ? \ - (1UL << 31) : -PAGE_SIZE) + _REGION3_SIZE : TASK_SIZE_MAX) #define TASK_UNMAPPED_BASE (test_thread_flag(TIF_31BIT) ? \ - (1UL << 30) : (1UL << 41)) + (_REGION3_SIZE >> 1) : (_REGION2_SIZE >> 1)) #define TASK_SIZE TASK_SIZE_OF(current) #define TASK_SIZE_MAX (-PAGE_SIZE) #define STACK_TOP (test_thread_flag(TIF_31BIT) ? \ - (1UL << 31) : (1UL << 42)) -#define STACK_TOP_MAX (1UL << 42) + _REGION3_SIZE : _REGION2_SIZE) +#define STACK_TOP_MAX _REGION2_SIZE #define HAVE_ARCH_PICK_MMAP_LAYOUT @@ -162,6 +161,7 @@ typedef struct thread_struct thread_struct; #define INIT_THREAD { \ .ksp = sizeof(init_stack) + (unsigned long) &init_stack, \ .fpu.regs = (void *) init_task.thread.fpu.fprs, \ + .last_break = 1, \ } /* @@ -178,7 +178,6 @@ typedef struct thread_struct thread_struct; regs->psw.mask = PSW_USER_BITS | PSW_MASK_BA; \ regs->psw.addr = new_psw; \ regs->gprs[15] = new_stackp; \ - crst_table_downgrade(current->mm); \ execve_tail(); \ } while (0) diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h index 71e3f0146cda..e577f8533009 100644 --- a/arch/s390/include/asm/qdio.h +++ b/arch/s390/include/asm/qdio.h @@ -201,7 +201,7 @@ struct slib { * @scount: SBAL count * @sflags: whole SBAL flags * @length: length - * @addr: address + * @addr: absolute data address */ struct qdio_buffer_element { u8 eflags; @@ -211,7 +211,7 @@ struct qdio_buffer_element { u8 scount; u8 sflags; u32 length; - void *addr; + u64 addr; } __attribute__ ((packed, aligned(16))); /** @@ -227,7 +227,7 @@ struct qdio_buffer { * @sbal: absolute SBAL address */ struct sl_element { - unsigned long sbal; + u64 sbal; } __attribute__ ((packed)); /** @@ -338,7 +338,7 @@ typedef void qdio_handler_t(struct ccw_device *, unsigned int, int, * @no_output_qs: number of output queues * @input_handler: handler to be called for input queues * @output_handler: handler to be called for output queues - * @queue_start_poll_array: polling handlers (one per input queue or NULL) + * @irq_poll: Data IRQ polling handler (NULL when not supported) * @scan_threshold: # of in-use buffers that triggers scan on output queue * @int_parm: interruption parameter * @input_sbal_addr_array: address of no_input_qs * 128 pointers @@ -359,8 +359,7 @@ struct qdio_initialize { unsigned int no_output_qs; qdio_handler_t *input_handler; qdio_handler_t *output_handler; - void (**queue_start_poll_array) (struct ccw_device *, int, - unsigned long); + void (*irq_poll)(struct ccw_device *cdev, unsigned long data); unsigned int scan_threshold; unsigned long int_parm; struct qdio_buffer **input_sbal_addr_array; @@ -415,8 +414,8 @@ extern int qdio_activate(struct ccw_device *); extern void qdio_release_aob(struct qaob *); extern int do_QDIO(struct ccw_device *, unsigned int, int, unsigned int, unsigned int); -extern int qdio_start_irq(struct ccw_device *, int); -extern int qdio_stop_irq(struct ccw_device *, int); +extern int qdio_start_irq(struct ccw_device *cdev); +extern int qdio_stop_irq(struct ccw_device *cdev); extern int qdio_get_next_buffers(struct ccw_device *, int, int *, int *); extern int qdio_inspect_queue(struct ccw_device *cdev, unsigned int nr, bool is_input, unsigned int *bufnr, diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index b241ddb67caf..534f212753d6 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -8,6 +8,7 @@ #include <linux/bits.h> #include <uapi/asm/setup.h> +#include <linux/build_bug.h> #define EP_OFFSET 0x10008 #define EP_STRING "S390EP" @@ -162,6 +163,12 @@ static inline unsigned long kaslr_offset(void) return __kaslr_offset; } +static inline u32 gen_lpswe(unsigned long addr) +{ + BUILD_BUG_ON(addr > 0xfff); + return 0xb2b20000 | addr; +} + #else /* __ASSEMBLY__ */ #define IPL_DEVICE (IPL_DEVICE_OFFSET) diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index b157a81fb977..231a51e870fe 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -34,6 +34,7 @@ extern int smp_vcpu_scheduled(int cpu); extern void smp_yield_cpu(int cpu); extern void smp_cpu_set_polarization(int cpu, int val); extern int smp_cpu_get_polarization(int cpu); +extern int smp_cpu_get_cpu_address(int cpu); extern void smp_fill_possible_mask(void); extern void smp_detect_cpus(void); diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 670f14a228e5..6bf3a45ccfec 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -155,7 +155,7 @@ static inline void get_tod_clock_ext(char *clk) static inline unsigned long long get_tod_clock(void) { - unsigned char clk[STORE_CLOCK_EXT_SIZE]; + char clk[STORE_CLOCK_EXT_SIZE]; get_tod_clock_ext(clk); return *((unsigned long long *)&clk[1]); diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index cca406fdbe51..fbb507504a3b 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -16,8 +16,8 @@ struct cpu_topology_s390 { unsigned short socket_id; unsigned short book_id; unsigned short drawer_id; - unsigned short node_id; unsigned short dedicated : 1; + int booted_cores; cpumask_t thread_mask; cpumask_t core_mask; cpumask_t book_mask; @@ -25,7 +25,6 @@ struct cpu_topology_s390 { }; extern struct cpu_topology_s390 cpu_topology[NR_CPUS]; -extern cpumask_t cpus_with_topology; #define topology_physical_package_id(cpu) (cpu_topology[cpu].socket_id) #define topology_thread_id(cpu) (cpu_topology[cpu].thread_id) @@ -37,6 +36,7 @@ extern cpumask_t cpus_with_topology; #define topology_drawer_id(cpu) (cpu_topology[cpu].drawer_id) #define topology_drawer_cpumask(cpu) (&cpu_topology[cpu].drawer_mask) #define topology_cpu_dedicated(cpu) (cpu_topology[cpu].dedicated) +#define topology_booted_cores(cpu) (cpu_topology[cpu].booted_cores) #define mc_capable() 1 @@ -45,6 +45,7 @@ int topology_cpu_init(struct cpu *); int topology_set_cpu_management(int fc); void topology_schedule_update(void); void store_topology(struct sysinfo_15_1_x *info); +void update_cpu_masks(void); void topology_expect_change(void); const struct cpumask *cpu_coregroup_mask(int cpu); @@ -54,6 +55,8 @@ static inline void topology_init_early(void) { } static inline void topology_schedule_update(void) { } static inline int topology_cpu_init(struct cpu *cpu) { return 0; } static inline int topology_cpu_dedicated(int cpu_nr) { return 0; } +static inline int topology_booted_cores(int cpu_nr) { return 1; } +static inline void update_cpu_masks(void) { } static inline void topology_expect_change(void) { } #endif /* CONFIG_SCHED_TOPOLOGY */ @@ -71,19 +74,23 @@ static inline void topology_expect_change(void) { } #define cpu_to_node cpu_to_node static inline int cpu_to_node(int cpu) { - return cpu_topology[cpu].node_id; + return 0; } /* Returns a pointer to the cpumask of CPUs on node 'node'. */ #define cpumask_of_node cpumask_of_node static inline const struct cpumask *cpumask_of_node(int node) { - return &node_to_cpumask_map[node]; + return cpu_possible_mask; } #define pcibus_to_node(bus) __pcibus_to_node(bus) #define node_distance(a, b) __node_distance(a, b) +static inline int __node_distance(int a, int b) +{ + return 0; +} #else /* !CONFIG_NUMA */ diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index 4093a2856929..cff4b4c99b75 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -14,23 +14,62 @@ #include <linux/types.h> #include <linux/errno.h> #include <linux/bug.h> +#include <linux/sched.h> #include <asm/page.h> +#include <asm/gmap.h> #define UVC_RC_EXECUTED 0x0001 #define UVC_RC_INV_CMD 0x0002 #define UVC_RC_INV_STATE 0x0003 #define UVC_RC_INV_LEN 0x0005 #define UVC_RC_NO_RESUME 0x0007 +#define UVC_RC_NEED_DESTROY 0x8000 #define UVC_CMD_QUI 0x0001 +#define UVC_CMD_INIT_UV 0x000f +#define UVC_CMD_CREATE_SEC_CONF 0x0100 +#define UVC_CMD_DESTROY_SEC_CONF 0x0101 +#define UVC_CMD_CREATE_SEC_CPU 0x0120 +#define UVC_CMD_DESTROY_SEC_CPU 0x0121 +#define UVC_CMD_CONV_TO_SEC_STOR 0x0200 +#define UVC_CMD_CONV_FROM_SEC_STOR 0x0201 +#define UVC_CMD_SET_SEC_CONF_PARAMS 0x0300 +#define UVC_CMD_UNPACK_IMG 0x0301 +#define UVC_CMD_VERIFY_IMG 0x0302 +#define UVC_CMD_CPU_RESET 0x0310 +#define UVC_CMD_CPU_RESET_INITIAL 0x0311 +#define UVC_CMD_PREPARE_RESET 0x0320 +#define UVC_CMD_CPU_RESET_CLEAR 0x0321 +#define UVC_CMD_CPU_SET_STATE 0x0330 +#define UVC_CMD_SET_UNSHARE_ALL 0x0340 +#define UVC_CMD_PIN_PAGE_SHARED 0x0341 +#define UVC_CMD_UNPIN_PAGE_SHARED 0x0342 #define UVC_CMD_SET_SHARED_ACCESS 0x1000 #define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001 /* Bits in installed uv calls */ enum uv_cmds_inst { BIT_UVC_CMD_QUI = 0, + BIT_UVC_CMD_INIT_UV = 1, + BIT_UVC_CMD_CREATE_SEC_CONF = 2, + BIT_UVC_CMD_DESTROY_SEC_CONF = 3, + BIT_UVC_CMD_CREATE_SEC_CPU = 4, + BIT_UVC_CMD_DESTROY_SEC_CPU = 5, + BIT_UVC_CMD_CONV_TO_SEC_STOR = 6, + BIT_UVC_CMD_CONV_FROM_SEC_STOR = 7, BIT_UVC_CMD_SET_SHARED_ACCESS = 8, BIT_UVC_CMD_REMOVE_SHARED_ACCESS = 9, + BIT_UVC_CMD_SET_SEC_PARMS = 11, + BIT_UVC_CMD_UNPACK_IMG = 13, + BIT_UVC_CMD_VERIFY_IMG = 14, + BIT_UVC_CMD_CPU_RESET = 15, + BIT_UVC_CMD_CPU_RESET_INITIAL = 16, + BIT_UVC_CMD_CPU_SET_STATE = 17, + BIT_UVC_CMD_PREPARE_RESET = 18, + BIT_UVC_CMD_CPU_PERFORM_CLEAR_RESET = 19, + BIT_UVC_CMD_UNSHARE_ALL = 20, + BIT_UVC_CMD_PIN_PAGE_SHARED = 21, + BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22, }; struct uv_cb_header { @@ -40,13 +79,127 @@ struct uv_cb_header { u16 rrc; /* Return Reason Code */ } __packed __aligned(8); +/* Query Ultravisor Information */ struct uv_cb_qui { struct uv_cb_header header; u64 reserved08; u64 inst_calls_list[4]; - u64 reserved30[15]; + u64 reserved30[2]; + u64 uv_base_stor_len; + u64 reserved48; + u64 conf_base_phys_stor_len; + u64 conf_base_virt_stor_len; + u64 conf_virt_var_stor_len; + u64 cpu_stor_len; + u32 reserved70[3]; + u32 max_num_sec_conf; + u64 max_guest_stor_addr; + u8 reserved88[158 - 136]; + u16 max_guest_cpus; + u8 reserveda0[200 - 160]; } __packed __aligned(8); +/* Initialize Ultravisor */ +struct uv_cb_init { + struct uv_cb_header header; + u64 reserved08[2]; + u64 stor_origin; + u64 stor_len; + u64 reserved28[4]; +} __packed __aligned(8); + +/* Create Guest Configuration */ +struct uv_cb_cgc { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 conf_base_stor_origin; + u64 conf_virt_stor_origin; + u64 reserved30; + u64 guest_stor_origin; + u64 guest_stor_len; + u64 guest_sca; + u64 guest_asce; + u64 reserved58[5]; +} __packed __aligned(8); + +/* Create Secure CPU */ +struct uv_cb_csc { + struct uv_cb_header header; + u64 reserved08[2]; + u64 cpu_handle; + u64 guest_handle; + u64 stor_origin; + u8 reserved30[6]; + u16 num; + u64 state_origin; + u64 reserved40[4]; +} __packed __aligned(8); + +/* Convert to Secure */ +struct uv_cb_cts { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 gaddr; +} __packed __aligned(8); + +/* Convert from Secure / Pin Page Shared */ +struct uv_cb_cfs { + struct uv_cb_header header; + u64 reserved08[2]; + u64 paddr; +} __packed __aligned(8); + +/* Set Secure Config Parameter */ +struct uv_cb_ssc { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 sec_header_origin; + u32 sec_header_len; + u32 reserved2c; + u64 reserved30[4]; +} __packed __aligned(8); + +/* Unpack */ +struct uv_cb_unp { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 gaddr; + u64 tweak[2]; + u64 reserved38[3]; +} __packed __aligned(8); + +#define PV_CPU_STATE_OPR 1 +#define PV_CPU_STATE_STP 2 +#define PV_CPU_STATE_CHKSTP 3 +#define PV_CPU_STATE_OPR_LOAD 5 + +struct uv_cb_cpu_set_state { + struct uv_cb_header header; + u64 reserved08[2]; + u64 cpu_handle; + u8 reserved20[7]; + u8 state; + u64 reserved28[5]; +}; + +/* + * A common UV call struct for calls that take no payload + * Examples: + * Destroy cpu/config + * Verify + */ +struct uv_cb_nodata { + struct uv_cb_header header; + u64 reserved08[2]; + u64 handle; + u64 reserved20[4]; +} __packed __aligned(8); + +/* Set Shared Access */ struct uv_cb_share { struct uv_cb_header header; u64 reserved08[3]; @@ -54,21 +207,76 @@ struct uv_cb_share { u64 reserved28; } __packed __aligned(8); -static inline int uv_call(unsigned long r1, unsigned long r2) +static inline int __uv_call(unsigned long r1, unsigned long r2) { int cc; asm volatile( - "0: .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n" - " brc 3,0b\n" - " ipm %[cc]\n" - " srl %[cc],28\n" + " .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n" + " ipm %[cc]\n" + " srl %[cc],28\n" : [cc] "=d" (cc) : [r1] "a" (r1), [r2] "a" (r2) : "memory", "cc"); return cc; } +static inline int uv_call(unsigned long r1, unsigned long r2) +{ + int cc; + + do { + cc = __uv_call(r1, r2); + } while (cc > 1); + return cc; +} + +/* Low level uv_call that avoids stalls for long running busy conditions */ +static inline int uv_call_sched(unsigned long r1, unsigned long r2) +{ + int cc; + + do { + cc = __uv_call(r1, r2); + cond_resched(); + } while (cc > 1); + return cc; +} + +/* + * special variant of uv_call that only transports the cpu or guest + * handle and the command, like destroy or verify. + */ +static inline int uv_cmd_nodata(u64 handle, u16 cmd, u16 *rc, u16 *rrc) +{ + struct uv_cb_nodata uvcb = { + .header.cmd = cmd, + .header.len = sizeof(uvcb), + .handle = handle, + }; + int cc; + + WARN(!handle, "No handle provided to Ultravisor call cmd %x\n", cmd); + cc = uv_call_sched(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + return cc ? -EINVAL : 0; +} + +struct uv_info { + unsigned long inst_calls_list[4]; + unsigned long uv_base_stor_len; + unsigned long guest_base_stor_len; + unsigned long guest_virt_base_stor_len; + unsigned long guest_virt_var_stor_len; + unsigned long guest_cpu_stor_len; + unsigned long max_sec_stor_addr; + unsigned int max_num_sec_conf; + unsigned short max_guest_cpus; +}; + +extern struct uv_info uv_info; + #ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST extern int prot_virt_guest; @@ -121,11 +329,40 @@ static inline int uv_remove_shared(unsigned long addr) return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS); } -void uv_query_info(void); #else #define is_prot_virt_guest() 0 static inline int uv_set_shared(unsigned long addr) { return 0; } static inline int uv_remove_shared(unsigned long addr) { return 0; } +#endif + +#if IS_ENABLED(CONFIG_KVM) +extern int prot_virt_host; + +static inline int is_prot_virt_host(void) +{ + return prot_virt_host; +} + +int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); +int uv_convert_from_secure(unsigned long paddr); +int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); + +void setup_uv(void); +void adjust_to_uv_max(unsigned long *vmax); +#else +#define is_prot_virt_host() 0 +static inline void setup_uv(void) {} +static inline void adjust_to_uv_max(unsigned long *vmax) {} + +static inline int uv_convert_from_secure(unsigned long paddr) +{ + return 0; +} +#endif + +#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM) +void uv_query_info(void); +#else static inline void uv_query_info(void) {} #endif diff --git a/arch/s390/kernel/.gitignore b/arch/s390/kernel/.gitignore index c5f676c3c224..bbb90f92d051 100644 --- a/arch/s390/kernel/.gitignore +++ b/arch/s390/kernel/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only vmlinux.lds diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 2b1203cf7be6..75f26d775027 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -54,7 +54,6 @@ CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o -obj-$(CONFIG_HIBERNATION) += suspend.o swsusp.o obj-$(CONFIG_AUDIT) += audit.o compat-obj-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o @@ -70,7 +69,7 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o -obj-$(CONFIG_IMA) += ima_arch.o +obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o @@ -78,6 +77,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_diag.o obj-$(CONFIG_TRACEPOINTS) += trace.o +obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o # vdso obj-y += vdso64/ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index ce33406cfe83..e80f0e6f5972 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -124,6 +124,8 @@ int main(void) OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code); OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address); OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr); + OFFSET(__LC_RETURN_LPSWE, lowcore, return_lpswe); + OFFSET(__LC_RETURN_MCCK_LPSWE, lowcore, return_mcck_lpswe); OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw); OFFSET(__LC_EXT_OLD_PSW, lowcore, external_old_psw); OFFSET(__LC_SVC_OLD_PSW, lowcore, svc_old_psw); diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index e9dac9a24d3f..61f2b0412345 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -84,7 +84,7 @@ static int show_diag_stat(struct seq_file *m, void *v) static void *show_diag_stat_start(struct seq_file *m, loff_t *pos) { - return *pos <= nr_cpu_ids ? (void *)((unsigned long) *pos + 1) : NULL; + return *pos <= NR_DIAG_STAT ? (void *)((unsigned long) *pos + 1) : NULL; } static void *show_diag_stat_next(struct seq_file *m, void *v, loff_t *pos) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 9205add8481d..3ae64914bd14 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -115,26 +115,29 @@ _LPP_OFFSET = __LC_LPP .macro SWITCH_ASYNC savearea,timer tmhh %r8,0x0001 # interrupting from user ? - jnz 1f + jnz 2f lgr %r14,%r9 + cghi %r14,__LC_RETURN_LPSWE + je 0f slg %r14,BASED(.Lcritical_start) clg %r14,BASED(.Lcritical_length) - jhe 0f + jhe 1f +0: lghi %r11,\savearea # inside critical section, do cleanup brasl %r14,cleanup_critical tmhh %r8,0x0001 # retest problem state after cleanup - jnz 1f -0: lg %r14,__LC_ASYNC_STACK # are we already on the target stack? + jnz 2f +1: lg %r14,__LC_ASYNC_STACK # are we already on the target stack? slgr %r14,%r15 srag %r14,%r14,STACK_SHIFT - jnz 2f + jnz 3f CHECK_STACK \savearea aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - j 3f -1: UPDATE_VTIME %r14,%r15,\timer + j 4f +2: UPDATE_VTIME %r14,%r15,\timer BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP -2: lg %r15,__LC_ASYNC_STACK # load async stack -3: la %r11,STACK_FRAME_OVERHEAD(%r15) +3: lg %r15,__LC_ASYNC_STACK # load async stack +4: la %r11,STACK_FRAME_OVERHEAD(%r15) .endm .macro UPDATE_VTIME w1,w2,enter_timer @@ -401,7 +404,7 @@ ENTRY(system_call) stpt __LC_EXIT_TIMER mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER lmg %r11,%r15,__PT_R11(%r11) - lpswe __LC_RETURN_PSW + b __LC_RETURN_LPSWE(%r0) .Lsysc_done: # @@ -608,43 +611,50 @@ ENTRY(pgm_check_handler) BPOFF stmg %r8,%r15,__LC_SAVE_AREA_SYNC lg %r10,__LC_LAST_BREAK - lg %r12,__LC_CURRENT + srag %r11,%r10,12 + jnz 0f + /* if __LC_LAST_BREAK is < 4096, it contains one of + * the lpswe addresses in lowcore. Set it to 1 (initial state) + * to prevent leaking that address to userspace. + */ + lghi %r10,1 +0: lg %r12,__LC_CURRENT lghi %r11,0 larl %r13,cleanup_critical lmg %r8,%r9,__LC_PGM_OLD_PSW tmhh %r8,0x0001 # test problem state bit - jnz 2f # -> fault in user space + jnz 3f # -> fault in user space #if IS_ENABLED(CONFIG_KVM) # cleanup critical section for program checks in sie64a lgr %r14,%r9 slg %r14,BASED(.Lsie_critical_start) clg %r14,BASED(.Lsie_critical_length) - jhe 0f + jhe 1f lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE lctlg %c1,%c1,__LC_USER_ASCE # load primary asce larl %r9,sie_exit # skip forward to sie_exit lghi %r11,_PIF_GUEST_FAULT #endif -0: tmhh %r8,0x4000 # PER bit set in old PSW ? - jnz 1f # -> enabled, can't be a double fault +1: tmhh %r8,0x4000 # PER bit set in old PSW ? + jnz 2f # -> enabled, can't be a double fault tm __LC_PGM_ILC+3,0x80 # check for per exception jnz .Lpgm_svcper # -> single stepped svc -1: CHECK_STACK __LC_SAVE_AREA_SYNC +2: CHECK_STACK __LC_SAVE_AREA_SYNC aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - # CHECK_VMAP_STACK branches to stack_overflow or 4f - CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,4f -2: UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER + # CHECK_VMAP_STACK branches to stack_overflow or 5f + CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,5f +3: UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP lg %r15,__LC_KERNEL_STACK lgr %r14,%r12 aghi %r14,__TASK_thread # pointer to thread_struct lghi %r13,__LC_PGM_TDB tm __LC_PGM_ILC+2,0x02 # check for transaction abort - jz 3f + jz 4f mvc __THREAD_trap_tdb(256,%r14),0(%r13) -3: stg %r10,__THREAD_last_break(%r14) -4: lgr %r13,%r11 +4: stg %r10,__THREAD_last_break(%r14) +5: lgr %r13,%r11 la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) # clear user controlled registers to prevent speculative use @@ -663,14 +673,14 @@ ENTRY(pgm_check_handler) stg %r13,__PT_FLAGS(%r11) stg %r10,__PT_ARGS(%r11) tm __LC_PGM_ILC+3,0x80 # check for per exception - jz 5f + jz 6f tmhh %r8,0x0001 # kernel per event ? jz .Lpgm_kprobe oi __PT_FLAGS+7(%r11),_PIF_PER_TRAP mvc __THREAD_per_address(8,%r14),__LC_PER_ADDRESS mvc __THREAD_per_cause(2,%r14),__LC_PER_CODE mvc __THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID -5: REENABLE_IRQS +6: REENABLE_IRQS xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) larl %r1,pgm_check_table llgh %r10,__PT_INT_CODE+2(%r11) @@ -775,7 +785,7 @@ ENTRY(io_int_handler) mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER .Lio_exit_kernel: lmg %r11,%r15,__PT_R11(%r11) - lpswe __LC_RETURN_PSW + b __LC_RETURN_LPSWE(%r0) .Lio_done: # @@ -1214,7 +1224,7 @@ ENTRY(mcck_int_handler) stpt __LC_EXIT_TIMER mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER 0: lmg %r11,%r15,__PT_R11(%r11) - lpswe __LC_RETURN_MCCK_PSW + b __LC_RETURN_MCCK_LPSWE .Lmcck_panic: lg %r15,__LC_NODAT_STACK @@ -1271,6 +1281,8 @@ ENDPROC(stack_overflow) #endif ENTRY(cleanup_critical) + cghi %r9,__LC_RETURN_LPSWE + je .Lcleanup_lpswe #if IS_ENABLED(CONFIG_KVM) clg %r9,BASED(.Lcleanup_table_sie) # .Lsie_gmap jl 0f @@ -1424,6 +1436,7 @@ ENDPROC(cleanup_critical) mvc __LC_RETURN_PSW(16),__PT_PSW(%r9) mvc 0(64,%r11),__PT_R8(%r9) lmg %r0,%r7,__PT_R0(%r9) +.Lcleanup_lpswe: 1: lmg %r8,%r9,__LC_RETURN_PSW BR_EX %r14,%r11 .Lcleanup_sysc_restore_insn: diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index 1d3927e01a5f..faca269d5f27 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -24,6 +24,8 @@ asmlinkage void do_syscall_trace_exit(struct pt_regs *regs); void do_protection_exception(struct pt_regs *regs); void do_dat_exception(struct pt_regs *regs); +void do_secure_storage_access(struct pt_regs *regs); +void do_non_secure_storage_access(struct pt_regs *regs); void addressing_exception(struct pt_regs *regs); void data_exception(struct pt_regs *regs); diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 6837affc19e8..4a71061974fd 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -144,6 +144,9 @@ static struct ipl_parameter_block *dump_block_ccw; static struct sclp_ipl_info sclp_ipl_info; +static bool reipl_fcp_clear; +static bool reipl_ccw_clear; + static inline int __diag308(unsigned long subcode, void *addr) { register unsigned long _addr asm("0") = (unsigned long) addr; @@ -691,6 +694,21 @@ static struct kobj_attribute sys_reipl_fcp_loadparm_attr = __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_fcp_loadparm_show, reipl_fcp_loadparm_store); +static ssize_t reipl_fcp_clear_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%u\n", reipl_fcp_clear); +} + +static ssize_t reipl_fcp_clear_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + if (strtobool(buf, &reipl_fcp_clear) < 0) + return -EINVAL; + return len; +} + static struct attribute *reipl_fcp_attrs[] = { &sys_reipl_fcp_device_attr.attr, &sys_reipl_fcp_wwpn_attr.attr, @@ -706,6 +724,9 @@ static struct attribute_group reipl_fcp_attr_group = { .bin_attrs = reipl_fcp_bin_attrs, }; +static struct kobj_attribute sys_reipl_fcp_clear_attr = + __ATTR(clear, 0644, reipl_fcp_clear_show, reipl_fcp_clear_store); + /* CCW reipl device attributes */ DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw); @@ -741,16 +762,36 @@ static struct kobj_attribute sys_reipl_ccw_loadparm_attr = __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_ccw_loadparm_show, reipl_ccw_loadparm_store); +static ssize_t reipl_ccw_clear_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%u\n", reipl_ccw_clear); +} + +static ssize_t reipl_ccw_clear_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + if (strtobool(buf, &reipl_ccw_clear) < 0) + return -EINVAL; + return len; +} + +static struct kobj_attribute sys_reipl_ccw_clear_attr = + __ATTR(clear, 0644, reipl_ccw_clear_show, reipl_ccw_clear_store); + static struct attribute *reipl_ccw_attrs_vm[] = { &sys_reipl_ccw_device_attr.attr, &sys_reipl_ccw_loadparm_attr.attr, &sys_reipl_ccw_vmparm_attr.attr, + &sys_reipl_ccw_clear_attr.attr, NULL, }; static struct attribute *reipl_ccw_attrs_lpar[] = { &sys_reipl_ccw_device_attr.attr, &sys_reipl_ccw_loadparm_attr.attr, + &sys_reipl_ccw_clear_attr.attr, NULL, }; @@ -892,11 +933,17 @@ static void __reipl_run(void *unused) switch (reipl_type) { case IPL_TYPE_CCW: diag308(DIAG308_SET, reipl_block_ccw); - diag308(DIAG308_LOAD_CLEAR, NULL); + if (reipl_ccw_clear) + diag308(DIAG308_LOAD_CLEAR, NULL); + else + diag308(DIAG308_LOAD_NORMAL_DUMP, NULL); break; case IPL_TYPE_FCP: diag308(DIAG308_SET, reipl_block_fcp); - diag308(DIAG308_LOAD_CLEAR, NULL); + if (reipl_fcp_clear) + diag308(DIAG308_LOAD_CLEAR, NULL); + else + diag308(DIAG308_LOAD_NORMAL, NULL); break; case IPL_TYPE_NSS: diag308(DIAG308_SET, reipl_block_nss); @@ -1008,11 +1055,16 @@ static int __init reipl_fcp_init(void) } rc = sysfs_create_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group); - if (rc) { - kset_unregister(reipl_fcp_kset); - free_page((unsigned long) reipl_block_fcp); - return rc; - } + if (rc) + goto out1; + + if (test_facility(141)) { + rc = sysfs_create_file(&reipl_fcp_kset->kobj, + &sys_reipl_fcp_clear_attr.attr); + if (rc) + goto out2; + } else + reipl_fcp_clear = true; if (ipl_info.type == IPL_TYPE_FCP) { memcpy(reipl_block_fcp, &ipl_block, sizeof(ipl_block)); @@ -1032,6 +1084,13 @@ static int __init reipl_fcp_init(void) } reipl_capabilities |= IPL_TYPE_FCP; return 0; + +out2: + sysfs_remove_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group); +out1: + kset_unregister(reipl_fcp_kset); + free_page((unsigned long) reipl_block_fcp); + return rc; } static int __init reipl_type_init(void) diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 8371855042dc..3514420f0259 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -95,14 +95,6 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"}, }; -void __init init_IRQ(void) -{ - BUILD_BUG_ON(ARRAY_SIZE(irqclass_sub_desc) != NR_ARCH_IRQS); - init_cio_interrupts(); - init_airq_interrupts(); - init_ext_interrupts(); -} - void do_IRQ(struct pt_regs *regs, int irq) { struct pt_regs *old_regs; @@ -294,12 +286,7 @@ static irqreturn_t do_ext_interrupt(int irq, void *dummy) return IRQ_HANDLED; } -static struct irqaction external_interrupt = { - .name = "EXT", - .handler = do_ext_interrupt, -}; - -void __init init_ext_interrupts(void) +static void __init init_ext_interrupts(void) { int idx; @@ -308,7 +295,16 @@ void __init init_ext_interrupts(void) irq_set_chip_and_handler(EXT_INTERRUPT, &dummy_irq_chip, handle_percpu_irq); - setup_irq(EXT_INTERRUPT, &external_interrupt); + if (request_irq(EXT_INTERRUPT, do_ext_interrupt, 0, "EXT", NULL)) + panic("Failed to register EXT interrupt\n"); +} + +void __init init_IRQ(void) +{ + BUILD_BUG_ON(ARRAY_SIZE(irqclass_sub_desc) != NR_ARCH_IRQS); + init_cio_interrupts(); + init_airq_interrupts(); + init_ext_interrupts(); } static DEFINE_SPINLOCK(irq_subclass_lock); diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index cb8b1cc285c9..3a854cb5a4c6 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -14,7 +14,6 @@ #include <linux/reboot.h> #include <linux/ftrace.h> #include <linux/debug_locks.h> -#include <linux/suspend.h> #include <asm/cio.h> #include <asm/setup.h> #include <asm/pgtable.h> @@ -39,36 +38,6 @@ extern const unsigned long long relocate_kernel_len; #ifdef CONFIG_CRASH_DUMP /* - * PM notifier callback for kdump - */ -static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action, - void *ptr) -{ - switch (action) { - case PM_SUSPEND_PREPARE: - case PM_HIBERNATION_PREPARE: - if (kexec_crash_image) - arch_kexec_unprotect_crashkres(); - break; - case PM_POST_SUSPEND: - case PM_POST_HIBERNATION: - if (kexec_crash_image) - arch_kexec_protect_crashkres(); - break; - default: - return NOTIFY_DONE; - } - return NOTIFY_OK; -} - -static int __init machine_kdump_pm_init(void) -{ - pm_notifier(machine_kdump_pm_cb, 0); - return 0; -} -arch_initcall(machine_kdump_pm_init); - -/* * Reset the system, copy boot CPU registers to absolute zero, * and jump to the kdump image */ diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index 8b33e03e47b8..1e3df52b2b65 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -238,6 +238,64 @@ CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_SPECIAL, 0x00f5); CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); +CPUMF_EVENT_ATTR(cf_z15, L1D_RO_EXCL_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z15, DTLB2_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z15, DTLB2_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z15, DTLB2_HPAGE_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z15, DTLB2_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z15, L1D_L2D_SOURCED_WRITES, 0x0085); +CPUMF_EVENT_ATTR(cf_z15, ITLB2_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z15, ITLB2_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z15, L1I_L2I_SOURCED_WRITES, 0x0088); +CPUMF_EVENT_ATTR(cf_z15, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z15, TLB2_CRSTE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z15, TLB2_ENGINES_BUSY, 0x008b); +CPUMF_EVENT_ATTR(cf_z15, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z15, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z15, L1C_TLB2_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_MEMORY_SOURCED_WRITES, 0x0091); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0092); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES, 0x0093); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x0094); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x0095); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES, 0x0096); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x0097); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x0098); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES, 0x0099); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x009a); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x009b); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONDRAWER_L4_SOURCED_WRITES, 0x009c); +CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L4_SOURCED_WRITES, 0x009d); +CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_RO, 0x009e); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES, 0x00a2); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_MEMORY_SOURCED_WRITES, 0x00a3); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x00a4); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES, 0x00a5); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x00a6); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x00a7); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES, 0x00a8); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x00a9); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x00aa); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES, 0x00ab); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x00ac); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x00ad); +CPUMF_EVENT_ATTR(cf_z15, L1I_ONDRAWER_L4_SOURCED_WRITES, 0x00ae); +CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L4_SOURCED_WRITES, 0x00af); +CPUMF_EVENT_ATTR(cf_z15, BCD_DFP_EXECUTION_SLOTS, 0x00e0); +CPUMF_EVENT_ATTR(cf_z15, VX_BCD_EXECUTION_SLOTS, 0x00e1); +CPUMF_EVENT_ATTR(cf_z15, DECIMAL_INSTRUCTIONS, 0x00e2); +CPUMF_EVENT_ATTR(cf_z15, LAST_HOST_TRANSLATIONS, 0x00e8); +CPUMF_EVENT_ATTR(cf_z15, TX_NC_TABORT, 0x00f3); +CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_NO_SPECIAL, 0x00f4); +CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_SPECIAL, 0x00f5); +CPUMF_EVENT_ATTR(cf_z15, DFLT_ACCESS, 0x00f7); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CYCLES, 0x00fc); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x00108); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CCERROR, 0x00109); +CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); + static struct attribute *cpumcf_fvn1_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf_fvn1, CPU_CYCLES), CPUMF_EVENT_PTR(cf_fvn1, INSTRUCTIONS), @@ -516,6 +574,67 @@ static struct attribute *cpumcf_z14_pmu_event_attr[] __initdata = { NULL, }; +static struct attribute *cpumcf_z15_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z15, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z15, DTLB2_WRITES), + CPUMF_EVENT_PTR(cf_z15, DTLB2_MISSES), + CPUMF_EVENT_PTR(cf_z15, DTLB2_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z15, DTLB2_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_L2D_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, ITLB2_WRITES), + CPUMF_EVENT_PTR(cf_z15, ITLB2_MISSES), + CPUMF_EVENT_PTR(cf_z15, L1I_L2I_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z15, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z15, TLB2_ENGINES_BUSY), + CPUMF_EVENT_PTR(cf_z15, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z15, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z15, L1C_TLB2_MISSES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1D_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_RO), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z15, L1I_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z15, BCD_DFP_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z15, VX_BCD_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z15, DECIMAL_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_z15, LAST_HOST_TRANSLATIONS), + CPUMF_EVENT_PTR(cf_z15, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z15, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z15, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z15, DFLT_ACCESS), + CPUMF_EVENT_PTR(cf_z15, DFLT_CYCLES), + CPUMF_EVENT_PTR(cf_z15, DFLT_CC), + CPUMF_EVENT_PTR(cf_z15, DFLT_CCERROR), + CPUMF_EVENT_PTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + /* END: CPUM_CF COUNTER DEFINITIONS ===================================== */ static struct attribute_group cpumcf_pmu_events_group = { @@ -624,9 +743,11 @@ __init const struct attribute_group **cpumf_cf_event_group(void) break; case 0x3906: case 0x3907: + model = cpumcf_z14_pmu_event_attr; + break; case 0x8561: case 0x8562: - model = cpumcf_z14_pmu_event_attr; + model = cpumcf_z15_pmu_event_attr; break; default: model = none; diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index b095b1c78987..85a711d783eb 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -372,28 +372,33 @@ static void deallocate_buffers(struct cpu_hw_sf *cpuhw) static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) { - unsigned long n_sdb, freq, factor; + unsigned long n_sdb, freq; size_t sample_size; /* Calculate sampling buffers using 4K pages * - * 1. Determine the sample data size which depends on the used - * sampling functions, for example, basic-sampling or - * basic-sampling with diagnostic-sampling. + * 1. The sampling size is 32 bytes for basic sampling. This size + * is the same for all machine types. Diagnostic + * sampling uses auxlilary data buffer setup which provides the + * memory for SDBs using linux common code auxiliary trace + * setup. * - * 2. Use the sampling frequency as input. The sampling buffer is - * designed for almost one second. This can be adjusted through - * the "factor" variable. - * In any case, alloc_sampling_buffer() sets the Alert Request + * 2. Function alloc_sampling_buffer() sets the Alert Request * Control indicator to trigger a measurement-alert to harvest - * sample-data-blocks (sdb). + * sample-data-blocks (SDB). This is done per SDB. This + * measurement alert interrupt fires quick enough to handle + * one SDB, on very high frequency and work loads there might + * be 2 to 3 SBDs available for sample processing. + * Currently there is no need for setup alert request on every + * n-th page. This is counterproductive as one IRQ triggers + * a very high number of samples to be processed at one IRQ. * - * 3. Compute the number of sample-data-blocks and ensure a minimum - * of CPUM_SF_MIN_SDB. Also ensure the upper limit does not - * exceed a "calculated" maximum. The symbolic maximum is - * designed for basic-sampling only and needs to be increased if - * diagnostic-sampling is active. - * See also the remarks for these symbolic constants. + * 3. Use the sampling frequency as input. + * Compute the number of SDBs and ensure a minimum + * of CPUM_SF_MIN_SDB. Depending on frequency add some more + * SDBs to handle a higher sampling rate. + * Use a minimum of CPUM_SF_MIN_SDB and allow for 100 samples + * (one SDB) for every 10000 HZ frequency increment. * * 4. Compute the number of sample-data-block-tables (SDBT) and * ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up @@ -401,10 +406,7 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) */ sample_size = sizeof(struct hws_basic_entry); freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc)); - factor = 1; - n_sdb = DIV_ROUND_UP(freq, factor * ((PAGE_SIZE-64) / sample_size)); - if (n_sdb < CPUM_SF_MIN_SDB) - n_sdb = CPUM_SF_MIN_SDB; + n_sdb = CPUM_SF_MIN_SDB + DIV_ROUND_UP(freq, 10000); /* If there is already a sampling buffer allocated, it is very likely * that the sampling facility is enabled too. If the event to be @@ -1576,6 +1578,7 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) unsigned long range = 0, size; unsigned long long overflow = 0; struct perf_output_handle *handle = &cpuhw->handle; + unsigned long num_sdb; aux = perf_get_aux(handle); if (WARN_ON_ONCE(!aux)) @@ -1587,13 +1590,14 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) size >> PAGE_SHIFT); perf_aux_output_end(handle, size); + num_sdb = aux->sfb.num_sdb; while (!done) { /* Get an output handle */ aux = perf_aux_output_begin(handle, cpuhw->event); if (handle->size == 0) { pr_err("The AUX buffer with %lu pages for the " "diagnostic-sampling mode is full\n", - aux->sfb.num_sdb); + num_sdb); debug_sprintf_event(sfdbg, 1, "%s: AUX buffer used up\n", __func__); diff --git a/arch/s390/kernel/pgm_check.S b/arch/s390/kernel/pgm_check.S index eee3a482195a..2c27907a5ffc 100644 --- a/arch/s390/kernel/pgm_check.S +++ b/arch/s390/kernel/pgm_check.S @@ -78,8 +78,8 @@ PGM_CHECK(do_dat_exception) /* 39 */ PGM_CHECK(do_dat_exception) /* 3a */ PGM_CHECK(do_dat_exception) /* 3b */ PGM_CHECK_DEFAULT /* 3c */ -PGM_CHECK_DEFAULT /* 3d */ -PGM_CHECK_DEFAULT /* 3e */ +PGM_CHECK(do_secure_storage_access) /* 3d */ +PGM_CHECK(do_non_secure_storage_access) /* 3e */ PGM_CHECK_DEFAULT /* 3f */ PGM_CHECK(monitor_event_exception) /* 40 */ PGM_CHECK_DEFAULT /* 41 */ diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 6ccef5f29761..eb6e23ad15a2 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -106,6 +106,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp, p->thread.system_timer = 0; p->thread.hardirq_timer = 0; p->thread.softirq_timer = 0; + p->thread.last_break = 1; frame->sf.back_chain = 0; /* new return point is ret_from_fork */ diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 6ebc2117c66c..c92d04f876cb 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -151,10 +151,35 @@ static void show_cpu_summary(struct seq_file *m, void *v) } } +static void show_cpu_topology(struct seq_file *m, unsigned long n) +{ +#ifdef CONFIG_SCHED_TOPOLOGY + seq_printf(m, "physical id : %d\n", topology_physical_package_id(n)); + seq_printf(m, "core id : %d\n", topology_core_id(n)); + seq_printf(m, "book id : %d\n", topology_book_id(n)); + seq_printf(m, "drawer id : %d\n", topology_drawer_id(n)); + seq_printf(m, "dedicated : %d\n", topology_cpu_dedicated(n)); + seq_printf(m, "address : %d\n", smp_cpu_get_cpu_address(n)); + seq_printf(m, "siblings : %d\n", cpumask_weight(topology_core_cpumask(n))); + seq_printf(m, "cpu cores : %d\n", topology_booted_cores(n)); +#endif /* CONFIG_SCHED_TOPOLOGY */ +} + +static void show_cpu_ids(struct seq_file *m, unsigned long n) +{ + struct cpuid *id = &per_cpu(cpu_info.cpu_id, n); + + seq_printf(m, "version : %02X\n", id->version); + seq_printf(m, "identification : %06X\n", id->ident); + seq_printf(m, "machine : %04X\n", id->machine); +} + static void show_cpu_mhz(struct seq_file *m, unsigned long n) { struct cpu_info *c = per_cpu_ptr(&cpu_info, n); + if (!machine_has_cpu_mhz) + return; seq_printf(m, "cpu MHz dynamic : %d\n", c->cpu_mhz_dynamic); seq_printf(m, "cpu MHz static : %d\n", c->cpu_mhz_static); } @@ -165,12 +190,13 @@ static void show_cpu_mhz(struct seq_file *m, unsigned long n) static int show_cpuinfo(struct seq_file *m, void *v) { unsigned long n = (unsigned long) v - 1; + unsigned long first = cpumask_first(cpu_online_mask); - if (!n) + if (n == first) show_cpu_summary(m, v); - if (!machine_has_cpu_mhz) - return 0; seq_printf(m, "\ncpu number : %ld\n", n); + show_cpu_topology(m, n); + show_cpu_ids(m, n); show_cpu_mhz(m, n); return 0; } @@ -179,6 +205,8 @@ static inline void *c_update(loff_t *pos) { if (*pos) *pos = cpumask_next(*pos - 1, cpu_online_mask); + else + *pos = cpumask_first(cpu_online_mask); return *pos < nr_cpu_ids ? (void *)*pos + 1 : NULL; } diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index b2c2f75860e8..36445dd40fdb 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -73,6 +73,7 @@ #include <asm/nospec-branch.h> #include <asm/mem_detect.h> #include <asm/uv.h> +#include <asm/asm-offsets.h> #include "entry.h" /* @@ -92,10 +93,6 @@ char elf_platform[ELF_PLATFORM_SIZE]; unsigned long int_hwcap = 0; -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST -int __bootdata_preserved(prot_virt_guest); -#endif - int __bootdata(noexec_disabled); int __bootdata(memory_end_set); unsigned long __bootdata(memory_end); @@ -450,6 +447,8 @@ static void __init setup_lowcore_dat_off(void) lc->spinlock_index = 0; arch_spin_lock_setup(0); lc->br_r1_trampoline = 0x07f1; /* br %r1 */ + lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); + lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); set_prefix((u32)(unsigned long) lc); lowcore_ptr[0] = lc; @@ -564,6 +563,9 @@ static void __init setup_memory_end(void) vmax = _REGION1_SIZE; /* 4-level kernel page table */ } + if (is_prot_virt_host()) + adjust_to_uv_max(&vmax); + /* module area is at the end of the kernel address space. */ MODULES_END = vmax; MODULES_VADDR = MODULES_END - MODULES_LEN; @@ -790,6 +792,7 @@ static void __init memblock_add_mem_detect_info(void) memblock_physmem_add(start, end - start); } memblock_set_bottom_up(false); + memblock_set_node(0, ULONG_MAX, &memblock.memory, 0); memblock_dump_all(); } @@ -1138,6 +1141,8 @@ void __init setup_arch(char **cmdline_p) */ memblock_trim_memory(1UL << (MAX_ORDER - 1 + PAGE_SHIFT)); + if (is_prot_virt_host()) + setup_uv(); setup_memory_end(); setup_memory(); dma_contiguous_reserve(memory_end); diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index e6fca5498e1f..b295090e2ce6 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -487,7 +487,7 @@ void do_signal(struct pt_regs *regs) regs->gprs[2] = -EINTR; break; } - /* fallthrough */ + fallthrough; case -ERESTARTNOINTR: regs->gprs[2] = regs->orig_gpr2; regs->psw.addr = @@ -514,7 +514,7 @@ void do_signal(struct pt_regs *regs) case -ERESTART_RESTARTBLOCK: /* Restart with sys_restart_syscall */ regs->int_code = __NR_restart_syscall; - /* fallthrough */ + fallthrough; case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index a08bd2522dd9..7eaabbab2213 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -212,6 +212,8 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) lc->spinlock_lockval = arch_spin_lockval(cpu); lc->spinlock_index = 0; lc->br_r1_trampoline = 0x07f1; /* br %r1 */ + lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); + lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); if (nmi_alloc_per_cpu(lc)) goto out_async; if (vdso_alloc_per_cpu(lc)) @@ -701,6 +703,11 @@ int smp_cpu_get_polarization(int cpu) return pcpu_devices[cpu].polarization; } +int smp_cpu_get_cpu_address(int cpu) +{ + return pcpu_devices[cpu].address; +} + static void __ref smp_get_core_info(struct sclp_core_info *info, int early) { static int use_sigp_detection; @@ -851,12 +858,13 @@ static void smp_init_secondary(void) init_cpu_timer(); vtime_init(); pfault_init(); - notify_cpu_starting(smp_processor_id()); + notify_cpu_starting(cpu); if (topology_cpu_dedicated(cpu)) set_cpu_flag(CIF_DEDICATED_CPU); else clear_cpu_flag(CIF_DEDICATED_CPU); - set_cpu_online(smp_processor_id(), true); + set_cpu_online(cpu, true); + update_cpu_masks(); inc_irq_stat(CPU_RST); local_irq_enable(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); @@ -928,6 +936,7 @@ int __cpu_disable(void) /* Handle possible pending IPIs */ smp_handle_ext_call(); set_cpu_online(smp_processor_id(), false); + update_cpu_masks(); /* Disable pseudo page faults on this cpu. */ pfault_fini(); /* Disable interrupt sources via control register. */ diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c deleted file mode 100644 index 75b7b307946e..000000000000 --- a/arch/s390/kernel/suspend.c +++ /dev/null @@ -1,240 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Suspend support specific for s390. - * - * Copyright IBM Corp. 2009 - * - * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com> - */ - -#include <linux/pfn.h> -#include <linux/suspend.h> -#include <linux/mm.h> -#include <linux/pci.h> -#include <asm/ctl_reg.h> -#include <asm/ipl.h> -#include <asm/cio.h> -#include <asm/sections.h> -#include "entry.h" - -/* - * The restore of the saved pages in an hibernation image will set - * the change and referenced bits in the storage key for each page. - * Overindication of the referenced bits after an hibernation cycle - * does not cause any harm but the overindication of the change bits - * would cause trouble. - * Use the ARCH_SAVE_PAGE_KEYS hooks to save the storage key of each - * page to the most significant byte of the associated page frame - * number in the hibernation image. - */ - -/* - * Key storage is allocated as a linked list of pages. - * The size of the keys array is (PAGE_SIZE - sizeof(long)) - */ -struct page_key_data { - struct page_key_data *next; - unsigned char data[]; -}; - -#define PAGE_KEY_DATA_SIZE (PAGE_SIZE - sizeof(struct page_key_data *)) - -static struct page_key_data *page_key_data; -static struct page_key_data *page_key_rp, *page_key_wp; -static unsigned long page_key_rx, page_key_wx; -unsigned long suspend_zero_pages; - -/* - * For each page in the hibernation image one additional byte is - * stored in the most significant byte of the page frame number. - * On suspend no additional memory is required but on resume the - * keys need to be memorized until the page data has been restored. - * Only then can the storage keys be set to their old state. - */ -unsigned long page_key_additional_pages(unsigned long pages) -{ - return DIV_ROUND_UP(pages, PAGE_KEY_DATA_SIZE); -} - -/* - * Free page_key_data list of arrays. - */ -void page_key_free(void) -{ - struct page_key_data *pkd; - - while (page_key_data) { - pkd = page_key_data; - page_key_data = pkd->next; - free_page((unsigned long) pkd); - } -} - -/* - * Allocate page_key_data list of arrays with enough room to store - * one byte for each page in the hibernation image. - */ -int page_key_alloc(unsigned long pages) -{ - struct page_key_data *pk; - unsigned long size; - - size = DIV_ROUND_UP(pages, PAGE_KEY_DATA_SIZE); - while (size--) { - pk = (struct page_key_data *) get_zeroed_page(GFP_KERNEL); - if (!pk) { - page_key_free(); - return -ENOMEM; - } - pk->next = page_key_data; - page_key_data = pk; - } - page_key_rp = page_key_wp = page_key_data; - page_key_rx = page_key_wx = 0; - return 0; -} - -/* - * Save the storage key into the upper 8 bits of the page frame number. - */ -void page_key_read(unsigned long *pfn) -{ - struct page *page; - unsigned long addr; - unsigned char key; - - page = pfn_to_page(*pfn); - addr = (unsigned long) page_address(page); - key = (unsigned char) page_get_storage_key(addr) & 0x7f; - if (arch_test_page_nodat(page)) - key |= 0x80; - *(unsigned char *) pfn = key; -} - -/* - * Extract the storage key from the upper 8 bits of the page frame number - * and store it in the page_key_data list of arrays. - */ -void page_key_memorize(unsigned long *pfn) -{ - page_key_wp->data[page_key_wx] = *(unsigned char *) pfn; - *(unsigned char *) pfn = 0; - if (++page_key_wx < PAGE_KEY_DATA_SIZE) - return; - page_key_wp = page_key_wp->next; - page_key_wx = 0; -} - -/* - * Get the next key from the page_key_data list of arrays and set the - * storage key of the page referred by @address. If @address refers to - * a "safe" page the swsusp_arch_resume code will transfer the storage - * key from the buffer page to the original page. - */ -void page_key_write(void *address) -{ - struct page *page; - unsigned char key; - - key = page_key_rp->data[page_key_rx]; - page_set_storage_key((unsigned long) address, key & 0x7f, 0); - page = virt_to_page(address); - if (key & 0x80) - arch_set_page_nodat(page, 0); - else - arch_set_page_dat(page, 0); - if (++page_key_rx >= PAGE_KEY_DATA_SIZE) - return; - page_key_rp = page_key_rp->next; - page_key_rx = 0; -} - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = PFN_DOWN(__pa(&__nosave_begin)); - unsigned long nosave_end_pfn = PFN_DOWN(__pa(&__nosave_end)); - unsigned long end_rodata_pfn = PFN_DOWN(__pa(__end_rodata)) - 1; - unsigned long stext_pfn = PFN_DOWN(__pa(_stext)); - - /* Always save lowcore pages (LC protection might be enabled). */ - if (pfn <= LC_PAGES) - return 0; - if (pfn >= nosave_begin_pfn && pfn < nosave_end_pfn) - return 1; - /* Skip memory holes and read-only pages (DCSS, ...). */ - if (pfn >= stext_pfn && pfn <= end_rodata_pfn) - return 0; - if (tprot(PFN_PHYS(pfn))) - return 1; - return 0; -} - -/* - * PM notifier callback for suspend - */ -static int suspend_pm_cb(struct notifier_block *nb, unsigned long action, - void *ptr) -{ - switch (action) { - case PM_SUSPEND_PREPARE: - case PM_HIBERNATION_PREPARE: - suspend_zero_pages = __get_free_pages(GFP_KERNEL, LC_ORDER); - if (!suspend_zero_pages) - return NOTIFY_BAD; - break; - case PM_POST_SUSPEND: - case PM_POST_HIBERNATION: - free_pages(suspend_zero_pages, LC_ORDER); - break; - default: - return NOTIFY_DONE; - } - return NOTIFY_OK; -} - -static int __init suspend_pm_init(void) -{ - pm_notifier(suspend_pm_cb, 0); - return 0; -} -arch_initcall(suspend_pm_init); - -void save_processor_state(void) -{ - /* swsusp_arch_suspend() actually saves all cpu register contents. - * Machine checks must be disabled since swsusp_arch_suspend() stores - * register contents to their lowcore save areas. That's the same - * place where register contents on machine checks would be saved. - * To avoid register corruption disable machine checks. - * We must also disable machine checks in the new psw mask for - * program checks, since swsusp_arch_suspend() may generate program - * checks. Disabling machine checks for all other new psw masks is - * just paranoia. - */ - local_mcck_disable(); - /* Disable lowcore protection */ - __ctl_clear_bit(0,28); - S390_lowcore.external_new_psw.mask &= ~PSW_MASK_MCHECK; - S390_lowcore.svc_new_psw.mask &= ~PSW_MASK_MCHECK; - S390_lowcore.io_new_psw.mask &= ~PSW_MASK_MCHECK; - S390_lowcore.program_new_psw.mask &= ~PSW_MASK_MCHECK; -} - -void restore_processor_state(void) -{ - S390_lowcore.external_new_psw.mask |= PSW_MASK_MCHECK; - S390_lowcore.svc_new_psw.mask |= PSW_MASK_MCHECK; - S390_lowcore.io_new_psw.mask |= PSW_MASK_MCHECK; - S390_lowcore.program_new_psw.mask |= PSW_MASK_MCHECK; - /* Enable lowcore protection */ - __ctl_set_bit(0,28); - local_mcck_enable(); -} - -/* Called at the end of swsusp_arch_resume */ -void s390_early_resume(void) -{ - lgr_info_log(); - channel_subsystem_reinit(); - zpci_rescan(); -} diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S deleted file mode 100644 index a7baf0b5f818..000000000000 --- a/arch/s390/kernel/swsusp.S +++ /dev/null @@ -1,276 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * S390 64-bit swsusp implementation - * - * Copyright IBM Corp. 2009 - * - * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com> - * Michael Holzheu <holzheu@linux.vnet.ibm.com> - */ - -#include <linux/linkage.h> -#include <asm/page.h> -#include <asm/ptrace.h> -#include <asm/thread_info.h> -#include <asm/asm-offsets.h> -#include <asm/nospec-insn.h> -#include <asm/sigp.h> - -/* - * Save register context in absolute 0 lowcore and call swsusp_save() to - * create in-memory kernel image. The context is saved in the designated - * "store status" memory locations (see POP). - * We return from this function twice. The first time during the suspend to - * disk process. The second time via the swsusp_arch_resume() function - * (see below) in the resume process. - * This function runs with disabled interrupts. - */ - GEN_BR_THUNK %r14 - - .section .text -ENTRY(swsusp_arch_suspend) - lg %r1,__LC_NODAT_STACK - stmg %r6,%r15,__SF_GPRS(%r1) - aghi %r1,-STACK_FRAME_OVERHEAD - stg %r15,__SF_BACKCHAIN(%r1) - lgr %r15,%r1 - - /* Store FPU registers */ - brasl %r14,save_fpu_regs - - /* Deactivate DAT */ - stnsm __SF_EMPTY(%r15),0xfb - - /* Store prefix register on stack */ - stpx __SF_EMPTY(%r15) - - /* Save prefix register contents for lowcore copy */ - llgf %r10,__SF_EMPTY(%r15) - - /* Get pointer to save area */ - lghi %r1,0x1000 - - /* Save CPU address */ - stap __LC_EXT_CPU_ADDR(%r0) - - /* Store registers */ - mvc 0x318(4,%r1),__SF_EMPTY(%r15) /* move prefix to lowcore */ - stam %a0,%a15,0x340(%r1) /* store access registers */ - stctg %c0,%c15,0x380(%r1) /* store control registers */ - stmg %r0,%r15,0x280(%r1) /* store general registers */ - - stpt 0x328(%r1) /* store timer */ - stck __SF_EMPTY(%r15) /* store clock */ - stckc 0x330(%r1) /* store clock comparator */ - - /* Update cputime accounting before going to sleep */ - lg %r0,__LC_LAST_UPDATE_TIMER - slg %r0,0x328(%r1) - alg %r0,__LC_SYSTEM_TIMER - stg %r0,__LC_SYSTEM_TIMER - mvc __LC_LAST_UPDATE_TIMER(8),0x328(%r1) - lg %r0,__LC_LAST_UPDATE_CLOCK - slg %r0,__SF_EMPTY(%r15) - alg %r0,__LC_STEAL_TIMER - stg %r0,__LC_STEAL_TIMER - mvc __LC_LAST_UPDATE_CLOCK(8),__SF_EMPTY(%r15) - - /* Activate DAT */ - stosm __SF_EMPTY(%r15),0x04 - - /* Set prefix page to zero */ - xc __SF_EMPTY(4,%r15),__SF_EMPTY(%r15) - spx __SF_EMPTY(%r15) - - /* Save absolute zero pages */ - larl %r2,suspend_zero_pages - lg %r2,0(%r2) - lghi %r4,0 - lghi %r3,2*PAGE_SIZE - lghi %r5,2*PAGE_SIZE -1: mvcle %r2,%r4,0 - jo 1b - - /* Copy lowcore to absolute zero lowcore */ - lghi %r2,0 - lgr %r4,%r10 - lghi %r3,2*PAGE_SIZE - lghi %r5,2*PAGE_SIZE -1: mvcle %r2,%r4,0 - jo 1b - - /* Save image */ - brasl %r14,swsusp_save - - /* Restore prefix register and return */ - lghi %r1,0x1000 - spx 0x318(%r1) - lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) - lghi %r2,0 - BR_EX %r14 -ENDPROC(swsusp_arch_suspend) - -/* - * Restore saved memory image to correct place and restore register context. - * Then we return to the function that called swsusp_arch_suspend(). - * swsusp_arch_resume() runs with disabled interrupts. - */ -ENTRY(swsusp_arch_resume) - stmg %r6,%r15,__SF_GPRS(%r15) - lgr %r1,%r15 - aghi %r15,-STACK_FRAME_OVERHEAD - stg %r1,__SF_BACKCHAIN(%r15) - - /* Make all free pages stable */ - lghi %r2,1 - brasl %r14,arch_set_page_states - - /* Set prefix page to zero */ - xc __SF_EMPTY(4,%r15),__SF_EMPTY(%r15) - spx __SF_EMPTY(%r15) - - /* Deactivate DAT */ - stnsm __SF_EMPTY(%r15),0xfb - - /* Restore saved image */ - larl %r1,restore_pblist - lg %r1,0(%r1) - ltgr %r1,%r1 - jz 2f -0: - lg %r2,8(%r1) - lg %r4,0(%r1) - iske %r0,%r4 - lghi %r3,PAGE_SIZE - lghi %r5,PAGE_SIZE -1: - mvcle %r2,%r4,0 - jo 1b - lg %r2,8(%r1) - sske %r0,%r2 - lg %r1,16(%r1) - ltgr %r1,%r1 - jnz 0b -2: - ptlb /* flush tlb */ - - /* Reset System */ - larl %r1,.Lnew_pgm_check_psw - epsw %r2,%r3 - stm %r2,%r3,0(%r1) - mvc __LC_PGM_NEW_PSW(16,%r0),0(%r1) - larl %r1,__swsusp_reset_dma - lg %r1,0(%r1) - BASR_EX %r14,%r1 - larl %r1,smp_cpu_mt_shift - icm %r1,15,0(%r1) - jz smt_done - llgfr %r1,%r1 -smt_loop: - sigp %r1,%r0,SIGP_SET_MULTI_THREADING - brc 8,smt_done /* accepted */ - brc 2,smt_loop /* busy, try again */ -smt_done: - larl %r1,.Lnew_pgm_check_psw - lpswe 0(%r1) -pgm_check_entry: - - /* Switch to original suspend CPU */ - larl %r1,.Lresume_cpu /* Resume CPU address: r2 */ - stap 0(%r1) - llgh %r2,0(%r1) - llgh %r1,__LC_EXT_CPU_ADDR(%r0) /* Suspend CPU address: r1 */ - cgr %r1,%r2 - je restore_registers /* r1 = r2 -> nothing to do */ - larl %r4,.Lrestart_suspend_psw /* Set new restart PSW */ - mvc __LC_RST_NEW_PSW(16,%r0),0(%r4) -3: - sigp %r9,%r1,SIGP_INITIAL_CPU_RESET /* sigp initial cpu reset */ - brc 8,4f /* accepted */ - brc 2,3b /* busy, try again */ - - /* Suspend CPU not available -> panic */ - larl %r15,init_thread_union+THREAD_SIZE-STACK_FRAME_OVERHEAD - larl %r2,.Lpanic_string - brasl %r14,sclp_early_printk_force - larl %r3,.Ldisabled_wait_31 - lpsw 0(%r3) -4: - /* Switch to suspend CPU */ - sigp %r9,%r1,SIGP_RESTART /* sigp restart to suspend CPU */ - brc 2,4b /* busy, try again */ -5: - sigp %r9,%r2,SIGP_STOP /* sigp stop to current resume CPU */ - brc 2,5b /* busy, try again */ -6: j 6b - -restart_suspend: - larl %r1,.Lresume_cpu - llgh %r2,0(%r1) -7: - sigp %r9,%r2,SIGP_SENSE /* sigp sense, wait for resume CPU */ - brc 8,7b /* accepted, status 0, still running */ - brc 2,7b /* busy, try again */ - tmll %r9,0x40 /* Test if resume CPU is stopped */ - jz 7b - -restore_registers: - /* Restore registers */ - lghi %r13,0x1000 /* %r1 = pointer to save area */ - - /* Ignore time spent in suspended state. */ - llgf %r1,0x318(%r13) - stck __LC_LAST_UPDATE_CLOCK(%r1) - spt 0x328(%r13) /* reprogram timer */ - //sckc 0x330(%r13) /* set clock comparator */ - - lctlg %c0,%c15,0x380(%r13) /* load control registers */ - lam %a0,%a15,0x340(%r13) /* load access registers */ - - /* Load old stack */ - lg %r15,0x2f8(%r13) - - /* Save prefix register */ - mvc __SF_EMPTY(4,%r15),0x318(%r13) - - /* Restore absolute zero pages */ - lghi %r2,0 - larl %r4,suspend_zero_pages - lg %r4,0(%r4) - lghi %r3,2*PAGE_SIZE - lghi %r5,2*PAGE_SIZE -1: mvcle %r2,%r4,0 - jo 1b - - /* Restore prefix register */ - spx __SF_EMPTY(%r15) - - /* Activate DAT */ - stosm __SF_EMPTY(%r15),0x04 - - /* Make all free pages unstable */ - lghi %r2,0 - brasl %r14,arch_set_page_states - - /* Call arch specific early resume code */ - brasl %r14,s390_early_resume - - /* Return 0 */ - lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) - lghi %r2,0 - BR_EX %r14 -ENDPROC(swsusp_arch_resume) - - .section .data..nosave,"aw",@progbits - .align 8 -.Ldisabled_wait_31: - .long 0x000a0000,0x00000000 -.Lpanic_string: - .asciz "Resume not possible because suspend CPU is no longer available\n" - .align 8 -.Lrestart_suspend_psw: - .quad 0x0000000180000000,restart_suspend -.Lnew_pgm_check_psw: - .quad 0,pgm_check_entry -.Lresume_cpu: - .byte 0,0 diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 3627953007ed..5f70cefc13e4 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -26,7 +26,6 @@ #include <linux/nodemask.h> #include <linux/node.h> #include <asm/sysinfo.h> -#include <asm/numa.h> #define PTF_HORIZONTAL (0UL) #define PTF_VERTICAL (1UL) @@ -63,8 +62,6 @@ static struct mask_info drawer_info; struct cpu_topology_s390 cpu_topology[NR_CPUS]; EXPORT_SYMBOL_GPL(cpu_topology); -cpumask_t cpus_with_topology; - static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu) { cpumask_t mask; @@ -86,11 +83,12 @@ static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu) cpumask_copy(&mask, cpu_present_mask); break; default: - /* fallthrough */ + fallthrough; case TOPOLOGY_MODE_SINGLE: cpumask_copy(&mask, cpumask_of(cpu)); break; } + cpumask_and(&mask, &mask, cpu_online_mask); return mask; } @@ -106,6 +104,7 @@ static cpumask_t cpu_thread_map(unsigned int cpu) for (i = 0; i <= smp_cpu_mtid; i++) if (cpu_present(cpu + i)) cpumask_set_cpu(cpu + i, &mask); + cpumask_and(&mask, &mask, cpu_online_mask); return mask; } @@ -138,7 +137,6 @@ static void add_cpus_to_mask(struct topology_core *tl_core, cpumask_set_cpu(lcpu + i, &drawer->mask); cpumask_set_cpu(lcpu + i, &book->mask); cpumask_set_cpu(lcpu + i, &socket->mask); - cpumask_set_cpu(lcpu + i, &cpus_with_topology); smp_cpu_set_polarization(lcpu + i, tl_core->pp); } } @@ -245,10 +243,10 @@ int topology_set_cpu_management(int fc) return rc; } -static void update_cpu_masks(void) +void update_cpu_masks(void) { - struct cpu_topology_s390 *topo; - int cpu, id; + struct cpu_topology_s390 *topo, *topo_package, *topo_sibling; + int cpu, sibling, pkg_first, smt_first, id; for_each_possible_cpu(cpu) { topo = &cpu_topology[cpu]; @@ -256,6 +254,7 @@ static void update_cpu_masks(void) topo->core_mask = cpu_group_map(&socket_info, cpu); topo->book_mask = cpu_group_map(&book_info, cpu); topo->drawer_mask = cpu_group_map(&drawer_info, cpu); + topo->booted_cores = 0; if (topology_mode != TOPOLOGY_MODE_HW) { id = topology_mode == TOPOLOGY_MODE_PACKAGE ? 0 : cpu; topo->thread_id = cpu; @@ -263,11 +262,23 @@ static void update_cpu_masks(void) topo->socket_id = id; topo->book_id = id; topo->drawer_id = id; - if (cpu_present(cpu)) - cpumask_set_cpu(cpu, &cpus_with_topology); } } - numa_update_cpu_topology(); + for_each_online_cpu(cpu) { + topo = &cpu_topology[cpu]; + pkg_first = cpumask_first(&topo->core_mask); + topo_package = &cpu_topology[pkg_first]; + if (cpu == pkg_first) { + for_each_cpu(sibling, &topo->core_mask) { + topo_sibling = &cpu_topology[sibling]; + smt_first = cpumask_first(&topo_sibling->thread_mask); + if (sibling == smt_first) + topo_package->booted_cores++; + } + } else { + topo->booted_cores = topo_package->booted_cores; + } + } } void store_topology(struct sysinfo_15_1_x *info) @@ -289,7 +300,6 @@ static int __arch_update_cpu_topology(void) int rc = 0; mutex_lock(&smp_cpu_state_mutex); - cpumask_clear(&cpus_with_topology); if (MACHINE_HAS_TOPOLOGY) { rc = 1; store_topology(info); diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index dc75588d7894..ff9cc4c3290e 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -271,7 +271,7 @@ void kernel_stack_overflow(struct pt_regs *regs) } NOKPROBE_SYMBOL(kernel_stack_overflow); -static void test_monitor_call(void) +static void __init test_monitor_call(void) { int val = 1; diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c new file mode 100644 index 000000000000..c86d654351d1 --- /dev/null +++ b/arch/s390/kernel/uv.c @@ -0,0 +1,414 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Ultravisor functions and initialization + * + * Copyright IBM Corp. 2019, 2020 + */ +#define KMSG_COMPONENT "prot_virt" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/sizes.h> +#include <linux/bitmap.h> +#include <linux/memblock.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <asm/facility.h> +#include <asm/sections.h> +#include <asm/uv.h> + +/* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */ +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST +int __bootdata_preserved(prot_virt_guest); +#endif + +#if IS_ENABLED(CONFIG_KVM) +int prot_virt_host; +EXPORT_SYMBOL(prot_virt_host); +struct uv_info __bootdata_preserved(uv_info); +EXPORT_SYMBOL(uv_info); + +static int __init prot_virt_setup(char *val) +{ + bool enabled; + int rc; + + rc = kstrtobool(val, &enabled); + if (!rc && enabled) + prot_virt_host = 1; + + if (is_prot_virt_guest() && prot_virt_host) { + prot_virt_host = 0; + pr_warn("Protected virtualization not available in protected guests."); + } + + if (prot_virt_host && !test_facility(158)) { + prot_virt_host = 0; + pr_warn("Protected virtualization not supported by the hardware."); + } + + return rc; +} +early_param("prot_virt", prot_virt_setup); + +static int __init uv_init(unsigned long stor_base, unsigned long stor_len) +{ + struct uv_cb_init uvcb = { + .header.cmd = UVC_CMD_INIT_UV, + .header.len = sizeof(uvcb), + .stor_origin = stor_base, + .stor_len = stor_len, + }; + + if (uv_call(0, (uint64_t)&uvcb)) { + pr_err("Ultravisor init failed with rc: 0x%x rrc: 0%x\n", + uvcb.header.rc, uvcb.header.rrc); + return -1; + } + return 0; +} + +void __init setup_uv(void) +{ + unsigned long uv_stor_base; + + uv_stor_base = (unsigned long)memblock_alloc_try_nid( + uv_info.uv_base_stor_len, SZ_1M, SZ_2G, + MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); + if (!uv_stor_base) { + pr_warn("Failed to reserve %lu bytes for ultravisor base storage\n", + uv_info.uv_base_stor_len); + goto fail; + } + + if (uv_init(uv_stor_base, uv_info.uv_base_stor_len)) { + memblock_free(uv_stor_base, uv_info.uv_base_stor_len); + goto fail; + } + + pr_info("Reserving %luMB as ultravisor base storage\n", + uv_info.uv_base_stor_len >> 20); + return; +fail: + pr_info("Disabling support for protected virtualization"); + prot_virt_host = 0; +} + +void adjust_to_uv_max(unsigned long *vmax) +{ + *vmax = min_t(unsigned long, *vmax, uv_info.max_sec_stor_addr); +} + +/* + * Requests the Ultravisor to pin the page in the shared state. This will + * cause an intercept when the guest attempts to unshare the pinned page. + */ +static int uv_pin_shared(unsigned long paddr) +{ + struct uv_cb_cfs uvcb = { + .header.cmd = UVC_CMD_PIN_PAGE_SHARED, + .header.len = sizeof(uvcb), + .paddr = paddr, + }; + + if (uv_call(0, (u64)&uvcb)) + return -EINVAL; + return 0; +} + +/* + * Requests the Ultravisor to encrypt a guest page and make it + * accessible to the host for paging (export). + * + * @paddr: Absolute host address of page to be exported + */ +int uv_convert_from_secure(unsigned long paddr) +{ + struct uv_cb_cfs uvcb = { + .header.cmd = UVC_CMD_CONV_FROM_SEC_STOR, + .header.len = sizeof(uvcb), + .paddr = paddr + }; + + if (uv_call(0, (u64)&uvcb)) + return -EINVAL; + return 0; +} + +/* + * Calculate the expected ref_count for a page that would otherwise have no + * further pins. This was cribbed from similar functions in other places in + * the kernel, but with some slight modifications. We know that a secure + * page can not be a huge page for example. + */ +static int expected_page_refs(struct page *page) +{ + int res; + + res = page_mapcount(page); + if (PageSwapCache(page)) { + res++; + } else if (page_mapping(page)) { + res++; + if (page_has_private(page)) + res++; + } + return res; +} + +static int make_secure_pte(pte_t *ptep, unsigned long addr, + struct page *exp_page, struct uv_cb_header *uvcb) +{ + pte_t entry = READ_ONCE(*ptep); + struct page *page; + int expected, rc = 0; + + if (!pte_present(entry)) + return -ENXIO; + if (pte_val(entry) & _PAGE_INVALID) + return -ENXIO; + + page = pte_page(entry); + if (page != exp_page) + return -ENXIO; + if (PageWriteback(page)) + return -EAGAIN; + expected = expected_page_refs(page); + if (!page_ref_freeze(page, expected)) + return -EBUSY; + set_bit(PG_arch_1, &page->flags); + rc = uv_call(0, (u64)uvcb); + page_ref_unfreeze(page, expected); + /* Return -ENXIO if the page was not mapped, -EINVAL otherwise */ + if (rc) + rc = uvcb->rc == 0x10a ? -ENXIO : -EINVAL; + return rc; +} + +/* + * Requests the Ultravisor to make a page accessible to a guest. + * If it's brought in the first time, it will be cleared. If + * it has been exported before, it will be decrypted and integrity + * checked. + */ +int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) +{ + struct vm_area_struct *vma; + bool local_drain = false; + spinlock_t *ptelock; + unsigned long uaddr; + struct page *page; + pte_t *ptep; + int rc; + +again: + rc = -EFAULT; + down_read(&gmap->mm->mmap_sem); + + uaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(uaddr)) + goto out; + vma = find_vma(gmap->mm, uaddr); + if (!vma) + goto out; + /* + * Secure pages cannot be huge and userspace should not combine both. + * In case userspace does it anyway this will result in an -EFAULT for + * the unpack. The guest is thus never reaching secure mode. If + * userspace is playing dirty tricky with mapping huge pages later + * on this will result in a segmentation fault. + */ + if (is_vm_hugetlb_page(vma)) + goto out; + + rc = -ENXIO; + page = follow_page(vma, uaddr, FOLL_WRITE); + if (IS_ERR_OR_NULL(page)) + goto out; + + lock_page(page); + ptep = get_locked_pte(gmap->mm, uaddr, &ptelock); + rc = make_secure_pte(ptep, uaddr, page, uvcb); + pte_unmap_unlock(ptep, ptelock); + unlock_page(page); +out: + up_read(&gmap->mm->mmap_sem); + + if (rc == -EAGAIN) { + wait_on_page_writeback(page); + } else if (rc == -EBUSY) { + /* + * If we have tried a local drain and the page refcount + * still does not match our expected safe value, try with a + * system wide drain. This is needed if the pagevecs holding + * the page are on a different CPU. + */ + if (local_drain) { + lru_add_drain_all(); + /* We give up here, and let the caller try again */ + return -EAGAIN; + } + /* + * We are here if the page refcount does not match the + * expected safe value. The main culprits are usually + * pagevecs. With lru_add_drain() we drain the pagevecs + * on the local CPU so that hopefully the refcount will + * reach the expected safe value. + */ + lru_add_drain(); + local_drain = true; + /* And now we try again immediately after draining */ + goto again; + } else if (rc == -ENXIO) { + if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE)) + return -EFAULT; + return -EAGAIN; + } + return rc; +} +EXPORT_SYMBOL_GPL(gmap_make_secure); + +int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) +{ + struct uv_cb_cts uvcb = { + .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, + .header.len = sizeof(uvcb), + .guest_handle = gmap->guest_handle, + .gaddr = gaddr, + }; + + return gmap_make_secure(gmap, gaddr, &uvcb); +} +EXPORT_SYMBOL_GPL(gmap_convert_to_secure); + +/* + * To be called with the page locked or with an extra reference! This will + * prevent gmap_make_secure from touching the page concurrently. Having 2 + * parallel make_page_accessible is fine, as the UV calls will become a + * no-op if the page is already exported. + */ +int arch_make_page_accessible(struct page *page) +{ + int rc = 0; + + /* Hugepage cannot be protected, so nothing to do */ + if (PageHuge(page)) + return 0; + + /* + * PG_arch_1 is used in 3 places: + * 1. for kernel page tables during early boot + * 2. for storage keys of huge pages and KVM + * 3. As an indication that this page might be secure. This can + * overindicate, e.g. we set the bit before calling + * convert_to_secure. + * As secure pages are never huge, all 3 variants can co-exists. + */ + if (!test_bit(PG_arch_1, &page->flags)) + return 0; + + rc = uv_pin_shared(page_to_phys(page)); + if (!rc) { + clear_bit(PG_arch_1, &page->flags); + return 0; + } + + rc = uv_convert_from_secure(page_to_phys(page)); + if (!rc) { + clear_bit(PG_arch_1, &page->flags); + return 0; + } + + return rc; +} +EXPORT_SYMBOL_GPL(arch_make_page_accessible); + +#endif + +#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM) +static ssize_t uv_query_facilities(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return snprintf(page, PAGE_SIZE, "%lx\n%lx\n%lx\n%lx\n", + uv_info.inst_calls_list[0], + uv_info.inst_calls_list[1], + uv_info.inst_calls_list[2], + uv_info.inst_calls_list[3]); +} + +static struct kobj_attribute uv_query_facilities_attr = + __ATTR(facilities, 0444, uv_query_facilities, NULL); + +static ssize_t uv_query_max_guest_cpus(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", + uv_info.max_guest_cpus); +} + +static struct kobj_attribute uv_query_max_guest_cpus_attr = + __ATTR(max_cpus, 0444, uv_query_max_guest_cpus, NULL); + +static ssize_t uv_query_max_guest_vms(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", + uv_info.max_num_sec_conf); +} + +static struct kobj_attribute uv_query_max_guest_vms_attr = + __ATTR(max_guests, 0444, uv_query_max_guest_vms, NULL); + +static ssize_t uv_query_max_guest_addr(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return snprintf(page, PAGE_SIZE, "%lx\n", + uv_info.max_sec_stor_addr); +} + +static struct kobj_attribute uv_query_max_guest_addr_attr = + __ATTR(max_address, 0444, uv_query_max_guest_addr, NULL); + +static struct attribute *uv_query_attrs[] = { + &uv_query_facilities_attr.attr, + &uv_query_max_guest_cpus_attr.attr, + &uv_query_max_guest_vms_attr.attr, + &uv_query_max_guest_addr_attr.attr, + NULL, +}; + +static struct attribute_group uv_query_attr_group = { + .attrs = uv_query_attrs, +}; + +static struct kset *uv_query_kset; +static struct kobject *uv_kobj; + +static int __init uv_info_init(void) +{ + int rc = -ENOMEM; + + if (!test_facility(158)) + return 0; + + uv_kobj = kobject_create_and_add("uv", firmware_kobj); + if (!uv_kobj) + return -ENOMEM; + + uv_query_kset = kset_create_and_add("query", NULL, uv_kobj); + if (!uv_query_kset) + goto out_kobj; + + rc = sysfs_create_group(&uv_query_kset->kobj, &uv_query_attr_group); + if (!rc) + return 0; + + kset_unregister(uv_query_kset); +out_kobj: + kobject_del(uv_kobj); + kobject_put(uv_kobj); + return rc; +} +device_initcall(uv_info_init); +#endif diff --git a/arch/s390/kernel/vdso64/.gitignore b/arch/s390/kernel/vdso64/.gitignore index 3fd18cf9fec2..4ec80685fecc 100644 --- a/arch/s390/kernel/vdso64/.gitignore +++ b/arch/s390/kernel/vdso64/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only vdso64.lds diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 05ee90a5ea08..12decca22e7c 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -9,6 +9,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqch ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-objs += diag.o gaccess.o guestdbg.o vsie.o +kvm-objs += diag.o gaccess.o guestdbg.o vsie.o pv.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 3fb54ec2cf3e..563429dece03 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -2,7 +2,7 @@ /* * handling diagnose instructions * - * Copyright IBM Corp. 2008, 2011 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -201,6 +201,10 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; } + /* + * no need to check the return value of vcpu_stop as it can only have + * an error for protvirt, but protvirt means user cpu state + */ if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) kvm_s390_vcpu_stop(vcpu); vcpu->run->s390_reset_flags |= KVM_S390_RESET_SUBSYSTEM; diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 07d30ffcfa41..47a67a958107 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -505,7 +505,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, switch (prot) { case PROT_TYPE_IEP: tec->b61 = 1; - /* FALL THROUGH */ + fallthrough; case PROT_TYPE_LA: tec->b56 = 1; break; @@ -514,12 +514,12 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, break; case PROT_TYPE_ALC: tec->b60 = 1; - /* FALL THROUGH */ + fallthrough; case PROT_TYPE_DAT: tec->b61 = 1; break; } - /* FALL THROUGH */ + fallthrough; case PGM_ASCE_TYPE: case PGM_PAGE_TRANSLATION: case PGM_REGION_FIRST_TRANS: @@ -534,7 +534,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, tec->addr = gva >> PAGE_SHIFT; tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as; - /* FALL THROUGH */ + fallthrough; case PGM_ALEN_TRANSLATION: case PGM_ALE_SEQUENCE: case PGM_ASTE_VALIDITY: @@ -677,7 +677,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, dat_protection |= rfte.p; ptr = rfte.rto * PAGE_SIZE + vaddr.rsx * 8; } - /* fallthrough */ + fallthrough; case ASCE_TYPE_REGION2: { union region2_table_entry rste; @@ -695,7 +695,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, dat_protection |= rste.p; ptr = rste.rto * PAGE_SIZE + vaddr.rtx * 8; } - /* fallthrough */ + fallthrough; case ASCE_TYPE_REGION3: { union region3_table_entry rtte; @@ -723,7 +723,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, dat_protection |= rtte.fc0.p; ptr = rtte.fc0.sto * PAGE_SIZE + vaddr.sx * 8; } - /* fallthrough */ + fallthrough; case ASCE_TYPE_SEGMENT: { union segment_table_entry ste; @@ -1050,7 +1050,8 @@ shadow_r2t: rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); if (rc) return rc; - } /* fallthrough */ + } + fallthrough; case ASCE_TYPE_REGION2: { union region2_table_entry rste; @@ -1076,7 +1077,8 @@ shadow_r3t: rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); if (rc) return rc; - } /* fallthrough */ + } + fallthrough; case ASCE_TYPE_REGION3: { union region3_table_entry rtte; @@ -1111,7 +1113,8 @@ shadow_sgt: rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); if (rc) return rc; - } /* fallthrough */ + } + fallthrough; case ASCE_TYPE_SEGMENT: { union segment_table_entry ste; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index a389fa85cca2..e7a7c499a73f 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -2,7 +2,7 @@ /* * in-kernel handling for sie intercepts * - * Copyright IBM Corp. 2008, 2014 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -12,10 +12,10 @@ #include <linux/errno.h> #include <linux/pagemap.h> -#include <asm/kvm_host.h> #include <asm/asm-offsets.h> #include <asm/irq.h> #include <asm/sysinfo.h> +#include <asm/uv.h> #include "kvm-s390.h" #include "gaccess.h" @@ -79,6 +79,10 @@ static int handle_stop(struct kvm_vcpu *vcpu) return rc; } + /* + * no need to check the return value of vcpu_stop as it can only have + * an error for protvirt, but protvirt means user cpu state + */ if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) kvm_s390_vcpu_stop(vcpu); return -EOPNOTSUPP; @@ -231,6 +235,13 @@ static int handle_prog(struct kvm_vcpu *vcpu) vcpu->stat.exit_program_interruption++; + /* + * Intercept 8 indicates a loop of specification exceptions + * for protected guests. + */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) + return -EOPNOTSUPP; + if (guestdbg_enabled(vcpu) && per_event(vcpu)) { rc = kvm_s390_handle_per_event(vcpu); if (rc) @@ -384,7 +395,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu) goto out; } - if (addr & ~PAGE_MASK) + if (!kvm_s390_pv_cpu_is_protected(vcpu) && (addr & ~PAGE_MASK)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); sctns = (void *)get_zeroed_page(GFP_KERNEL); @@ -395,10 +406,15 @@ int handle_sthyi(struct kvm_vcpu *vcpu) out: if (!cc) { - r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); - if (r) { - free_page((unsigned long)sctns); - return kvm_s390_inject_prog_cond(vcpu, r); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy((void *)(sida_origin(vcpu->arch.sie_block)), + sctns, PAGE_SIZE); + } else { + r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); + if (r) { + free_page((unsigned long)sctns); + return kvm_s390_inject_prog_cond(vcpu, r); + } } } @@ -444,6 +460,77 @@ static int handle_operexc(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); } +static int handle_pv_spx(struct kvm_vcpu *vcpu) +{ + u32 pref = *(u32 *)vcpu->arch.sie_block->sidad; + + kvm_s390_set_prefix(vcpu, pref); + trace_kvm_s390_handle_prefix(vcpu, 1, pref); + return 0; +} + +static int handle_pv_sclp(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + + spin_lock(&fi->lock); + /* + * 2 cases: + * a: an sccb answering interrupt was already pending or in flight. + * As the sccb value is not known we can simply set some value to + * trigger delivery of a saved SCCB. UV will then use its saved + * copy of the SCCB value. + * b: an error SCCB interrupt needs to be injected so we also inject + * a fake SCCB address. Firmware will use the proper one. + * This makes sure, that both errors and real sccb returns will only + * be delivered after a notification intercept (instruction has + * finished) but not after others. + */ + fi->srv_signal.ext_params |= 0x43000; + set_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs); + clear_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs); + spin_unlock(&fi->lock); + return 0; +} + +static int handle_pv_uvc(struct kvm_vcpu *vcpu) +{ + struct uv_cb_share *guest_uvcb = (void *)vcpu->arch.sie_block->sidad; + struct uv_cb_cts uvcb = { + .header.cmd = UVC_CMD_UNPIN_PAGE_SHARED, + .header.len = sizeof(uvcb), + .guest_handle = kvm_s390_pv_get_handle(vcpu->kvm), + .gaddr = guest_uvcb->paddr, + }; + int rc; + + if (guest_uvcb->header.cmd != UVC_CMD_REMOVE_SHARED_ACCESS) { + WARN_ONCE(1, "Unexpected notification intercept for UVC 0x%x\n", + guest_uvcb->header.cmd); + return 0; + } + rc = gmap_make_secure(vcpu->arch.gmap, uvcb.gaddr, &uvcb); + /* + * If the unpin did not succeed, the guest will exit again for the UVC + * and we will retry the unpin. + */ + if (rc == -EINVAL) + return 0; + return rc; +} + +static int handle_pv_notification(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.sie_block->ipa == 0xb210) + return handle_pv_spx(vcpu); + if (vcpu->arch.sie_block->ipa == 0xb220) + return handle_pv_sclp(vcpu); + if (vcpu->arch.sie_block->ipa == 0xb9a4) + return handle_pv_uvc(vcpu); + + return handle_instruction(vcpu); +} + int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) { int rc, per_rc = 0; @@ -480,6 +567,28 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) case ICPT_KSS: rc = kvm_s390_skey_check_enable(vcpu); break; + case ICPT_MCHKREQ: + case ICPT_INT_ENABLE: + /* + * PSW bit 13 or a CR (0, 6, 14) changed and we might + * now be able to deliver interrupts. The pre-run code + * will take care of this. + */ + rc = 0; + break; + case ICPT_PV_INSTR: + rc = handle_instruction(vcpu); + break; + case ICPT_PV_NOTIFY: + rc = handle_pv_notification(vcpu); + break; + case ICPT_PV_PREF: + rc = 0; + gmap_convert_to_secure(vcpu->arch.gmap, + kvm_s390_get_prefix(vcpu)); + gmap_convert_to_secure(vcpu->arch.gmap, + kvm_s390_get_prefix(vcpu) + PAGE_SIZE); + break; default: return -EOPNOTSUPP; } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index c06c89d370a7..8191106bf7b9 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2,7 +2,7 @@ /* * handling kvm guest interrupts * - * Copyright IBM Corp. 2008, 2015 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> */ @@ -324,8 +324,11 @@ static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu) { - return vcpu->kvm->arch.float_int.pending_irqs | - vcpu->arch.local_int.pending_irqs; + unsigned long pending = vcpu->kvm->arch.float_int.pending_irqs | + vcpu->arch.local_int.pending_irqs; + + pending &= ~vcpu->kvm->arch.float_int.masked_irqs; + return pending; } static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) @@ -383,10 +386,18 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) __clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &active_mask); if (!(vcpu->arch.sie_block->gcr[0] & CR0_CPU_TIMER_SUBMASK)) __clear_bit(IRQ_PEND_EXT_CPU_TIMER, &active_mask); - if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) + if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) { __clear_bit(IRQ_PEND_EXT_SERVICE, &active_mask); + __clear_bit(IRQ_PEND_EXT_SERVICE_EV, &active_mask); + } if (psw_mchk_disabled(vcpu)) active_mask &= ~IRQ_PEND_MCHK_MASK; + /* PV guest cpus can have a single interruption injected at a time. */ + if (kvm_s390_pv_cpu_is_protected(vcpu) && + vcpu->arch.sie_block->iictl != IICTL_CODE_NONE) + active_mask &= ~(IRQ_PEND_EXT_II_MASK | + IRQ_PEND_IO_MASK | + IRQ_PEND_MCHK_MASK); /* * Check both floating and local interrupt's cr14 because * bit IRQ_PEND_MCHK_REP could be set in both cases. @@ -479,19 +490,23 @@ static void set_intercept_indicators(struct kvm_vcpu *vcpu) static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - int rc; + int rc = 0; vcpu->stat.deliver_cputm++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER, 0, 0); - - rc = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER, - (u16 *)__LC_EXT_INT_CODE); - rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); - rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_CPU_TIMER; + } else { + rc = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER, + (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + } clear_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs); return rc ? -EFAULT : 0; } @@ -499,19 +514,23 @@ static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu) static int __must_check __deliver_ckc(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - int rc; + int rc = 0; vcpu->stat.deliver_ckc++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP, 0, 0); - - rc = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP, - (u16 __user *)__LC_EXT_INT_CODE); - rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); - rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_CLK_COMP; + } else { + rc = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP, + (u16 __user *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + } clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs); return rc ? -EFAULT : 0; } @@ -553,6 +572,20 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, union mci mci; int rc; + /* + * All other possible payload for a machine check (e.g. the register + * contents in the save area) will be handled by the ultravisor, as + * the hypervisor does not not have the needed information for + * protected guests. + */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_MCHK; + vcpu->arch.sie_block->mcic = mchk->mcic; + vcpu->arch.sie_block->faddr = mchk->failing_storage_address; + vcpu->arch.sie_block->edc = mchk->ext_damage_code; + return 0; + } + mci.val = mchk->mcic; /* take care of lazy register loading */ save_fpu_regs(); @@ -696,17 +729,21 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) static int __must_check __deliver_restart(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - int rc; + int rc = 0; VCPU_EVENT(vcpu, 3, "%s", "deliver: cpu restart"); vcpu->stat.deliver_restart_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0); - rc = write_guest_lc(vcpu, - offsetof(struct lowcore, restart_old_psw), - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw), - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_RESTART; + } else { + rc = write_guest_lc(vcpu, + offsetof(struct lowcore, restart_old_psw), + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw), + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + } clear_bit(IRQ_PEND_RESTART, &li->pending_irqs); return rc ? -EFAULT : 0; } @@ -748,6 +785,12 @@ static int __must_check __deliver_emergency_signal(struct kvm_vcpu *vcpu) vcpu->stat.deliver_emergency_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY, cpu_addr, 0); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_EMERGENCY_SIG; + vcpu->arch.sie_block->extcpuaddr = cpu_addr; + return 0; + } rc = put_guest_lc(vcpu, EXT_IRQ_EMERGENCY_SIG, (u16 *)__LC_EXT_INT_CODE); @@ -776,6 +819,12 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu) trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EXTERNAL_CALL, extcall.code, 0); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_EXTERNAL_CALL; + vcpu->arch.sie_block->extcpuaddr = extcall.code; + return 0; + } rc = put_guest_lc(vcpu, EXT_IRQ_EXTERNAL_CALL, (u16 *)__LC_EXT_INT_CODE); @@ -787,6 +836,21 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu) return rc ? -EFAULT : 0; } +static int __deliver_prog_pv(struct kvm_vcpu *vcpu, u16 code) +{ + switch (code) { + case PGM_SPECIFICATION: + vcpu->arch.sie_block->iictl = IICTL_CODE_SPECIFICATION; + break; + case PGM_OPERAND: + vcpu->arch.sie_block->iictl = IICTL_CODE_OPERAND; + break; + default: + return -EINVAL; + } + return 0; +} + static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; @@ -807,6 +871,10 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, pgm_info.code, 0); + /* PER is handled by the ultravisor */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) + return __deliver_prog_pv(vcpu, pgm_info.code & ~PGM_PER); + switch (pgm_info.code & ~PGM_PER) { case PGM_AFX_TRANSLATION: case PGM_ASX_TRANSLATION: @@ -818,7 +886,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) case PGM_PRIMARY_AUTHORITY: case PGM_SECONDARY_AUTHORITY: nullifying = true; - /* fall through */ + fallthrough; case PGM_SPACE_SWITCH: rc = put_guest_lc(vcpu, pgm_info.trans_exc_code, (u64 *)__LC_TRANS_EXC_CODE); @@ -902,20 +970,49 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) return rc ? -EFAULT : 0; } +#define SCCB_MASK 0xFFFFFFF8 +#define SCCB_EVENT_PENDING 0x3 + +static int write_sclp(struct kvm_vcpu *vcpu, u32 parm) +{ + int rc; + + if (kvm_s390_pv_cpu_get_handle(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_SERVICE_SIG; + vcpu->arch.sie_block->eiparams = parm; + return 0; + } + + rc = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= put_guest_lc(vcpu, parm, + (u32 *)__LC_EXT_PARAMS); + + return rc ? -EFAULT : 0; +} + static int __must_check __deliver_service(struct kvm_vcpu *vcpu) { struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_ext_info ext; - int rc = 0; spin_lock(&fi->lock); - if (!(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) { + if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs) || + !(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) { spin_unlock(&fi->lock); return 0; } ext = fi->srv_signal; memset(&fi->srv_signal, 0, sizeof(ext)); clear_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs); + clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); + if (kvm_s390_pv_cpu_is_protected(vcpu)) + set_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs); spin_unlock(&fi->lock); VCPU_EVENT(vcpu, 4, "deliver: sclp parameter 0x%x", @@ -924,16 +1021,31 @@ static int __must_check __deliver_service(struct kvm_vcpu *vcpu) trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE, ext.ext_params, 0); - rc = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE); - rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); - rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= put_guest_lc(vcpu, ext.ext_params, - (u32 *)__LC_EXT_PARAMS); + return write_sclp(vcpu, ext.ext_params); +} - return rc ? -EFAULT : 0; +static int __must_check __deliver_service_ev(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + struct kvm_s390_ext_info ext; + + spin_lock(&fi->lock); + if (!(test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs))) { + spin_unlock(&fi->lock); + return 0; + } + ext = fi->srv_signal; + /* only clear the event bit */ + fi->srv_signal.ext_params &= ~SCCB_EVENT_PENDING; + clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); + spin_unlock(&fi->lock); + + VCPU_EVENT(vcpu, 4, "%s", "deliver: sclp parameter event"); + vcpu->stat.deliver_service_signal++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE, + ext.ext_params, 0); + + return write_sclp(vcpu, SCCB_EVENT_PENDING); } static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu) @@ -1028,6 +1140,15 @@ static int __do_deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_io_info *io) { int rc; + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_IO; + vcpu->arch.sie_block->subchannel_id = io->subchannel_id; + vcpu->arch.sie_block->subchannel_nr = io->subchannel_nr; + vcpu->arch.sie_block->io_int_parm = io->io_int_parm; + vcpu->arch.sie_block->io_int_word = io->io_int_word; + return 0; + } + rc = put_guest_lc(vcpu, io->subchannel_id, (u16 *)__LC_SUBCHANNEL_ID); rc |= put_guest_lc(vcpu, io->subchannel_nr, (u16 *)__LC_SUBCHANNEL_NR); rc |= put_guest_lc(vcpu, io->io_int_parm, (u32 *)__LC_IO_INT_PARM); @@ -1329,6 +1450,9 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) case IRQ_PEND_EXT_SERVICE: rc = __deliver_service(vcpu); break; + case IRQ_PEND_EXT_SERVICE_EV: + rc = __deliver_service_ev(vcpu); + break; case IRQ_PEND_PFAULT_DONE: rc = __deliver_pfault_done(vcpu); break; @@ -1421,7 +1545,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) if (kvm_get_vcpu_by_id(vcpu->kvm, src_id) == NULL) return -EINVAL; - if (sclp.has_sigpif) + if (sclp.has_sigpif && !kvm_s390_pv_cpu_get_handle(vcpu)) return sca_inject_ext_call(vcpu, src_id); if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs)) @@ -1681,9 +1805,6 @@ out: return inti; } -#define SCCB_MASK 0xFFFFFFF8 -#define SCCB_EVENT_PENDING 0x3 - static int __inject_service(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) { @@ -1692,6 +1813,11 @@ static int __inject_service(struct kvm *kvm, kvm->stat.inject_service_signal++; spin_lock(&fi->lock); fi->srv_signal.ext_params |= inti->ext.ext_params & SCCB_EVENT_PENDING; + + /* We always allow events, track them separately from the sccb ints */ + if (fi->srv_signal.ext_params & SCCB_EVENT_PENDING) + set_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); + /* * Early versions of the QEMU s390 bios will inject several * service interrupts after another without handling a @@ -1773,7 +1899,14 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) kvm->stat.inject_io++; isc = int_word_to_isc(inti->io.io_int_word); - if (gi->origin && inti->type & KVM_S390_INT_IO_AI_MASK) { + /* + * Do not make use of gisa in protected mode. We do not use the lock + * checking variant as this is just a performance optimization and we + * do not hold the lock here. This is ok as the code will pick + * interrupts from both "lists" for delivery. + */ + if (!kvm_s390_pv_get_handle(kvm) && + gi->origin && inti->type & KVM_S390_INT_IO_AI_MASK) { VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc); gisa_set_ipm_gisc(gi->origin, isc); kfree(inti); @@ -1834,7 +1967,8 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: if (!(type & KVM_S390_INT_IO_AI_MASK && - kvm->arch.gisa_int.origin)) + kvm->arch.gisa_int.origin) || + kvm_s390_pv_cpu_get_handle(dst_vcpu)) kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT); break; default: @@ -2080,6 +2214,10 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm) struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; int i; + mutex_lock(&kvm->lock); + if (!kvm_s390_pv_is_protected(kvm)) + fi->masked_irqs = 0; + mutex_unlock(&kvm->lock); spin_lock(&fi->lock); fi->pending_irqs = 0; memset(&fi->srv_signal, 0, sizeof(fi->srv_signal)); @@ -2146,7 +2284,8 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) n++; } } - if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs)) { + if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs) || + test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs)) { if (n == max_irqs) { /* signal userspace to try again */ ret = -ENOMEM; @@ -2327,9 +2466,6 @@ static int register_io_adapter(struct kvm_device *dev, if (!adapter) return -ENOMEM; - INIT_LIST_HEAD(&adapter->maps); - init_rwsem(&adapter->maps_lock); - atomic_set(&adapter->nr_maps, 0); adapter->id = adapter_info.id; adapter->isc = adapter_info.isc; adapter->maskable = adapter_info.maskable; @@ -2354,87 +2490,12 @@ int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked) return ret; } -static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr) -{ - struct s390_io_adapter *adapter = get_io_adapter(kvm, id); - struct s390_map_info *map; - int ret; - - if (!adapter || !addr) - return -EINVAL; - - map = kzalloc(sizeof(*map), GFP_KERNEL); - if (!map) { - ret = -ENOMEM; - goto out; - } - INIT_LIST_HEAD(&map->list); - map->guest_addr = addr; - map->addr = gmap_translate(kvm->arch.gmap, addr); - if (map->addr == -EFAULT) { - ret = -EFAULT; - goto out; - } - ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page); - if (ret < 0) - goto out; - BUG_ON(ret != 1); - down_write(&adapter->maps_lock); - if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) { - list_add_tail(&map->list, &adapter->maps); - ret = 0; - } else { - put_page(map->page); - ret = -EINVAL; - } - up_write(&adapter->maps_lock); -out: - if (ret) - kfree(map); - return ret; -} - -static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr) -{ - struct s390_io_adapter *adapter = get_io_adapter(kvm, id); - struct s390_map_info *map, *tmp; - int found = 0; - - if (!adapter || !addr) - return -EINVAL; - - down_write(&adapter->maps_lock); - list_for_each_entry_safe(map, tmp, &adapter->maps, list) { - if (map->guest_addr == addr) { - found = 1; - atomic_dec(&adapter->nr_maps); - list_del(&map->list); - put_page(map->page); - kfree(map); - break; - } - } - up_write(&adapter->maps_lock); - - return found ? 0 : -EINVAL; -} - void kvm_s390_destroy_adapters(struct kvm *kvm) { int i; - struct s390_map_info *map, *tmp; - for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) { - if (!kvm->arch.adapters[i]) - continue; - list_for_each_entry_safe(map, tmp, - &kvm->arch.adapters[i]->maps, list) { - list_del(&map->list); - put_page(map->page); - kfree(map); - } + for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) kfree(kvm->arch.adapters[i]); - } } static int modify_io_adapter(struct kvm_device *dev, @@ -2456,11 +2517,14 @@ static int modify_io_adapter(struct kvm_device *dev, if (ret > 0) ret = 0; break; + /* + * The following operations are no longer needed and therefore no-ops. + * The gpa to hva translation is done when an IRQ route is set up. The + * set_irq code uses get_user_pages_remote() to do the actual write. + */ case KVM_S390_IO_ADAPTER_MAP: - ret = kvm_s390_adapter_map(dev->kvm, req.id, req.addr); - break; case KVM_S390_IO_ADAPTER_UNMAP: - ret = kvm_s390_adapter_unmap(dev->kvm, req.id, req.addr); + ret = 0; break; default: ret = -EINVAL; @@ -2699,19 +2763,15 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap) return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit; } -static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter, - u64 addr) +static struct page *get_map_page(struct kvm *kvm, u64 uaddr) { - struct s390_map_info *map; + struct page *page = NULL; - if (!adapter) - return NULL; - - list_for_each_entry(map, &adapter->maps, list) { - if (map->guest_addr == addr) - return map; - } - return NULL; + down_read(&kvm->mm->mmap_sem); + get_user_pages_remote(NULL, kvm->mm, uaddr, 1, FOLL_WRITE, + &page, NULL, NULL); + up_read(&kvm->mm->mmap_sem); + return page; } static int adapter_indicators_set(struct kvm *kvm, @@ -2720,30 +2780,35 @@ static int adapter_indicators_set(struct kvm *kvm, { unsigned long bit; int summary_set, idx; - struct s390_map_info *info; + struct page *ind_page, *summary_page; void *map; - info = get_map_info(adapter, adapter_int->ind_addr); - if (!info) + ind_page = get_map_page(kvm, adapter_int->ind_addr); + if (!ind_page) return -1; - map = page_address(info->page); - bit = get_ind_bit(info->addr, adapter_int->ind_offset, adapter->swap); - set_bit(bit, map); - idx = srcu_read_lock(&kvm->srcu); - mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT); - set_page_dirty_lock(info->page); - info = get_map_info(adapter, adapter_int->summary_addr); - if (!info) { - srcu_read_unlock(&kvm->srcu, idx); + summary_page = get_map_page(kvm, adapter_int->summary_addr); + if (!summary_page) { + put_page(ind_page); return -1; } - map = page_address(info->page); - bit = get_ind_bit(info->addr, adapter_int->summary_offset, - adapter->swap); + + idx = srcu_read_lock(&kvm->srcu); + map = page_address(ind_page); + bit = get_ind_bit(adapter_int->ind_addr, + adapter_int->ind_offset, adapter->swap); + set_bit(bit, map); + mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT); + set_page_dirty_lock(ind_page); + map = page_address(summary_page); + bit = get_ind_bit(adapter_int->summary_addr, + adapter_int->summary_offset, adapter->swap); summary_set = test_and_set_bit(bit, map); - mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT); - set_page_dirty_lock(info->page); + mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT); + set_page_dirty_lock(summary_page); srcu_read_unlock(&kvm->srcu, idx); + + put_page(ind_page); + put_page(summary_page); return summary_set ? 0 : 1; } @@ -2765,9 +2830,7 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e, adapter = get_io_adapter(kvm, e->adapter.adapter_id); if (!adapter) return -1; - down_read(&adapter->maps_lock); ret = adapter_indicators_set(kvm, adapter, &e->adapter); - up_read(&adapter->maps_lock); if ((ret > 0) && !adapter->masked) { ret = kvm_s390_inject_airq(kvm, adapter); if (ret == 0) @@ -2818,23 +2881,27 @@ int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { - int ret; + u64 uaddr; switch (ue->type) { + /* we store the userspace addresses instead of the guest addresses */ case KVM_IRQ_ROUTING_S390_ADAPTER: e->set = set_adapter_int; - e->adapter.summary_addr = ue->u.adapter.summary_addr; - e->adapter.ind_addr = ue->u.adapter.ind_addr; + uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr); + if (uaddr == -EFAULT) + return -EFAULT; + e->adapter.summary_addr = uaddr; + uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr); + if (uaddr == -EFAULT) + return -EFAULT; + e->adapter.ind_addr = uaddr; e->adapter.summary_offset = ue->u.adapter.summary_offset; e->adapter.ind_offset = ue->u.adapter.ind_offset; e->adapter.adapter_id = ue->u.adapter.adapter_id; - ret = 0; - break; + return 0; default: - ret = -EINVAL; + return -EINVAL; } - - return ret; } int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d7ff30e45589..19a81024fe16 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2,7 +2,7 @@ /* * hosting IBM Z kernel virtual machines (s390x) * - * Copyright IBM Corp. 2008, 2018 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -44,6 +44,7 @@ #include <asm/cpacf.h> #include <asm/timex.h> #include <asm/ap.h> +#include <asm/uv.h> #include "kvm-s390.h" #include "gaccess.h" @@ -184,6 +185,11 @@ static u8 halt_poll_max_steal = 10; module_param(halt_poll_max_steal, byte, 0644); MODULE_PARM_DESC(halt_poll_max_steal, "Maximum percentage of steal time to allow polling"); +/* if set to true, the GISA will be initialized and used if available */ +static bool use_gisa = true; +module_param(use_gisa, bool, 0644); +MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it."); + /* * For now we handle at most 16 double words as this is what the s390 base * kernel handles and stores in the prefix page. If we ever need to go beyond @@ -220,6 +226,7 @@ static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc; static struct gmap_notifier gmap_notifier; static struct gmap_notifier vsie_gmap_notifier; debug_info_t *kvm_s390_dbf; +debug_info_t *kvm_s390_dbf_uv; /* Section: not file related */ int kvm_arch_hardware_enable(void) @@ -228,13 +235,15 @@ int kvm_arch_hardware_enable(void) return 0; } -int kvm_arch_check_processor_compat(void) +int kvm_arch_check_processor_compat(void *opaque) { return 0; } +/* forward declarations */ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); +static int sca_switch_to_extended(struct kvm *kvm); static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta) { @@ -293,7 +302,7 @@ static struct notifier_block kvm_clock_notifier = { .notifier_call = kvm_clock_sync, }; -int kvm_arch_hardware_setup(void) +int kvm_arch_hardware_setup(void *opaque) { gmap_notifier.notifier_call = kvm_gmap_notifier; gmap_register_pte_notifier(&gmap_notifier); @@ -460,7 +469,12 @@ int kvm_arch_init(void *opaque) if (!kvm_s390_dbf) return -ENOMEM; - if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view)) + kvm_s390_dbf_uv = debug_register("kvm-uv", 32, 1, 7 * sizeof(long)); + if (!kvm_s390_dbf_uv) + goto out; + + if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view) || + debug_register_view(kvm_s390_dbf_uv, &debug_sprintf_view)) goto out; kvm_s390_cpu_feat_init(); @@ -487,6 +501,7 @@ void kvm_arch_exit(void) { kvm_s390_gib_destroy(); debug_unregister(kvm_s390_dbf); + debug_unregister(kvm_s390_dbf_uv); } /* Section: device related */ @@ -564,14 +579,16 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_BPB: r = test_facility(82); break; + case KVM_CAP_S390_PROTECTED: + r = is_prot_virt_host(); + break; default: r = 0; } return r; } -static void kvm_s390_sync_dirty_log(struct kvm *kvm, - struct kvm_memory_slot *memslot) +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { int i; gfn_t cur_gfn, last_gfn; @@ -612,9 +629,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, { int r; unsigned long n; - struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - int is_dirty = 0; + int is_dirty; if (kvm_is_ucontrol(kvm)) return -EINVAL; @@ -625,14 +641,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, if (log->slot >= KVM_USER_MEM_SLOTS) goto out; - slots = kvm_memslots(kvm); - memslot = id_to_memslot(slots, log->slot); - r = -ENOENT; - if (!memslot->dirty_bitmap) - goto out; - - kvm_s390_sync_dirty_log(kvm, memslot); - r = kvm_get_dirty_log(kvm, log, &is_dirty); + r = kvm_get_dirty_log(kvm, log, &is_dirty, &memslot); if (r) goto out; @@ -1993,6 +2002,9 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *ms; + if (unlikely(!slots->used_slots)) + return 0; + cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn); ms = gfn_to_memslot(kvm, cur_gfn); args->count = 0; @@ -2158,6 +2170,194 @@ out: return r; } +static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp) +{ + struct kvm_vcpu *vcpu; + u16 rc, rrc; + int ret = 0; + int i; + + /* + * We ignore failures and try to destroy as many CPUs as possible. + * At the same time we must not free the assigned resources when + * this fails, as the ultravisor has still access to that memory. + * So kvm_s390_pv_destroy_cpu can leave a "wanted" memory leak + * behind. + * We want to return the first failure rc and rrc, though. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + if (kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc) && !ret) { + *rcp = rc; + *rrcp = rrc; + ret = -EIO; + } + mutex_unlock(&vcpu->mutex); + } + return ret; +} + +static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + int i, r = 0; + u16 dummy; + + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + r = kvm_s390_pv_create_cpu(vcpu, rc, rrc); + mutex_unlock(&vcpu->mutex); + if (r) + break; + } + if (r) + kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); + return r; +} + +static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) +{ + int r = 0; + u16 dummy; + void __user *argp = (void __user *)cmd->data; + + switch (cmd->cmd) { + case KVM_PV_ENABLE: { + r = -EINVAL; + if (kvm_s390_pv_is_protected(kvm)) + break; + + /* + * FMT 4 SIE needs esca. As we never switch back to bsca from + * esca, we need no cleanup in the error cases below + */ + r = sca_switch_to_extended(kvm); + if (r) + break; + + down_write(¤t->mm->mmap_sem); + r = gmap_mark_unmergeable(); + up_write(¤t->mm->mmap_sem); + if (r) + break; + + r = kvm_s390_pv_init_vm(kvm, &cmd->rc, &cmd->rrc); + if (r) + break; + + r = kvm_s390_cpus_to_pv(kvm, &cmd->rc, &cmd->rrc); + if (r) + kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); + + /* we need to block service interrupts from now on */ + set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + } + case KVM_PV_DISABLE: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc); + /* + * If a CPU could not be destroyed, destroy VM will also fail. + * There is no point in trying to destroy it. Instead return + * the rc and rrc from the first CPU that failed destroying. + */ + if (r) + break; + r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc); + + /* no need to block service interrupts any more */ + clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + } + case KVM_PV_SET_SEC_PARMS: { + struct kvm_s390_pv_sec_parm parms = {}; + void *hdr; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = -EFAULT; + if (copy_from_user(&parms, argp, sizeof(parms))) + break; + + /* Currently restricted to 8KB */ + r = -EINVAL; + if (parms.length > PAGE_SIZE * 2) + break; + + r = -ENOMEM; + hdr = vmalloc(parms.length); + if (!hdr) + break; + + r = -EFAULT; + if (!copy_from_user(hdr, (void __user *)parms.origin, + parms.length)) + r = kvm_s390_pv_set_sec_parms(kvm, hdr, parms.length, + &cmd->rc, &cmd->rrc); + + vfree(hdr); + break; + } + case KVM_PV_UNPACK: { + struct kvm_s390_pv_unp unp = {}; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = -EFAULT; + if (copy_from_user(&unp, argp, sizeof(unp))) + break; + + r = kvm_s390_pv_unpack(kvm, unp.addr, unp.size, unp.tweak, + &cmd->rc, &cmd->rrc); + break; + } + case KVM_PV_VERIFY: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_VERIFY_IMG, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT VERIFY: rc %x rrc %x", cmd->rc, + cmd->rrc); + break; + } + case KVM_PV_PREP_RESET: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_PREPARE_RESET, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT PREP RESET: rc %x rrc %x", + cmd->rc, cmd->rrc); + break; + } + case KVM_PV_UNSHARE_ALL: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_SET_UNSHARE_ALL, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT UNSHARE: rc %x rrc %x", + cmd->rc, cmd->rrc); + break; + } + default: + r = -ENOTTY; + } + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2255,6 +2455,33 @@ long kvm_arch_vm_ioctl(struct file *filp, mutex_unlock(&kvm->slots_lock); break; } + case KVM_S390_PV_COMMAND: { + struct kvm_pv_cmd args; + + /* protvirt means user sigp */ + kvm->arch.user_cpu_state_ctrl = 1; + r = 0; + if (!is_prot_virt_host()) { + r = -EINVAL; + break; + } + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + if (args.flags) { + r = -EINVAL; + break; + } + mutex_lock(&kvm->lock); + r = kvm_s390_handle_pv(kvm, &args); + mutex_unlock(&kvm->lock); + if (copy_to_user(argp, &args, sizeof(args))) { + r = -EFAULT; + break; + } + break; + } default: r = -ENOTTY; } @@ -2504,7 +2731,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.use_skf = sclp.has_skey; spin_lock_init(&kvm->arch.start_stop_lock); kvm_s390_vsie_init(kvm); - kvm_s390_gisa_init(kvm); + if (use_gisa) + kvm_s390_gisa_init(kvm); KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); return 0; @@ -2518,6 +2746,8 @@ out_err: void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { + u16 rc, rrc; + VCPU_EVENT(vcpu, 3, "%s", "free cpu"); trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id); kvm_s390_clear_local_irqs(vcpu); @@ -2530,6 +2760,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); + /* We can not hold the vcpu mutex here, we are already dying */ + if (kvm_s390_pv_cpu_get_handle(vcpu)) + kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc); free_page((unsigned long)(vcpu->arch.sie_block)); } @@ -2551,10 +2784,20 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + u16 rc, rrc; + kvm_free_vcpus(kvm); sca_dispose(kvm); - debug_unregister(kvm->arch.dbf); kvm_s390_gisa_destroy(kvm); + /* + * We are already at the end of life and kvm->lock is not taken. + * This is ok as the file descriptor is closed by now and nobody + * can mess with the pv state. To avoid lockdep_assert_held from + * complaining we do not use kvm_s390_pv_is_protected. + */ + if (kvm_s390_pv_get_handle(kvm)) + kvm_s390_pv_deinit_vm(kvm, &rc, &rrc); + debug_unregister(kvm->arch.dbf); free_page((unsigned long)kvm->arch.sie_page2); if (!kvm_is_ucontrol(kvm)) gmap_remove(kvm->arch.gmap); @@ -2650,6 +2893,9 @@ static int sca_switch_to_extended(struct kvm *kvm) unsigned int vcpu_idx; u32 scaol, scaoh; + if (kvm->arch.use_esca) + return 0; + new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL|__GFP_ZERO); if (!new_sca) return -ENOMEM; @@ -2901,6 +3147,7 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu) static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) { int rc = 0; + u16 uvrc, uvrrc; atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | CPUSTAT_SM | @@ -2968,6 +3215,14 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) kvm_s390_vcpu_crypto_setup(vcpu); + mutex_lock(&vcpu->kvm->lock); + if (kvm_s390_pv_is_protected(vcpu->kvm)) { + rc = kvm_s390_pv_create_cpu(vcpu, &uvrc, &uvrrc); + if (rc) + kvm_s390_vcpu_unsetup_cmma(vcpu); + } + mutex_unlock(&vcpu->kvm->lock); + return rc; } @@ -3268,20 +3523,43 @@ static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) /* Initial reset is a superset of the normal reset */ kvm_arch_vcpu_ioctl_normal_reset(vcpu); - /* this equals initial cpu reset in pop, but we don't switch to ESA */ + /* + * This equals initial cpu reset in pop, but we don't switch to ESA. + * We do not only reset the internal data, but also ... + */ vcpu->arch.sie_block->gpsw.mask = 0; vcpu->arch.sie_block->gpsw.addr = 0; kvm_s390_set_prefix(vcpu, 0); kvm_s390_set_cpu_timer(vcpu, 0); vcpu->arch.sie_block->ckc = 0; - vcpu->arch.sie_block->todpr = 0; memset(vcpu->arch.sie_block->gcr, 0, sizeof(vcpu->arch.sie_block->gcr)); vcpu->arch.sie_block->gcr[0] = CR0_INITIAL_MASK; vcpu->arch.sie_block->gcr[14] = CR14_INITIAL_MASK; + + /* ... the data in sync regs */ + memset(vcpu->run->s.regs.crs, 0, sizeof(vcpu->run->s.regs.crs)); + vcpu->run->s.regs.ckc = 0; + vcpu->run->s.regs.crs[0] = CR0_INITIAL_MASK; + vcpu->run->s.regs.crs[14] = CR14_INITIAL_MASK; + vcpu->run->psw_addr = 0; + vcpu->run->psw_mask = 0; + vcpu->run->s.regs.todpr = 0; + vcpu->run->s.regs.cputm = 0; + vcpu->run->s.regs.ckc = 0; + vcpu->run->s.regs.pp = 0; + vcpu->run->s.regs.gbea = 1; vcpu->run->s.regs.fpc = 0; - vcpu->arch.sie_block->gbea = 1; - vcpu->arch.sie_block->pp = 0; - vcpu->arch.sie_block->fpf &= ~FPF_BPBC; + /* + * Do not reset these registers in the protected case, as some of + * them are overlayed and they are not accessible in this case + * anyway. + */ + if (!kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->gbea = 1; + vcpu->arch.sie_block->pp = 0; + vcpu->arch.sie_block->fpf &= ~FPF_BPBC; + vcpu->arch.sie_block->todpr = 0; + } } static void kvm_arch_vcpu_ioctl_clear_reset(struct kvm_vcpu *vcpu) @@ -3471,14 +3749,20 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, switch (mp_state->mp_state) { case KVM_MP_STATE_STOPPED: - kvm_s390_vcpu_stop(vcpu); + rc = kvm_s390_vcpu_stop(vcpu); break; case KVM_MP_STATE_OPERATING: - kvm_s390_vcpu_start(vcpu); + rc = kvm_s390_vcpu_start(vcpu); break; case KVM_MP_STATE_LOAD: + if (!kvm_s390_pv_cpu_is_protected(vcpu)) { + rc = -ENXIO; + break; + } + rc = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR_LOAD); + break; case KVM_MP_STATE_CHECK_STOP: - /* fall through - CHECK_STOP and LOAD are not supported yet */ + fallthrough; /* CHECK_STOP and LOAD are not supported yet */ default: rc = -ENXIO; } @@ -3828,9 +4112,11 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) return vcpu_post_run_fault_in_sie(vcpu); } +#define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK) static int __vcpu_run(struct kvm_vcpu *vcpu) { int rc, exit_reason; + struct sie_page *sie_page = (struct sie_page *)vcpu->arch.sie_block; /* * We try to hold kvm->srcu during most of vcpu_run (except when run- @@ -3852,8 +4138,28 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) guest_enter_irqoff(); __disable_cpu_timer_accounting(vcpu); local_irq_enable(); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy(sie_page->pv_grregs, + vcpu->run->s.regs.gprs, + sizeof(sie_page->pv_grregs)); + } exit_reason = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy(vcpu->run->s.regs.gprs, + sie_page->pv_grregs, + sizeof(sie_page->pv_grregs)); + /* + * We're not allowed to inject interrupts on intercepts + * that leave the guest state in an "in-between" state + * where the next SIE entry will do a continuation. + * Fence interrupts in our "internal" PSW. + */ + if (vcpu->arch.sie_block->icptcode == ICPT_PV_INSTR || + vcpu->arch.sie_block->icptcode == ICPT_PV_PREF) { + vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK; + } + } local_irq_disable(); __enable_cpu_timer_accounting(vcpu); guest_exit_irqoff(); @@ -3867,7 +4173,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) return rc; } -static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void sync_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct runtime_instr_cb *riccb; struct gs_cb *gscb; @@ -3876,16 +4182,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) gscb = (struct gs_cb *) &kvm_run->s.regs.gscb; vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask; vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr; - if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) - kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); - if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) { - memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128); - /* some control register changes require a tlb flush */ - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) { - kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm); - vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc; vcpu->arch.sie_block->todpr = kvm_run->s.regs.todpr; vcpu->arch.sie_block->pp = kvm_run->s.regs.pp; vcpu->arch.sie_block->gbea = kvm_run->s.regs.gbea; @@ -3926,6 +4223,36 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.sie_block->fpf &= ~FPF_BPBC; vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 0; } + if (MACHINE_HAS_GS) { + preempt_disable(); + __ctl_set_bit(2, 4); + if (current->thread.gs_cb) { + vcpu->arch.host_gscb = current->thread.gs_cb; + save_gs_cb(vcpu->arch.host_gscb); + } + if (vcpu->arch.gs_enabled) { + current->thread.gs_cb = (struct gs_cb *) + &vcpu->run->s.regs.gscb; + restore_gs_cb(current->thread.gs_cb); + } + preempt_enable(); + } + /* SIE will load etoken directly from SDNX and therefore kvm_run */ +} + +static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) + kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); + if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) { + memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128); + /* some control register changes require a tlb flush */ + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) { + kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm); + vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc; + } save_access_regs(vcpu->arch.host_acrs); restore_access_regs(vcpu->run->s.regs.acrs); /* save host (userspace) fprs/vrs */ @@ -3940,23 +4267,47 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (test_fp_ctl(current->thread.fpu.fpc)) /* User space provided an invalid FPC, let's clear it */ current->thread.fpu.fpc = 0; + + /* Sync fmt2 only data */ + if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) { + sync_regs_fmt2(vcpu, kvm_run); + } else { + /* + * In several places we have to modify our internal view to + * not do things that are disallowed by the ultravisor. For + * example we must not inject interrupts after specific exits + * (e.g. 112 prefix page not secure). We do this by turning + * off the machine check, external and I/O interrupt bits + * of our PSW copy. To avoid getting validity intercepts, we + * do only accept the condition code from userspace. + */ + vcpu->arch.sie_block->gpsw.mask &= ~PSW_MASK_CC; + vcpu->arch.sie_block->gpsw.mask |= kvm_run->psw_mask & + PSW_MASK_CC; + } + + kvm_run->kvm_dirty_regs = 0; +} + +static void store_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr; + kvm_run->s.regs.pp = vcpu->arch.sie_block->pp; + kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; + kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; if (MACHINE_HAS_GS) { - preempt_disable(); __ctl_set_bit(2, 4); - if (current->thread.gs_cb) { - vcpu->arch.host_gscb = current->thread.gs_cb; - save_gs_cb(vcpu->arch.host_gscb); - } - if (vcpu->arch.gs_enabled) { - current->thread.gs_cb = (struct gs_cb *) - &vcpu->run->s.regs.gscb; - restore_gs_cb(current->thread.gs_cb); - } + if (vcpu->arch.gs_enabled) + save_gs_cb(current->thread.gs_cb); + preempt_disable(); + current->thread.gs_cb = vcpu->arch.host_gscb; + restore_gs_cb(vcpu->arch.host_gscb); preempt_enable(); + if (!vcpu->arch.host_gscb) + __ctl_clear_bit(2, 4); + vcpu->arch.host_gscb = NULL; } - /* SIE will load etoken directly from SDNX and therefore kvm_run */ - - kvm_run->kvm_dirty_regs = 0; + /* SIE will save etoken directly into SDNX and therefore kvm_run */ } static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -3967,13 +4318,9 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128); kvm_run->s.regs.cputm = kvm_s390_get_cpu_timer(vcpu); kvm_run->s.regs.ckc = vcpu->arch.sie_block->ckc; - kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr; - kvm_run->s.regs.pp = vcpu->arch.sie_block->pp; - kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; kvm_run->s.regs.pft = vcpu->arch.pfault_token; kvm_run->s.regs.pfs = vcpu->arch.pfault_select; kvm_run->s.regs.pfc = vcpu->arch.pfault_compare; - kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; save_access_regs(vcpu->run->s.regs.acrs); restore_access_regs(vcpu->arch.host_acrs); /* Save guest register state */ @@ -3982,19 +4329,8 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* Restore will be done lazily at return */ current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc; current->thread.fpu.regs = vcpu->arch.host_fpregs.regs; - if (MACHINE_HAS_GS) { - __ctl_set_bit(2, 4); - if (vcpu->arch.gs_enabled) - save_gs_cb(current->thread.gs_cb); - preempt_disable(); - current->thread.gs_cb = vcpu->arch.host_gscb; - restore_gs_cb(vcpu->arch.host_gscb); - preempt_enable(); - if (!vcpu->arch.host_gscb) - __ctl_clear_bit(2, 4); - vcpu->arch.host_gscb = NULL; - } - /* SIE will save etoken directly into SDNX and therefore kvm_run */ + if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) + store_regs_fmt2(vcpu, kvm_run); } int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -4018,6 +4354,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_sigset_activate(vcpu); + /* + * no need to check the return value of vcpu_start as it can only have + * an error for protvirt, but protvirt means user cpu state + */ if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) { kvm_s390_vcpu_start(vcpu); } else if (is_vcpu_stopped(vcpu)) { @@ -4155,18 +4495,27 @@ static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu) kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu); } -void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) +int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) { - int i, online_vcpus, started_vcpus = 0; + int i, online_vcpus, r = 0, started_vcpus = 0; if (!is_vcpu_stopped(vcpu)) - return; + return 0; trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 1); /* Only one cpu at a time may enter/leave the STOPPED state. */ spin_lock(&vcpu->kvm->arch.start_stop_lock); online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); + /* Let's tell the UV that we want to change into the operating state */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR); + if (r) { + spin_unlock(&vcpu->kvm->arch.start_stop_lock); + return r; + } + } + for (i = 0; i < online_vcpus; i++) { if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) started_vcpus++; @@ -4186,27 +4535,43 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) kvm_s390_clear_cpuflags(vcpu, CPUSTAT_STOPPED); /* + * The real PSW might have changed due to a RESTART interpreted by the + * ultravisor. We block all interrupts and let the next sie exit + * refresh our view. + */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) + vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK; + /* * Another VCPU might have used IBS while we were offline. * Let's play safe and flush the VCPU at startup. */ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); spin_unlock(&vcpu->kvm->arch.start_stop_lock); - return; + return 0; } -void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) +int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) { - int i, online_vcpus, started_vcpus = 0; + int i, online_vcpus, r = 0, started_vcpus = 0; struct kvm_vcpu *started_vcpu = NULL; if (is_vcpu_stopped(vcpu)) - return; + return 0; trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 0); /* Only one cpu at a time may enter/leave the STOPPED state. */ spin_lock(&vcpu->kvm->arch.start_stop_lock); online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); + /* Let's tell the UV that we want to change into the stopped state */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_STP); + if (r) { + spin_unlock(&vcpu->kvm->arch.start_stop_lock); + return r; + } + } + /* SIGP STOP and SIGP STOP AND STORE STATUS has been fully processed */ kvm_s390_clear_stop_irq(vcpu); @@ -4229,7 +4594,7 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) } spin_unlock(&vcpu->kvm->arch.start_stop_lock); - return; + return 0; } static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, @@ -4256,12 +4621,40 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } +static long kvm_s390_guest_sida_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + int r = 0; + + if (mop->flags || !mop->size) + return -EINVAL; + if (mop->size + mop->sida_offset < mop->size) + return -EINVAL; + if (mop->size + mop->sida_offset > sida_size(vcpu->arch.sie_block)) + return -E2BIG; + + switch (mop->op) { + case KVM_S390_MEMOP_SIDA_READ: + if (copy_to_user(uaddr, (void *)(sida_origin(vcpu->arch.sie_block) + + mop->sida_offset), mop->size)) + r = -EFAULT; + + break; + case KVM_S390_MEMOP_SIDA_WRITE: + if (copy_from_user((void *)(sida_origin(vcpu->arch.sie_block) + + mop->sida_offset), uaddr, mop->size)) + r = -EFAULT; + break; + } + return r; +} static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; void *tmpbuf = NULL; - int r, srcu_idx; + int r = 0; const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION | KVM_S390_MEMOP_F_CHECK_ONLY; @@ -4271,14 +4664,15 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, if (mop->size > MEM_OP_MAX_SIZE) return -E2BIG; + if (kvm_s390_pv_cpu_is_protected(vcpu)) + return -EINVAL; + if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) { tmpbuf = vmalloc(mop->size); if (!tmpbuf) return -ENOMEM; } - srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - switch (mop->op) { case KVM_S390_MEMOP_LOGICAL_READ: if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { @@ -4304,12 +4698,8 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, } r = write_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size); break; - default: - r = -EINVAL; } - srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); - if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0) kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); @@ -4317,6 +4707,31 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, return r; } +static long kvm_s390_guest_memsida_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) +{ + int r, srcu_idx; + + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + switch (mop->op) { + case KVM_S390_MEMOP_LOGICAL_READ: + case KVM_S390_MEMOP_LOGICAL_WRITE: + r = kvm_s390_guest_mem_op(vcpu, mop); + break; + case KVM_S390_MEMOP_SIDA_READ: + case KVM_S390_MEMOP_SIDA_WRITE: + /* we are locked against sida going away by the vcpu->mutex */ + r = kvm_s390_guest_sida_op(vcpu, mop); + break; + default: + r = -EINVAL; + } + + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + return r; +} + long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4352,6 +4767,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, void __user *argp = (void __user *)arg; int idx; long r; + u16 rc, rrc; vcpu_load(vcpu); @@ -4373,18 +4789,40 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_S390_CLEAR_RESET: r = 0; kvm_arch_vcpu_ioctl_clear_reset(vcpu); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_CPU_RESET_CLEAR, &rc, &rrc); + VCPU_EVENT(vcpu, 3, "PROTVIRT RESET CLEAR VCPU: rc %x rrc %x", + rc, rrc); + } break; case KVM_S390_INITIAL_RESET: r = 0; kvm_arch_vcpu_ioctl_initial_reset(vcpu); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_CPU_RESET_INITIAL, + &rc, &rrc); + VCPU_EVENT(vcpu, 3, "PROTVIRT RESET INITIAL VCPU: rc %x rrc %x", + rc, rrc); + } break; case KVM_S390_NORMAL_RESET: r = 0; kvm_arch_vcpu_ioctl_normal_reset(vcpu); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_CPU_RESET, &rc, &rrc); + VCPU_EVENT(vcpu, 3, "PROTVIRT RESET NORMAL VCPU: rc %x rrc %x", + rc, rrc); + } break; case KVM_SET_ONE_REG: case KVM_GET_ONE_REG: { struct kvm_one_reg reg; + r = -EINVAL; + if (kvm_s390_pv_cpu_is_protected(vcpu)) + break; r = -EFAULT; if (copy_from_user(®, argp, sizeof(reg))) break; @@ -4447,7 +4885,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_s390_mem_op mem_op; if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0) - r = kvm_s390_guest_mem_op(vcpu, &mem_op); + r = kvm_s390_guest_memsida_op(vcpu, &mem_op); else r = -EFAULT; break; @@ -4507,12 +4945,6 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) -{ - return 0; -} - /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, @@ -4533,12 +4965,15 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if (mem->guest_phys_addr + mem->memory_size > kvm->arch.mem_limit) return -EINVAL; + /* When we are protected, we should not change the memory slots */ + if (kvm_s390_pv_get_handle(kvm)) + return -EINVAL; return 0; } void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old, + struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { @@ -4554,7 +4989,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, old->npages * PAGE_SIZE); if (rc) break; - /* FALLTHROUGH */ + fallthrough; case KVM_MR_CREATE: rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr, mem->guest_phys_addr, mem->memory_size); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 6d9448dbd052..79dcd647b378 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -2,7 +2,7 @@ /* * definition for kvm on s390 * - * Copyright IBM Corp. 2008, 2009 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -15,6 +15,7 @@ #include <linux/hrtimer.h> #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <linux/lockdep.h> #include <asm/facility.h> #include <asm/processor.h> #include <asm/sclp.h> @@ -25,6 +26,17 @@ #define IS_ITDB_VALID(vcpu) ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1)) extern debug_info_t *kvm_s390_dbf; +extern debug_info_t *kvm_s390_dbf_uv; + +#define KVM_UV_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ +do { \ + debug_sprintf_event((d_kvm)->arch.dbf, d_loglevel, d_string "\n", \ + d_args); \ + debug_sprintf_event(kvm_s390_dbf_uv, d_loglevel, \ + "%d: " d_string "\n", (d_kvm)->userspace_pid, \ + d_args); \ +} while (0) + #define KVM_EVENT(d_loglevel, d_string, d_args...)\ do { \ debug_sprintf_event(kvm_s390_dbf, d_loglevel, d_string "\n", \ @@ -196,6 +208,39 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) return kvm->arch.user_cpu_state_ctrl != 0; } +/* implemented in pv.c */ +int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); +int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, + u16 *rrc); +int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, + unsigned long tweak, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state); + +static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm) +{ + return kvm->arch.pv.handle; +} + +static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pv.handle; +} + +static inline bool kvm_s390_pv_is_protected(struct kvm *kvm) +{ + lockdep_assert_held(&kvm->lock); + return !!kvm_s390_pv_get_handle(kvm); +} + +static inline bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu) +{ + lockdep_assert_held(&vcpu->mutex); + return !!kvm_s390_pv_cpu_get_handle(vcpu); +} + /* implemented in interrupt.c */ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu); @@ -286,8 +331,8 @@ void kvm_s390_set_tod_clock(struct kvm *kvm, long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr); -void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); -void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); +int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); +int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu); bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index ed52ffa8d5d4..69a824f9ef0b 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -2,7 +2,7 @@ /* * handling privileged instructions * - * Copyright IBM Corp. 2008, 2018 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -872,7 +872,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu) operand2 = kvm_s390_get_base_disp_s(vcpu, &ar); - if (operand2 & 0xfff) + if (!kvm_s390_pv_cpu_is_protected(vcpu) && (operand2 & 0xfff)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); switch (fc) { @@ -893,8 +893,13 @@ static int handle_stsi(struct kvm_vcpu *vcpu) handle_stsi_3_2_2(vcpu, (void *) mem); break; } - - rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy((void *)sida_origin(vcpu->arch.sie_block), (void *)mem, + PAGE_SIZE); + rc = 0; + } else { + rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE); + } if (rc) { rc = kvm_s390_inject_prog_cond(vcpu, rc); goto out; diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c new file mode 100644 index 000000000000..63e330109b63 --- /dev/null +++ b/arch/s390/kvm/pv.c @@ -0,0 +1,303 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hosting Protected Virtual Machines + * + * Copyright IBM Corp. 2019, 2020 + * Author(s): Janosch Frank <frankja@linux.ibm.com> + */ +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/pagemap.h> +#include <linux/sched/signal.h> +#include <asm/pgalloc.h> +#include <asm/gmap.h> +#include <asm/uv.h> +#include <asm/mman.h> +#include "kvm-s390.h" + +int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) +{ + int cc = 0; + + if (kvm_s390_pv_cpu_get_handle(vcpu)) { + cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_DESTROY_SEC_CPU, rc, rrc); + + KVM_UV_EVENT(vcpu->kvm, 3, + "PROTVIRT DESTROY VCPU %d: rc %x rrc %x", + vcpu->vcpu_id, *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x", + *rc, *rrc); + } + /* Intended memory leak for something that should never happen. */ + if (!cc) + free_pages(vcpu->arch.pv.stor_base, + get_order(uv_info.guest_cpu_stor_len)); + + free_page(sida_origin(vcpu->arch.sie_block)); + vcpu->arch.sie_block->pv_handle_cpu = 0; + vcpu->arch.sie_block->pv_handle_config = 0; + memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv)); + vcpu->arch.sie_block->sdf = 0; + /* + * The sidad field (for sdf == 2) is now the gbea field (for sdf == 0). + * Use the reset value of gbea to avoid leaking the kernel pointer of + * the just freed sida. + */ + vcpu->arch.sie_block->gbea = 1; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + + return cc ? EIO : 0; +} + +int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) +{ + struct uv_cb_csc uvcb = { + .header.cmd = UVC_CMD_CREATE_SEC_CPU, + .header.len = sizeof(uvcb), + }; + int cc; + + if (kvm_s390_pv_cpu_get_handle(vcpu)) + return -EINVAL; + + vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, + get_order(uv_info.guest_cpu_stor_len)); + if (!vcpu->arch.pv.stor_base) + return -ENOMEM; + + /* Input */ + uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm); + uvcb.num = vcpu->arch.sie_block->icpua; + uvcb.state_origin = (u64)vcpu->arch.sie_block; + uvcb.stor_origin = (u64)vcpu->arch.pv.stor_base; + + /* Alloc Secure Instruction Data Area Designation */ + vcpu->arch.sie_block->sidad = __get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!vcpu->arch.sie_block->sidad) { + free_pages(vcpu->arch.pv.stor_base, + get_order(uv_info.guest_cpu_stor_len)); + return -ENOMEM; + } + + cc = uv_call(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(vcpu->kvm, 3, + "PROTVIRT CREATE VCPU: cpu %d handle %llx rc %x rrc %x", + vcpu->vcpu_id, uvcb.cpu_handle, uvcb.header.rc, + uvcb.header.rrc); + + if (cc) { + u16 dummy; + + kvm_s390_pv_destroy_cpu(vcpu, &dummy, &dummy); + return -EIO; + } + + /* Output */ + vcpu->arch.pv.handle = uvcb.cpu_handle; + vcpu->arch.sie_block->pv_handle_cpu = uvcb.cpu_handle; + vcpu->arch.sie_block->pv_handle_config = kvm_s390_pv_get_handle(vcpu->kvm); + vcpu->arch.sie_block->sdf = 2; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + return 0; +} + +/* only free resources when the destroy was successful */ +static void kvm_s390_pv_dealloc_vm(struct kvm *kvm) +{ + vfree(kvm->arch.pv.stor_var); + free_pages(kvm->arch.pv.stor_base, + get_order(uv_info.guest_base_stor_len)); + memset(&kvm->arch.pv, 0, sizeof(kvm->arch.pv)); +} + +static int kvm_s390_pv_alloc_vm(struct kvm *kvm) +{ + unsigned long base = uv_info.guest_base_stor_len; + unsigned long virt = uv_info.guest_virt_var_stor_len; + unsigned long npages = 0, vlen = 0; + struct kvm_memory_slot *memslot; + + kvm->arch.pv.stor_var = NULL; + kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, get_order(base)); + if (!kvm->arch.pv.stor_base) + return -ENOMEM; + + /* + * Calculate current guest storage for allocation of the + * variable storage, which is based on the length in MB. + * + * Slots are sorted by GFN + */ + mutex_lock(&kvm->slots_lock); + memslot = kvm_memslots(kvm)->memslots; + npages = memslot->base_gfn + memslot->npages; + mutex_unlock(&kvm->slots_lock); + + kvm->arch.pv.guest_len = npages * PAGE_SIZE; + + /* Allocate variable storage */ + vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE); + vlen += uv_info.guest_virt_base_stor_len; + kvm->arch.pv.stor_var = vzalloc(vlen); + if (!kvm->arch.pv.stor_var) + goto out_err; + return 0; + +out_err: + kvm_s390_pv_dealloc_vm(kvm); + return -ENOMEM; +} + +/* this should not fail, but if it does, we must not free the donated memory */ +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + int cc; + + /* make all pages accessible before destroying the guest */ + s390_reset_acc(kvm->mm); + + cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + atomic_set(&kvm->mm->context.is_protected, 0); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc); + /* Inteded memory leak on "impossible" error */ + if (!cc) + kvm_s390_pv_dealloc_vm(kvm); + return cc ? -EIO : 0; +} + +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct uv_cb_cgc uvcb = { + .header.cmd = UVC_CMD_CREATE_SEC_CONF, + .header.len = sizeof(uvcb) + }; + int cc, ret; + u16 dummy; + + ret = kvm_s390_pv_alloc_vm(kvm); + if (ret) + return ret; + + /* Inputs */ + uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */ + uvcb.guest_stor_len = kvm->arch.pv.guest_len; + uvcb.guest_asce = kvm->arch.gmap->asce; + uvcb.guest_sca = (unsigned long)kvm->arch.sca; + uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base; + uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var; + + cc = uv_call(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x", + uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc); + + /* Outputs */ + kvm->arch.pv.handle = uvcb.guest_handle; + + if (cc) { + if (uvcb.header.rc & UVC_RC_NEED_DESTROY) + kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); + else + kvm_s390_pv_dealloc_vm(kvm); + return -EIO; + } + kvm->arch.gmap->guest_handle = uvcb.guest_handle; + atomic_set(&kvm->mm->context.is_protected, 1); + return 0; +} + +int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, + u16 *rrc) +{ + struct uv_cb_ssc uvcb = { + .header.cmd = UVC_CMD_SET_SEC_CONF_PARAMS, + .header.len = sizeof(uvcb), + .sec_header_origin = (u64)hdr, + .sec_header_len = length, + .guest_handle = kvm_s390_pv_get_handle(kvm), + }; + int cc = uv_call(0, (u64)&uvcb); + + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x", + *rc, *rrc); + return cc ? -EINVAL : 0; +} + +static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak, + u64 offset, u16 *rc, u16 *rrc) +{ + struct uv_cb_unp uvcb = { + .header.cmd = UVC_CMD_UNPACK_IMG, + .header.len = sizeof(uvcb), + .guest_handle = kvm_s390_pv_get_handle(kvm), + .gaddr = addr, + .tweak[0] = tweak, + .tweak[1] = offset, + }; + int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb); + + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + + if (ret && ret != -EAGAIN) + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x", + uvcb.gaddr, *rc, *rrc); + return ret; +} + +int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, + unsigned long tweak, u16 *rc, u16 *rrc) +{ + u64 offset = 0; + int ret = 0; + + if (addr & ~PAGE_MASK || !size || size & ~PAGE_MASK) + return -EINVAL; + + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx", + addr, size); + + while (offset < size) { + ret = unpack_one(kvm, addr, tweak, offset, rc, rrc); + if (ret == -EAGAIN) { + cond_resched(); + if (fatal_signal_pending(current)) + break; + continue; + } + if (ret) + break; + addr += PAGE_SIZE; + offset += PAGE_SIZE; + } + if (!ret) + KVM_UV_EVENT(kvm, 3, "%s", "PROTVIRT VM UNPACK: successful"); + return ret; +} + +int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state) +{ + struct uv_cb_cpu_set_state uvcb = { + .header.cmd = UVC_CMD_CPU_SET_STATE, + .header.len = sizeof(uvcb), + .cpu_handle = kvm_s390_pv_cpu_get_handle(vcpu), + .state = state, + }; + int cc; + + cc = uv_call(0, (u64)&uvcb); + KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT SET CPU %d STATE %d rc %x rrc %x", + vcpu->vcpu_id, state, uvcb.header.rc, uvcb.header.rrc); + if (cc) + return -EINVAL; + return 0; +} diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index a51c892f14f3..ae989b740376 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -19,7 +19,6 @@ #include <linux/swap.h> #include <linux/kthread.h> #include <linux/oom.h> -#include <linux/suspend.h> #include <linux/uaccess.h> #include <asm/pgalloc.h> @@ -49,7 +48,6 @@ static volatile long cmm_pages_target; static volatile long cmm_timed_pages_target; static long cmm_timeout_pages; static long cmm_timeout_seconds; -static int cmm_suspended; static struct cmm_page_array *cmm_page_list; static struct cmm_page_array *cmm_timed_page_list; @@ -151,9 +149,9 @@ static int cmm_thread(void *dummy) while (1) { rc = wait_event_interruptible(cmm_thread_wait, - (!cmm_suspended && (cmm_pages != cmm_pages_target || - cmm_timed_pages != cmm_timed_pages_target)) || - kthread_should_stop()); + cmm_pages != cmm_pages_target || + cmm_timed_pages != cmm_timed_pages_target || + kthread_should_stop()); if (kthread_should_stop() || rc == -ERESTARTSYS) { cmm_pages_target = cmm_pages; cmm_timed_pages_target = cmm_timed_pages; @@ -390,38 +388,6 @@ static void cmm_smsg_target(const char *from, char *msg) static struct ctl_table_header *cmm_sysctl_header; -static int cmm_suspend(void) -{ - cmm_suspended = 1; - cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); - cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); - return 0; -} - -static int cmm_resume(void) -{ - cmm_suspended = 0; - cmm_kick_thread(); - return 0; -} - -static int cmm_power_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - switch (event) { - case PM_POST_HIBERNATION: - return cmm_resume(); - case PM_HIBERNATION_PREPARE: - return cmm_suspend(); - default: - return NOTIFY_DONE; - } -} - -static struct notifier_block cmm_power_notifier = { - .notifier_call = cmm_power_event, -}; - static int __init cmm_init(void) { int rc = -ENOMEM; @@ -446,16 +412,11 @@ static int __init cmm_init(void) rc = register_oom_notifier(&cmm_oom_nb); if (rc < 0) goto out_oom_notify; - rc = register_pm_notifier(&cmm_power_notifier); - if (rc) - goto out_pm; cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread"); if (!IS_ERR(cmm_thread_ptr)) return 0; rc = PTR_ERR(cmm_thread_ptr); - unregister_pm_notifier(&cmm_power_notifier); -out_pm: unregister_oom_notifier(&cmm_oom_nb); out_oom_notify: #ifdef CONFIG_CMM_IUCV @@ -475,7 +436,6 @@ static void __exit cmm_exit(void) #ifdef CONFIG_CMM_IUCV smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target); #endif - unregister_pm_notifier(&cmm_power_notifier); unregister_oom_notifier(&cmm_oom_nb); kthread_stop(cmm_thread_ptr); del_timer_sync(&cmm_timer); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 7b0bb475c166..d56f67745e3e 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -38,17 +38,18 @@ #include <asm/irq.h> #include <asm/mmu_context.h> #include <asm/facility.h> +#include <asm/uv.h> #include "../kernel/entry.h" #define __FAIL_ADDR_MASK -4096L #define __SUBCODE_MASK 0x0600 #define __PF_RES_FIELD 0x8000000000000000ULL -#define VM_FAULT_BADCONTEXT 0x010000 -#define VM_FAULT_BADMAP 0x020000 -#define VM_FAULT_BADACCESS 0x040000 -#define VM_FAULT_SIGNAL 0x080000 -#define VM_FAULT_PFAULT 0x100000 +#define VM_FAULT_BADCONTEXT ((__force vm_fault_t) 0x010000) +#define VM_FAULT_BADMAP ((__force vm_fault_t) 0x020000) +#define VM_FAULT_BADACCESS ((__force vm_fault_t) 0x040000) +#define VM_FAULT_SIGNAL ((__force vm_fault_t) 0x080000) +#define VM_FAULT_PFAULT ((__force vm_fault_t) 0x100000) enum fault_type { KERNEL_FAULT, @@ -122,7 +123,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) if (*table & _REGION_ENTRY_INVALID) goto out; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ + fallthrough; case _ASCE_TYPE_REGION2: table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; if (bad_address(table)) @@ -131,7 +132,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) if (*table & _REGION_ENTRY_INVALID) goto out; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ + fallthrough; case _ASCE_TYPE_REGION3: table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; if (bad_address(table)) @@ -140,7 +141,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) goto out; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ + fallthrough; case _ASCE_TYPE_SEGMENT: table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (bad_address(table)) @@ -327,7 +328,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, case VM_FAULT_BADACCESS: if (access == VM_EXEC && signal_return(regs) == 0) break; - /* fallthrough */ + fallthrough; case VM_FAULT_BADMAP: /* Bad memory access. Check if it is kernel or user space. */ if (user_mode(regs)) { @@ -337,9 +338,8 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, do_sigsegv(regs, si_code); break; } - /* fallthrough */ + fallthrough; case VM_FAULT_BADCONTEXT: - /* fallthrough */ case VM_FAULT_PFAULT: do_no_context(regs); break; @@ -429,7 +429,7 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) address = trans_exc_code & __FAIL_ADDR_MASK; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + flags = FAULT_FLAG_DEFAULT; if (user_mode(regs)) flags |= FAULT_FLAG_USER; if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) @@ -480,8 +480,7 @@ retry: * the fault. */ fault = handle_mm_fault(vma, address, flags); - /* No reason to continue if interrupted by SIGKILL. */ - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { + if (fault_signal_pending(fault, regs)) { fault = VM_FAULT_SIGNAL; if (flags & FAULT_FLAG_RETRY_NOWAIT) goto out_up; @@ -514,10 +513,7 @@ retry: fault = VM_FAULT_PFAULT; goto out_up; } - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~(FAULT_FLAG_ALLOW_RETRY | - FAULT_FLAG_RETRY_NOWAIT); + flags &= ~FAULT_FLAG_RETRY_NOWAIT; flags |= FAULT_FLAG_TRIED; down_read(&mm->mmap_sem); goto retry; @@ -816,3 +812,80 @@ out_extint: early_initcall(pfault_irq_init); #endif /* CONFIG_PFAULT */ + +#if IS_ENABLED(CONFIG_PGSTE) +void do_secure_storage_access(struct pt_regs *regs) +{ + unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK; + struct vm_area_struct *vma; + struct mm_struct *mm; + struct page *page; + int rc; + + switch (get_fault_type(regs)) { + case USER_FAULT: + mm = current->mm; + down_read(&mm->mmap_sem); + vma = find_vma(mm, addr); + if (!vma) { + up_read(&mm->mmap_sem); + do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); + break; + } + page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET); + if (IS_ERR_OR_NULL(page)) { + up_read(&mm->mmap_sem); + break; + } + if (arch_make_page_accessible(page)) + send_sig(SIGSEGV, current, 0); + put_page(page); + up_read(&mm->mmap_sem); + break; + case KERNEL_FAULT: + page = phys_to_page(addr); + if (unlikely(!try_get_page(page))) + break; + rc = arch_make_page_accessible(page); + put_page(page); + if (rc) + BUG(); + break; + case VDSO_FAULT: + /* fallthrough */ + case GMAP_FAULT: + /* fallthrough */ + default: + do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); + WARN_ON_ONCE(1); + } +} +NOKPROBE_SYMBOL(do_secure_storage_access); + +void do_non_secure_storage_access(struct pt_regs *regs) +{ + unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK; + struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; + + if (get_fault_type(regs) != GMAP_FAULT) { + do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); + WARN_ON_ONCE(1); + return; + } + + if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL) + send_sig(SIGSEGV, current, 0); +} +NOKPROBE_SYMBOL(do_non_secure_storage_access); + +#else +void do_secure_storage_access(struct pt_regs *regs) +{ + default_trap_handler(regs); +} + +void do_non_secure_storage_access(struct pt_regs *regs) +{ + default_trap_handler(regs); +} +#endif diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index edcdca97e85e..2fbece47ef6f 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -804,7 +804,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, if (*table & _REGION_ENTRY_INVALID) return NULL; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* Fallthrough */ + fallthrough; case _ASCE_TYPE_REGION2: table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; if (level == 3) @@ -812,7 +812,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, if (*table & _REGION_ENTRY_INVALID) return NULL; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* Fallthrough */ + fallthrough; case _ASCE_TYPE_REGION3: table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; if (level == 2) @@ -820,7 +820,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, if (*table & _REGION_ENTRY_INVALID) return NULL; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* Fallthrough */ + fallthrough; case _ASCE_TYPE_SEGMENT: table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (level == 1) @@ -2548,6 +2548,23 @@ int s390_enable_sie(void) } EXPORT_SYMBOL_GPL(s390_enable_sie); +int gmap_mark_unmergeable(void) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int ret; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, + MADV_UNMERGEABLE, &vma->vm_flags); + if (ret) + return ret; + } + mm->def_flags &= ~VM_MERGEABLE; + return 0; +} +EXPORT_SYMBOL_GPL(gmap_mark_unmergeable); + /* * Enable storage key handling from now on and initialize the storage * keys with the default key. @@ -2593,7 +2610,6 @@ static const struct mm_walk_ops enable_skey_walk_ops = { int s390_enable_skey(void) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; int rc = 0; down_write(&mm->mmap_sem); @@ -2601,16 +2617,11 @@ int s390_enable_skey(void) goto out_up; mm->context.uses_skeys = 1; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags)) { - mm->context.uses_skeys = 0; - rc = -ENOMEM; - goto out_up; - } + rc = gmap_mark_unmergeable(); + if (rc) { + mm->context.uses_skeys = 0; + goto out_up; } - mm->def_flags &= ~VM_MERGEABLE; - walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); out_up: @@ -2640,3 +2651,38 @@ void s390_reset_cmma(struct mm_struct *mm) up_write(&mm->mmap_sem); } EXPORT_SYMBOL_GPL(s390_reset_cmma); + +/* + * make inaccessible pages accessible again + */ +static int __s390_reset_acc(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) + WARN_ON_ONCE(uv_convert_from_secure(pte_val(pte) & PAGE_MASK)); + return 0; +} + +static const struct mm_walk_ops reset_acc_walk_ops = { + .pte_entry = __s390_reset_acc, +}; + +#include <linux/sched/mm.h> +void s390_reset_acc(struct mm_struct *mm) +{ + /* + * we might be called during + * reset: we walk the pages and clear + * close of all kvm file descriptors: we walk the pages and clear + * exit of process on fd closure: vma already gone, do nothing + */ + if (!mmget_not_zero(mm)) + return; + down_read(&mm->mmap_sem); + walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL); + up_read(&mm->mmap_sem); + mmput(mm); +} +EXPORT_SYMBOL_GPL(s390_reset_acc); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 5674710a4841..f01daddcbc5e 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -326,7 +326,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - int rc; if (len & ~huge_page_mask(h)) return -EINVAL; @@ -353,15 +352,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, else addr = hugetlb_get_unmapped_area_topdown(file, addr, len, pgoff, flags); - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return addr; check_asce_limit: - if (addr + len > current->mm->context.asce_limit && - addr + len <= TASK_SIZE) { - rc = crst_table_upgrade(mm, addr + len); - if (rc) - return (unsigned long) rc; - } - return addr; + return check_asce_limit(mm, addr, len); } diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index cbc718ba6d78..1b78f630a9ca 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -72,14 +72,13 @@ static inline unsigned long mmap_base(unsigned long rnd, return PAGE_ALIGN(STACK_TOP - gap - rnd); } -unsigned long -arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct vm_unmapped_area_info info; - int rc; if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; @@ -105,30 +104,20 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return addr; check_asce_limit: - if (addr + len > current->mm->context.asce_limit && - addr + len <= TASK_SIZE) { - rc = crst_table_upgrade(mm, addr + len); - if (rc) - return (unsigned long) rc; - } - - return addr; + return check_asce_limit(mm, addr, len); } -unsigned long -arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) +unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0; struct vm_unmapped_area_info info; - int rc; /* requested length too big for entire address space */ if (len > TASK_SIZE - mmap_min_addr) @@ -163,25 +152,18 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, * can happen with large stack limits and large mmap() * allocations. */ - if (addr & ~PAGE_MASK) { + if (offset_in_page(addr)) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = TASK_SIZE; addr = vm_unmapped_area(&info); - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return addr; } check_asce_limit: - if (addr + len > current->mm->context.asce_limit && - addr + len <= TASK_SIZE) { - rc = crst_table_upgrade(mm, addr + len); - if (rc) - return (unsigned long) rc; - } - - return addr; + return check_asce_limit(mm, addr, len); } /* diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index f8c6faab41f4..e22c06d5f206 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -367,20 +367,4 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) } } -#ifdef CONFIG_HIBERNATION -bool kernel_page_present(struct page *page) -{ - unsigned long addr; - int cc; - - addr = page_to_phys(page); - asm volatile( - " lra %1,0(%1)\n" - " ipm %0\n" - " srl %0,28" - : "=d" (cc), "+a" (addr) : : "cc"); - return cc == 0; -} -#endif /* CONFIG_HIBERNATION */ - #endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 3dd253f81a77..498c98a312f4 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -77,67 +77,65 @@ static void __crst_table_upgrade(void *arg) int crst_table_upgrade(struct mm_struct *mm, unsigned long end) { - unsigned long *table, *pgd; - int rc, notify; + unsigned long *pgd = NULL, *p4d = NULL, *__pgd; + unsigned long asce_limit = mm->context.asce_limit; /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ - VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE); - rc = 0; - notify = 0; - while (mm->context.asce_limit < end) { - table = crst_table_alloc(mm); - if (!table) { - rc = -ENOMEM; - break; - } - spin_lock_bh(&mm->page_table_lock); - pgd = (unsigned long *) mm->pgd; - if (mm->context.asce_limit == _REGION2_SIZE) { - crst_table_init(table, _REGION2_ENTRY_EMPTY); - p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd); - mm->pgd = (pgd_t *) table; - mm->context.asce_limit = _REGION1_SIZE; - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION2; - mm_inc_nr_puds(mm); - } else { - crst_table_init(table, _REGION1_ENTRY_EMPTY); - pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd); - mm->pgd = (pgd_t *) table; - mm->context.asce_limit = -PAGE_SIZE; - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION1; - } - notify = 1; - spin_unlock_bh(&mm->page_table_lock); - } - if (notify) - on_each_cpu(__crst_table_upgrade, mm, 0); - return rc; -} + VM_BUG_ON(asce_limit < _REGION2_SIZE); -void crst_table_downgrade(struct mm_struct *mm) -{ - pgd_t *pgd; + if (end <= asce_limit) + return 0; - /* downgrade should only happen from 3 to 2 levels (compat only) */ - VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE); + if (asce_limit == _REGION2_SIZE) { + p4d = crst_table_alloc(mm); + if (unlikely(!p4d)) + goto err_p4d; + crst_table_init(p4d, _REGION2_ENTRY_EMPTY); + } + if (end > _REGION1_SIZE) { + pgd = crst_table_alloc(mm); + if (unlikely(!pgd)) + goto err_pgd; + crst_table_init(pgd, _REGION1_ENTRY_EMPTY); + } - if (current->active_mm == mm) { - clear_user_asce(); - __tlb_flush_mm(mm); + spin_lock_bh(&mm->page_table_lock); + + /* + * This routine gets called with mmap_sem lock held and there is + * no reason to optimize for the case of otherwise. However, if + * that would ever change, the below check will let us know. + */ + VM_BUG_ON(asce_limit != mm->context.asce_limit); + + if (p4d) { + __pgd = (unsigned long *) mm->pgd; + p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd); + mm->pgd = (pgd_t *) p4d; + mm->context.asce_limit = _REGION1_SIZE; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION2; + mm_inc_nr_puds(mm); + } + if (pgd) { + __pgd = (unsigned long *) mm->pgd; + pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd); + mm->pgd = (pgd_t *) pgd; + mm->context.asce_limit = TASK_SIZE_MAX; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION1; } - pgd = mm->pgd; - mm_dec_nr_pmds(mm); - mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); - mm->context.asce_limit = _REGION3_SIZE; - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; - crst_table_free(mm, (unsigned long *) pgd); + spin_unlock_bh(&mm->page_table_lock); - if (current->active_mm == mm) - set_user_asce(mm); + on_each_cpu(__crst_table_upgrade, mm, 0); + + return 0; + +err_pgd: + crst_table_free(mm, p4d); +err_p4d: + return -ENOMEM; } static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) @@ -304,7 +302,7 @@ void __tlb_remove_table(void *_table) mask >>= 24; if (mask != 0) break; - /* fallthrough */ + fallthrough; case 3: /* 4K page table with pgstes */ if (mask & 3) atomic_xor_bits(&page->_refcount, 3 << 24); @@ -529,7 +527,7 @@ void base_asce_free(unsigned long asce) base_region2_walk(table, 0, _REGION1_SIZE, 0); break; case _ASCE_TYPE_REGION1: - base_region1_walk(table, 0, -_PAGE_SIZE, 0); + base_region1_walk(table, 0, TASK_SIZE_MAX, 0); break; } base_crst_free(table); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index b403fa14847d..f810930aff42 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -415,6 +415,10 @@ void __init vmem_map_init(void) SET_MEMORY_RO | SET_MEMORY_X); __set_memory(__stext_dma, (__etext_dma - __stext_dma) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); + + /* we need lowcore executable for our LPSWE instructions */ + set_memory_x(0, 1); + pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); } diff --git a/arch/s390/numa/Makefile b/arch/s390/numa/Makefile index 66c2dff74895..c89d26f4f77d 100644 --- a/arch/s390/numa/Makefile +++ b/arch/s390/numa/Makefile @@ -1,4 +1,2 @@ # SPDX-License-Identifier: GPL-2.0 obj-y += numa.o -obj-y += toptree.o -obj-$(CONFIG_NUMA_EMU) += mode_emu.o diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c deleted file mode 100644 index 72d742bb2d17..000000000000 --- a/arch/s390/numa/mode_emu.c +++ /dev/null @@ -1,577 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NUMA support for s390 - * - * NUMA emulation (aka fake NUMA) distributes the available memory to nodes - * without using real topology information about the physical memory of the - * machine. - * - * It distributes the available CPUs to nodes while respecting the original - * machine topology information. This is done by trying to avoid to separate - * CPUs which reside on the same book or even on the same MC. - * - * Because the current Linux scheduler code requires a stable cpu to node - * mapping, cores are pinned to nodes when the first CPU thread is set online. - * - * Copyright IBM Corp. 2015 - */ - -#define KMSG_COMPONENT "numa_emu" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/kernel.h> -#include <linux/cpumask.h> -#include <linux/memblock.h> -#include <linux/node.h> -#include <linux/memory.h> -#include <linux/slab.h> -#include <asm/smp.h> -#include <asm/topology.h> -#include "numa_mode.h" -#include "toptree.h" - -/* Distances between the different system components */ -#define DIST_EMPTY 0 -#define DIST_CORE 1 -#define DIST_MC 2 -#define DIST_BOOK 3 -#define DIST_DRAWER 4 -#define DIST_MAX 5 - -/* Node distance reported to common code */ -#define EMU_NODE_DIST 10 - -/* Node ID for free (not yet pinned) cores */ -#define NODE_ID_FREE -1 - -/* Different levels of toptree */ -enum toptree_level {CORE, MC, BOOK, DRAWER, NODE, TOPOLOGY}; - -/* The two toptree IDs */ -enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA}; - -/* Number of NUMA nodes */ -static int emu_nodes = 1; -/* NUMA stripe size */ -static unsigned long emu_size; - -/* - * Node to core pinning information updates are protected by - * "sched_domains_mutex". - */ -static struct { - s32 to_node_id[CONFIG_NR_CPUS]; /* Pinned core to node mapping */ - int total; /* Total number of pinned cores */ - int per_node_target; /* Cores per node without extra cores */ - int per_node[MAX_NUMNODES]; /* Number of cores pinned to node */ -} *emu_cores; - -/* - * Pin a core to a node - */ -static void pin_core_to_node(int core_id, int node_id) -{ - if (emu_cores->to_node_id[core_id] == NODE_ID_FREE) { - emu_cores->per_node[node_id]++; - emu_cores->to_node_id[core_id] = node_id; - emu_cores->total++; - } else { - WARN_ON(emu_cores->to_node_id[core_id] != node_id); - } -} - -/* - * Number of pinned cores of a node - */ -static int cores_pinned(struct toptree *node) -{ - return emu_cores->per_node[node->id]; -} - -/* - * ID of the node where the core is pinned (or NODE_ID_FREE) - */ -static int core_pinned_to_node_id(struct toptree *core) -{ - return emu_cores->to_node_id[core->id]; -} - -/* - * Number of cores in the tree that are not yet pinned - */ -static int cores_free(struct toptree *tree) -{ - struct toptree *core; - int count = 0; - - toptree_for_each(core, tree, CORE) { - if (core_pinned_to_node_id(core) == NODE_ID_FREE) - count++; - } - return count; -} - -/* - * Return node of core - */ -static struct toptree *core_node(struct toptree *core) -{ - return core->parent->parent->parent->parent; -} - -/* - * Return drawer of core - */ -static struct toptree *core_drawer(struct toptree *core) -{ - return core->parent->parent->parent; -} - -/* - * Return book of core - */ -static struct toptree *core_book(struct toptree *core) -{ - return core->parent->parent; -} - -/* - * Return mc of core - */ -static struct toptree *core_mc(struct toptree *core) -{ - return core->parent; -} - -/* - * Distance between two cores - */ -static int dist_core_to_core(struct toptree *core1, struct toptree *core2) -{ - if (core_drawer(core1)->id != core_drawer(core2)->id) - return DIST_DRAWER; - if (core_book(core1)->id != core_book(core2)->id) - return DIST_BOOK; - if (core_mc(core1)->id != core_mc(core2)->id) - return DIST_MC; - /* Same core or sibling on same MC */ - return DIST_CORE; -} - -/* - * Distance of a node to a core - */ -static int dist_node_to_core(struct toptree *node, struct toptree *core) -{ - struct toptree *core_node; - int dist_min = DIST_MAX; - - toptree_for_each(core_node, node, CORE) - dist_min = min(dist_min, dist_core_to_core(core_node, core)); - return dist_min == DIST_MAX ? DIST_EMPTY : dist_min; -} - -/* - * Unify will delete empty nodes, therefore recreate nodes. - */ -static void toptree_unify_tree(struct toptree *tree) -{ - int nid; - - toptree_unify(tree); - for (nid = 0; nid < emu_nodes; nid++) - toptree_get_child(tree, nid); -} - -/* - * Find the best/nearest node for a given core and ensure that no node - * gets more than "emu_cores->per_node_target + extra" cores. - */ -static struct toptree *node_for_core(struct toptree *numa, struct toptree *core, - int extra) -{ - struct toptree *node, *node_best = NULL; - int dist_cur, dist_best, cores_target; - - cores_target = emu_cores->per_node_target + extra; - dist_best = DIST_MAX; - node_best = NULL; - toptree_for_each(node, numa, NODE) { - /* Already pinned cores must use their nodes */ - if (core_pinned_to_node_id(core) == node->id) { - node_best = node; - break; - } - /* Skip nodes that already have enough cores */ - if (cores_pinned(node) >= cores_target) - continue; - dist_cur = dist_node_to_core(node, core); - if (dist_cur < dist_best) { - dist_best = dist_cur; - node_best = node; - } - } - return node_best; -} - -/* - * Find the best node for each core with respect to "extra" core count - */ -static void toptree_to_numa_single(struct toptree *numa, struct toptree *phys, - int extra) -{ - struct toptree *node, *core, *tmp; - - toptree_for_each_safe(core, tmp, phys, CORE) { - node = node_for_core(numa, core, extra); - if (!node) - return; - toptree_move(core, node); - pin_core_to_node(core->id, node->id); - } -} - -/* - * Move structures of given level to specified NUMA node - */ -static void move_level_to_numa_node(struct toptree *node, struct toptree *phys, - enum toptree_level level, bool perfect) -{ - int cores_free, cores_target = emu_cores->per_node_target; - struct toptree *cur, *tmp; - - toptree_for_each_safe(cur, tmp, phys, level) { - cores_free = cores_target - toptree_count(node, CORE); - if (perfect) { - if (cores_free == toptree_count(cur, CORE)) - toptree_move(cur, node); - } else { - if (cores_free >= toptree_count(cur, CORE)) - toptree_move(cur, node); - } - } -} - -/* - * Move structures of a given level to NUMA nodes. If "perfect" is specified - * move only perfectly fitting structures. Otherwise move also smaller - * than needed structures. - */ -static void move_level_to_numa(struct toptree *numa, struct toptree *phys, - enum toptree_level level, bool perfect) -{ - struct toptree *node; - - toptree_for_each(node, numa, NODE) - move_level_to_numa_node(node, phys, level, perfect); -} - -/* - * For the first run try to move the big structures - */ -static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys) -{ - struct toptree *core; - - /* Always try to move perfectly fitting structures first */ - move_level_to_numa(numa, phys, DRAWER, true); - move_level_to_numa(numa, phys, DRAWER, false); - move_level_to_numa(numa, phys, BOOK, true); - move_level_to_numa(numa, phys, BOOK, false); - move_level_to_numa(numa, phys, MC, true); - move_level_to_numa(numa, phys, MC, false); - /* Now pin all the moved cores */ - toptree_for_each(core, numa, CORE) - pin_core_to_node(core->id, core_node(core)->id); -} - -/* - * Allocate new topology and create required nodes - */ -static struct toptree *toptree_new(int id, int nodes) -{ - struct toptree *tree; - int nid; - - tree = toptree_alloc(TOPOLOGY, id); - if (!tree) - goto fail; - for (nid = 0; nid < nodes; nid++) { - if (!toptree_get_child(tree, nid)) - goto fail; - } - return tree; -fail: - panic("NUMA emulation could not allocate topology"); -} - -/* - * Allocate and initialize core to node mapping - */ -static void __ref create_core_to_node_map(void) -{ - int i; - - emu_cores = memblock_alloc(sizeof(*emu_cores), 8); - if (!emu_cores) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*emu_cores), 8); - for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++) - emu_cores->to_node_id[i] = NODE_ID_FREE; -} - -/* - * Move cores from physical topology into NUMA target topology - * and try to keep as much of the physical topology as possible. - */ -static struct toptree *toptree_to_numa(struct toptree *phys) -{ - static int first = 1; - struct toptree *numa; - int cores_total; - - cores_total = emu_cores->total + cores_free(phys); - emu_cores->per_node_target = cores_total / emu_nodes; - numa = toptree_new(TOPTREE_ID_NUMA, emu_nodes); - if (first) { - toptree_to_numa_first(numa, phys); - first = 0; - } - toptree_to_numa_single(numa, phys, 0); - toptree_to_numa_single(numa, phys, 1); - toptree_unify_tree(numa); - - WARN_ON(cpumask_weight(&phys->mask)); - return numa; -} - -/* - * Create a toptree out of the physical topology that we got from the hypervisor - */ -static struct toptree *toptree_from_topology(void) -{ - struct toptree *phys, *node, *drawer, *book, *mc, *core; - struct cpu_topology_s390 *top; - int cpu; - - phys = toptree_new(TOPTREE_ID_PHYS, 1); - - for_each_cpu(cpu, &cpus_with_topology) { - top = &cpu_topology[cpu]; - node = toptree_get_child(phys, 0); - drawer = toptree_get_child(node, top->drawer_id); - book = toptree_get_child(drawer, top->book_id); - mc = toptree_get_child(book, top->socket_id); - core = toptree_get_child(mc, smp_get_base_cpu(cpu)); - if (!drawer || !book || !mc || !core) - panic("NUMA emulation could not allocate memory"); - cpumask_set_cpu(cpu, &core->mask); - toptree_update_mask(mc); - } - return phys; -} - -/* - * Add toptree core to topology and create correct CPU masks - */ -static void topology_add_core(struct toptree *core) -{ - struct cpu_topology_s390 *top; - int cpu; - - for_each_cpu(cpu, &core->mask) { - top = &cpu_topology[cpu]; - cpumask_copy(&top->thread_mask, &core->mask); - cpumask_copy(&top->core_mask, &core_mc(core)->mask); - cpumask_copy(&top->book_mask, &core_book(core)->mask); - cpumask_copy(&top->drawer_mask, &core_drawer(core)->mask); - cpumask_set_cpu(cpu, &node_to_cpumask_map[core_node(core)->id]); - top->node_id = core_node(core)->id; - } -} - -/* - * Apply toptree to topology and create CPU masks - */ -static void toptree_to_topology(struct toptree *numa) -{ - struct toptree *core; - int i; - - /* Clear all node masks */ - for (i = 0; i < MAX_NUMNODES; i++) - cpumask_clear(&node_to_cpumask_map[i]); - - /* Rebuild all masks */ - toptree_for_each(core, numa, CORE) - topology_add_core(core); -} - -/* - * Show the node to core mapping - */ -static void print_node_to_core_map(void) -{ - int nid, cid; - - if (!numa_debug_enabled) - return; - printk(KERN_DEBUG "NUMA node to core mapping\n"); - for (nid = 0; nid < emu_nodes; nid++) { - printk(KERN_DEBUG " node %3d: ", nid); - for (cid = 0; cid < ARRAY_SIZE(emu_cores->to_node_id); cid++) { - if (emu_cores->to_node_id[cid] == nid) - printk(KERN_CONT "%d ", cid); - } - printk(KERN_CONT "\n"); - } -} - -static void pin_all_possible_cpus(void) -{ - int core_id, node_id, cpu; - static int initialized; - - if (initialized) - return; - print_node_to_core_map(); - node_id = 0; - for_each_possible_cpu(cpu) { - core_id = smp_get_base_cpu(cpu); - if (emu_cores->to_node_id[core_id] != NODE_ID_FREE) - continue; - pin_core_to_node(core_id, node_id); - cpu_topology[cpu].node_id = node_id; - node_id = (node_id + 1) % emu_nodes; - } - print_node_to_core_map(); - initialized = 1; -} - -/* - * Transfer physical topology into a NUMA topology and modify CPU masks - * according to the NUMA topology. - * - * Must be called with "sched_domains_mutex" lock held. - */ -static void emu_update_cpu_topology(void) -{ - struct toptree *phys, *numa; - - if (emu_cores == NULL) - create_core_to_node_map(); - phys = toptree_from_topology(); - numa = toptree_to_numa(phys); - toptree_free(phys); - toptree_to_topology(numa); - toptree_free(numa); - pin_all_possible_cpus(); -} - -/* - * If emu_size is not set, use CONFIG_EMU_SIZE. Then round to minimum - * alignment (needed for memory hotplug). - */ -static unsigned long emu_setup_size_adjust(unsigned long size) -{ - unsigned long size_new; - - size = size ? : CONFIG_EMU_SIZE; - size_new = roundup(size, memory_block_size_bytes()); - if (size_new == size) - return size; - pr_warn("Increasing memory stripe size from %ld MB to %ld MB\n", - size >> 20, size_new >> 20); - return size_new; -} - -/* - * If we have not enough memory for the specified nodes, reduce the node count. - */ -static int emu_setup_nodes_adjust(int nodes) -{ - int nodes_max; - - nodes_max = memblock.memory.total_size / emu_size; - nodes_max = max(nodes_max, 1); - if (nodes_max >= nodes) - return nodes; - pr_warn("Not enough memory for %d nodes, reducing node count\n", nodes); - return nodes_max; -} - -/* - * Early emu setup - */ -static void emu_setup(void) -{ - int nid; - - emu_size = emu_setup_size_adjust(emu_size); - emu_nodes = emu_setup_nodes_adjust(emu_nodes); - for (nid = 0; nid < emu_nodes; nid++) - node_set(nid, node_possible_map); - pr_info("Creating %d nodes with memory stripe size %ld MB\n", - emu_nodes, emu_size >> 20); -} - -/* - * Return node id for given page number - */ -static int emu_pfn_to_nid(unsigned long pfn) -{ - return (pfn / (emu_size >> PAGE_SHIFT)) % emu_nodes; -} - -/* - * Return stripe size - */ -static unsigned long emu_align(void) -{ - return emu_size; -} - -/* - * Return distance between two nodes - */ -static int emu_distance(int node1, int node2) -{ - return (node1 != node2) * EMU_NODE_DIST; -} - -/* - * Define callbacks for generic s390 NUMA infrastructure - */ -const struct numa_mode numa_mode_emu = { - .name = "emu", - .setup = emu_setup, - .update_cpu_topology = emu_update_cpu_topology, - .__pfn_to_nid = emu_pfn_to_nid, - .align = emu_align, - .distance = emu_distance, -}; - -/* - * Kernel parameter: emu_nodes=<n> - */ -static int __init early_parse_emu_nodes(char *p) -{ - int count; - - if (!p || kstrtoint(p, 0, &count) != 0 || count <= 0) - return 0; - emu_nodes = min(count, MAX_NUMNODES); - return 0; -} -early_param("emu_nodes", early_parse_emu_nodes); - -/* - * Kernel parameter: emu_size=[<n>[k|M|G|T]] - */ -static int __init early_parse_emu_size(char *p) -{ - if (p) - emu_size = memparse(p, NULL); - return 0; -} -early_param("emu_size", early_parse_emu_size); diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c index d2910fa834c8..51c5a9f6e525 100644 --- a/arch/s390/numa/numa.c +++ b/arch/s390/numa/numa.c @@ -7,165 +7,36 @@ * Copyright IBM Corp. 2015 */ -#define KMSG_COMPONENT "numa" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - #include <linux/kernel.h> #include <linux/mmzone.h> #include <linux/cpumask.h> #include <linux/memblock.h> -#include <linux/slab.h> #include <linux/node.h> - #include <asm/numa.h> -#include "numa_mode.h" -pg_data_t *node_data[MAX_NUMNODES]; +struct pglist_data *node_data[MAX_NUMNODES]; EXPORT_SYMBOL(node_data); -cpumask_t node_to_cpumask_map[MAX_NUMNODES]; -EXPORT_SYMBOL(node_to_cpumask_map); - -static void plain_setup(void) -{ - node_set(0, node_possible_map); -} - -const struct numa_mode numa_mode_plain = { - .name = "plain", - .setup = plain_setup, -}; - -static const struct numa_mode *mode = &numa_mode_plain; - -int numa_pfn_to_nid(unsigned long pfn) -{ - return mode->__pfn_to_nid ? mode->__pfn_to_nid(pfn) : 0; -} - -void numa_update_cpu_topology(void) -{ - if (mode->update_cpu_topology) - mode->update_cpu_topology(); -} - -int __node_distance(int a, int b) -{ - return mode->distance ? mode->distance(a, b) : 0; -} -EXPORT_SYMBOL(__node_distance); - -int numa_debug_enabled; - -/* - * numa_setup_memory() - Assign bootmem to nodes - * - * The memory is first added to memblock without any respect to nodes. - * This is fixed before remaining memblock memory is handed over to the - * buddy allocator. - * An important side effect is that large bootmem allocations might easily - * cross node boundaries, which can be needed for large allocations with - * smaller memory stripes in each node (i.e. when using NUMA emulation). - * - * Memory defines nodes: - * Therefore this routine also sets the nodes online with memory. - */ -static void __init numa_setup_memory(void) +void __init numa_setup(void) { - unsigned long cur_base, align, end_of_dram; - int nid = 0; - - end_of_dram = memblock_end_of_DRAM(); - align = mode->align ? mode->align() : ULONG_MAX; - - /* - * Step through all available memory and assign it to the nodes - * indicated by the mode implementation. - * All nodes which are seen here will be set online. - */ - cur_base = 0; - do { - nid = numa_pfn_to_nid(PFN_DOWN(cur_base)); - node_set_online(nid); - memblock_set_node(cur_base, align, &memblock.memory, nid); - cur_base += align; - } while (cur_base < end_of_dram); + int nid; - /* Allocate and fill out node_data */ + nodes_clear(node_possible_map); + node_set(0, node_possible_map); + node_set_online(0); for (nid = 0; nid < MAX_NUMNODES; nid++) { NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8); if (!NODE_DATA(nid)) panic("%s: Failed to allocate %zu bytes align=0x%x\n", __func__, sizeof(pg_data_t), 8); } - - for_each_online_node(nid) { - unsigned long start_pfn, end_pfn; - unsigned long t_start, t_end; - int i; - - start_pfn = ULONG_MAX; - end_pfn = 0; - for_each_mem_pfn_range(i, nid, &t_start, &t_end, NULL) { - if (t_start < start_pfn) - start_pfn = t_start; - if (t_end > end_pfn) - end_pfn = t_end; - } - NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; - NODE_DATA(nid)->node_id = nid; - } -} - -/* - * numa_setup() - Earliest initialization - * - * Assign the mode and call the mode's setup routine. - */ -void __init numa_setup(void) -{ - pr_info("NUMA mode: %s\n", mode->name); - nodes_clear(node_possible_map); - /* Initially attach all possible CPUs to node 0. */ - cpumask_copy(&node_to_cpumask_map[0], cpu_possible_mask); - if (mode->setup) - mode->setup(); - numa_setup_memory(); - memblock_dump_all(); + NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT; + NODE_DATA(0)->node_id = 0; } -/* - * numa_init_late() - Initialization initcall - * - * Register NUMA nodes. - */ static int __init numa_init_late(void) { - int nid; - - for_each_online_node(nid) - register_one_node(nid); + register_one_node(0); return 0; } arch_initcall(numa_init_late); - -static int __init parse_debug(char *parm) -{ - numa_debug_enabled = 1; - return 0; -} -early_param("numa_debug", parse_debug); - -static int __init parse_numa(char *parm) -{ - if (!parm) - return 1; - if (strcmp(parm, numa_mode_plain.name) == 0) - mode = &numa_mode_plain; -#ifdef CONFIG_NUMA_EMU - if (strcmp(parm, numa_mode_emu.name) == 0) - mode = &numa_mode_emu; -#endif - return 0; -} -early_param("numa", parse_numa); diff --git a/arch/s390/numa/numa_mode.h b/arch/s390/numa/numa_mode.h deleted file mode 100644 index dfd3e2784081..000000000000 --- a/arch/s390/numa/numa_mode.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * NUMA support for s390 - * - * Define declarations used for communication between NUMA mode - * implementations and NUMA core functionality. - * - * Copyright IBM Corp. 2015 - */ -#ifndef __S390_NUMA_MODE_H -#define __S390_NUMA_MODE_H - -struct numa_mode { - char *name; /* Name of mode */ - void (*setup)(void); /* Initizalize mode */ - void (*update_cpu_topology)(void); /* Called by topology code */ - int (*__pfn_to_nid)(unsigned long pfn); /* PFN to node ID */ - unsigned long (*align)(void); /* Minimum node alignment */ - int (*distance)(int a, int b); /* Distance between two nodes */ -}; - -extern const struct numa_mode numa_mode_plain; -extern const struct numa_mode numa_mode_emu; - -#endif /* __S390_NUMA_MODE_H */ diff --git a/arch/s390/numa/toptree.c b/arch/s390/numa/toptree.c deleted file mode 100644 index 71a608cd4f61..000000000000 --- a/arch/s390/numa/toptree.c +++ /dev/null @@ -1,351 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NUMA support for s390 - * - * A tree structure used for machine topology mangling - * - * Copyright IBM Corp. 2015 - */ - -#include <linux/kernel.h> -#include <linux/memblock.h> -#include <linux/cpumask.h> -#include <linux/list.h> -#include <linux/list_sort.h> -#include <linux/slab.h> -#include <asm/numa.h> - -#include "toptree.h" - -/** - * toptree_alloc - Allocate and initialize a new tree node. - * @level: The node's vertical level; level 0 contains the leaves. - * @id: ID number, explicitly not unique beyond scope of node's siblings - * - * Allocate a new tree node and initialize it. - * - * RETURNS: - * Pointer to the new tree node or NULL on error - */ -struct toptree __ref *toptree_alloc(int level, int id) -{ - struct toptree *res; - - if (slab_is_available()) - res = kzalloc(sizeof(*res), GFP_KERNEL); - else - res = memblock_alloc(sizeof(*res), 8); - if (!res) - return res; - - INIT_LIST_HEAD(&res->children); - INIT_LIST_HEAD(&res->sibling); - cpumask_clear(&res->mask); - res->level = level; - res->id = id; - return res; -} - -/** - * toptree_remove - Remove a tree node from a tree - * @cand: Pointer to the node to remove - * - * The node is detached from its parent node. The parent node's - * masks will be updated to reflect the loss of the child. - */ -static void toptree_remove(struct toptree *cand) -{ - struct toptree *oldparent; - - list_del_init(&cand->sibling); - oldparent = cand->parent; - cand->parent = NULL; - toptree_update_mask(oldparent); -} - -/** - * toptree_free - discard a tree node - * @cand: Pointer to the tree node to discard - * - * Checks if @cand is attached to a parent node. Detaches it - * cleanly using toptree_remove. Possible children are freed - * recursively. In the end @cand itself is freed. - */ -void __ref toptree_free(struct toptree *cand) -{ - struct toptree *child, *tmp; - - if (cand->parent) - toptree_remove(cand); - toptree_for_each_child_safe(child, tmp, cand) - toptree_free(child); - if (slab_is_available()) - kfree(cand); - else - memblock_free_early((unsigned long)cand, sizeof(*cand)); -} - -/** - * toptree_update_mask - Update node bitmasks - * @cand: Pointer to a tree node - * - * The node's cpumask will be updated by combining all children's - * masks. Then toptree_update_mask is called recursively for the - * parent if applicable. - * - * NOTE: - * This must not be called on leaves. If called on a leaf, its - * CPU mask is cleared and lost. - */ -void toptree_update_mask(struct toptree *cand) -{ - struct toptree *child; - - cpumask_clear(&cand->mask); - list_for_each_entry(child, &cand->children, sibling) - cpumask_or(&cand->mask, &cand->mask, &child->mask); - if (cand->parent) - toptree_update_mask(cand->parent); -} - -/** - * toptree_insert - Insert a tree node into tree - * @cand: Pointer to the node to insert - * @target: Pointer to the node to which @cand will added as a child - * - * Insert a tree node into a tree. Masks will be updated automatically. - * - * RETURNS: - * 0 on success, -1 if NULL is passed as argument or the node levels - * don't fit. - */ -static int toptree_insert(struct toptree *cand, struct toptree *target) -{ - if (!cand || !target) - return -1; - if (target->level != (cand->level + 1)) - return -1; - list_add_tail(&cand->sibling, &target->children); - cand->parent = target; - toptree_update_mask(target); - return 0; -} - -/** - * toptree_move_children - Move all child nodes of a node to a new place - * @cand: Pointer to the node whose children are to be moved - * @target: Pointer to the node to which @cand's children will be attached - * - * Take all child nodes of @cand and move them using toptree_move. - */ -static void toptree_move_children(struct toptree *cand, struct toptree *target) -{ - struct toptree *child, *tmp; - - toptree_for_each_child_safe(child, tmp, cand) - toptree_move(child, target); -} - -/** - * toptree_unify - Merge children with same ID - * @cand: Pointer to node whose direct children should be made unique - * - * When mangling the tree it is possible that a node has two or more children - * which have the same ID. This routine merges these children into one and - * moves all children of the merged nodes into the unified node. - */ -void toptree_unify(struct toptree *cand) -{ - struct toptree *child, *tmp, *cand_copy; - - /* Threads cannot be split, cores are not split */ - if (cand->level < 2) - return; - - cand_copy = toptree_alloc(cand->level, 0); - toptree_for_each_child_safe(child, tmp, cand) { - struct toptree *tmpchild; - - if (!cpumask_empty(&child->mask)) { - tmpchild = toptree_get_child(cand_copy, child->id); - toptree_move_children(child, tmpchild); - } - toptree_free(child); - } - toptree_move_children(cand_copy, cand); - toptree_free(cand_copy); - - toptree_for_each_child(child, cand) - toptree_unify(child); -} - -/** - * toptree_move - Move a node to another context - * @cand: Pointer to the node to move - * @target: Pointer to the node where @cand should go - * - * In the easiest case @cand is exactly on the level below @target - * and will be immediately moved to the target. - * - * If @target's level is not the direct parent level of @cand, - * nodes for the missing levels are created and put between - * @cand and @target. The "stacking" nodes' IDs are taken from - * @cand's parents. - * - * After this it is likely to have redundant nodes in the tree - * which are addressed by means of toptree_unify. - */ -void toptree_move(struct toptree *cand, struct toptree *target) -{ - struct toptree *stack_target, *real_insert_point, *ptr, *tmp; - - if (cand->level + 1 == target->level) { - toptree_remove(cand); - toptree_insert(cand, target); - return; - } - - real_insert_point = NULL; - ptr = cand; - stack_target = NULL; - - do { - tmp = stack_target; - stack_target = toptree_alloc(ptr->level + 1, - ptr->parent->id); - toptree_insert(tmp, stack_target); - if (!real_insert_point) - real_insert_point = stack_target; - ptr = ptr->parent; - } while (stack_target->level < (target->level - 1)); - - toptree_remove(cand); - toptree_insert(cand, real_insert_point); - toptree_insert(stack_target, target); -} - -/** - * toptree_get_child - Access a tree node's child by its ID - * @cand: Pointer to tree node whose child is to access - * @id: The desired child's ID - * - * @cand's children are searched for a child with matching ID. - * If no match can be found, a new child with the desired ID - * is created and returned. - */ -struct toptree *toptree_get_child(struct toptree *cand, int id) -{ - struct toptree *child; - - toptree_for_each_child(child, cand) - if (child->id == id) - return child; - child = toptree_alloc(cand->level-1, id); - toptree_insert(child, cand); - return child; -} - -/** - * toptree_first - Find the first descendant on specified level - * @context: Pointer to tree node whose descendants are to be used - * @level: The level of interest - * - * RETURNS: - * @context's first descendant on the specified level, or NULL - * if there is no matching descendant - */ -struct toptree *toptree_first(struct toptree *context, int level) -{ - struct toptree *child, *tmp; - - if (context->level == level) - return context; - - if (!list_empty(&context->children)) { - list_for_each_entry(child, &context->children, sibling) { - tmp = toptree_first(child, level); - if (tmp) - return tmp; - } - } - return NULL; -} - -/** - * toptree_next_sibling - Return next sibling - * @cur: Pointer to a tree node - * - * RETURNS: - * If @cur has a parent and is not the last in the parent's children list, - * the next sibling is returned. Or NULL when there are no siblings left. - */ -static struct toptree *toptree_next_sibling(struct toptree *cur) -{ - if (cur->parent == NULL) - return NULL; - - if (cur == list_last_entry(&cur->parent->children, - struct toptree, sibling)) - return NULL; - return (struct toptree *) list_next_entry(cur, sibling); -} - -/** - * toptree_next - Tree traversal function - * @cur: Pointer to current element - * @context: Pointer to the root node of the tree or subtree to - * be traversed. - * @level: The level of interest. - * - * RETURNS: - * Pointer to the next node on level @level - * or NULL when there is no next node. - */ -struct toptree *toptree_next(struct toptree *cur, struct toptree *context, - int level) -{ - struct toptree *cur_context, *tmp; - - if (!cur) - return NULL; - - if (context->level == level) - return NULL; - - tmp = toptree_next_sibling(cur); - if (tmp != NULL) - return tmp; - - cur_context = cur; - while (cur_context->level < context->level - 1) { - /* Step up */ - cur_context = cur_context->parent; - /* Step aside */ - tmp = toptree_next_sibling(cur_context); - if (tmp != NULL) { - /* Step down */ - tmp = toptree_first(tmp, level); - if (tmp != NULL) - return tmp; - } - } - return NULL; -} - -/** - * toptree_count - Count descendants on specified level - * @context: Pointer to node whose descendants are to be considered - * @level: Only descendants on the specified level will be counted - * - * RETURNS: - * Number of descendants on the specified level - */ -int toptree_count(struct toptree *context, int level) -{ - struct toptree *cur; - int cnt = 0; - - toptree_for_each(cur, context, level) - cnt++; - return cnt; -} diff --git a/arch/s390/numa/toptree.h b/arch/s390/numa/toptree.h deleted file mode 100644 index 5246371ec713..000000000000 --- a/arch/s390/numa/toptree.h +++ /dev/null @@ -1,61 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * NUMA support for s390 - * - * A tree structure used for machine topology mangling - * - * Copyright IBM Corp. 2015 - */ -#ifndef S390_TOPTREE_H -#define S390_TOPTREE_H - -#include <linux/cpumask.h> -#include <linux/list.h> - -struct toptree { - int level; - int id; - cpumask_t mask; - struct toptree *parent; - struct list_head sibling; - struct list_head children; -}; - -struct toptree *toptree_alloc(int level, int id); -void toptree_free(struct toptree *cand); -void toptree_update_mask(struct toptree *cand); -void toptree_unify(struct toptree *cand); -struct toptree *toptree_get_child(struct toptree *cand, int id); -void toptree_move(struct toptree *cand, struct toptree *target); -int toptree_count(struct toptree *context, int level); - -struct toptree *toptree_first(struct toptree *context, int level); -struct toptree *toptree_next(struct toptree *cur, struct toptree *context, - int level); - -#define toptree_for_each_child(child, ptree) \ - list_for_each_entry(child, &ptree->children, sibling) - -#define toptree_for_each_child_safe(child, ptmp, ptree) \ - list_for_each_entry_safe(child, ptmp, &ptree->children, sibling) - -#define toptree_is_last(ptree) \ - ((ptree->parent == NULL) || \ - (ptree->parent->children.prev == &ptree->sibling)) - -#define toptree_for_each(ptree, cont, ttype) \ - for (ptree = toptree_first(cont, ttype); \ - ptree != NULL; \ - ptree = toptree_next(ptree, cont, ttype)) - -#define toptree_for_each_safe(ptree, tmp, cont, ttype) \ - for (ptree = toptree_first(cont, ttype), \ - tmp = toptree_next(ptree, cont, ttype); \ - ptree != NULL; \ - ptree = tmp, \ - tmp = toptree_next(ptree, cont, ttype)) - -#define toptree_for_each_sibling(ptree, start) \ - toptree_for_each(ptree, start->parent, start->level) - -#endif /* S390_TOPTREE_H */ diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index bc61ea18e88d..94ca121933de 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -40,8 +40,9 @@ static LIST_HEAD(zpci_list); static DEFINE_SPINLOCK(zpci_list_lock); -static DECLARE_BITMAP(zpci_domain, ZPCI_NR_DEVICES); +static DECLARE_BITMAP(zpci_domain, ZPCI_DOMAIN_BITMAP_SIZE); static DEFINE_SPINLOCK(zpci_domain_lock); +static unsigned int zpci_num_domains_allocated; #define ZPCI_IOMAP_ENTRIES \ min(((unsigned long) ZPCI_NR_DEVICES * PCI_STD_NUM_BARS / 2), \ @@ -424,7 +425,7 @@ static void zpci_map_resources(struct pci_dev *pdev) if (zpci_use_mio(zdev)) pdev->resource[i].start = - (resource_size_t __force) zdev->bars[i].mio_wb; + (resource_size_t __force) zdev->bars[i].mio_wt; else pdev->resource[i].start = (resource_size_t __force) pci_iomap_range_fh(pdev, i, 0, 0); @@ -531,7 +532,7 @@ static int zpci_setup_bus_resources(struct zpci_dev *zdev, flags |= IORESOURCE_MEM_64; if (zpci_use_mio(zdev)) - addr = (unsigned long) zdev->bars[i].mio_wb; + addr = (unsigned long) zdev->bars[i].mio_wt; else addr = ZPCI_ADDR(entry); size = 1UL << zdev->bars[i].size; @@ -607,57 +608,25 @@ void pcibios_disable_device(struct pci_dev *pdev) zpci_debug_exit_device(zdev); } -#ifdef CONFIG_HIBERNATE_CALLBACKS -static int zpci_restore(struct device *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev); - struct zpci_dev *zdev = to_zpci(pdev); - int ret = 0; - - if (zdev->state != ZPCI_FN_STATE_ONLINE) - goto out; - - ret = clp_enable_fh(zdev, ZPCI_NR_DMA_SPACES); - if (ret) - goto out; - - zpci_map_resources(pdev); - zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - (u64) zdev->dma_table); - -out: - return ret; -} - -static int zpci_freeze(struct device *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev); - struct zpci_dev *zdev = to_zpci(pdev); - - if (zdev->state != ZPCI_FN_STATE_ONLINE) - return 0; - - zpci_unregister_ioat(zdev, 0); - zpci_unmap_resources(pdev); - return clp_disable_fh(zdev); -} - -struct dev_pm_ops pcibios_pm_ops = { - .thaw_noirq = zpci_restore, - .freeze_noirq = zpci_freeze, - .restore_noirq = zpci_restore, - .poweroff_noirq = zpci_freeze, -}; -#endif /* CONFIG_HIBERNATE_CALLBACKS */ - static int zpci_alloc_domain(struct zpci_dev *zdev) { + spin_lock(&zpci_domain_lock); + if (zpci_num_domains_allocated > (ZPCI_NR_DEVICES - 1)) { + spin_unlock(&zpci_domain_lock); + pr_err("Adding PCI function %08x failed because the configured limit of %d is reached\n", + zdev->fid, ZPCI_NR_DEVICES); + return -ENOSPC; + } + if (zpci_unique_uid) { zdev->domain = (u16) zdev->uid; - if (zdev->domain >= ZPCI_NR_DEVICES) - return 0; + if (zdev->domain == 0) { + pr_warn("UID checking is active but no UID is set for PCI function %08x, so automatic domain allocation is used instead\n", + zdev->fid); + update_uid_checking(false); + goto auto_allocate; + } - spin_lock(&zpci_domain_lock); if (test_bit(zdev->domain, zpci_domain)) { spin_unlock(&zpci_domain_lock); pr_err("Adding PCI function %08x failed because domain %04x is already assigned\n", @@ -665,30 +634,28 @@ static int zpci_alloc_domain(struct zpci_dev *zdev) return -EEXIST; } set_bit(zdev->domain, zpci_domain); + zpci_num_domains_allocated++; spin_unlock(&zpci_domain_lock); return 0; } - - spin_lock(&zpci_domain_lock); +auto_allocate: + /* + * We can always auto allocate domains below ZPCI_NR_DEVICES. + * There is either a free domain or we have reached the maximum in + * which case we would have bailed earlier. + */ zdev->domain = find_first_zero_bit(zpci_domain, ZPCI_NR_DEVICES); - if (zdev->domain == ZPCI_NR_DEVICES) { - spin_unlock(&zpci_domain_lock); - pr_err("Adding PCI function %08x failed because the configured limit of %d is reached\n", - zdev->fid, ZPCI_NR_DEVICES); - return -ENOSPC; - } set_bit(zdev->domain, zpci_domain); + zpci_num_domains_allocated++; spin_unlock(&zpci_domain_lock); return 0; } static void zpci_free_domain(struct zpci_dev *zdev) { - if (zdev->domain >= ZPCI_NR_DEVICES) - return; - spin_lock(&zpci_domain_lock); clear_bit(zdev->domain, zpci_domain); + zpci_num_domains_allocated--; spin_unlock(&zpci_domain_lock); } diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 0d3d8f170ea4..ea794ae755ae 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -24,7 +24,7 @@ bool zpci_unique_uid; -static void update_uid_checking(bool new) +void update_uid_checking(bool new) { if (zpci_unique_uid != new) zpci_dbg(1, "uid checking:%d\n", new); diff --git a/arch/s390/purgatory/.gitignore b/arch/s390/purgatory/.gitignore index c82157f46b18..97ca52779457 100644 --- a/arch/s390/purgatory/.gitignore +++ b/arch/s390/purgatory/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only purgatory purgatory.chk purgatory.lds diff --git a/arch/s390/tools/.gitignore b/arch/s390/tools/.gitignore index 71bd6f8eebaf..ea62f37b79ef 100644 --- a/arch/s390/tools/.gitignore +++ b/arch/s390/tools/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only gen_facilities gen_opcode_table |