diff options
Diffstat (limited to 'arch/s390')
163 files changed, 9863 insertions, 6595 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bf24ab188921..deeadfa291ba 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -68,11 +68,12 @@ config DEBUG_RODATA config S390 def_bool y select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE - select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_KCOV select ARCH_HAS_SG_CHAIN + select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_INLINE_READ_LOCK select ARCH_INLINE_READ_LOCK_BH @@ -107,7 +108,9 @@ config S390 select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANTS_PROT_NUMA_PROT_NONE + select ARCH_WANTS_UBSAN_NO_NULL select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_EXTABLE_SORT select CLONE_BACKWARDS2 @@ -121,18 +124,22 @@ config S390 select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_EARLY_PFN_TO_NID + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_JUMP_LABEL + select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SOFT_DIRTY select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE - select HAVE_BPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES + select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_EXIT_THREAD select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER @@ -160,10 +167,12 @@ config S390 select NO_BOOTMEM select OLD_SIGACTION select OLD_SIGSUSPEND3 + select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE select TTY select VIRT_CPU_ACCOUNTING select VIRT_TO_BUS + select HAVE_NMI config SCHED_OMIT_FRAME_POINTER @@ -210,7 +219,7 @@ config HAVE_MARCH_Z13_FEATURES choice prompt "Processor type" - default MARCH_Z900 + default MARCH_Z196 config MARCH_Z900 bool "IBM zSeries model z800 and z900" @@ -473,6 +482,9 @@ config SCHED_MC config SCHED_BOOK def_bool n +config SCHED_DRAWER + def_bool n + config SCHED_TOPOLOGY def_bool y prompt "Topology scheduler support" @@ -480,6 +492,7 @@ config SCHED_TOPOLOGY select SCHED_SMT select SCHED_MC select SCHED_BOOK + select SCHED_DRAWER help Topology scheduler support improves the CPU scheduler's decision making when dealing with machines that have multi-threading, @@ -601,16 +614,6 @@ config PCI_NR_FUNCTIONS This allows you to specify the maximum number of PCI functions which this kernel will support. -config PCI_NR_MSI - int "Maximum number of MSI interrupts (64-32768)" - range 64 32768 - default "256" - help - This defines the number of virtual interrupts the kernel will - provide for MSI interrupts. If you configure your system to have - too few drivers will fail to allocate MSI interrupts for all - PCI devices. - source "drivers/pci/Kconfig" endif # PCI @@ -871,4 +874,17 @@ config S390_GUEST Select this option if you want to run the kernel as a guest under the KVM hypervisor. +config S390_GUEST_OLD_TRANSPORT + def_bool y + prompt "Guest support for old s390 virtio transport (DEPRECATED)" + depends on S390_GUEST + help + Enable this option to add support for the old s390-virtio + transport (i.e. virtio devices NOT based on virtio-ccw). This + type of virtio devices is only available on the experimental + kuli userspace or with old (< 2.6) qemu. If you are running + with a modern version of qemu (which supports virtio-ccw since + 1.4 and uses it by default since version 2.4), you probably won't + need this. + endmenu diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 224b42734f0d..54e00526b8df 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -46,6 +46,8 @@ cflags-$(CONFIG_MARCH_Z196_TUNE) += -mtune=z196 cflags-$(CONFIG_MARCH_ZEC12_TUNE) += -mtune=zEC12 cflags-$(CONFIG_MARCH_Z13_TUNE) += -mtune=z13 +cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include + #KBUILD_IMAGE is necessary for make rpm KBUILD_IMAGE :=arch/s390/boot/image diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index 15c94246b600..f587c4811faf 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -542,7 +542,7 @@ static int __init appldata_init(void) rc = PTR_ERR(appldata_pdev); goto out_driver; } - appldata_wq = create_singlethread_workqueue("appldata"); + appldata_wq = alloc_ordered_workqueue("appldata", 0); if (!appldata_wq) { rc = -ENOMEM; goto out_device; diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c index edcf2a706942..598df5708501 100644 --- a/arch/s390/appldata/appldata_mem.c +++ b/arch/s390/appldata/appldata_mem.c @@ -102,7 +102,7 @@ static void appldata_get_mem_data(void *data) mem_data->totalhigh = P2K(val.totalhigh); mem_data->freehigh = P2K(val.freehigh); mem_data->bufferram = P2K(val.bufferram); - mem_data->cached = P2K(global_page_state(NR_FILE_PAGES) + mem_data->cached = P2K(global_node_page_state(NR_FILE_PAGES) - val.bufferram); si_swapinfo(&val); diff --git a/arch/s390/boot/compressed/Makefile b/arch/s390/boot/compressed/Makefile index fac6ac9790fa..0daa070d6c9d 100644 --- a/arch/s390/boot/compressed/Makefile +++ b/arch/s390/boot/compressed/Makefile @@ -4,6 +4,8 @@ # create a compressed vmlinux image from the original vmlinux # +KCOV_INSTRUMENT := n + targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4 targets += misc.o piggy.o sizes.h head.o @@ -15,14 +17,14 @@ KBUILD_CFLAGS += $(call cc-option,-mpacked-stack) KBUILD_CFLAGS += $(call cc-option,-ffreestanding) GCOV_PROFILE := n +UBSAN_SANITIZE := n -OBJECTS := $(addprefix $(objtree)/arch/s390/kernel/, head.o sclp.o ebcdic.o) +OBJECTS := $(addprefix $(objtree)/arch/s390/kernel/, head.o sclp.o ebcdic.o als.o) OBJECTS += $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup -T $(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS) $(call if_changed,ld) - @: sed-sizes := -e 's/^\([0-9a-fA-F]*\) . \(__bss_start\|_end\)$$/\#define SZ\2 0x\1/p' @@ -32,10 +34,10 @@ quiet_cmd_sizes = GEN $@ $(obj)/sizes.h: vmlinux $(call if_changed,sizes) -AFLAGS_head.o += -I$(obj) +AFLAGS_head.o += -I$(objtree)/$(obj) $(obj)/head.o: $(obj)/sizes.h -CFLAGS_misc.o += -I$(obj) +CFLAGS_misc.o += -I$(objtree)/$(obj) $(obj)/misc.o: $(obj)/sizes.h OBJCOPYFLAGS_vmlinux.bin := -R .comment -S diff --git a/arch/s390/boot/compressed/head.S b/arch/s390/boot/compressed/head.S index f86a4eef28a9..28c4f96a2d9c 100644 --- a/arch/s390/boot/compressed/head.S +++ b/arch/s390/boot/compressed/head.S @@ -21,16 +21,21 @@ ENTRY(startup_continue) lg %r15,.Lstack-.LPG1(%r13) aghi %r15,-160 brasl %r14,decompress_kernel - # setup registers for memory mover & branch to target + # Set up registers for memory mover. We move the decompressed image to + # 0x11000, starting at offset 0x11000 in the decompressed image so + # that code living at 0x11000 in the image will end up at 0x11000 in + # memory. lgr %r4,%r2 lg %r2,.Loffset-.LPG1(%r13) la %r4,0(%r2,%r4) lg %r3,.Lmvsize-.LPG1(%r13) lgr %r5,%r3 - # move the memory mover someplace safe + # Move the memory mover someplace safe so it doesn't overwrite itself. la %r1,0x200 mvc 0(mover_end-mover,%r1),mover-.LPG1(%r13) - # decompress image is started at 0x11000 + # When the memory mover is done we pass control to + # arch/s390/kernel/head64.S:startup_continue which lives at 0x11000 in + # the decompressed image. lgr %r6,%r2 br %r1 mover: diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig index 0ac42cc4f880..45968686f918 100644 --- a/arch/s390/configs/default_defconfig +++ b/arch/s390/configs/default_defconfig @@ -1,8 +1,7 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_FHANDLE=y CONFIG_AUDIT=y -CONFIG_NO_HZ=y +CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_BSD_PROCESS_ACCT_V3=y @@ -13,19 +12,19 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_NUMA_BALANCING=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_PIDS=y -CONFIG_CGROUP_DEVICE=y -CONFIG_CPUSETS=y -CONFIG_CGROUP_CPUACCT=y CONFIG_MEMCG=y CONFIG_MEMCG_SWAP=y -CONFIG_MEMCG_KMEM=y -CONFIG_CGROUP_HUGETLB=y -CONFIG_CGROUP_PERF=y +CONFIG_BLK_CGROUP=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y -CONFIG_BLK_CGROUP=y +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_HUGETLB=y +CONFIG_CPUSETS=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_PERF=y +CONFIG_CHECKPOINT_RESTORE=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_SCHED_AUTOGROUP=y @@ -55,7 +54,6 @@ CONFIG_UNIXWARE_DISKLABEL=y CONFIG_CFQ_GROUP_IOSCHED=y CONFIG_DEFAULT_DEADLINE=y CONFIG_LIVEPATCH=y -CONFIG_MARCH_Z196=y CONFIG_TUNE_ZEC12=y CONFIG_NR_CPUS=256 CONFIG_NUMA=y @@ -65,6 +63,15 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_CLEANCACHE=y +CONFIG_FRONTSWAP=y +CONFIG_CMA=y +CONFIG_MEM_SOFT_DIRTY=y +CONFIG_ZPOOL=m +CONFIG_ZBUD=m +CONFIG_ZSMALLOC=m +CONFIG_ZSMALLOC_STAT=y +CONFIG_IDLE_PAGE_TRACKING=y CONFIG_PCI=y CONFIG_PCI_DEBUG=y CONFIG_HOTPLUG_PCI=y @@ -253,7 +260,6 @@ CONFIG_NF_CONNTRACK_IPV4=m CONFIG_NF_TABLES_IPV4=m CONFIG_NFT_CHAIN_ROUTE_IPV4=m CONFIG_NF_TABLES_ARP=m -CONFIG_NF_NAT_IPV4=m CONFIG_NFT_CHAIN_NAT_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m @@ -262,6 +268,8 @@ CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m @@ -274,7 +282,6 @@ CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NF_CONNTRACK_IPV6=m CONFIG_NF_TABLES_IPV6=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NF_NAT_IPV6=m CONFIG_NFT_CHAIN_NAT_IPV6=m CONFIG_IP6_NF_IPTABLES=m CONFIG_IP6_NF_MATCH_AH=m @@ -292,6 +299,8 @@ CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_SECURITY=m +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m CONFIG_NF_TABLES_BRIDGE=m CONFIG_NET_SCTPPROBE=m CONFIG_RDS=m @@ -352,6 +361,7 @@ CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m CONFIG_NET_ACT_CSUM=m CONFIG_DNS_RESOLVER=y +CONFIG_CGROUP_NET_PRIO=y CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m CONFIG_NET_TCPPROBE=m @@ -402,6 +412,7 @@ CONFIG_MD_FAULTY=m CONFIG_BLK_DEV_DM=m CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m CONFIG_DM_MIRROR=m CONFIG_DM_LOG_USERSPACE=m CONFIG_DM_RAID=m @@ -421,6 +432,7 @@ CONFIG_EQUALIZER=m CONFIG_IFB=m CONFIG_MACVLAN=m CONFIG_MACVTAP=m +CONFIG_IPVLAN=m CONFIG_VXLAN=m CONFIG_TUN=m CONFIG_VETH=m @@ -446,12 +458,12 @@ CONFIG_PPP_SYNC_TTY=m # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set -CONFIG_DEVPTS_MULTIPLE_INSTANCES=y CONFIG_LEGACY_PTY_COUNT=0 CONFIG_HW_RANDOM_VIRTIO=m CONFIG_RAW_DRIVER=m CONFIG_HANGCHECK_TIMER=m CONFIG_TN3270_FS=y +# CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_WATCHDOG_NOWAYOUT=y CONFIG_SOFT_WATCHDOG=m @@ -487,6 +499,7 @@ CONFIG_QFMT_V2=m CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=y CONFIG_CUSE=m +CONFIG_OVERLAY_FS=m CONFIG_FSCACHE=m CONFIG_CACHEFILES=m CONFIG_ISO9660_FS=y @@ -537,6 +550,8 @@ CONFIG_DLM=m CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_GDB_SCRIPTS=y CONFIG_FRAME_WARN=1024 CONFIG_READABLE_ASM=y CONFIG_UNUSED_SYMBOLS=y @@ -555,13 +570,17 @@ CONFIG_SLUB_DEBUG_ON=y CONFIG_SLUB_STATS=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_DEBUG_VM=y +CONFIG_DEBUG_VM_VMACACHE=y CONFIG_DEBUG_VM_RB=y +CONFIG_DEBUG_VM_PGFLAGS=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m CONFIG_DEBUG_PER_CPU_MAPS=y CONFIG_DEBUG_SHIRQ=y CONFIG_DETECT_HUNG_TASK=y +CONFIG_WQ_WATCHDOG=y CONFIG_PANIC_ON_OOPS=y +CONFIG_DEBUG_TIMEKEEPING=y CONFIG_TIMER_STATS=y CONFIG_DEBUG_RT_MUTEXES=y CONFIG_DEBUG_WW_MUTEX_SLOWPATH=y @@ -588,7 +607,6 @@ CONFIG_FAIL_FUTEX=y CONFIG_FAULT_INJECTION_DEBUG_FS=y CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y CONFIG_LATENCYTOP=y -CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y CONFIG_IRQSOFF_TRACER=y CONFIG_PREEMPT_TRACER=y CONFIG_SCHED_TRACER=y @@ -596,6 +614,8 @@ CONFIG_FTRACE_SYSCALLS=y CONFIG_STACK_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_UPROBE_EVENT=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_TRACE_ENUM_MAP_FILE=y CONFIG_LKDTM=m CONFIG_TEST_LIST_SORT=y CONFIG_KPROBES_SANITY_TEST=y @@ -607,7 +627,6 @@ CONFIG_TEST_STRING_HELPERS=y CONFIG_TEST_KSTRTOX=y CONFIG_DMA_API_DEBUG=y CONFIG_TEST_BPF=m -# CONFIG_STRICT_DEVMEM is not set CONFIG_S390_PTDUMP=y CONFIG_ENCRYPTED_KEYS=m CONFIG_SECURITY=y @@ -651,7 +670,6 @@ CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_ZLIB=y CONFIG_CRYPTO_LZO=m CONFIG_CRYPTO_LZ4=m CONFIG_CRYPTO_LZ4HC=m @@ -664,7 +682,8 @@ CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_DES_S390=m CONFIG_CRYPTO_AES_S390=m CONFIG_CRYPTO_GHASH_S390=m -CONFIG_ASYMMETRIC_KEY_TYPE=m +CONFIG_CRYPTO_CRC32_S390=y +CONFIG_ASYMMETRIC_KEY_TYPE=y CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m CONFIG_X509_CERTIFICATE_PARSER=m CONFIG_CRC7=m diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig index a31dcd56f7c0..1dd05e345c4d 100644 --- a/arch/s390/configs/gcov_defconfig +++ b/arch/s390/configs/gcov_defconfig @@ -1,8 +1,7 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_FHANDLE=y CONFIG_AUDIT=y -CONFIG_NO_HZ=y +CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_BSD_PROCESS_ACCT_V3=y @@ -13,17 +12,19 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_NUMA_BALANCING=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_PIDS=y -CONFIG_CGROUP_DEVICE=y -CONFIG_CPUSETS=y -CONFIG_CGROUP_CPUACCT=y CONFIG_MEMCG=y CONFIG_MEMCG_SWAP=y -CONFIG_MEMCG_KMEM=y +CONFIG_BLK_CGROUP=y +CONFIG_CFS_BANDWIDTH=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_HUGETLB=y +CONFIG_CPUSETS=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y -CONFIG_BLK_CGROUP=y +CONFIG_CHECKPOINT_RESTORE=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_SCHED_AUTOGROUP=y @@ -53,7 +54,6 @@ CONFIG_SOLARIS_X86_PARTITION=y CONFIG_UNIXWARE_DISKLABEL=y CONFIG_CFQ_GROUP_IOSCHED=y CONFIG_DEFAULT_DEADLINE=y -CONFIG_MARCH_Z196=y CONFIG_TUNE_ZEC12=y CONFIG_NR_CPUS=256 CONFIG_NUMA=y @@ -62,6 +62,14 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_CLEANCACHE=y +CONFIG_FRONTSWAP=y +CONFIG_CMA=y +CONFIG_ZSWAP=y +CONFIG_ZBUD=m +CONFIG_ZSMALLOC=m +CONFIG_ZSMALLOC_STAT=y +CONFIG_IDLE_PAGE_TRACKING=y CONFIG_PCI=y CONFIG_HOTPLUG_PCI=y CONFIG_HOTPLUG_PCI_S390=y @@ -249,7 +257,6 @@ CONFIG_NF_CONNTRACK_IPV4=m CONFIG_NF_TABLES_IPV4=m CONFIG_NFT_CHAIN_ROUTE_IPV4=m CONFIG_NF_TABLES_ARP=m -CONFIG_NF_NAT_IPV4=m CONFIG_NFT_CHAIN_NAT_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m @@ -258,6 +265,8 @@ CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m @@ -270,7 +279,6 @@ CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NF_CONNTRACK_IPV6=m CONFIG_NF_TABLES_IPV6=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NF_NAT_IPV6=m CONFIG_NFT_CHAIN_NAT_IPV6=m CONFIG_IP6_NF_IPTABLES=m CONFIG_IP6_NF_MATCH_AH=m @@ -288,6 +296,8 @@ CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_SECURITY=m +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m CONFIG_NF_TABLES_BRIDGE=m CONFIG_NET_SCTPPROBE=m CONFIG_RDS=m @@ -347,6 +357,7 @@ CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m CONFIG_NET_ACT_CSUM=m CONFIG_DNS_RESOLVER=y +CONFIG_CGROUP_NET_PRIO=y CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m CONFIG_NET_TCPPROBE=m @@ -397,6 +408,7 @@ CONFIG_MD_FAULTY=m CONFIG_BLK_DEV_DM=m CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m CONFIG_DM_MIRROR=m CONFIG_DM_LOG_USERSPACE=m CONFIG_DM_RAID=m @@ -416,6 +428,7 @@ CONFIG_EQUALIZER=m CONFIG_IFB=m CONFIG_MACVLAN=m CONFIG_MACVTAP=m +CONFIG_IPVLAN=m CONFIG_VXLAN=m CONFIG_TUN=m CONFIG_VETH=m @@ -441,7 +454,6 @@ CONFIG_PPP_SYNC_TTY=m # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set -CONFIG_DEVPTS_MULTIPLE_INSTANCES=y CONFIG_LEGACY_PTY_COUNT=0 CONFIG_HW_RANDOM_VIRTIO=m CONFIG_RAW_DRIVER=m @@ -481,6 +493,7 @@ CONFIG_QFMT_V2=m CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=y CONFIG_CUSE=m +CONFIG_OVERLAY_FS=m CONFIG_FSCACHE=m CONFIG_CACHEFILES=m CONFIG_ISO9660_FS=y @@ -530,6 +543,8 @@ CONFIG_NLS_UTF8=m CONFIG_DLM=m CONFIG_PRINTK_TIME=y CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_GDB_SCRIPTS=y # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_FRAME_WARN=1024 CONFIG_UNUSED_SYMBOLS=y @@ -544,16 +559,15 @@ CONFIG_NOTIFIER_ERROR_INJECTION=m CONFIG_CPU_NOTIFIER_ERROR_INJECT=m CONFIG_PM_NOTIFIER_ERROR_INJECT=m CONFIG_LATENCYTOP=y -CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y CONFIG_BLK_DEV_IO_TRACE=y # CONFIG_KPROBE_EVENT is not set +CONFIG_TRACE_ENUM_MAP_FILE=y CONFIG_LKDTM=m CONFIG_RBTREE_TEST=m CONFIG_INTERVAL_TREE_TEST=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y CONFIG_TEST_BPF=m -# CONFIG_STRICT_DEVMEM is not set CONFIG_S390_PTDUMP=y CONFIG_ENCRYPTED_KEYS=m CONFIG_SECURITY=y @@ -597,8 +611,6 @@ CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_ZLIB=y -CONFIG_CRYPTO_LZO=m CONFIG_CRYPTO_LZ4=m CONFIG_CRYPTO_LZ4HC=m CONFIG_CRYPTO_USER_API_HASH=m @@ -610,7 +622,8 @@ CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_DES_S390=m CONFIG_CRYPTO_AES_S390=m CONFIG_CRYPTO_GHASH_S390=m -CONFIG_ASYMMETRIC_KEY_TYPE=m +CONFIG_CRYPTO_CRC32_S390=y +CONFIG_ASYMMETRIC_KEY_TYPE=y CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m CONFIG_X509_CERTIFICATE_PARSER=m CONFIG_CRC7=m diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index 7b73bf353345..29d1178666f0 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -1,8 +1,7 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_FHANDLE=y CONFIG_AUDIT=y -CONFIG_NO_HZ=y +CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_BSD_PROCESS_ACCT_V3=y @@ -14,17 +13,19 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_NUMA_BALANCING=y # CONFIG_NUMA_BALANCING_DEFAULT_ENABLED is not set -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_PIDS=y -CONFIG_CGROUP_DEVICE=y -CONFIG_CPUSETS=y -CONFIG_CGROUP_CPUACCT=y CONFIG_MEMCG=y CONFIG_MEMCG_SWAP=y -CONFIG_MEMCG_KMEM=y +CONFIG_BLK_CGROUP=y +CONFIG_CFS_BANDWIDTH=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_HUGETLB=y +CONFIG_CPUSETS=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y -CONFIG_BLK_CGROUP=y +CONFIG_CHECKPOINT_RESTORE=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_SCHED_AUTOGROUP=y @@ -53,7 +54,6 @@ CONFIG_UNIXWARE_DISKLABEL=y CONFIG_CFQ_GROUP_IOSCHED=y CONFIG_DEFAULT_DEADLINE=y CONFIG_LIVEPATCH=y -CONFIG_MARCH_Z196=y CONFIG_TUNE_ZEC12=y CONFIG_NR_CPUS=512 CONFIG_NUMA=y @@ -62,6 +62,14 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_CLEANCACHE=y +CONFIG_FRONTSWAP=y +CONFIG_CMA=y +CONFIG_ZSWAP=y +CONFIG_ZBUD=m +CONFIG_ZSMALLOC=m +CONFIG_ZSMALLOC_STAT=y +CONFIG_IDLE_PAGE_TRACKING=y CONFIG_PCI=y CONFIG_HOTPLUG_PCI=y CONFIG_HOTPLUG_PCI_S390=y @@ -249,7 +257,6 @@ CONFIG_NF_CONNTRACK_IPV4=m CONFIG_NF_TABLES_IPV4=m CONFIG_NFT_CHAIN_ROUTE_IPV4=m CONFIG_NF_TABLES_ARP=m -CONFIG_NF_NAT_IPV4=m CONFIG_NFT_CHAIN_NAT_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m @@ -258,6 +265,8 @@ CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m @@ -270,7 +279,6 @@ CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NF_CONNTRACK_IPV6=m CONFIG_NF_TABLES_IPV6=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NF_NAT_IPV6=m CONFIG_NFT_CHAIN_NAT_IPV6=m CONFIG_IP6_NF_IPTABLES=m CONFIG_IP6_NF_MATCH_AH=m @@ -288,6 +296,8 @@ CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_SECURITY=m +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m CONFIG_NF_TABLES_BRIDGE=m CONFIG_NET_SCTPPROBE=m CONFIG_RDS=m @@ -347,6 +357,7 @@ CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m CONFIG_NET_ACT_CSUM=m CONFIG_DNS_RESOLVER=y +CONFIG_CGROUP_NET_PRIO=y CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m CONFIG_NET_TCPPROBE=m @@ -397,6 +408,7 @@ CONFIG_MD_FAULTY=m CONFIG_BLK_DEV_DM=m CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m CONFIG_DM_MIRROR=m CONFIG_DM_LOG_USERSPACE=m CONFIG_DM_RAID=m @@ -416,6 +428,7 @@ CONFIG_EQUALIZER=m CONFIG_IFB=m CONFIG_MACVLAN=m CONFIG_MACVTAP=m +CONFIG_IPVLAN=m CONFIG_VXLAN=m CONFIG_TUN=m CONFIG_VETH=m @@ -441,12 +454,12 @@ CONFIG_PPP_SYNC_TTY=m # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set -CONFIG_DEVPTS_MULTIPLE_INSTANCES=y CONFIG_LEGACY_PTY_COUNT=0 CONFIG_HW_RANDOM_VIRTIO=m CONFIG_RAW_DRIVER=m CONFIG_HANGCHECK_TIMER=m CONFIG_TN3270_FS=y +# CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_WATCHDOG_NOWAYOUT=y CONFIG_SOFT_WATCHDOG=m @@ -481,6 +494,7 @@ CONFIG_QFMT_V2=m CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=y CONFIG_CUSE=m +CONFIG_OVERLAY_FS=m CONFIG_FSCACHE=m CONFIG_CACHEFILES=m CONFIG_ISO9660_FS=y @@ -530,6 +544,8 @@ CONFIG_NLS_UTF8=m CONFIG_DLM=m CONFIG_PRINTK_TIME=y CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_GDB_SCRIPTS=y # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_FRAME_WARN=1024 CONFIG_UNUSED_SYMBOLS=y @@ -540,17 +556,17 @@ CONFIG_TIMER_STATS=y CONFIG_RCU_TORTURE_TEST=m CONFIG_RCU_CPU_STALL_TIMEOUT=60 CONFIG_LATENCYTOP=y -CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y CONFIG_STACK_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_UPROBE_EVENT=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_TRACE_ENUM_MAP_FILE=y CONFIG_LKDTM=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y CONFIG_TEST_BPF=m -# CONFIG_STRICT_DEVMEM is not set CONFIG_S390_PTDUMP=y CONFIG_ENCRYPTED_KEYS=m CONFIG_SECURITY=y @@ -594,8 +610,6 @@ CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_ZLIB=y -CONFIG_CRYPTO_LZO=m CONFIG_CRYPTO_LZ4=m CONFIG_CRYPTO_LZ4HC=m CONFIG_CRYPTO_USER_API_HASH=m @@ -607,7 +621,8 @@ CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_DES_S390=m CONFIG_CRYPTO_AES_S390=m CONFIG_CRYPTO_GHASH_S390=m -CONFIG_ASYMMETRIC_KEY_TYPE=m +CONFIG_CRYPTO_CRC32_S390=y +CONFIG_ASYMMETRIC_KEY_TYPE=y CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m CONFIG_X509_CERTIFICATE_PARSER=m CONFIG_CRC7=m diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index 1719843a55a2..4366a3e3e754 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -1,5 +1,5 @@ # CONFIG_SWAP is not set -CONFIG_NO_HZ=y +CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y @@ -7,7 +7,6 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_PARTITION_ADVANCED=y CONFIG_IBM_PARTITION=y CONFIG_DEFAULT_DEADLINE=y -CONFIG_MARCH_Z196=y CONFIG_TUNE_ZEC12=y # CONFIG_COMPAT is not set CONFIG_NR_CPUS=2 @@ -64,7 +63,6 @@ CONFIG_PANIC_ON_OOPS=y # CONFIG_SCHED_DEBUG is not set CONFIG_RCU_CPU_STALL_TIMEOUT=60 # CONFIG_FTRACE is not set -# CONFIG_STRICT_DEVMEM is not set # CONFIG_PFAULT is not set # CONFIG_S390_HYPFS_FS is not set # CONFIG_VIRTUALIZATION is not set diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile index 7f0b7cda6259..d1033de4c4ee 100644 --- a/arch/s390/crypto/Makefile +++ b/arch/s390/crypto/Makefile @@ -9,3 +9,6 @@ obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o obj-$(CONFIG_S390_PRNG) += prng.o obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o +obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o + +crc32-vx_s390-y := crc32-vx.o crc32le-vx.o crc32be-vx.o diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index 48e1a2d3e318..303d28eb03a2 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -22,77 +22,38 @@ #include <crypto/aes.h> #include <crypto/algapi.h> +#include <crypto/internal/skcipher.h> #include <linux/err.h> #include <linux/module.h> #include <linux/cpufeature.h> #include <linux/init.h> #include <linux/spinlock.h> #include <crypto/xts.h> -#include "crypt_s390.h" - -#define AES_KEYLEN_128 1 -#define AES_KEYLEN_192 2 -#define AES_KEYLEN_256 4 +#include <asm/cpacf.h> static u8 *ctrblk; static DEFINE_SPINLOCK(ctrblk_lock); -static char keylen_flag; + +static cpacf_mask_t km_functions, kmc_functions, kmctr_functions; struct s390_aes_ctx { u8 key[AES_MAX_KEY_SIZE]; - long enc; - long dec; int key_len; + unsigned long fc; union { - struct crypto_blkcipher *blk; + struct crypto_skcipher *blk; struct crypto_cipher *cip; } fallback; }; -struct pcc_param { - u8 key[32]; - u8 tweak[16]; - u8 block[16]; - u8 bit[16]; - u8 xts[16]; -}; - struct s390_xts_ctx { u8 key[32]; u8 pcc_key[32]; - long enc; - long dec; int key_len; - struct crypto_blkcipher *fallback; + unsigned long fc; + struct crypto_skcipher *fallback; }; -/* - * Check if the key_len is supported by the HW. - * Returns 0 if it is, a positive number if it is not and software fallback is - * required or a negative number in case the key size is not valid - */ -static int need_fallback(unsigned int key_len) -{ - switch (key_len) { - case 16: - if (!(keylen_flag & AES_KEYLEN_128)) - return 1; - break; - case 24: - if (!(keylen_flag & AES_KEYLEN_192)) - return 1; - break; - case 32: - if (!(keylen_flag & AES_KEYLEN_256)) - return 1; - break; - default: - return -1; - break; - } - return 0; -} - static int setkey_fallback_cip(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { @@ -116,72 +77,44 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - u32 *flags = &tfm->crt_flags; - int ret; + unsigned long fc; - ret = need_fallback(key_len); - if (ret < 0) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; - } + /* Pick the correct function code based on the key length */ + fc = (key_len == 16) ? CPACF_KM_AES_128 : + (key_len == 24) ? CPACF_KM_AES_192 : + (key_len == 32) ? CPACF_KM_AES_256 : 0; - sctx->key_len = key_len; - if (!ret) { - memcpy(sctx->key, in_key, key_len); - return 0; - } + /* Check if the function code is available */ + sctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; + if (!sctx->fc) + return setkey_fallback_cip(tfm, in_key, key_len); - return setkey_fallback_cip(tfm, in_key, key_len); + sctx->key_len = key_len; + memcpy(sctx->key, in_key, key_len); + return 0; } static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - if (unlikely(need_fallback(sctx->key_len))) { + if (unlikely(!sctx->fc)) { crypto_cipher_encrypt_one(sctx->fallback.cip, out, in); return; } - - switch (sctx->key_len) { - case 16: - crypt_s390_km(KM_AES_128_ENCRYPT, &sctx->key, out, in, - AES_BLOCK_SIZE); - break; - case 24: - crypt_s390_km(KM_AES_192_ENCRYPT, &sctx->key, out, in, - AES_BLOCK_SIZE); - break; - case 32: - crypt_s390_km(KM_AES_256_ENCRYPT, &sctx->key, out, in, - AES_BLOCK_SIZE); - break; - } + cpacf_km(sctx->fc, &sctx->key, out, in, AES_BLOCK_SIZE); } static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - if (unlikely(need_fallback(sctx->key_len))) { + if (unlikely(!sctx->fc)) { crypto_cipher_decrypt_one(sctx->fallback.cip, out, in); return; } - - switch (sctx->key_len) { - case 16: - crypt_s390_km(KM_AES_128_DECRYPT, &sctx->key, out, in, - AES_BLOCK_SIZE); - break; - case 24: - crypt_s390_km(KM_AES_192_DECRYPT, &sctx->key, out, in, - AES_BLOCK_SIZE); - break; - case 32: - crypt_s390_km(KM_AES_256_DECRYPT, &sctx->key, out, in, - AES_BLOCK_SIZE); - break; - } + cpacf_km(sctx->fc | CPACF_DECRYPT, + &sctx->key, out, in, AES_BLOCK_SIZE); } static int fallback_init_cip(struct crypto_tfm *tfm) @@ -212,7 +145,7 @@ static void fallback_exit_cip(struct crypto_tfm *tfm) static struct crypto_alg aes_alg = { .cra_name = "aes", .cra_driver_name = "aes-s390", - .cra_priority = CRYPT_S390_PRIORITY, + .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_NEED_FALLBACK, .cra_blocksize = AES_BLOCK_SIZE, @@ -237,16 +170,16 @@ static int setkey_fallback_blk(struct crypto_tfm *tfm, const u8 *key, struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); unsigned int ret; - sctx->fallback.blk->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK; - sctx->fallback.blk->base.crt_flags |= (tfm->crt_flags & - CRYPTO_TFM_REQ_MASK); + crypto_skcipher_clear_flags(sctx->fallback.blk, CRYPTO_TFM_REQ_MASK); + crypto_skcipher_set_flags(sctx->fallback.blk, tfm->crt_flags & + CRYPTO_TFM_REQ_MASK); + + ret = crypto_skcipher_setkey(sctx->fallback.blk, key, len); + + tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; + tfm->crt_flags |= crypto_skcipher_get_flags(sctx->fallback.blk) & + CRYPTO_TFM_RES_MASK; - ret = crypto_blkcipher_setkey(sctx->fallback.blk, key, len); - if (ret) { - tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; - tfm->crt_flags |= (sctx->fallback.blk->base.crt_flags & - CRYPTO_TFM_RES_MASK); - } return ret; } @@ -255,15 +188,17 @@ static int fallback_blk_dec(struct blkcipher_desc *desc, unsigned int nbytes) { unsigned int ret; - struct crypto_blkcipher *tfm; - struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_blkcipher *tfm = desc->tfm; + struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(tfm); + SKCIPHER_REQUEST_ON_STACK(req, sctx->fallback.blk); - tfm = desc->tfm; - desc->tfm = sctx->fallback.blk; + skcipher_request_set_tfm(req, sctx->fallback.blk); + skcipher_request_set_callback(req, desc->flags, NULL, NULL); + skcipher_request_set_crypt(req, src, dst, nbytes, desc->info); - ret = crypto_blkcipher_decrypt_iv(desc, dst, src, nbytes); + ret = crypto_skcipher_decrypt(req); - desc->tfm = tfm; + skcipher_request_zero(req); return ret; } @@ -272,15 +207,15 @@ static int fallback_blk_enc(struct blkcipher_desc *desc, unsigned int nbytes) { unsigned int ret; - struct crypto_blkcipher *tfm; - struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); - - tfm = desc->tfm; - desc->tfm = sctx->fallback.blk; + struct crypto_blkcipher *tfm = desc->tfm; + struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(tfm); + SKCIPHER_REQUEST_ON_STACK(req, sctx->fallback.blk); - ret = crypto_blkcipher_encrypt_iv(desc, dst, src, nbytes); + skcipher_request_set_tfm(req, sctx->fallback.blk); + skcipher_request_set_callback(req, desc->flags, NULL, NULL); + skcipher_request_set_crypt(req, src, dst, nbytes, desc->info); - desc->tfm = tfm; + ret = crypto_skcipher_encrypt(req); return ret; } @@ -288,50 +223,37 @@ static int ecb_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - int ret; + unsigned long fc; - ret = need_fallback(key_len); - if (ret > 0) { - sctx->key_len = key_len; - return setkey_fallback_blk(tfm, in_key, key_len); - } + /* Pick the correct function code based on the key length */ + fc = (key_len == 16) ? CPACF_KM_AES_128 : + (key_len == 24) ? CPACF_KM_AES_192 : + (key_len == 32) ? CPACF_KM_AES_256 : 0; - switch (key_len) { - case 16: - sctx->enc = KM_AES_128_ENCRYPT; - sctx->dec = KM_AES_128_DECRYPT; - break; - case 24: - sctx->enc = KM_AES_192_ENCRYPT; - sctx->dec = KM_AES_192_DECRYPT; - break; - case 32: - sctx->enc = KM_AES_256_ENCRYPT; - sctx->dec = KM_AES_256_DECRYPT; - break; - } + /* Check if the function code is available */ + sctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; + if (!sctx->fc) + return setkey_fallback_blk(tfm, in_key, key_len); - return aes_set_key(tfm, in_key, key_len); + sctx->key_len = key_len; + memcpy(sctx->key, in_key, key_len); + return 0; } -static int ecb_aes_crypt(struct blkcipher_desc *desc, long func, void *param, +static int ecb_aes_crypt(struct blkcipher_desc *desc, unsigned long modifier, struct blkcipher_walk *walk) { - int ret = blkcipher_walk_virt(desc, walk); - unsigned int nbytes; + struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int nbytes, n; + int ret; - while ((nbytes = walk->nbytes)) { + ret = blkcipher_walk_virt(desc, walk); + while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) { /* only use complete blocks */ - unsigned int n = nbytes & ~(AES_BLOCK_SIZE - 1); - u8 *out = walk->dst.virt.addr; - u8 *in = walk->src.virt.addr; - - ret = crypt_s390_km(func, param, out, in, n); - if (ret < 0 || ret != n) - return -EIO; - - nbytes &= AES_BLOCK_SIZE - 1; - ret = blkcipher_walk_done(desc, walk, nbytes); + n = nbytes & ~(AES_BLOCK_SIZE - 1); + cpacf_km(sctx->fc | modifier, sctx->key, + walk->dst.virt.addr, walk->src.virt.addr, n); + ret = blkcipher_walk_done(desc, walk, nbytes - n); } return ret; @@ -344,11 +266,11 @@ static int ecb_aes_encrypt(struct blkcipher_desc *desc, struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; - if (unlikely(need_fallback(sctx->key_len))) + if (unlikely(!sctx->fc)) return fallback_blk_enc(desc, dst, src, nbytes); blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_aes_crypt(desc, sctx->enc, sctx->key, &walk); + return ecb_aes_crypt(desc, 0, &walk); } static int ecb_aes_decrypt(struct blkcipher_desc *desc, @@ -358,11 +280,11 @@ static int ecb_aes_decrypt(struct blkcipher_desc *desc, struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; - if (unlikely(need_fallback(sctx->key_len))) + if (unlikely(!sctx->fc)) return fallback_blk_dec(desc, dst, src, nbytes); blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_aes_crypt(desc, sctx->dec, sctx->key, &walk); + return ecb_aes_crypt(desc, CPACF_DECRYPT, &walk); } static int fallback_init_blk(struct crypto_tfm *tfm) @@ -370,8 +292,9 @@ static int fallback_init_blk(struct crypto_tfm *tfm) const char *name = tfm->__crt_alg->cra_name; struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - sctx->fallback.blk = crypto_alloc_blkcipher(name, 0, - CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK); + sctx->fallback.blk = crypto_alloc_skcipher(name, 0, + CRYPTO_ALG_ASYNC | + CRYPTO_ALG_NEED_FALLBACK); if (IS_ERR(sctx->fallback.blk)) { pr_err("Allocating AES fallback algorithm %s failed\n", @@ -386,14 +309,13 @@ static void fallback_exit_blk(struct crypto_tfm *tfm) { struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - crypto_free_blkcipher(sctx->fallback.blk); - sctx->fallback.blk = NULL; + crypto_free_skcipher(sctx->fallback.blk); } static struct crypto_alg ecb_aes_alg = { .cra_name = "ecb(aes)", .cra_driver_name = "ecb-aes-s390", - .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, + .cra_priority = 400, /* combo: aes + ecb */ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | CRYPTO_ALG_NEED_FALLBACK, .cra_blocksize = AES_BLOCK_SIZE, @@ -417,64 +339,45 @@ static int cbc_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); - int ret; + unsigned long fc; - ret = need_fallback(key_len); - if (ret > 0) { - sctx->key_len = key_len; - return setkey_fallback_blk(tfm, in_key, key_len); - } + /* Pick the correct function code based on the key length */ + fc = (key_len == 16) ? CPACF_KMC_AES_128 : + (key_len == 24) ? CPACF_KMC_AES_192 : + (key_len == 32) ? CPACF_KMC_AES_256 : 0; - switch (key_len) { - case 16: - sctx->enc = KMC_AES_128_ENCRYPT; - sctx->dec = KMC_AES_128_DECRYPT; - break; - case 24: - sctx->enc = KMC_AES_192_ENCRYPT; - sctx->dec = KMC_AES_192_DECRYPT; - break; - case 32: - sctx->enc = KMC_AES_256_ENCRYPT; - sctx->dec = KMC_AES_256_DECRYPT; - break; - } + /* Check if the function code is available */ + sctx->fc = (fc && cpacf_test_func(&kmc_functions, fc)) ? fc : 0; + if (!sctx->fc) + return setkey_fallback_blk(tfm, in_key, key_len); - return aes_set_key(tfm, in_key, key_len); + sctx->key_len = key_len; + memcpy(sctx->key, in_key, key_len); + return 0; } -static int cbc_aes_crypt(struct blkcipher_desc *desc, long func, +static int cbc_aes_crypt(struct blkcipher_desc *desc, unsigned long modifier, struct blkcipher_walk *walk) { struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); - int ret = blkcipher_walk_virt(desc, walk); - unsigned int nbytes = walk->nbytes; + unsigned int nbytes, n; + int ret; struct { u8 iv[AES_BLOCK_SIZE]; u8 key[AES_MAX_KEY_SIZE]; } param; - if (!nbytes) - goto out; - + ret = blkcipher_walk_virt(desc, walk); memcpy(param.iv, walk->iv, AES_BLOCK_SIZE); memcpy(param.key, sctx->key, sctx->key_len); - do { + while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) { /* only use complete blocks */ - unsigned int n = nbytes & ~(AES_BLOCK_SIZE - 1); - u8 *out = walk->dst.virt.addr; - u8 *in = walk->src.virt.addr; - - ret = crypt_s390_kmc(func, ¶m, out, in, n); - if (ret < 0 || ret != n) - return -EIO; - - nbytes &= AES_BLOCK_SIZE - 1; - ret = blkcipher_walk_done(desc, walk, nbytes); - } while ((nbytes = walk->nbytes)); + n = nbytes & ~(AES_BLOCK_SIZE - 1); + cpacf_kmc(sctx->fc | modifier, ¶m, + walk->dst.virt.addr, walk->src.virt.addr, n); + ret = blkcipher_walk_done(desc, walk, nbytes - n); + } memcpy(walk->iv, param.iv, AES_BLOCK_SIZE); - -out: return ret; } @@ -485,11 +388,11 @@ static int cbc_aes_encrypt(struct blkcipher_desc *desc, struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; - if (unlikely(need_fallback(sctx->key_len))) + if (unlikely(!sctx->fc)) return fallback_blk_enc(desc, dst, src, nbytes); blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_aes_crypt(desc, sctx->enc, &walk); + return cbc_aes_crypt(desc, 0, &walk); } static int cbc_aes_decrypt(struct blkcipher_desc *desc, @@ -499,17 +402,17 @@ static int cbc_aes_decrypt(struct blkcipher_desc *desc, struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; - if (unlikely(need_fallback(sctx->key_len))) + if (unlikely(!sctx->fc)) return fallback_blk_dec(desc, dst, src, nbytes); blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_aes_crypt(desc, sctx->dec, &walk); + return cbc_aes_crypt(desc, CPACF_DECRYPT, &walk); } static struct crypto_alg cbc_aes_alg = { .cra_name = "cbc(aes)", .cra_driver_name = "cbc-aes-s390", - .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, + .cra_priority = 400, /* combo: aes + cbc */ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | CRYPTO_ALG_NEED_FALLBACK, .cra_blocksize = AES_BLOCK_SIZE, @@ -536,16 +439,16 @@ static int xts_fallback_setkey(struct crypto_tfm *tfm, const u8 *key, struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); unsigned int ret; - xts_ctx->fallback->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK; - xts_ctx->fallback->base.crt_flags |= (tfm->crt_flags & - CRYPTO_TFM_REQ_MASK); + crypto_skcipher_clear_flags(xts_ctx->fallback, CRYPTO_TFM_REQ_MASK); + crypto_skcipher_set_flags(xts_ctx->fallback, tfm->crt_flags & + CRYPTO_TFM_REQ_MASK); + + ret = crypto_skcipher_setkey(xts_ctx->fallback, key, len); + + tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; + tfm->crt_flags |= crypto_skcipher_get_flags(xts_ctx->fallback) & + CRYPTO_TFM_RES_MASK; - ret = crypto_blkcipher_setkey(xts_ctx->fallback, key, len); - if (ret) { - tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; - tfm->crt_flags |= (xts_ctx->fallback->base.crt_flags & - CRYPTO_TFM_RES_MASK); - } return ret; } @@ -553,16 +456,18 @@ static int xts_fallback_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); - struct crypto_blkcipher *tfm; + struct crypto_blkcipher *tfm = desc->tfm; + struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(tfm); + SKCIPHER_REQUEST_ON_STACK(req, xts_ctx->fallback); unsigned int ret; - tfm = desc->tfm; - desc->tfm = xts_ctx->fallback; + skcipher_request_set_tfm(req, xts_ctx->fallback); + skcipher_request_set_callback(req, desc->flags, NULL, NULL); + skcipher_request_set_crypt(req, src, dst, nbytes, desc->info); - ret = crypto_blkcipher_decrypt_iv(desc, dst, src, nbytes); + ret = crypto_skcipher_decrypt(req); - desc->tfm = tfm; + skcipher_request_zero(req); return ret; } @@ -570,16 +475,18 @@ static int xts_fallback_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); - struct crypto_blkcipher *tfm; + struct crypto_blkcipher *tfm = desc->tfm; + struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(tfm); + SKCIPHER_REQUEST_ON_STACK(req, xts_ctx->fallback); unsigned int ret; - tfm = desc->tfm; - desc->tfm = xts_ctx->fallback; + skcipher_request_set_tfm(req, xts_ctx->fallback); + skcipher_request_set_callback(req, desc->flags, NULL, NULL); + skcipher_request_set_crypt(req, src, dst, nbytes, desc->info); - ret = crypto_blkcipher_encrypt_iv(desc, dst, src, nbytes); + ret = crypto_skcipher_encrypt(req); - desc->tfm = tfm; + skcipher_request_zero(req); return ret; } @@ -587,82 +494,67 @@ static int xts_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); - u32 *flags = &tfm->crt_flags; + unsigned long fc; int err; err = xts_check_key(tfm, in_key, key_len); if (err) return err; - switch (key_len) { - case 32: - xts_ctx->enc = KM_XTS_128_ENCRYPT; - xts_ctx->dec = KM_XTS_128_DECRYPT; - memcpy(xts_ctx->key + 16, in_key, 16); - memcpy(xts_ctx->pcc_key + 16, in_key + 16, 16); - break; - case 48: - xts_ctx->enc = 0; - xts_ctx->dec = 0; - xts_fallback_setkey(tfm, in_key, key_len); - break; - case 64: - xts_ctx->enc = KM_XTS_256_ENCRYPT; - xts_ctx->dec = KM_XTS_256_DECRYPT; - memcpy(xts_ctx->key, in_key, 32); - memcpy(xts_ctx->pcc_key, in_key + 32, 32); - break; - default: - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; - } + /* Pick the correct function code based on the key length */ + fc = (key_len == 32) ? CPACF_KM_XTS_128 : + (key_len == 64) ? CPACF_KM_XTS_256 : 0; + + /* Check if the function code is available */ + xts_ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0; + if (!xts_ctx->fc) + return xts_fallback_setkey(tfm, in_key, key_len); + + /* Split the XTS key into the two subkeys */ + key_len = key_len / 2; xts_ctx->key_len = key_len; + memcpy(xts_ctx->key, in_key, key_len); + memcpy(xts_ctx->pcc_key, in_key + key_len, key_len); return 0; } -static int xts_aes_crypt(struct blkcipher_desc *desc, long func, - struct s390_xts_ctx *xts_ctx, +static int xts_aes_crypt(struct blkcipher_desc *desc, unsigned long modifier, struct blkcipher_walk *walk) { - unsigned int offset = (xts_ctx->key_len >> 1) & 0x10; - int ret = blkcipher_walk_virt(desc, walk); - unsigned int nbytes = walk->nbytes; - unsigned int n; - u8 *in, *out; - struct pcc_param pcc_param; + struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int offset, nbytes, n; + int ret; + struct { + u8 key[32]; + u8 tweak[16]; + u8 block[16]; + u8 bit[16]; + u8 xts[16]; + } pcc_param; struct { u8 key[32]; u8 init[16]; } xts_param; - if (!nbytes) - goto out; - + ret = blkcipher_walk_virt(desc, walk); + offset = xts_ctx->key_len & 0x10; memset(pcc_param.block, 0, sizeof(pcc_param.block)); memset(pcc_param.bit, 0, sizeof(pcc_param.bit)); memset(pcc_param.xts, 0, sizeof(pcc_param.xts)); memcpy(pcc_param.tweak, walk->iv, sizeof(pcc_param.tweak)); - memcpy(pcc_param.key, xts_ctx->pcc_key, 32); - ret = crypt_s390_pcc(func, &pcc_param.key[offset]); - if (ret < 0) - return -EIO; + memcpy(pcc_param.key + offset, xts_ctx->pcc_key, xts_ctx->key_len); + cpacf_pcc(xts_ctx->fc, pcc_param.key + offset); - memcpy(xts_param.key, xts_ctx->key, 32); + memcpy(xts_param.key + offset, xts_ctx->key, xts_ctx->key_len); memcpy(xts_param.init, pcc_param.xts, 16); - do { + + while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) { /* only use complete blocks */ n = nbytes & ~(AES_BLOCK_SIZE - 1); - out = walk->dst.virt.addr; - in = walk->src.virt.addr; - - ret = crypt_s390_km(func, &xts_param.key[offset], out, in, n); - if (ret < 0 || ret != n) - return -EIO; - - nbytes &= AES_BLOCK_SIZE - 1; - ret = blkcipher_walk_done(desc, walk, nbytes); - } while ((nbytes = walk->nbytes)); -out: + cpacf_km(xts_ctx->fc | modifier, xts_param.key + offset, + walk->dst.virt.addr, walk->src.virt.addr, n); + ret = blkcipher_walk_done(desc, walk, nbytes - n); + } return ret; } @@ -673,11 +565,11 @@ static int xts_aes_encrypt(struct blkcipher_desc *desc, struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; - if (unlikely(xts_ctx->key_len == 48)) + if (unlikely(!xts_ctx->fc)) return xts_fallback_encrypt(desc, dst, src, nbytes); blkcipher_walk_init(&walk, dst, src, nbytes); - return xts_aes_crypt(desc, xts_ctx->enc, xts_ctx, &walk); + return xts_aes_crypt(desc, 0, &walk); } static int xts_aes_decrypt(struct blkcipher_desc *desc, @@ -687,11 +579,11 @@ static int xts_aes_decrypt(struct blkcipher_desc *desc, struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; - if (unlikely(xts_ctx->key_len == 48)) + if (unlikely(!xts_ctx->fc)) return xts_fallback_decrypt(desc, dst, src, nbytes); blkcipher_walk_init(&walk, dst, src, nbytes); - return xts_aes_crypt(desc, xts_ctx->dec, xts_ctx, &walk); + return xts_aes_crypt(desc, CPACF_DECRYPT, &walk); } static int xts_fallback_init(struct crypto_tfm *tfm) @@ -699,8 +591,9 @@ static int xts_fallback_init(struct crypto_tfm *tfm) const char *name = tfm->__crt_alg->cra_name; struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); - xts_ctx->fallback = crypto_alloc_blkcipher(name, 0, - CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK); + xts_ctx->fallback = crypto_alloc_skcipher(name, 0, + CRYPTO_ALG_ASYNC | + CRYPTO_ALG_NEED_FALLBACK); if (IS_ERR(xts_ctx->fallback)) { pr_err("Allocating XTS fallback algorithm %s failed\n", @@ -714,14 +607,13 @@ static void xts_fallback_exit(struct crypto_tfm *tfm) { struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); - crypto_free_blkcipher(xts_ctx->fallback); - xts_ctx->fallback = NULL; + crypto_free_skcipher(xts_ctx->fallback); } static struct crypto_alg xts_aes_alg = { .cra_name = "xts(aes)", .cra_driver_name = "xts-aes-s390", - .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, + .cra_priority = 400, /* combo: aes + xts */ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | CRYPTO_ALG_NEED_FALLBACK, .cra_blocksize = AES_BLOCK_SIZE, @@ -742,109 +634,79 @@ static struct crypto_alg xts_aes_alg = { } }; -static int xts_aes_alg_reg; - static int ctr_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); + unsigned long fc; - switch (key_len) { - case 16: - sctx->enc = KMCTR_AES_128_ENCRYPT; - sctx->dec = KMCTR_AES_128_DECRYPT; - break; - case 24: - sctx->enc = KMCTR_AES_192_ENCRYPT; - sctx->dec = KMCTR_AES_192_DECRYPT; - break; - case 32: - sctx->enc = KMCTR_AES_256_ENCRYPT; - sctx->dec = KMCTR_AES_256_DECRYPT; - break; - } + /* Pick the correct function code based on the key length */ + fc = (key_len == 16) ? CPACF_KMCTR_AES_128 : + (key_len == 24) ? CPACF_KMCTR_AES_192 : + (key_len == 32) ? CPACF_KMCTR_AES_256 : 0; + + /* Check if the function code is available */ + sctx->fc = (fc && cpacf_test_func(&kmctr_functions, fc)) ? fc : 0; + if (!sctx->fc) + return setkey_fallback_blk(tfm, in_key, key_len); - return aes_set_key(tfm, in_key, key_len); + sctx->key_len = key_len; + memcpy(sctx->key, in_key, key_len); + return 0; } -static unsigned int __ctrblk_init(u8 *ctrptr, unsigned int nbytes) +static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes) { unsigned int i, n; /* only use complete blocks, max. PAGE_SIZE */ + memcpy(ctrptr, iv, AES_BLOCK_SIZE); n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(AES_BLOCK_SIZE - 1); - for (i = AES_BLOCK_SIZE; i < n; i += AES_BLOCK_SIZE) { - memcpy(ctrptr + i, ctrptr + i - AES_BLOCK_SIZE, - AES_BLOCK_SIZE); - crypto_inc(ctrptr + i, AES_BLOCK_SIZE); + for (i = (n / AES_BLOCK_SIZE) - 1; i > 0; i--) { + memcpy(ctrptr + AES_BLOCK_SIZE, ctrptr, AES_BLOCK_SIZE); + crypto_inc(ctrptr + AES_BLOCK_SIZE, AES_BLOCK_SIZE); + ctrptr += AES_BLOCK_SIZE; } return n; } -static int ctr_aes_crypt(struct blkcipher_desc *desc, long func, - struct s390_aes_ctx *sctx, struct blkcipher_walk *walk) +static int ctr_aes_crypt(struct blkcipher_desc *desc, unsigned long modifier, + struct blkcipher_walk *walk) { - int ret = blkcipher_walk_virt_block(desc, walk, AES_BLOCK_SIZE); + struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + u8 buf[AES_BLOCK_SIZE], *ctrptr; unsigned int n, nbytes; - u8 buf[AES_BLOCK_SIZE], ctrbuf[AES_BLOCK_SIZE]; - u8 *out, *in, *ctrptr = ctrbuf; - - if (!walk->nbytes) - return ret; + int ret, locked; - if (spin_trylock(&ctrblk_lock)) - ctrptr = ctrblk; + locked = spin_trylock(&ctrblk_lock); - memcpy(ctrptr, walk->iv, AES_BLOCK_SIZE); + ret = blkcipher_walk_virt_block(desc, walk, AES_BLOCK_SIZE); while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) { - out = walk->dst.virt.addr; - in = walk->src.virt.addr; - while (nbytes >= AES_BLOCK_SIZE) { - if (ctrptr == ctrblk) - n = __ctrblk_init(ctrptr, nbytes); - else - n = AES_BLOCK_SIZE; - ret = crypt_s390_kmctr(func, sctx->key, out, in, - n, ctrptr); - if (ret < 0 || ret != n) { - if (ctrptr == ctrblk) - spin_unlock(&ctrblk_lock); - return -EIO; - } - if (n > AES_BLOCK_SIZE) - memcpy(ctrptr, ctrptr + n - AES_BLOCK_SIZE, - AES_BLOCK_SIZE); - crypto_inc(ctrptr, AES_BLOCK_SIZE); - out += n; - in += n; - nbytes -= n; - } - ret = blkcipher_walk_done(desc, walk, nbytes); - } - if (ctrptr == ctrblk) { - if (nbytes) - memcpy(ctrbuf, ctrptr, AES_BLOCK_SIZE); - else - memcpy(walk->iv, ctrptr, AES_BLOCK_SIZE); + n = AES_BLOCK_SIZE; + if (nbytes >= 2*AES_BLOCK_SIZE && locked) + n = __ctrblk_init(ctrblk, walk->iv, nbytes); + ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk->iv; + cpacf_kmctr(sctx->fc | modifier, sctx->key, + walk->dst.virt.addr, walk->src.virt.addr, + n, ctrptr); + if (ctrptr == ctrblk) + memcpy(walk->iv, ctrptr + n - AES_BLOCK_SIZE, + AES_BLOCK_SIZE); + crypto_inc(walk->iv, AES_BLOCK_SIZE); + ret = blkcipher_walk_done(desc, walk, nbytes - n); + } + if (locked) spin_unlock(&ctrblk_lock); - } else { - if (!nbytes) - memcpy(walk->iv, ctrptr, AES_BLOCK_SIZE); - } /* * final block may be < AES_BLOCK_SIZE, copy only nbytes */ if (nbytes) { - out = walk->dst.virt.addr; - in = walk->src.virt.addr; - ret = crypt_s390_kmctr(func, sctx->key, buf, in, - AES_BLOCK_SIZE, ctrbuf); - if (ret < 0 || ret != AES_BLOCK_SIZE) - return -EIO; - memcpy(out, buf, nbytes); - crypto_inc(ctrbuf, AES_BLOCK_SIZE); + cpacf_kmctr(sctx->fc | modifier, sctx->key, + buf, walk->src.virt.addr, + AES_BLOCK_SIZE, walk->iv); + memcpy(walk->dst.virt.addr, buf, nbytes); + crypto_inc(walk->iv, AES_BLOCK_SIZE); ret = blkcipher_walk_done(desc, walk, 0); - memcpy(walk->iv, ctrbuf, AES_BLOCK_SIZE); } return ret; @@ -857,8 +719,11 @@ static int ctr_aes_encrypt(struct blkcipher_desc *desc, struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; + if (unlikely(!sctx->fc)) + return fallback_blk_enc(desc, dst, src, nbytes); + blkcipher_walk_init(&walk, dst, src, nbytes); - return ctr_aes_crypt(desc, sctx->enc, sctx, &walk); + return ctr_aes_crypt(desc, 0, &walk); } static int ctr_aes_decrypt(struct blkcipher_desc *desc, @@ -868,19 +733,25 @@ static int ctr_aes_decrypt(struct blkcipher_desc *desc, struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; + if (unlikely(!sctx->fc)) + return fallback_blk_dec(desc, dst, src, nbytes); + blkcipher_walk_init(&walk, dst, src, nbytes); - return ctr_aes_crypt(desc, sctx->dec, sctx, &walk); + return ctr_aes_crypt(desc, CPACF_DECRYPT, &walk); } static struct crypto_alg ctr_aes_alg = { .cra_name = "ctr(aes)", .cra_driver_name = "ctr-aes-s390", - .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_priority = 400, /* combo: aes + ctr */ + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_NEED_FALLBACK, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct s390_aes_ctx), .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, + .cra_init = fallback_init_blk, + .cra_exit = fallback_exit_blk, .cra_u = { .blkcipher = { .min_keysize = AES_MIN_KEY_SIZE, @@ -893,94 +764,79 @@ static struct crypto_alg ctr_aes_alg = { } }; -static int ctr_aes_alg_reg; +static struct crypto_alg *aes_s390_algs_ptr[5]; +static int aes_s390_algs_num; + +static int aes_s390_register_alg(struct crypto_alg *alg) +{ + int ret; + + ret = crypto_register_alg(alg); + if (!ret) + aes_s390_algs_ptr[aes_s390_algs_num++] = alg; + return ret; +} + +static void aes_s390_fini(void) +{ + while (aes_s390_algs_num--) + crypto_unregister_alg(aes_s390_algs_ptr[aes_s390_algs_num]); + if (ctrblk) + free_page((unsigned long) ctrblk); +} static int __init aes_s390_init(void) { int ret; - if (crypt_s390_func_available(KM_AES_128_ENCRYPT, CRYPT_S390_MSA)) - keylen_flag |= AES_KEYLEN_128; - if (crypt_s390_func_available(KM_AES_192_ENCRYPT, CRYPT_S390_MSA)) - keylen_flag |= AES_KEYLEN_192; - if (crypt_s390_func_available(KM_AES_256_ENCRYPT, CRYPT_S390_MSA)) - keylen_flag |= AES_KEYLEN_256; - - if (!keylen_flag) - return -EOPNOTSUPP; - - /* z9 109 and z9 BC/EC only support 128 bit key length */ - if (keylen_flag == AES_KEYLEN_128) - pr_info("AES hardware acceleration is only available for" - " 128-bit keys\n"); - - ret = crypto_register_alg(&aes_alg); - if (ret) - goto aes_err; - - ret = crypto_register_alg(&ecb_aes_alg); - if (ret) - goto ecb_aes_err; - - ret = crypto_register_alg(&cbc_aes_alg); - if (ret) - goto cbc_aes_err; - - if (crypt_s390_func_available(KM_XTS_128_ENCRYPT, - CRYPT_S390_MSA | CRYPT_S390_MSA4) && - crypt_s390_func_available(KM_XTS_256_ENCRYPT, - CRYPT_S390_MSA | CRYPT_S390_MSA4)) { - ret = crypto_register_alg(&xts_aes_alg); + /* Query available functions for KM, KMC and KMCTR */ + cpacf_query(CPACF_KM, &km_functions); + cpacf_query(CPACF_KMC, &kmc_functions); + cpacf_query(CPACF_KMCTR, &kmctr_functions); + + if (cpacf_test_func(&km_functions, CPACF_KM_AES_128) || + cpacf_test_func(&km_functions, CPACF_KM_AES_192) || + cpacf_test_func(&km_functions, CPACF_KM_AES_256)) { + ret = aes_s390_register_alg(&aes_alg); if (ret) - goto xts_aes_err; - xts_aes_alg_reg = 1; + goto out_err; + ret = aes_s390_register_alg(&ecb_aes_alg); + if (ret) + goto out_err; + } + + if (cpacf_test_func(&kmc_functions, CPACF_KMC_AES_128) || + cpacf_test_func(&kmc_functions, CPACF_KMC_AES_192) || + cpacf_test_func(&kmc_functions, CPACF_KMC_AES_256)) { + ret = aes_s390_register_alg(&cbc_aes_alg); + if (ret) + goto out_err; } - if (crypt_s390_func_available(KMCTR_AES_128_ENCRYPT, - CRYPT_S390_MSA | CRYPT_S390_MSA4) && - crypt_s390_func_available(KMCTR_AES_192_ENCRYPT, - CRYPT_S390_MSA | CRYPT_S390_MSA4) && - crypt_s390_func_available(KMCTR_AES_256_ENCRYPT, - CRYPT_S390_MSA | CRYPT_S390_MSA4)) { + if (cpacf_test_func(&km_functions, CPACF_KM_XTS_128) || + cpacf_test_func(&km_functions, CPACF_KM_XTS_256)) { + ret = aes_s390_register_alg(&xts_aes_alg); + if (ret) + goto out_err; + } + + if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_AES_128) || + cpacf_test_func(&kmctr_functions, CPACF_KMCTR_AES_192) || + cpacf_test_func(&kmctr_functions, CPACF_KMCTR_AES_256)) { ctrblk = (u8 *) __get_free_page(GFP_KERNEL); if (!ctrblk) { ret = -ENOMEM; - goto ctr_aes_err; - } - ret = crypto_register_alg(&ctr_aes_alg); - if (ret) { - free_page((unsigned long) ctrblk); - goto ctr_aes_err; + goto out_err; } - ctr_aes_alg_reg = 1; + ret = aes_s390_register_alg(&ctr_aes_alg); + if (ret) + goto out_err; } -out: + return 0; +out_err: + aes_s390_fini(); return ret; - -ctr_aes_err: - crypto_unregister_alg(&xts_aes_alg); -xts_aes_err: - crypto_unregister_alg(&cbc_aes_alg); -cbc_aes_err: - crypto_unregister_alg(&ecb_aes_alg); -ecb_aes_err: - crypto_unregister_alg(&aes_alg); -aes_err: - goto out; -} - -static void __exit aes_s390_fini(void) -{ - if (ctr_aes_alg_reg) { - crypto_unregister_alg(&ctr_aes_alg); - free_page((unsigned long) ctrblk); - } - if (xts_aes_alg_reg) - crypto_unregister_alg(&xts_aes_alg); - crypto_unregister_alg(&cbc_aes_alg); - crypto_unregister_alg(&ecb_aes_alg); - crypto_unregister_alg(&aes_alg); } module_cpu_feature_match(MSA, aes_s390_init); diff --git a/arch/s390/crypto/crc32-vx.c b/arch/s390/crypto/crc32-vx.c new file mode 100644 index 000000000000..992e630c227b --- /dev/null +++ b/arch/s390/crypto/crc32-vx.c @@ -0,0 +1,310 @@ +/* + * Crypto-API module for CRC-32 algorithms implemented with the + * z/Architecture Vector Extension Facility. + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ +#define KMSG_COMPONENT "crc32-vx" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/cpufeature.h> +#include <linux/crc32.h> +#include <crypto/internal/hash.h> +#include <asm/fpu/api.h> + + +#define CRC32_BLOCK_SIZE 1 +#define CRC32_DIGEST_SIZE 4 + +#define VX_MIN_LEN 64 +#define VX_ALIGNMENT 16L +#define VX_ALIGN_MASK (VX_ALIGNMENT - 1) + +struct crc_ctx { + u32 key; +}; + +struct crc_desc_ctx { + u32 crc; +}; + +/* Prototypes for functions in assembly files */ +u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size); +u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size); +u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size); + +/* + * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension + * + * Creates a function to perform a particular CRC-32 computation. Depending + * on the message buffer, the hardware-accelerated or software implementation + * is used. Note that the message buffer is aligned to improve fetch + * operations of VECTOR LOAD MULTIPLE instructions. + * + */ +#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw) \ + static u32 __pure ___fname(u32 crc, \ + unsigned char const *data, size_t datalen) \ + { \ + struct kernel_fpu vxstate; \ + unsigned long prealign, aligned, remaining; \ + \ + if (datalen < VX_MIN_LEN + VX_ALIGN_MASK) \ + return ___crc32_sw(crc, data, datalen); \ + \ + if ((unsigned long)data & VX_ALIGN_MASK) { \ + prealign = VX_ALIGNMENT - \ + ((unsigned long)data & VX_ALIGN_MASK); \ + datalen -= prealign; \ + crc = ___crc32_sw(crc, data, prealign); \ + data = (void *)((unsigned long)data + prealign); \ + } \ + \ + aligned = datalen & ~VX_ALIGN_MASK; \ + remaining = datalen & VX_ALIGN_MASK; \ + \ + kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW); \ + crc = ___crc32_vx(crc, data, aligned); \ + kernel_fpu_end(&vxstate, KERNEL_VXR_LOW); \ + \ + if (remaining) \ + crc = ___crc32_sw(crc, data + aligned, remaining); \ + \ + return crc; \ + } + +DEFINE_CRC32_VX(crc32_le_vx, crc32_le_vgfm_16, crc32_le) +DEFINE_CRC32_VX(crc32_be_vx, crc32_be_vgfm_16, crc32_be) +DEFINE_CRC32_VX(crc32c_le_vx, crc32c_le_vgfm_16, __crc32c_le) + + +static int crc32_vx_cra_init_zero(struct crypto_tfm *tfm) +{ + struct crc_ctx *mctx = crypto_tfm_ctx(tfm); + + mctx->key = 0; + return 0; +} + +static int crc32_vx_cra_init_invert(struct crypto_tfm *tfm) +{ + struct crc_ctx *mctx = crypto_tfm_ctx(tfm); + + mctx->key = ~0; + return 0; +} + +static int crc32_vx_init(struct shash_desc *desc) +{ + struct crc_ctx *mctx = crypto_shash_ctx(desc->tfm); + struct crc_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = mctx->key; + return 0; +} + +static int crc32_vx_setkey(struct crypto_shash *tfm, const u8 *newkey, + unsigned int newkeylen) +{ + struct crc_ctx *mctx = crypto_shash_ctx(tfm); + + if (newkeylen != sizeof(mctx->key)) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + mctx->key = le32_to_cpu(*(__le32 *)newkey); + return 0; +} + +static int crc32be_vx_setkey(struct crypto_shash *tfm, const u8 *newkey, + unsigned int newkeylen) +{ + struct crc_ctx *mctx = crypto_shash_ctx(tfm); + + if (newkeylen != sizeof(mctx->key)) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + mctx->key = be32_to_cpu(*(__be32 *)newkey); + return 0; +} + +static int crc32le_vx_final(struct shash_desc *desc, u8 *out) +{ + struct crc_desc_ctx *ctx = shash_desc_ctx(desc); + + *(__le32 *)out = cpu_to_le32p(&ctx->crc); + return 0; +} + +static int crc32be_vx_final(struct shash_desc *desc, u8 *out) +{ + struct crc_desc_ctx *ctx = shash_desc_ctx(desc); + + *(__be32 *)out = cpu_to_be32p(&ctx->crc); + return 0; +} + +static int crc32c_vx_final(struct shash_desc *desc, u8 *out) +{ + struct crc_desc_ctx *ctx = shash_desc_ctx(desc); + + /* + * Perform a final XOR with 0xFFFFFFFF to be in sync + * with the generic crc32c shash implementation. + */ + *(__le32 *)out = ~cpu_to_le32p(&ctx->crc); + return 0; +} + +static int __crc32le_vx_finup(u32 *crc, const u8 *data, unsigned int len, + u8 *out) +{ + *(__le32 *)out = cpu_to_le32(crc32_le_vx(*crc, data, len)); + return 0; +} + +static int __crc32be_vx_finup(u32 *crc, const u8 *data, unsigned int len, + u8 *out) +{ + *(__be32 *)out = cpu_to_be32(crc32_be_vx(*crc, data, len)); + return 0; +} + +static int __crc32c_vx_finup(u32 *crc, const u8 *data, unsigned int len, + u8 *out) +{ + /* + * Perform a final XOR with 0xFFFFFFFF to be in sync + * with the generic crc32c shash implementation. + */ + *(__le32 *)out = ~cpu_to_le32(crc32c_le_vx(*crc, data, len)); + return 0; +} + + +#define CRC32_VX_FINUP(alg, func) \ + static int alg ## _vx_finup(struct shash_desc *desc, const u8 *data, \ + unsigned int datalen, u8 *out) \ + { \ + return __ ## alg ## _vx_finup(shash_desc_ctx(desc), \ + data, datalen, out); \ + } + +CRC32_VX_FINUP(crc32le, crc32_le_vx) +CRC32_VX_FINUP(crc32be, crc32_be_vx) +CRC32_VX_FINUP(crc32c, crc32c_le_vx) + +#define CRC32_VX_DIGEST(alg, func) \ + static int alg ## _vx_digest(struct shash_desc *desc, const u8 *data, \ + unsigned int len, u8 *out) \ + { \ + return __ ## alg ## _vx_finup(crypto_shash_ctx(desc->tfm), \ + data, len, out); \ + } + +CRC32_VX_DIGEST(crc32le, crc32_le_vx) +CRC32_VX_DIGEST(crc32be, crc32_be_vx) +CRC32_VX_DIGEST(crc32c, crc32c_le_vx) + +#define CRC32_VX_UPDATE(alg, func) \ + static int alg ## _vx_update(struct shash_desc *desc, const u8 *data, \ + unsigned int datalen) \ + { \ + struct crc_desc_ctx *ctx = shash_desc_ctx(desc); \ + ctx->crc = func(ctx->crc, data, datalen); \ + return 0; \ + } + +CRC32_VX_UPDATE(crc32le, crc32_le_vx) +CRC32_VX_UPDATE(crc32be, crc32_be_vx) +CRC32_VX_UPDATE(crc32c, crc32c_le_vx) + + +static struct shash_alg crc32_vx_algs[] = { + /* CRC-32 LE */ + { + .init = crc32_vx_init, + .setkey = crc32_vx_setkey, + .update = crc32le_vx_update, + .final = crc32le_vx_final, + .finup = crc32le_vx_finup, + .digest = crc32le_vx_digest, + .descsize = sizeof(struct crc_desc_ctx), + .digestsize = CRC32_DIGEST_SIZE, + .base = { + .cra_name = "crc32", + .cra_driver_name = "crc32-vx", + .cra_priority = 200, + .cra_blocksize = CRC32_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crc_ctx), + .cra_module = THIS_MODULE, + .cra_init = crc32_vx_cra_init_zero, + }, + }, + /* CRC-32 BE */ + { + .init = crc32_vx_init, + .setkey = crc32be_vx_setkey, + .update = crc32be_vx_update, + .final = crc32be_vx_final, + .finup = crc32be_vx_finup, + .digest = crc32be_vx_digest, + .descsize = sizeof(struct crc_desc_ctx), + .digestsize = CRC32_DIGEST_SIZE, + .base = { + .cra_name = "crc32be", + .cra_driver_name = "crc32be-vx", + .cra_priority = 200, + .cra_blocksize = CRC32_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crc_ctx), + .cra_module = THIS_MODULE, + .cra_init = crc32_vx_cra_init_zero, + }, + }, + /* CRC-32C LE */ + { + .init = crc32_vx_init, + .setkey = crc32_vx_setkey, + .update = crc32c_vx_update, + .final = crc32c_vx_final, + .finup = crc32c_vx_finup, + .digest = crc32c_vx_digest, + .descsize = sizeof(struct crc_desc_ctx), + .digestsize = CRC32_DIGEST_SIZE, + .base = { + .cra_name = "crc32c", + .cra_driver_name = "crc32c-vx", + .cra_priority = 200, + .cra_blocksize = CRC32_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crc_ctx), + .cra_module = THIS_MODULE, + .cra_init = crc32_vx_cra_init_invert, + }, + }, +}; + + +static int __init crc_vx_mod_init(void) +{ + return crypto_register_shashes(crc32_vx_algs, + ARRAY_SIZE(crc32_vx_algs)); +} + +static void __exit crc_vx_mod_exit(void) +{ + crypto_unregister_shashes(crc32_vx_algs, ARRAY_SIZE(crc32_vx_algs)); +} + +module_cpu_feature_match(VXRS, crc_vx_mod_init); +module_exit(crc_vx_mod_exit); + +MODULE_AUTHOR("Hendrik Brueckner <brueckner@linux.vnet.ibm.com>"); +MODULE_LICENSE("GPL"); + +MODULE_ALIAS_CRYPTO("crc32"); +MODULE_ALIAS_CRYPTO("crc32-vx"); +MODULE_ALIAS_CRYPTO("crc32c"); +MODULE_ALIAS_CRYPTO("crc32c-vx"); diff --git a/arch/s390/crypto/crc32be-vx.S b/arch/s390/crypto/crc32be-vx.S new file mode 100644 index 000000000000..8013989cd2e5 --- /dev/null +++ b/arch/s390/crypto/crc32be-vx.S @@ -0,0 +1,207 @@ +/* + * Hardware-accelerated CRC-32 variants for Linux on z Systems + * + * Use the z/Architecture Vector Extension Facility to accelerate the + * computing of CRC-32 checksums. + * + * This CRC-32 implementation algorithm processes the most-significant + * bit first (BE). + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ + +#include <linux/linkage.h> +#include <asm/vx-insn.h> + +/* Vector register range containing CRC-32 constants */ +#define CONST_R1R2 %v9 +#define CONST_R3R4 %v10 +#define CONST_R5 %v11 +#define CONST_R6 %v12 +#define CONST_RU_POLY %v13 +#define CONST_CRC_POLY %v14 + +.data +.align 8 + +/* + * The CRC-32 constant block contains reduction constants to fold and + * process particular chunks of the input data stream in parallel. + * + * For the CRC-32 variants, the constants are precomputed according to + * these defintions: + * + * R1 = x4*128+64 mod P(x) + * R2 = x4*128 mod P(x) + * R3 = x128+64 mod P(x) + * R4 = x128 mod P(x) + * R5 = x96 mod P(x) + * R6 = x64 mod P(x) + * + * Barret reduction constant, u, is defined as floor(x**64 / P(x)). + * + * where P(x) is the polynomial in the normal domain and the P'(x) is the + * polynomial in the reversed (bitreflected) domain. + * + * Note that the constant definitions below are extended in order to compute + * intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. + * The righmost doubleword can be 0 to prevent contribution to the result or + * can be multiplied by 1 to perform an XOR without the need for a separate + * VECTOR EXCLUSIVE OR instruction. + * + * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: + * + * P(x) = 0x04C11DB7 + * P'(x) = 0xEDB88320 + */ + +.Lconstants_CRC_32_BE: + .quad 0x08833794c, 0x0e6228b11 # R1, R2 + .quad 0x0c5b9cd4c, 0x0e8a45605 # R3, R4 + .quad 0x0f200aa66, 1 << 32 # R5, x32 + .quad 0x0490d678d, 1 # R6, 1 + .quad 0x104d101df, 0 # u + .quad 0x104C11DB7, 0 # P(x) + +.previous + +.text +/* + * The CRC-32 function(s) use these calling conventions: + * + * Parameters: + * + * %r2: Initial CRC value, typically ~0; and final CRC (return) value. + * %r3: Input buffer pointer, performance might be improved if the + * buffer is on a doubleword boundary. + * %r4: Length of the buffer, must be 64 bytes or greater. + * + * Register usage: + * + * %r5: CRC-32 constant pool base pointer. + * V0: Initial CRC value and intermediate constants and results. + * V1..V4: Data for CRC computation. + * V5..V8: Next data chunks that are fetched from the input buffer. + * + * V9..V14: CRC-32 constants. + */ +ENTRY(crc32_be_vgfm_16) + /* Load CRC-32 constants */ + larl %r5,.Lconstants_CRC_32_BE + VLM CONST_R1R2,CONST_CRC_POLY,0,%r5 + + /* Load the initial CRC value into the leftmost word of V0. */ + VZERO %v0 + VLVGF %v0,%r2,0 + + /* Load a 64-byte data chunk and XOR with CRC */ + VLM %v1,%v4,0,%r3 /* 64-bytes into V1..V4 */ + VX %v1,%v0,%v1 /* V1 ^= CRC */ + aghi %r3,64 /* BUF = BUF + 64 */ + aghi %r4,-64 /* LEN = LEN - 64 */ + + /* Check remaining buffer size and jump to proper folding method */ + cghi %r4,64 + jl .Lless_than_64bytes + +.Lfold_64bytes_loop: + /* Load the next 64-byte data chunk into V5 to V8 */ + VLM %v5,%v8,0,%r3 + + /* + * Perform a GF(2) multiplication of the doublewords in V1 with + * the reduction constants in V0. The intermediate result is + * then folded (accumulated) with the next data chunk in V5 and + * stored in V1. Repeat this step for the register contents + * in V2, V3, and V4 respectively. + */ + VGFMAG %v1,CONST_R1R2,%v1,%v5 + VGFMAG %v2,CONST_R1R2,%v2,%v6 + VGFMAG %v3,CONST_R1R2,%v3,%v7 + VGFMAG %v4,CONST_R1R2,%v4,%v8 + + /* Adjust buffer pointer and length for next loop */ + aghi %r3,64 /* BUF = BUF + 64 */ + aghi %r4,-64 /* LEN = LEN - 64 */ + + cghi %r4,64 + jnl .Lfold_64bytes_loop + +.Lless_than_64bytes: + /* Fold V1 to V4 into a single 128-bit value in V1 */ + VGFMAG %v1,CONST_R3R4,%v1,%v2 + VGFMAG %v1,CONST_R3R4,%v1,%v3 + VGFMAG %v1,CONST_R3R4,%v1,%v4 + + /* Check whether to continue with 64-bit folding */ + cghi %r4,16 + jl .Lfinal_fold + +.Lfold_16bytes_loop: + + VL %v2,0,,%r3 /* Load next data chunk */ + VGFMAG %v1,CONST_R3R4,%v1,%v2 /* Fold next data chunk */ + + /* Adjust buffer pointer and size for folding next data chunk */ + aghi %r3,16 + aghi %r4,-16 + + /* Process remaining data chunks */ + cghi %r4,16 + jnl .Lfold_16bytes_loop + +.Lfinal_fold: + /* + * The R5 constant is used to fold a 128-bit value into an 96-bit value + * that is XORed with the next 96-bit input data chunk. To use a single + * VGFMG instruction, multiply the rightmost 64-bit with x^32 (1<<32) to + * form an intermediate 96-bit value (with appended zeros) which is then + * XORed with the intermediate reduction result. + */ + VGFMG %v1,CONST_R5,%v1 + + /* + * Further reduce the remaining 96-bit value to a 64-bit value using a + * single VGFMG, the rightmost doubleword is multiplied with 0x1. The + * intermediate result is then XORed with the product of the leftmost + * doubleword with R6. The result is a 64-bit value and is subject to + * the Barret reduction. + */ + VGFMG %v1,CONST_R6,%v1 + + /* + * The input values to the Barret reduction are the degree-63 polynomial + * in V1 (R(x)), degree-32 generator polynomial, and the reduction + * constant u. The Barret reduction result is the CRC value of R(x) mod + * P(x). + * + * The Barret reduction algorithm is defined as: + * + * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u + * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) + * 3. C(x) = R(x) XOR T2(x) mod x^32 + * + * Note: To compensate the division by x^32, use the vector unpack + * instruction to move the leftmost word into the leftmost doubleword + * of the vector register. The rightmost doubleword is multiplied + * with zero to not contribute to the intermedate results. + */ + + /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ + VUPLLF %v2,%v1 + VGFMG %v2,CONST_RU_POLY,%v2 + + /* + * Compute the GF(2) product of the CRC polynomial in VO with T1(x) in + * V2 and XOR the intermediate result, T2(x), with the value in V1. + * The final result is in the rightmost word of V2. + */ + VUPLLF %v2,%v2 + VGFMAG %v2,CONST_CRC_POLY,%v2,%v1 + +.Ldone: + VLGVF %r2,%v2,3 + br %r14 + +.previous diff --git a/arch/s390/crypto/crc32le-vx.S b/arch/s390/crypto/crc32le-vx.S new file mode 100644 index 000000000000..17f2504c2633 --- /dev/null +++ b/arch/s390/crypto/crc32le-vx.S @@ -0,0 +1,268 @@ +/* + * Hardware-accelerated CRC-32 variants for Linux on z Systems + * + * Use the z/Architecture Vector Extension Facility to accelerate the + * computing of bitreflected CRC-32 checksums for IEEE 802.3 Ethernet + * and Castagnoli. + * + * This CRC-32 implementation algorithm is bitreflected and processes + * the least-significant bit first (Little-Endian). + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ + +#include <linux/linkage.h> +#include <asm/vx-insn.h> + +/* Vector register range containing CRC-32 constants */ +#define CONST_PERM_LE2BE %v9 +#define CONST_R2R1 %v10 +#define CONST_R4R3 %v11 +#define CONST_R5 %v12 +#define CONST_RU_POLY %v13 +#define CONST_CRC_POLY %v14 + +.data +.align 8 + +/* + * The CRC-32 constant block contains reduction constants to fold and + * process particular chunks of the input data stream in parallel. + * + * For the CRC-32 variants, the constants are precomputed according to + * these definitions: + * + * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1 + * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1 + * R3 = [(x128+32 mod P'(x) << 32)]' << 1 + * R4 = [(x128-32 mod P'(x) << 32)]' << 1 + * R5 = [(x64 mod P'(x) << 32)]' << 1 + * R6 = [(x32 mod P'(x) << 32)]' << 1 + * + * The bitreflected Barret reduction constant, u', is defined as + * the bit reversal of floor(x**64 / P(x)). + * + * where P(x) is the polynomial in the normal domain and the P'(x) is the + * polynomial in the reversed (bitreflected) domain. + * + * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: + * + * P(x) = 0x04C11DB7 + * P'(x) = 0xEDB88320 + * + * CRC-32C (Castagnoli) polynomials: + * + * P(x) = 0x1EDC6F41 + * P'(x) = 0x82F63B78 + */ + +.Lconstants_CRC_32_LE: + .octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask + .quad 0x1c6e41596, 0x154442bd4 # R2, R1 + .quad 0x0ccaa009e, 0x1751997d0 # R4, R3 + .octa 0x163cd6124 # R5 + .octa 0x1F7011641 # u' + .octa 0x1DB710641 # P'(x) << 1 + +.Lconstants_CRC_32C_LE: + .octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask + .quad 0x09e4addf8, 0x740eef02 # R2, R1 + .quad 0x14cd00bd6, 0xf20c0dfe # R4, R3 + .octa 0x0dd45aab8 # R5 + .octa 0x0dea713f1 # u' + .octa 0x105ec76f0 # P'(x) << 1 + +.previous + + +.text + +/* + * The CRC-32 functions use these calling conventions: + * + * Parameters: + * + * %r2: Initial CRC value, typically ~0; and final CRC (return) value. + * %r3: Input buffer pointer, performance might be improved if the + * buffer is on a doubleword boundary. + * %r4: Length of the buffer, must be 64 bytes or greater. + * + * Register usage: + * + * %r5: CRC-32 constant pool base pointer. + * V0: Initial CRC value and intermediate constants and results. + * V1..V4: Data for CRC computation. + * V5..V8: Next data chunks that are fetched from the input buffer. + * V9: Constant for BE->LE conversion and shift operations + * + * V10..V14: CRC-32 constants. + */ + +ENTRY(crc32_le_vgfm_16) + larl %r5,.Lconstants_CRC_32_LE + j crc32_le_vgfm_generic + +ENTRY(crc32c_le_vgfm_16) + larl %r5,.Lconstants_CRC_32C_LE + j crc32_le_vgfm_generic + + +crc32_le_vgfm_generic: + /* Load CRC-32 constants */ + VLM CONST_PERM_LE2BE,CONST_CRC_POLY,0,%r5 + + /* + * Load the initial CRC value. + * + * The CRC value is loaded into the rightmost word of the + * vector register and is later XORed with the LSB portion + * of the loaded input data. + */ + VZERO %v0 /* Clear V0 */ + VLVGF %v0,%r2,3 /* Load CRC into rightmost word */ + + /* Load a 64-byte data chunk and XOR with CRC */ + VLM %v1,%v4,0,%r3 /* 64-bytes into V1..V4 */ + VPERM %v1,%v1,%v1,CONST_PERM_LE2BE + VPERM %v2,%v2,%v2,CONST_PERM_LE2BE + VPERM %v3,%v3,%v3,CONST_PERM_LE2BE + VPERM %v4,%v4,%v4,CONST_PERM_LE2BE + + VX %v1,%v0,%v1 /* V1 ^= CRC */ + aghi %r3,64 /* BUF = BUF + 64 */ + aghi %r4,-64 /* LEN = LEN - 64 */ + + cghi %r4,64 + jl .Lless_than_64bytes + +.Lfold_64bytes_loop: + /* Load the next 64-byte data chunk into V5 to V8 */ + VLM %v5,%v8,0,%r3 + VPERM %v5,%v5,%v5,CONST_PERM_LE2BE + VPERM %v6,%v6,%v6,CONST_PERM_LE2BE + VPERM %v7,%v7,%v7,CONST_PERM_LE2BE + VPERM %v8,%v8,%v8,CONST_PERM_LE2BE + + /* + * Perform a GF(2) multiplication of the doublewords in V1 with + * the R1 and R2 reduction constants in V0. The intermediate result + * is then folded (accumulated) with the next data chunk in V5 and + * stored in V1. Repeat this step for the register contents + * in V2, V3, and V4 respectively. + */ + VGFMAG %v1,CONST_R2R1,%v1,%v5 + VGFMAG %v2,CONST_R2R1,%v2,%v6 + VGFMAG %v3,CONST_R2R1,%v3,%v7 + VGFMAG %v4,CONST_R2R1,%v4,%v8 + + aghi %r3,64 /* BUF = BUF + 64 */ + aghi %r4,-64 /* LEN = LEN - 64 */ + + cghi %r4,64 + jnl .Lfold_64bytes_loop + +.Lless_than_64bytes: + /* + * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3 + * and R4 and accumulating the next 128-bit chunk until a single 128-bit + * value remains. + */ + VGFMAG %v1,CONST_R4R3,%v1,%v2 + VGFMAG %v1,CONST_R4R3,%v1,%v3 + VGFMAG %v1,CONST_R4R3,%v1,%v4 + + cghi %r4,16 + jl .Lfinal_fold + +.Lfold_16bytes_loop: + + VL %v2,0,,%r3 /* Load next data chunk */ + VPERM %v2,%v2,%v2,CONST_PERM_LE2BE + VGFMAG %v1,CONST_R4R3,%v1,%v2 /* Fold next data chunk */ + + aghi %r3,16 + aghi %r4,-16 + + cghi %r4,16 + jnl .Lfold_16bytes_loop + +.Lfinal_fold: + /* + * Set up a vector register for byte shifts. The shift value must + * be loaded in bits 1-4 in byte element 7 of a vector register. + * Shift by 8 bytes: 0x40 + * Shift by 4 bytes: 0x20 + */ + VLEIB %v9,0x40,7 + + /* + * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes + * to move R4 into the rightmost doubleword and set the leftmost + * doubleword to 0x1. + */ + VSRLB %v0,CONST_R4R3,%v9 + VLEIG %v0,1,0 + + /* + * Compute GF(2) product of V1 and V0. The rightmost doubleword + * of V1 is multiplied with R4. The leftmost doubleword of V1 is + * multiplied by 0x1 and is then XORed with rightmost product. + * Implicitly, the intermediate leftmost product becomes padded + */ + VGFMG %v1,%v0,%v1 + + /* + * Now do the final 32-bit fold by multiplying the rightmost word + * in V1 with R5 and XOR the result with the remaining bits in V1. + * + * To achieve this by a single VGFMAG, right shift V1 by a word + * and store the result in V2 which is then accumulated. Use the + * vector unpack instruction to load the rightmost half of the + * doubleword into the rightmost doubleword element of V1; the other + * half is loaded in the leftmost doubleword. + * The vector register with CONST_R5 contains the R5 constant in the + * rightmost doubleword and the leftmost doubleword is zero to ignore + * the leftmost product of V1. + */ + VLEIB %v9,0x20,7 /* Shift by words */ + VSRLB %v2,%v1,%v9 /* Store remaining bits in V2 */ + VUPLLF %v1,%v1 /* Split rightmost doubleword */ + VGFMAG %v1,CONST_R5,%v1,%v2 /* V1 = (V1 * R5) XOR V2 */ + + /* + * Apply a Barret reduction to compute the final 32-bit CRC value. + * + * The input values to the Barret reduction are the degree-63 polynomial + * in V1 (R(x)), degree-32 generator polynomial, and the reduction + * constant u. The Barret reduction result is the CRC value of R(x) mod + * P(x). + * + * The Barret reduction algorithm is defined as: + * + * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u + * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) + * 3. C(x) = R(x) XOR T2(x) mod x^32 + * + * Note: The leftmost doubleword of vector register containing + * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product + * is zero and does not contribute to the final result. + */ + + /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ + VUPLLF %v2,%v1 + VGFMG %v2,CONST_RU_POLY,%v2 + + /* + * Compute the GF(2) product of the CRC polynomial with T1(x) in + * V2 and XOR the intermediate result, T2(x), with the value in V1. + * The final result is stored in word element 2 of V2. + */ + VUPLLF %v2,%v2 + VGFMAG %v2,CONST_CRC_POLY,%v2,%v1 + +.Ldone: + VLGVF %r2,%v2,2 + br %r14 + +.previous diff --git a/arch/s390/crypto/crypt_s390.h b/arch/s390/crypto/crypt_s390.h deleted file mode 100644 index d9c4c313fbc6..000000000000 --- a/arch/s390/crypto/crypt_s390.h +++ /dev/null @@ -1,493 +0,0 @@ -/* - * Cryptographic API. - * - * Support for s390 cryptographic instructions. - * - * Copyright IBM Corp. 2003, 2015 - * Author(s): Thomas Spatzier - * Jan Glauber (jan.glauber@de.ibm.com) - * Harald Freudenberger (freude@de.ibm.com) - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - */ -#ifndef _CRYPTO_ARCH_S390_CRYPT_S390_H -#define _CRYPTO_ARCH_S390_CRYPT_S390_H - -#include <asm/errno.h> -#include <asm/facility.h> - -#define CRYPT_S390_OP_MASK 0xFF00 -#define CRYPT_S390_FUNC_MASK 0x00FF - -#define CRYPT_S390_PRIORITY 300 -#define CRYPT_S390_COMPOSITE_PRIORITY 400 - -#define CRYPT_S390_MSA 0x1 -#define CRYPT_S390_MSA3 0x2 -#define CRYPT_S390_MSA4 0x4 -#define CRYPT_S390_MSA5 0x8 - -/* s390 cryptographic operations */ -enum crypt_s390_operations { - CRYPT_S390_KM = 0x0100, - CRYPT_S390_KMC = 0x0200, - CRYPT_S390_KIMD = 0x0300, - CRYPT_S390_KLMD = 0x0400, - CRYPT_S390_KMAC = 0x0500, - CRYPT_S390_KMCTR = 0x0600, - CRYPT_S390_PPNO = 0x0700 -}; - -/* - * function codes for KM (CIPHER MESSAGE) instruction - * 0x80 is the decipher modifier bit - */ -enum crypt_s390_km_func { - KM_QUERY = CRYPT_S390_KM | 0x0, - KM_DEA_ENCRYPT = CRYPT_S390_KM | 0x1, - KM_DEA_DECRYPT = CRYPT_S390_KM | 0x1 | 0x80, - KM_TDEA_128_ENCRYPT = CRYPT_S390_KM | 0x2, - KM_TDEA_128_DECRYPT = CRYPT_S390_KM | 0x2 | 0x80, - KM_TDEA_192_ENCRYPT = CRYPT_S390_KM | 0x3, - KM_TDEA_192_DECRYPT = CRYPT_S390_KM | 0x3 | 0x80, - KM_AES_128_ENCRYPT = CRYPT_S390_KM | 0x12, - KM_AES_128_DECRYPT = CRYPT_S390_KM | 0x12 | 0x80, - KM_AES_192_ENCRYPT = CRYPT_S390_KM | 0x13, - KM_AES_192_DECRYPT = CRYPT_S390_KM | 0x13 | 0x80, - KM_AES_256_ENCRYPT = CRYPT_S390_KM | 0x14, - KM_AES_256_DECRYPT = CRYPT_S390_KM | 0x14 | 0x80, - KM_XTS_128_ENCRYPT = CRYPT_S390_KM | 0x32, - KM_XTS_128_DECRYPT = CRYPT_S390_KM | 0x32 | 0x80, - KM_XTS_256_ENCRYPT = CRYPT_S390_KM | 0x34, - KM_XTS_256_DECRYPT = CRYPT_S390_KM | 0x34 | 0x80, -}; - -/* - * function codes for KMC (CIPHER MESSAGE WITH CHAINING) - * instruction - */ -enum crypt_s390_kmc_func { - KMC_QUERY = CRYPT_S390_KMC | 0x0, - KMC_DEA_ENCRYPT = CRYPT_S390_KMC | 0x1, - KMC_DEA_DECRYPT = CRYPT_S390_KMC | 0x1 | 0x80, - KMC_TDEA_128_ENCRYPT = CRYPT_S390_KMC | 0x2, - KMC_TDEA_128_DECRYPT = CRYPT_S390_KMC | 0x2 | 0x80, - KMC_TDEA_192_ENCRYPT = CRYPT_S390_KMC | 0x3, - KMC_TDEA_192_DECRYPT = CRYPT_S390_KMC | 0x3 | 0x80, - KMC_AES_128_ENCRYPT = CRYPT_S390_KMC | 0x12, - KMC_AES_128_DECRYPT = CRYPT_S390_KMC | 0x12 | 0x80, - KMC_AES_192_ENCRYPT = CRYPT_S390_KMC | 0x13, - KMC_AES_192_DECRYPT = CRYPT_S390_KMC | 0x13 | 0x80, - KMC_AES_256_ENCRYPT = CRYPT_S390_KMC | 0x14, - KMC_AES_256_DECRYPT = CRYPT_S390_KMC | 0x14 | 0x80, - KMC_PRNG = CRYPT_S390_KMC | 0x43, -}; - -/* - * function codes for KMCTR (CIPHER MESSAGE WITH COUNTER) - * instruction - */ -enum crypt_s390_kmctr_func { - KMCTR_QUERY = CRYPT_S390_KMCTR | 0x0, - KMCTR_DEA_ENCRYPT = CRYPT_S390_KMCTR | 0x1, - KMCTR_DEA_DECRYPT = CRYPT_S390_KMCTR | 0x1 | 0x80, - KMCTR_TDEA_128_ENCRYPT = CRYPT_S390_KMCTR | 0x2, - KMCTR_TDEA_128_DECRYPT = CRYPT_S390_KMCTR | 0x2 | 0x80, - KMCTR_TDEA_192_ENCRYPT = CRYPT_S390_KMCTR | 0x3, - KMCTR_TDEA_192_DECRYPT = CRYPT_S390_KMCTR | 0x3 | 0x80, - KMCTR_AES_128_ENCRYPT = CRYPT_S390_KMCTR | 0x12, - KMCTR_AES_128_DECRYPT = CRYPT_S390_KMCTR | 0x12 | 0x80, - KMCTR_AES_192_ENCRYPT = CRYPT_S390_KMCTR | 0x13, - KMCTR_AES_192_DECRYPT = CRYPT_S390_KMCTR | 0x13 | 0x80, - KMCTR_AES_256_ENCRYPT = CRYPT_S390_KMCTR | 0x14, - KMCTR_AES_256_DECRYPT = CRYPT_S390_KMCTR | 0x14 | 0x80, -}; - -/* - * function codes for KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST) - * instruction - */ -enum crypt_s390_kimd_func { - KIMD_QUERY = CRYPT_S390_KIMD | 0, - KIMD_SHA_1 = CRYPT_S390_KIMD | 1, - KIMD_SHA_256 = CRYPT_S390_KIMD | 2, - KIMD_SHA_512 = CRYPT_S390_KIMD | 3, - KIMD_GHASH = CRYPT_S390_KIMD | 65, -}; - -/* - * function codes for KLMD (COMPUTE LAST MESSAGE DIGEST) - * instruction - */ -enum crypt_s390_klmd_func { - KLMD_QUERY = CRYPT_S390_KLMD | 0, - KLMD_SHA_1 = CRYPT_S390_KLMD | 1, - KLMD_SHA_256 = CRYPT_S390_KLMD | 2, - KLMD_SHA_512 = CRYPT_S390_KLMD | 3, -}; - -/* - * function codes for KMAC (COMPUTE MESSAGE AUTHENTICATION CODE) - * instruction - */ -enum crypt_s390_kmac_func { - KMAC_QUERY = CRYPT_S390_KMAC | 0, - KMAC_DEA = CRYPT_S390_KMAC | 1, - KMAC_TDEA_128 = CRYPT_S390_KMAC | 2, - KMAC_TDEA_192 = CRYPT_S390_KMAC | 3 -}; - -/* - * function codes for PPNO (PERFORM PSEUDORANDOM NUMBER - * OPERATION) instruction - */ -enum crypt_s390_ppno_func { - PPNO_QUERY = CRYPT_S390_PPNO | 0, - PPNO_SHA512_DRNG_GEN = CRYPT_S390_PPNO | 3, - PPNO_SHA512_DRNG_SEED = CRYPT_S390_PPNO | 0x83 -}; - -/** - * crypt_s390_km: - * @func: the function code passed to KM; see crypt_s390_km_func - * @param: address of parameter block; see POP for details on each func - * @dest: address of destination memory area - * @src: address of source memory area - * @src_len: length of src operand in bytes - * - * Executes the KM (CIPHER MESSAGE) operation of the CPU. - * - * Returns -1 for failure, 0 for the query func, number of processed - * bytes for encryption/decryption funcs - */ -static inline int crypt_s390_km(long func, void *param, - u8 *dest, const u8 *src, long src_len) -{ - register long __func asm("0") = func & CRYPT_S390_FUNC_MASK; - register void *__param asm("1") = param; - register const u8 *__src asm("2") = src; - register long __src_len asm("3") = src_len; - register u8 *__dest asm("4") = dest; - int ret; - - asm volatile( - "0: .insn rre,0xb92e0000,%3,%1\n" /* KM opcode */ - "1: brc 1,0b\n" /* handle partial completion */ - " la %0,0\n" - "2:\n" - EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) - : "=d" (ret), "+a" (__src), "+d" (__src_len), "+a" (__dest) - : "d" (__func), "a" (__param), "0" (-1) : "cc", "memory"); - if (ret < 0) - return ret; - return (func & CRYPT_S390_FUNC_MASK) ? src_len - __src_len : __src_len; -} - -/** - * crypt_s390_kmc: - * @func: the function code passed to KM; see crypt_s390_kmc_func - * @param: address of parameter block; see POP for details on each func - * @dest: address of destination memory area - * @src: address of source memory area - * @src_len: length of src operand in bytes - * - * Executes the KMC (CIPHER MESSAGE WITH CHAINING) operation of the CPU. - * - * Returns -1 for failure, 0 for the query func, number of processed - * bytes for encryption/decryption funcs - */ -static inline int crypt_s390_kmc(long func, void *param, - u8 *dest, const u8 *src, long src_len) -{ - register long __func asm("0") = func & CRYPT_S390_FUNC_MASK; - register void *__param asm("1") = param; - register const u8 *__src asm("2") = src; - register long __src_len asm("3") = src_len; - register u8 *__dest asm("4") = dest; - int ret; - - asm volatile( - "0: .insn rre,0xb92f0000,%3,%1\n" /* KMC opcode */ - "1: brc 1,0b\n" /* handle partial completion */ - " la %0,0\n" - "2:\n" - EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) - : "=d" (ret), "+a" (__src), "+d" (__src_len), "+a" (__dest) - : "d" (__func), "a" (__param), "0" (-1) : "cc", "memory"); - if (ret < 0) - return ret; - return (func & CRYPT_S390_FUNC_MASK) ? src_len - __src_len : __src_len; -} - -/** - * crypt_s390_kimd: - * @func: the function code passed to KM; see crypt_s390_kimd_func - * @param: address of parameter block; see POP for details on each func - * @src: address of source memory area - * @src_len: length of src operand in bytes - * - * Executes the KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST) operation - * of the CPU. - * - * Returns -1 for failure, 0 for the query func, number of processed - * bytes for digest funcs - */ -static inline int crypt_s390_kimd(long func, void *param, - const u8 *src, long src_len) -{ - register long __func asm("0") = func & CRYPT_S390_FUNC_MASK; - register void *__param asm("1") = param; - register const u8 *__src asm("2") = src; - register long __src_len asm("3") = src_len; - int ret; - - asm volatile( - "0: .insn rre,0xb93e0000,%1,%1\n" /* KIMD opcode */ - "1: brc 1,0b\n" /* handle partial completion */ - " la %0,0\n" - "2:\n" - EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) - : "=d" (ret), "+a" (__src), "+d" (__src_len) - : "d" (__func), "a" (__param), "0" (-1) : "cc", "memory"); - if (ret < 0) - return ret; - return (func & CRYPT_S390_FUNC_MASK) ? src_len - __src_len : __src_len; -} - -/** - * crypt_s390_klmd: - * @func: the function code passed to KM; see crypt_s390_klmd_func - * @param: address of parameter block; see POP for details on each func - * @src: address of source memory area - * @src_len: length of src operand in bytes - * - * Executes the KLMD (COMPUTE LAST MESSAGE DIGEST) operation of the CPU. - * - * Returns -1 for failure, 0 for the query func, number of processed - * bytes for digest funcs - */ -static inline int crypt_s390_klmd(long func, void *param, - const u8 *src, long src_len) -{ - register long __func asm("0") = func & CRYPT_S390_FUNC_MASK; - register void *__param asm("1") = param; - register const u8 *__src asm("2") = src; - register long __src_len asm("3") = src_len; - int ret; - - asm volatile( - "0: .insn rre,0xb93f0000,%1,%1\n" /* KLMD opcode */ - "1: brc 1,0b\n" /* handle partial completion */ - " la %0,0\n" - "2:\n" - EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) - : "=d" (ret), "+a" (__src), "+d" (__src_len) - : "d" (__func), "a" (__param), "0" (-1) : "cc", "memory"); - if (ret < 0) - return ret; - return (func & CRYPT_S390_FUNC_MASK) ? src_len - __src_len : __src_len; -} - -/** - * crypt_s390_kmac: - * @func: the function code passed to KM; see crypt_s390_klmd_func - * @param: address of parameter block; see POP for details on each func - * @src: address of source memory area - * @src_len: length of src operand in bytes - * - * Executes the KMAC (COMPUTE MESSAGE AUTHENTICATION CODE) operation - * of the CPU. - * - * Returns -1 for failure, 0 for the query func, number of processed - * bytes for digest funcs - */ -static inline int crypt_s390_kmac(long func, void *param, - const u8 *src, long src_len) -{ - register long __func asm("0") = func & CRYPT_S390_FUNC_MASK; - register void *__param asm("1") = param; - register const u8 *__src asm("2") = src; - register long __src_len asm("3") = src_len; - int ret; - - asm volatile( - "0: .insn rre,0xb91e0000,%1,%1\n" /* KLAC opcode */ - "1: brc 1,0b\n" /* handle partial completion */ - " la %0,0\n" - "2:\n" - EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) - : "=d" (ret), "+a" (__src), "+d" (__src_len) - : "d" (__func), "a" (__param), "0" (-1) : "cc", "memory"); - if (ret < 0) - return ret; - return (func & CRYPT_S390_FUNC_MASK) ? src_len - __src_len : __src_len; -} - -/** - * crypt_s390_kmctr: - * @func: the function code passed to KMCTR; see crypt_s390_kmctr_func - * @param: address of parameter block; see POP for details on each func - * @dest: address of destination memory area - * @src: address of source memory area - * @src_len: length of src operand in bytes - * @counter: address of counter value - * - * Executes the KMCTR (CIPHER MESSAGE WITH COUNTER) operation of the CPU. - * - * Returns -1 for failure, 0 for the query func, number of processed - * bytes for encryption/decryption funcs - */ -static inline int crypt_s390_kmctr(long func, void *param, u8 *dest, - const u8 *src, long src_len, u8 *counter) -{ - register long __func asm("0") = func & CRYPT_S390_FUNC_MASK; - register void *__param asm("1") = param; - register const u8 *__src asm("2") = src; - register long __src_len asm("3") = src_len; - register u8 *__dest asm("4") = dest; - register u8 *__ctr asm("6") = counter; - int ret = -1; - - asm volatile( - "0: .insn rrf,0xb92d0000,%3,%1,%4,0\n" /* KMCTR opcode */ - "1: brc 1,0b\n" /* handle partial completion */ - " la %0,0\n" - "2:\n" - EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) - : "+d" (ret), "+a" (__src), "+d" (__src_len), "+a" (__dest), - "+a" (__ctr) - : "d" (__func), "a" (__param) : "cc", "memory"); - if (ret < 0) - return ret; - return (func & CRYPT_S390_FUNC_MASK) ? src_len - __src_len : __src_len; -} - -/** - * crypt_s390_ppno: - * @func: the function code passed to PPNO; see crypt_s390_ppno_func - * @param: address of parameter block; see POP for details on each func - * @dest: address of destination memory area - * @dest_len: size of destination memory area in bytes - * @seed: address of seed data - * @seed_len: size of seed data in bytes - * - * Executes the PPNO (PERFORM PSEUDORANDOM NUMBER OPERATION) - * operation of the CPU. - * - * Returns -1 for failure, 0 for the query func, number of random - * bytes stored in dest buffer for generate function - */ -static inline int crypt_s390_ppno(long func, void *param, - u8 *dest, long dest_len, - const u8 *seed, long seed_len) -{ - register long __func asm("0") = func & CRYPT_S390_FUNC_MASK; - register void *__param asm("1") = param; /* param block (240 bytes) */ - register u8 *__dest asm("2") = dest; /* buf for recv random bytes */ - register long __dest_len asm("3") = dest_len; /* requested random bytes */ - register const u8 *__seed asm("4") = seed; /* buf with seed data */ - register long __seed_len asm("5") = seed_len; /* bytes in seed buf */ - int ret = -1; - - asm volatile ( - "0: .insn rre,0xb93c0000,%1,%5\n" /* PPNO opcode */ - "1: brc 1,0b\n" /* handle partial completion */ - " la %0,0\n" - "2:\n" - EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) - : "+d" (ret), "+a"(__dest), "+d"(__dest_len) - : "d"(__func), "a"(__param), "a"(__seed), "d"(__seed_len) - : "cc", "memory"); - if (ret < 0) - return ret; - return (func & CRYPT_S390_FUNC_MASK) ? dest_len - __dest_len : 0; -} - -/** - * crypt_s390_func_available: - * @func: the function code of the specific function; 0 if op in general - * - * Tests if a specific crypto function is implemented on the machine. - * - * Returns 1 if func available; 0 if func or op in general not available - */ -static inline int crypt_s390_func_available(int func, - unsigned int facility_mask) -{ - unsigned char status[16]; - int ret; - - if (facility_mask & CRYPT_S390_MSA && !test_facility(17)) - return 0; - if (facility_mask & CRYPT_S390_MSA3 && !test_facility(76)) - return 0; - if (facility_mask & CRYPT_S390_MSA4 && !test_facility(77)) - return 0; - if (facility_mask & CRYPT_S390_MSA5 && !test_facility(57)) - return 0; - - switch (func & CRYPT_S390_OP_MASK) { - case CRYPT_S390_KM: - ret = crypt_s390_km(KM_QUERY, &status, NULL, NULL, 0); - break; - case CRYPT_S390_KMC: - ret = crypt_s390_kmc(KMC_QUERY, &status, NULL, NULL, 0); - break; - case CRYPT_S390_KIMD: - ret = crypt_s390_kimd(KIMD_QUERY, &status, NULL, 0); - break; - case CRYPT_S390_KLMD: - ret = crypt_s390_klmd(KLMD_QUERY, &status, NULL, 0); - break; - case CRYPT_S390_KMAC: - ret = crypt_s390_kmac(KMAC_QUERY, &status, NULL, 0); - break; - case CRYPT_S390_KMCTR: - ret = crypt_s390_kmctr(KMCTR_QUERY, &status, - NULL, NULL, 0, NULL); - break; - case CRYPT_S390_PPNO: - ret = crypt_s390_ppno(PPNO_QUERY, &status, - NULL, 0, NULL, 0); - break; - default: - return 0; - } - if (ret < 0) - return 0; - func &= CRYPT_S390_FUNC_MASK; - func &= 0x7f; /* mask modifier bit */ - return (status[func >> 3] & (0x80 >> (func & 7))) != 0; -} - -/** - * crypt_s390_pcc: - * @func: the function code passed to KM; see crypt_s390_km_func - * @param: address of parameter block; see POP for details on each func - * - * Executes the PCC (PERFORM CRYPTOGRAPHIC COMPUTATION) operation of the CPU. - * - * Returns -1 for failure, 0 for success. - */ -static inline int crypt_s390_pcc(long func, void *param) -{ - register long __func asm("0") = func & 0x7f; /* encrypt or decrypt */ - register void *__param asm("1") = param; - int ret = -1; - - asm volatile( - "0: .insn rre,0xb92c0000,0,0\n" /* PCC opcode */ - "1: brc 1,0b\n" /* handle partial completion */ - " la %0,0\n" - "2:\n" - EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) - : "+d" (ret) - : "d" (__func), "a" (__param) : "cc", "memory"); - return ret; -} - -#endif /* _CRYPTO_ARCH_S390_CRYPT_S390_H */ diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c index fba1c10a2dd0..8b83144206eb 100644 --- a/arch/s390/crypto/des_s390.c +++ b/arch/s390/crypto/des_s390.c @@ -20,14 +20,15 @@ #include <linux/crypto.h> #include <crypto/algapi.h> #include <crypto/des.h> - -#include "crypt_s390.h" +#include <asm/cpacf.h> #define DES3_KEY_SIZE (3 * DES_KEY_SIZE) static u8 *ctrblk; static DEFINE_SPINLOCK(ctrblk_lock); +static cpacf_mask_t km_functions, kmc_functions, kmctr_functions; + struct s390_des_ctx { u8 iv[DES_BLOCK_SIZE]; u8 key[DES3_KEY_SIZE]; @@ -37,12 +38,12 @@ static int des_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int key_len) { struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - u32 *flags = &tfm->crt_flags; u32 tmp[DES_EXPKEY_WORDS]; /* check for weak keys */ - if (!des_ekey(tmp, key) && (*flags & CRYPTO_TFM_REQ_WEAK_KEY)) { - *flags |= CRYPTO_TFM_RES_WEAK_KEY; + if (!des_ekey(tmp, key) && + (tfm->crt_flags & CRYPTO_TFM_REQ_WEAK_KEY)) { + tfm->crt_flags |= CRYPTO_TFM_RES_WEAK_KEY; return -EINVAL; } @@ -54,20 +55,21 @@ static void des_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - crypt_s390_km(KM_DEA_ENCRYPT, ctx->key, out, in, DES_BLOCK_SIZE); + cpacf_km(CPACF_KM_DEA, ctx->key, out, in, DES_BLOCK_SIZE); } static void des_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - crypt_s390_km(KM_DEA_DECRYPT, ctx->key, out, in, DES_BLOCK_SIZE); + cpacf_km(CPACF_KM_DEA | CPACF_DECRYPT, + ctx->key, out, in, DES_BLOCK_SIZE); } static struct crypto_alg des_alg = { .cra_name = "des", .cra_driver_name = "des-s390", - .cra_priority = CRYPT_S390_PRIORITY, + .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = DES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct s390_des_ctx), @@ -83,61 +85,46 @@ static struct crypto_alg des_alg = { } }; -static int ecb_desall_crypt(struct blkcipher_desc *desc, long func, - u8 *key, struct blkcipher_walk *walk) +static int ecb_desall_crypt(struct blkcipher_desc *desc, unsigned long fc, + struct blkcipher_walk *walk) { - int ret = blkcipher_walk_virt(desc, walk); - unsigned int nbytes; + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int nbytes, n; + int ret; - while ((nbytes = walk->nbytes)) { + ret = blkcipher_walk_virt(desc, walk); + while ((nbytes = walk->nbytes) >= DES_BLOCK_SIZE) { /* only use complete blocks */ - unsigned int n = nbytes & ~(DES_BLOCK_SIZE - 1); - u8 *out = walk->dst.virt.addr; - u8 *in = walk->src.virt.addr; - - ret = crypt_s390_km(func, key, out, in, n); - if (ret < 0 || ret != n) - return -EIO; - - nbytes &= DES_BLOCK_SIZE - 1; - ret = blkcipher_walk_done(desc, walk, nbytes); + n = nbytes & ~(DES_BLOCK_SIZE - 1); + cpacf_km(fc, ctx->key, walk->dst.virt.addr, + walk->src.virt.addr, n); + ret = blkcipher_walk_done(desc, walk, nbytes - n); } - return ret; } -static int cbc_desall_crypt(struct blkcipher_desc *desc, long func, +static int cbc_desall_crypt(struct blkcipher_desc *desc, unsigned long fc, struct blkcipher_walk *walk) { struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - int ret = blkcipher_walk_virt(desc, walk); - unsigned int nbytes = walk->nbytes; + unsigned int nbytes, n; + int ret; struct { u8 iv[DES_BLOCK_SIZE]; u8 key[DES3_KEY_SIZE]; } param; - if (!nbytes) - goto out; - + ret = blkcipher_walk_virt(desc, walk); memcpy(param.iv, walk->iv, DES_BLOCK_SIZE); memcpy(param.key, ctx->key, DES3_KEY_SIZE); - do { + while ((nbytes = walk->nbytes) >= DES_BLOCK_SIZE) { /* only use complete blocks */ - unsigned int n = nbytes & ~(DES_BLOCK_SIZE - 1); - u8 *out = walk->dst.virt.addr; - u8 *in = walk->src.virt.addr; - - ret = crypt_s390_kmc(func, ¶m, out, in, n); - if (ret < 0 || ret != n) - return -EIO; - - nbytes &= DES_BLOCK_SIZE - 1; - ret = blkcipher_walk_done(desc, walk, nbytes); - } while ((nbytes = walk->nbytes)); + n = nbytes & ~(DES_BLOCK_SIZE - 1); + cpacf_kmc(fc, ¶m, walk->dst.virt.addr, + walk->src.virt.addr, n); + ret = blkcipher_walk_done(desc, walk, nbytes - n); + } memcpy(walk->iv, param.iv, DES_BLOCK_SIZE); - -out: return ret; } @@ -145,28 +132,26 @@ static int ecb_des_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, KM_DEA_ENCRYPT, ctx->key, &walk); + return ecb_desall_crypt(desc, CPACF_KM_DEA, &walk); } static int ecb_des_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, KM_DEA_DECRYPT, ctx->key, &walk); + return ecb_desall_crypt(desc, CPACF_KM_DEA | CPACF_DECRYPT, &walk); } static struct crypto_alg ecb_des_alg = { .cra_name = "ecb(des)", .cra_driver_name = "ecb-des-s390", - .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, + .cra_priority = 400, /* combo: des + ecb */ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, .cra_blocksize = DES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct s390_des_ctx), @@ -190,7 +175,7 @@ static int cbc_des_encrypt(struct blkcipher_desc *desc, struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, KMC_DEA_ENCRYPT, &walk); + return cbc_desall_crypt(desc, CPACF_KMC_DEA, &walk); } static int cbc_des_decrypt(struct blkcipher_desc *desc, @@ -200,13 +185,13 @@ static int cbc_des_decrypt(struct blkcipher_desc *desc, struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, KMC_DEA_DECRYPT, &walk); + return cbc_desall_crypt(desc, CPACF_KMC_DEA | CPACF_DECRYPT, &walk); } static struct crypto_alg cbc_des_alg = { .cra_name = "cbc(des)", .cra_driver_name = "cbc-des-s390", - .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, + .cra_priority = 400, /* combo: des + cbc */ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, .cra_blocksize = DES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct s390_des_ctx), @@ -241,13 +226,12 @@ static int des3_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int key_len) { struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - u32 *flags = &tfm->crt_flags; if (!(crypto_memneq(key, &key[DES_KEY_SIZE], DES_KEY_SIZE) && crypto_memneq(&key[DES_KEY_SIZE], &key[DES_KEY_SIZE * 2], DES_KEY_SIZE)) && - (*flags & CRYPTO_TFM_REQ_WEAK_KEY)) { - *flags |= CRYPTO_TFM_RES_WEAK_KEY; + (tfm->crt_flags & CRYPTO_TFM_REQ_WEAK_KEY)) { + tfm->crt_flags |= CRYPTO_TFM_RES_WEAK_KEY; return -EINVAL; } memcpy(ctx->key, key, key_len); @@ -258,20 +242,21 @@ static void des3_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - crypt_s390_km(KM_TDEA_192_ENCRYPT, ctx->key, dst, src, DES_BLOCK_SIZE); + cpacf_km(CPACF_KM_TDEA_192, ctx->key, dst, src, DES_BLOCK_SIZE); } static void des3_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - crypt_s390_km(KM_TDEA_192_DECRYPT, ctx->key, dst, src, DES_BLOCK_SIZE); + cpacf_km(CPACF_KM_TDEA_192 | CPACF_DECRYPT, + ctx->key, dst, src, DES_BLOCK_SIZE); } static struct crypto_alg des3_alg = { .cra_name = "des3_ede", .cra_driver_name = "des3_ede-s390", - .cra_priority = CRYPT_S390_PRIORITY, + .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = DES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct s390_des_ctx), @@ -291,28 +276,27 @@ static int ecb_des3_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, KM_TDEA_192_ENCRYPT, ctx->key, &walk); + return ecb_desall_crypt(desc, CPACF_KM_TDEA_192, &walk); } static int ecb_des3_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, KM_TDEA_192_DECRYPT, ctx->key, &walk); + return ecb_desall_crypt(desc, CPACF_KM_TDEA_192 | CPACF_DECRYPT, + &walk); } static struct crypto_alg ecb_des3_alg = { .cra_name = "ecb(des3_ede)", .cra_driver_name = "ecb-des3_ede-s390", - .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, + .cra_priority = 400, /* combo: des3 + ecb */ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, .cra_blocksize = DES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct s390_des_ctx), @@ -336,7 +320,7 @@ static int cbc_des3_encrypt(struct blkcipher_desc *desc, struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, KMC_TDEA_192_ENCRYPT, &walk); + return cbc_desall_crypt(desc, CPACF_KMC_TDEA_192, &walk); } static int cbc_des3_decrypt(struct blkcipher_desc *desc, @@ -346,13 +330,14 @@ static int cbc_des3_decrypt(struct blkcipher_desc *desc, struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, KMC_TDEA_192_DECRYPT, &walk); + return cbc_desall_crypt(desc, CPACF_KMC_TDEA_192 | CPACF_DECRYPT, + &walk); } static struct crypto_alg cbc_des3_alg = { .cra_name = "cbc(des3_ede)", .cra_driver_name = "cbc-des3_ede-s390", - .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, + .cra_priority = 400, /* combo: des3 + cbc */ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, .cra_blocksize = DES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct s390_des_ctx), @@ -370,82 +355,54 @@ static struct crypto_alg cbc_des3_alg = { } }; -static unsigned int __ctrblk_init(u8 *ctrptr, unsigned int nbytes) +static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes) { unsigned int i, n; /* align to block size, max. PAGE_SIZE */ n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(DES_BLOCK_SIZE - 1); - for (i = DES_BLOCK_SIZE; i < n; i += DES_BLOCK_SIZE) { - memcpy(ctrptr + i, ctrptr + i - DES_BLOCK_SIZE, DES_BLOCK_SIZE); - crypto_inc(ctrptr + i, DES_BLOCK_SIZE); + memcpy(ctrptr, iv, DES_BLOCK_SIZE); + for (i = (n / DES_BLOCK_SIZE) - 1; i > 0; i--) { + memcpy(ctrptr + DES_BLOCK_SIZE, ctrptr, DES_BLOCK_SIZE); + crypto_inc(ctrptr + DES_BLOCK_SIZE, DES_BLOCK_SIZE); + ctrptr += DES_BLOCK_SIZE; } return n; } -static int ctr_desall_crypt(struct blkcipher_desc *desc, long func, - struct s390_des_ctx *ctx, +static int ctr_desall_crypt(struct blkcipher_desc *desc, unsigned long fc, struct blkcipher_walk *walk) { - int ret = blkcipher_walk_virt_block(desc, walk, DES_BLOCK_SIZE); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + u8 buf[DES_BLOCK_SIZE], *ctrptr; unsigned int n, nbytes; - u8 buf[DES_BLOCK_SIZE], ctrbuf[DES_BLOCK_SIZE]; - u8 *out, *in, *ctrptr = ctrbuf; + int ret, locked; - if (!walk->nbytes) - return ret; + locked = spin_trylock(&ctrblk_lock); - if (spin_trylock(&ctrblk_lock)) - ctrptr = ctrblk; - - memcpy(ctrptr, walk->iv, DES_BLOCK_SIZE); + ret = blkcipher_walk_virt_block(desc, walk, DES_BLOCK_SIZE); while ((nbytes = walk->nbytes) >= DES_BLOCK_SIZE) { - out = walk->dst.virt.addr; - in = walk->src.virt.addr; - while (nbytes >= DES_BLOCK_SIZE) { - if (ctrptr == ctrblk) - n = __ctrblk_init(ctrptr, nbytes); - else - n = DES_BLOCK_SIZE; - ret = crypt_s390_kmctr(func, ctx->key, out, in, - n, ctrptr); - if (ret < 0 || ret != n) { - if (ctrptr == ctrblk) - spin_unlock(&ctrblk_lock); - return -EIO; - } - if (n > DES_BLOCK_SIZE) - memcpy(ctrptr, ctrptr + n - DES_BLOCK_SIZE, - DES_BLOCK_SIZE); - crypto_inc(ctrptr, DES_BLOCK_SIZE); - out += n; - in += n; - nbytes -= n; - } - ret = blkcipher_walk_done(desc, walk, nbytes); + n = DES_BLOCK_SIZE; + if (nbytes >= 2*DES_BLOCK_SIZE && locked) + n = __ctrblk_init(ctrblk, walk->iv, nbytes); + ctrptr = (n > DES_BLOCK_SIZE) ? ctrblk : walk->iv; + cpacf_kmctr(fc, ctx->key, walk->dst.virt.addr, + walk->src.virt.addr, n, ctrptr); + if (ctrptr == ctrblk) + memcpy(walk->iv, ctrptr + n - DES_BLOCK_SIZE, + DES_BLOCK_SIZE); + crypto_inc(walk->iv, DES_BLOCK_SIZE); + ret = blkcipher_walk_done(desc, walk, nbytes - n); } - if (ctrptr == ctrblk) { - if (nbytes) - memcpy(ctrbuf, ctrptr, DES_BLOCK_SIZE); - else - memcpy(walk->iv, ctrptr, DES_BLOCK_SIZE); + if (locked) spin_unlock(&ctrblk_lock); - } else { - if (!nbytes) - memcpy(walk->iv, ctrptr, DES_BLOCK_SIZE); - } /* final block may be < DES_BLOCK_SIZE, copy only nbytes */ if (nbytes) { - out = walk->dst.virt.addr; - in = walk->src.virt.addr; - ret = crypt_s390_kmctr(func, ctx->key, buf, in, - DES_BLOCK_SIZE, ctrbuf); - if (ret < 0 || ret != DES_BLOCK_SIZE) - return -EIO; - memcpy(out, buf, nbytes); - crypto_inc(ctrbuf, DES_BLOCK_SIZE); + cpacf_kmctr(fc, ctx->key, buf, walk->src.virt.addr, + DES_BLOCK_SIZE, walk->iv); + memcpy(walk->dst.virt.addr, buf, nbytes); + crypto_inc(walk->iv, DES_BLOCK_SIZE); ret = blkcipher_walk_done(desc, walk, 0); - memcpy(walk->iv, ctrbuf, DES_BLOCK_SIZE); } return ret; } @@ -454,28 +411,26 @@ static int ctr_des_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ctr_desall_crypt(desc, KMCTR_DEA_ENCRYPT, ctx, &walk); + return ctr_desall_crypt(desc, CPACF_KMCTR_DEA, &walk); } static int ctr_des_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ctr_desall_crypt(desc, KMCTR_DEA_DECRYPT, ctx, &walk); + return ctr_desall_crypt(desc, CPACF_KMCTR_DEA | CPACF_DECRYPT, &walk); } static struct crypto_alg ctr_des_alg = { .cra_name = "ctr(des)", .cra_driver_name = "ctr-des-s390", - .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, + .cra_priority = 400, /* combo: des + ctr */ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct s390_des_ctx), @@ -497,28 +452,27 @@ static int ctr_des3_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ctr_desall_crypt(desc, KMCTR_TDEA_192_ENCRYPT, ctx, &walk); + return ctr_desall_crypt(desc, CPACF_KMCTR_TDEA_192, &walk); } static int ctr_des3_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ctr_desall_crypt(desc, KMCTR_TDEA_192_DECRYPT, ctx, &walk); + return ctr_desall_crypt(desc, CPACF_KMCTR_TDEA_192 | CPACF_DECRYPT, + &walk); } static struct crypto_alg ctr_des3_alg = { .cra_name = "ctr(des3_ede)", .cra_driver_name = "ctr-des3_ede-s390", - .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, + .cra_priority = 400, /* combo: des3 + ede */ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct s390_des_ctx), @@ -536,85 +490,87 @@ static struct crypto_alg ctr_des3_alg = { } }; +static struct crypto_alg *des_s390_algs_ptr[8]; +static int des_s390_algs_num; + +static int des_s390_register_alg(struct crypto_alg *alg) +{ + int ret; + + ret = crypto_register_alg(alg); + if (!ret) + des_s390_algs_ptr[des_s390_algs_num++] = alg; + return ret; +} + +static void des_s390_exit(void) +{ + while (des_s390_algs_num--) + crypto_unregister_alg(des_s390_algs_ptr[des_s390_algs_num]); + if (ctrblk) + free_page((unsigned long) ctrblk); +} + static int __init des_s390_init(void) { int ret; - if (!crypt_s390_func_available(KM_DEA_ENCRYPT, CRYPT_S390_MSA) || - !crypt_s390_func_available(KM_TDEA_192_ENCRYPT, CRYPT_S390_MSA)) - return -EOPNOTSUPP; - - ret = crypto_register_alg(&des_alg); - if (ret) - goto des_err; - ret = crypto_register_alg(&ecb_des_alg); - if (ret) - goto ecb_des_err; - ret = crypto_register_alg(&cbc_des_alg); - if (ret) - goto cbc_des_err; - ret = crypto_register_alg(&des3_alg); - if (ret) - goto des3_err; - ret = crypto_register_alg(&ecb_des3_alg); - if (ret) - goto ecb_des3_err; - ret = crypto_register_alg(&cbc_des3_alg); - if (ret) - goto cbc_des3_err; - - if (crypt_s390_func_available(KMCTR_DEA_ENCRYPT, - CRYPT_S390_MSA | CRYPT_S390_MSA4) && - crypt_s390_func_available(KMCTR_TDEA_192_ENCRYPT, - CRYPT_S390_MSA | CRYPT_S390_MSA4)) { - ret = crypto_register_alg(&ctr_des_alg); + /* Query available functions for KM, KMC and KMCTR */ + cpacf_query(CPACF_KM, &km_functions); + cpacf_query(CPACF_KMC, &kmc_functions); + cpacf_query(CPACF_KMCTR, &kmctr_functions); + + if (cpacf_test_func(&km_functions, CPACF_KM_DEA)) { + ret = des_s390_register_alg(&des_alg); + if (ret) + goto out_err; + ret = des_s390_register_alg(&ecb_des_alg); + if (ret) + goto out_err; + } + if (cpacf_test_func(&kmc_functions, CPACF_KMC_DEA)) { + ret = des_s390_register_alg(&cbc_des_alg); + if (ret) + goto out_err; + } + if (cpacf_test_func(&km_functions, CPACF_KM_TDEA_192)) { + ret = des_s390_register_alg(&des3_alg); if (ret) - goto ctr_des_err; - ret = crypto_register_alg(&ctr_des3_alg); + goto out_err; + ret = des_s390_register_alg(&ecb_des3_alg); if (ret) - goto ctr_des3_err; + goto out_err; + } + if (cpacf_test_func(&kmc_functions, CPACF_KMC_TDEA_192)) { + ret = des_s390_register_alg(&cbc_des3_alg); + if (ret) + goto out_err; + } + + if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_DEA) || + cpacf_test_func(&kmctr_functions, CPACF_KMCTR_TDEA_192)) { ctrblk = (u8 *) __get_free_page(GFP_KERNEL); if (!ctrblk) { ret = -ENOMEM; - goto ctr_mem_err; + goto out_err; } } -out: - return ret; - -ctr_mem_err: - crypto_unregister_alg(&ctr_des3_alg); -ctr_des3_err: - crypto_unregister_alg(&ctr_des_alg); -ctr_des_err: - crypto_unregister_alg(&cbc_des3_alg); -cbc_des3_err: - crypto_unregister_alg(&ecb_des3_alg); -ecb_des3_err: - crypto_unregister_alg(&des3_alg); -des3_err: - crypto_unregister_alg(&cbc_des_alg); -cbc_des_err: - crypto_unregister_alg(&ecb_des_alg); -ecb_des_err: - crypto_unregister_alg(&des_alg); -des_err: - goto out; -} -static void __exit des_s390_exit(void) -{ - if (ctrblk) { - crypto_unregister_alg(&ctr_des_alg); - crypto_unregister_alg(&ctr_des3_alg); - free_page((unsigned long) ctrblk); + if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_DEA)) { + ret = des_s390_register_alg(&ctr_des_alg); + if (ret) + goto out_err; + } + if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_TDEA_192)) { + ret = des_s390_register_alg(&ctr_des3_alg); + if (ret) + goto out_err; } - crypto_unregister_alg(&cbc_des3_alg); - crypto_unregister_alg(&ecb_des3_alg); - crypto_unregister_alg(&des3_alg); - crypto_unregister_alg(&cbc_des_alg); - crypto_unregister_alg(&ecb_des_alg); - crypto_unregister_alg(&des_alg); + + return 0; +out_err: + des_s390_exit(); + return ret; } module_cpu_feature_match(MSA, des_s390_init); diff --git a/arch/s390/crypto/ghash_s390.c b/arch/s390/crypto/ghash_s390.c index 26e14efd30a7..564616d48d8b 100644 --- a/arch/s390/crypto/ghash_s390.c +++ b/arch/s390/crypto/ghash_s390.c @@ -10,8 +10,7 @@ #include <crypto/internal/hash.h> #include <linux/module.h> #include <linux/cpufeature.h> - -#include "crypt_s390.h" +#include <asm/cpacf.h> #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 @@ -59,7 +58,6 @@ static int ghash_update(struct shash_desc *desc, struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); unsigned int n; u8 *buf = dctx->buffer; - int ret; if (dctx->bytes) { u8 *pos = buf + (GHASH_BLOCK_SIZE - dctx->bytes); @@ -72,18 +70,14 @@ static int ghash_update(struct shash_desc *desc, src += n; if (!dctx->bytes) { - ret = crypt_s390_kimd(KIMD_GHASH, dctx, buf, - GHASH_BLOCK_SIZE); - if (ret != GHASH_BLOCK_SIZE) - return -EIO; + cpacf_kimd(CPACF_KIMD_GHASH, dctx, buf, + GHASH_BLOCK_SIZE); } } n = srclen & ~(GHASH_BLOCK_SIZE - 1); if (n) { - ret = crypt_s390_kimd(KIMD_GHASH, dctx, src, n); - if (ret != n) - return -EIO; + cpacf_kimd(CPACF_KIMD_GHASH, dctx, src, n); src += n; srclen -= n; } @@ -99,17 +93,12 @@ static int ghash_update(struct shash_desc *desc, static int ghash_flush(struct ghash_desc_ctx *dctx) { u8 *buf = dctx->buffer; - int ret; if (dctx->bytes) { u8 *pos = buf + (GHASH_BLOCK_SIZE - dctx->bytes); memset(pos, 0, dctx->bytes); - - ret = crypt_s390_kimd(KIMD_GHASH, dctx, buf, GHASH_BLOCK_SIZE); - if (ret != GHASH_BLOCK_SIZE) - return -EIO; - + cpacf_kimd(CPACF_KIMD_GHASH, dctx, buf, GHASH_BLOCK_SIZE); dctx->bytes = 0; } @@ -137,7 +126,7 @@ static struct shash_alg ghash_alg = { .base = { .cra_name = "ghash", .cra_driver_name = "ghash-s390", - .cra_priority = CRYPT_S390_PRIORITY, + .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = GHASH_BLOCK_SIZE, .cra_ctxsize = sizeof(struct ghash_ctx), @@ -147,8 +136,7 @@ static struct shash_alg ghash_alg = { static int __init ghash_mod_init(void) { - if (!crypt_s390_func_available(KIMD_GHASH, - CRYPT_S390_MSA | CRYPT_S390_MSA4)) + if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_GHASH)) return -EOPNOTSUPP; return crypto_register_shash(&ghash_alg); diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index d750cc0dfe30..9cc050f9536c 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -23,8 +23,7 @@ #include <asm/debug.h> #include <asm/uaccess.h> #include <asm/timex.h> - -#include "crypt_s390.h" +#include <asm/cpacf.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("IBM Corporation"); @@ -136,12 +135,7 @@ static int generate_entropy(u8 *ebuf, size_t nbytes) else h = ebuf; /* generate sha256 from this page */ - if (crypt_s390_kimd(KIMD_SHA_256, h, - pg, PAGE_SIZE) != PAGE_SIZE) { - prng_errorflag = PRNG_GEN_ENTROPY_FAILED; - ret = -EIO; - goto out; - } + cpacf_kimd(CPACF_KIMD_SHA_256, h, pg, PAGE_SIZE); if (n < sizeof(hash)) memcpy(ebuf, hash, n); ret += n; @@ -149,7 +143,6 @@ static int generate_entropy(u8 *ebuf, size_t nbytes) nbytes -= n; } -out: free_page((unsigned long)pg); return ret; } @@ -161,13 +154,11 @@ static void prng_tdes_add_entropy(void) { __u64 entropy[4]; unsigned int i; - int ret; for (i = 0; i < 16; i++) { - ret = crypt_s390_kmc(KMC_PRNG, prng_data->prngws.parm_block, - (char *)entropy, (char *)entropy, - sizeof(entropy)); - BUG_ON(ret < 0 || ret != sizeof(entropy)); + cpacf_kmc(CPACF_KMC_PRNG, prng_data->prngws.parm_block, + (char *) entropy, (char *) entropy, + sizeof(entropy)); memcpy(prng_data->prngws.parm_block, entropy, sizeof(entropy)); } } @@ -304,22 +295,14 @@ static int __init prng_sha512_selftest(void) 0x21, 0xe4, 0xb0, 0x86, 0x44, 0xf6, 0x72, 0x7c, 0x36, 0x8c, 0x5a, 0x9f, 0x7a, 0x4b, 0x3e, 0xe2 }; - int ret = 0; u8 buf[sizeof(random)]; struct ppno_ws_s ws; memset(&ws, 0, sizeof(ws)); /* initial seed */ - ret = crypt_s390_ppno(PPNO_SHA512_DRNG_SEED, - &ws, NULL, 0, - seed, sizeof(seed)); - if (ret < 0) { - pr_err("The prng self test seed operation for the " - "SHA-512 mode failed with rc=%d\n", ret); - prng_errorflag = PRNG_SELFTEST_FAILED; - return -EIO; - } + cpacf_ppno(CPACF_PPNO_SHA512_DRNG_SEED, + &ws, NULL, 0, seed, sizeof(seed)); /* check working states V and C */ if (memcmp(ws.V, V0, sizeof(V0)) != 0 @@ -331,24 +314,10 @@ static int __init prng_sha512_selftest(void) } /* generate random bytes */ - ret = crypt_s390_ppno(PPNO_SHA512_DRNG_GEN, - &ws, buf, sizeof(buf), - NULL, 0); - if (ret < 0) { - pr_err("The prng self test generate operation for " - "the SHA-512 mode failed with rc=%d\n", ret); - prng_errorflag = PRNG_SELFTEST_FAILED; - return -EIO; - } - ret = crypt_s390_ppno(PPNO_SHA512_DRNG_GEN, - &ws, buf, sizeof(buf), - NULL, 0); - if (ret < 0) { - pr_err("The prng self test generate operation for " - "the SHA-512 mode failed with rc=%d\n", ret); - prng_errorflag = PRNG_SELFTEST_FAILED; - return -EIO; - } + cpacf_ppno(CPACF_PPNO_SHA512_DRNG_GEN, + &ws, buf, sizeof(buf), NULL, 0); + cpacf_ppno(CPACF_PPNO_SHA512_DRNG_GEN, + &ws, buf, sizeof(buf), NULL, 0); /* check against expected data */ if (memcmp(buf, random, sizeof(random)) != 0) { @@ -396,29 +365,16 @@ static int __init prng_sha512_instantiate(void) get_tod_clock_ext(seed + 48); /* initial seed of the ppno drng */ - ret = crypt_s390_ppno(PPNO_SHA512_DRNG_SEED, - &prng_data->ppnows, NULL, 0, - seed, sizeof(seed)); - if (ret < 0) { - prng_errorflag = PRNG_SEED_FAILED; - ret = -EIO; - goto outfree; - } + cpacf_ppno(CPACF_PPNO_SHA512_DRNG_SEED, + &prng_data->ppnows, NULL, 0, seed, sizeof(seed)); /* if fips mode is enabled, generate a first block of random bytes for the FIPS 140-2 Conditional Self Test */ if (fips_enabled) { prng_data->prev = prng_data->buf + prng_chunk_size; - ret = crypt_s390_ppno(PPNO_SHA512_DRNG_GEN, - &prng_data->ppnows, - prng_data->prev, - prng_chunk_size, - NULL, 0); - if (ret < 0 || ret != prng_chunk_size) { - prng_errorflag = PRNG_GEN_FAILED; - ret = -EIO; - goto outfree; - } + cpacf_ppno(CPACF_PPNO_SHA512_DRNG_GEN, + &prng_data->ppnows, + prng_data->prev, prng_chunk_size, NULL, 0); } return 0; @@ -447,13 +403,8 @@ static int prng_sha512_reseed(void) return ret; /* do a reseed of the ppno drng with this bytestring */ - ret = crypt_s390_ppno(PPNO_SHA512_DRNG_SEED, - &prng_data->ppnows, NULL, 0, - seed, sizeof(seed)); - if (ret) { - prng_errorflag = PRNG_RESEED_FAILED; - return -EIO; - } + cpacf_ppno(CPACF_PPNO_SHA512_DRNG_SEED, + &prng_data->ppnows, NULL, 0, seed, sizeof(seed)); return 0; } @@ -471,13 +422,8 @@ static int prng_sha512_generate(u8 *buf, size_t nbytes) } /* PPNO generate */ - ret = crypt_s390_ppno(PPNO_SHA512_DRNG_GEN, - &prng_data->ppnows, buf, nbytes, - NULL, 0); - if (ret < 0 || ret != nbytes) { - prng_errorflag = PRNG_GEN_FAILED; - return -EIO; - } + cpacf_ppno(CPACF_PPNO_SHA512_DRNG_GEN, + &prng_data->ppnows, buf, nbytes, NULL, 0); /* FIPS 140-2 Conditional Self Test */ if (fips_enabled) { @@ -488,7 +434,7 @@ static int prng_sha512_generate(u8 *buf, size_t nbytes) memcpy(prng_data->prev, buf, nbytes); } - return ret; + return nbytes; } @@ -503,7 +449,7 @@ static int prng_open(struct inode *inode, struct file *file) static ssize_t prng_tdes_read(struct file *file, char __user *ubuf, size_t nbytes, loff_t *ppos) { - int chunk, n, tmp, ret = 0; + int chunk, n, ret = 0; /* lock prng_data struct */ if (mutex_lock_interruptible(&prng_data->mutex)) @@ -554,13 +500,9 @@ static ssize_t prng_tdes_read(struct file *file, char __user *ubuf, * * Note: you can still get strict X9.17 conformity by setting * prng_chunk_size to 8 bytes. - */ - tmp = crypt_s390_kmc(KMC_PRNG, prng_data->prngws.parm_block, - prng_data->buf, prng_data->buf, n); - if (tmp < 0 || tmp != n) { - ret = -EIO; - break; - } + */ + cpacf_kmc(CPACF_KMC_PRNG, prng_data->prngws.parm_block, + prng_data->buf, prng_data->buf, n); prng_data->prngws.byte_counter += n; prng_data->prngws.reseed_counter += n; @@ -815,14 +757,13 @@ static int __init prng_init(void) int ret; /* check if the CPU has a PRNG */ - if (!crypt_s390_func_available(KMC_PRNG, CRYPT_S390_MSA)) + if (!cpacf_query_func(CPACF_KMC, CPACF_KMC_PRNG)) return -EOPNOTSUPP; /* choose prng mode */ if (prng_mode != PRNG_MODE_TDES) { /* check for MSA5 support for PPNO operations */ - if (!crypt_s390_func_available(PPNO_SHA512_DRNG_GEN, - CRYPT_S390_MSA5)) { + if (!cpacf_query_func(CPACF_PPNO, CPACF_PPNO_SHA512_DRNG_GEN)) { if (prng_mode == PRNG_MODE_SHA512) { pr_err("The prng module cannot " "start in SHA-512 mode\n"); diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c index 9208eadae9f0..c7de53d8da75 100644 --- a/arch/s390/crypto/sha1_s390.c +++ b/arch/s390/crypto/sha1_s390.c @@ -28,8 +28,8 @@ #include <linux/module.h> #include <linux/cpufeature.h> #include <crypto/sha.h> +#include <asm/cpacf.h> -#include "crypt_s390.h" #include "sha.h" static int sha1_init(struct shash_desc *desc) @@ -42,7 +42,7 @@ static int sha1_init(struct shash_desc *desc) sctx->state[3] = SHA1_H3; sctx->state[4] = SHA1_H4; sctx->count = 0; - sctx->func = KIMD_SHA_1; + sctx->func = CPACF_KIMD_SHA_1; return 0; } @@ -66,7 +66,7 @@ static int sha1_import(struct shash_desc *desc, const void *in) sctx->count = ictx->count; memcpy(sctx->state, ictx->state, sizeof(ictx->state)); memcpy(sctx->buf, ictx->buffer, sizeof(ictx->buffer)); - sctx->func = KIMD_SHA_1; + sctx->func = CPACF_KIMD_SHA_1; return 0; } @@ -82,7 +82,7 @@ static struct shash_alg alg = { .base = { .cra_name = "sha1", .cra_driver_name= "sha1-s390", - .cra_priority = CRYPT_S390_PRIORITY, + .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, @@ -91,7 +91,7 @@ static struct shash_alg alg = { static int __init sha1_s390_init(void) { - if (!crypt_s390_func_available(KIMD_SHA_1, CRYPT_S390_MSA)) + if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_1)) return -EOPNOTSUPP; return crypto_register_shash(&alg); } diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c index 667888f5c964..53c277999a28 100644 --- a/arch/s390/crypto/sha256_s390.c +++ b/arch/s390/crypto/sha256_s390.c @@ -18,8 +18,8 @@ #include <linux/module.h> #include <linux/cpufeature.h> #include <crypto/sha.h> +#include <asm/cpacf.h> -#include "crypt_s390.h" #include "sha.h" static int sha256_init(struct shash_desc *desc) @@ -35,7 +35,7 @@ static int sha256_init(struct shash_desc *desc) sctx->state[6] = SHA256_H6; sctx->state[7] = SHA256_H7; sctx->count = 0; - sctx->func = KIMD_SHA_256; + sctx->func = CPACF_KIMD_SHA_256; return 0; } @@ -59,7 +59,7 @@ static int sha256_import(struct shash_desc *desc, const void *in) sctx->count = ictx->count; memcpy(sctx->state, ictx->state, sizeof(ictx->state)); memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); - sctx->func = KIMD_SHA_256; + sctx->func = CPACF_KIMD_SHA_256; return 0; } @@ -75,7 +75,7 @@ static struct shash_alg sha256_alg = { .base = { .cra_name = "sha256", .cra_driver_name= "sha256-s390", - .cra_priority = CRYPT_S390_PRIORITY, + .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, @@ -95,7 +95,7 @@ static int sha224_init(struct shash_desc *desc) sctx->state[6] = SHA224_H6; sctx->state[7] = SHA224_H7; sctx->count = 0; - sctx->func = KIMD_SHA_256; + sctx->func = CPACF_KIMD_SHA_256; return 0; } @@ -112,7 +112,7 @@ static struct shash_alg sha224_alg = { .base = { .cra_name = "sha224", .cra_driver_name= "sha224-s390", - .cra_priority = CRYPT_S390_PRIORITY, + .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA224_BLOCK_SIZE, .cra_module = THIS_MODULE, @@ -123,7 +123,7 @@ static int __init sha256_s390_init(void) { int ret; - if (!crypt_s390_func_available(KIMD_SHA_256, CRYPT_S390_MSA)) + if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256)) return -EOPNOTSUPP; ret = crypto_register_shash(&sha256_alg); if (ret < 0) diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c index 2ba66b1518f0..2f4caa1ef123 100644 --- a/arch/s390/crypto/sha512_s390.c +++ b/arch/s390/crypto/sha512_s390.c @@ -19,9 +19,9 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/cpufeature.h> +#include <asm/cpacf.h> #include "sha.h" -#include "crypt_s390.h" static int sha512_init(struct shash_desc *desc) { @@ -36,7 +36,7 @@ static int sha512_init(struct shash_desc *desc) *(__u64 *)&ctx->state[12] = 0x1f83d9abfb41bd6bULL; *(__u64 *)&ctx->state[14] = 0x5be0cd19137e2179ULL; ctx->count = 0; - ctx->func = KIMD_SHA_512; + ctx->func = CPACF_KIMD_SHA_512; return 0; } @@ -64,7 +64,7 @@ static int sha512_import(struct shash_desc *desc, const void *in) memcpy(sctx->state, ictx->state, sizeof(ictx->state)); memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf)); - sctx->func = KIMD_SHA_512; + sctx->func = CPACF_KIMD_SHA_512; return 0; } @@ -80,7 +80,7 @@ static struct shash_alg sha512_alg = { .base = { .cra_name = "sha512", .cra_driver_name= "sha512-s390", - .cra_priority = CRYPT_S390_PRIORITY, + .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, @@ -102,7 +102,7 @@ static int sha384_init(struct shash_desc *desc) *(__u64 *)&ctx->state[12] = 0xdb0c2e0d64f98fa7ULL; *(__u64 *)&ctx->state[14] = 0x47b5481dbefa4fa4ULL; ctx->count = 0; - ctx->func = KIMD_SHA_512; + ctx->func = CPACF_KIMD_SHA_512; return 0; } @@ -119,7 +119,7 @@ static struct shash_alg sha384_alg = { .base = { .cra_name = "sha384", .cra_driver_name= "sha384-s390", - .cra_priority = CRYPT_S390_PRIORITY, + .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_ctxsize = sizeof(struct s390_sha_ctx), @@ -133,7 +133,7 @@ static int __init init(void) { int ret; - if (!crypt_s390_func_available(KIMD_SHA_512, CRYPT_S390_MSA)) + if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_512)) return -EOPNOTSUPP; if ((ret = crypto_register_shash(&sha512_alg)) < 0) goto out; diff --git a/arch/s390/crypto/sha_common.c b/arch/s390/crypto/sha_common.c index 8620b0ec9c42..c740f77285b2 100644 --- a/arch/s390/crypto/sha_common.c +++ b/arch/s390/crypto/sha_common.c @@ -15,15 +15,14 @@ #include <crypto/internal/hash.h> #include <linux/module.h> +#include <asm/cpacf.h> #include "sha.h" -#include "crypt_s390.h" int s390_sha_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct s390_sha_ctx *ctx = shash_desc_ctx(desc); unsigned int bsize = crypto_shash_blocksize(desc->tfm); - unsigned int index; - int ret; + unsigned int index, n; /* how much is already in the buffer? */ index = ctx->count & (bsize - 1); @@ -35,9 +34,7 @@ int s390_sha_update(struct shash_desc *desc, const u8 *data, unsigned int len) /* process one stored block */ if (index) { memcpy(ctx->buf + index, data, bsize - index); - ret = crypt_s390_kimd(ctx->func, ctx->state, ctx->buf, bsize); - if (ret != bsize) - return -EIO; + cpacf_kimd(ctx->func, ctx->state, ctx->buf, bsize); data += bsize - index; len -= bsize - index; index = 0; @@ -45,12 +42,10 @@ int s390_sha_update(struct shash_desc *desc, const u8 *data, unsigned int len) /* process as many blocks as possible */ if (len >= bsize) { - ret = crypt_s390_kimd(ctx->func, ctx->state, data, - len & ~(bsize - 1)); - if (ret != (len & ~(bsize - 1))) - return -EIO; - data += ret; - len -= ret; + n = len & ~(bsize - 1); + cpacf_kimd(ctx->func, ctx->state, data, n); + data += n; + len -= n; } store: if (len) @@ -66,7 +61,6 @@ int s390_sha_final(struct shash_desc *desc, u8 *out) unsigned int bsize = crypto_shash_blocksize(desc->tfm); u64 bits; unsigned int index, end, plen; - int ret; /* SHA-512 uses 128 bit padding length */ plen = (bsize > SHA256_BLOCK_SIZE) ? 16 : 8; @@ -88,10 +82,7 @@ int s390_sha_final(struct shash_desc *desc, u8 *out) */ bits = ctx->count * 8; memcpy(ctx->buf + end - 8, &bits, sizeof(bits)); - - ret = crypt_s390_kimd(ctx->func, ctx->state, ctx->buf, end); - if (ret != end) - return -EIO; + cpacf_kimd(ctx->func, ctx->state, ctx->buf, end); /* copy digest to out */ memcpy(out, ctx->state, crypto_shash_digestsize(desc->tfm)); diff --git a/arch/s390/defconfig b/arch/s390/defconfig index e24f2af4c73b..2d40ef0a6295 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -1,8 +1,8 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_FHANDLE=y +CONFIG_USELIB=y CONFIG_AUDIT=y -CONFIG_NO_HZ=y +CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y CONFIG_TASKSTATS=y CONFIG_TASK_DELAY_ACCT=y @@ -11,19 +11,19 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_CGROUPS=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_PIDS=y -CONFIG_CGROUP_DEVICE=y -CONFIG_CPUSETS=y -CONFIG_CGROUP_CPUACCT=y CONFIG_MEMCG=y CONFIG_MEMCG_SWAP=y -CONFIG_MEMCG_KMEM=y -CONFIG_CGROUP_HUGETLB=y -CONFIG_CGROUP_PERF=y +CONFIG_BLK_CGROUP=y CONFIG_CGROUP_SCHED=y CONFIG_RT_GROUP_SCHED=y -CONFIG_BLK_CGROUP=y +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_HUGETLB=y +CONFIG_CPUSETS=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_PERF=y +CONFIG_CHECKPOINT_RESTORE=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_BLK_DEV_INITRD=y @@ -44,7 +44,6 @@ CONFIG_PARTITION_ADVANCED=y CONFIG_IBM_PARTITION=y CONFIG_DEFAULT_DEADLINE=y CONFIG_LIVEPATCH=y -CONFIG_MARCH_Z196=y CONFIG_NR_CPUS=256 CONFIG_NUMA=y CONFIG_HZ_100=y @@ -52,6 +51,14 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_CLEANCACHE=y +CONFIG_FRONTSWAP=y +CONFIG_CMA=y +CONFIG_ZSWAP=y +CONFIG_ZBUD=m +CONFIG_ZSMALLOC=m +CONFIG_ZSMALLOC_STAT=y +CONFIG_IDLE_PAGE_TRACKING=y CONFIG_CRASH_DUMP=y CONFIG_BINFMT_MISC=m CONFIG_HIBERNATION=y @@ -61,7 +68,6 @@ CONFIG_UNIX=y CONFIG_NET_KEY=y CONFIG_INET=y CONFIG_IP_MULTICAST=y -# CONFIG_INET_LRO is not set CONFIG_L2TP=m CONFIG_L2TP_DEBUGFS=m CONFIG_VLAN_8021Q=y @@ -144,6 +150,9 @@ CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y # CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_GDB_SCRIPTS=y CONFIG_UNUSED_SYMBOLS=y CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y @@ -158,20 +167,20 @@ CONFIG_LOCK_STAT=y CONFIG_DEBUG_LOCKDEP=y CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DEBUG_LIST=y -CONFIG_DEBUG_PI_LIST=y CONFIG_DEBUG_SG=y CONFIG_DEBUG_NOTIFIERS=y CONFIG_RCU_CPU_STALL_TIMEOUT=60 CONFIG_RCU_TRACE=y CONFIG_LATENCYTOP=y -CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y -CONFIG_TRACER_SNAPSHOT=y +CONFIG_SCHED_TRACER=y +CONFIG_FTRACE_SYSCALLS=y CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP=y CONFIG_STACK_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_UPROBE_EVENT=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_TRACE_ENUM_MAP_FILE=y CONFIG_KPROBES_SANITY_TEST=y -# CONFIG_STRICT_DEVMEM is not set CONFIG_S390_PTDUMP=y CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_AUTHENC=m @@ -212,17 +221,19 @@ CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m CONFIG_CRYPTO_DEFLATE=m -CONFIG_CRYPTO_ZLIB=m -CONFIG_CRYPTO_LZO=m CONFIG_CRYPTO_LZ4=m CONFIG_CRYPTO_LZ4HC=m CONFIG_CRYPTO_ANSI_CPRNG=m +CONFIG_CRYPTO_USER_API_HASH=m +CONFIG_CRYPTO_USER_API_SKCIPHER=m +CONFIG_CRYPTO_USER_API_RNG=m CONFIG_ZCRYPT=m CONFIG_CRYPTO_SHA1_S390=m CONFIG_CRYPTO_SHA256_S390=m CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_DES_S390=m CONFIG_CRYPTO_AES_S390=m +CONFIG_CRYPTO_CRC32_S390=y CONFIG_CRC7=m # CONFIG_XZ_DEC_X86 is not set # CONFIG_XZ_DEC_POWERPC is not set diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c index 045035796ca7..28f03ca60100 100644 --- a/arch/s390/hypfs/hypfs_diag.c +++ b/arch/s390/hypfs/hypfs_diag.c @@ -19,29 +19,10 @@ #include <asm/ebcdic.h> #include "hypfs.h" -#define LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */ -#define CPU_NAME_LEN 16 /* type name len of cpus in diag224 name table */ #define TMP_SIZE 64 /* size of temporary buffers */ #define DBFS_D204_HDR_VERSION 0 -/* diag 204 subcodes */ -enum diag204_sc { - SUBC_STIB4 = 4, - SUBC_RSI = 5, - SUBC_STIB6 = 6, - SUBC_STIB7 = 7 -}; - -/* The two available diag 204 data formats */ -enum diag204_format { - INFO_SIMPLE = 0, - INFO_EXT = 0x00010000 -}; - -/* bit is set in flags, when physical cpu info is included in diag 204 data */ -#define LPAR_PHYS_FLG 0x80 - static char *diag224_cpu_names; /* diag 224 name table */ static enum diag204_sc diag204_store_sc; /* used subcode for store */ static enum diag204_format diag204_info_type; /* used diag 204 data format */ @@ -53,7 +34,7 @@ static int diag204_buf_pages; /* number of pages for diag204 data */ static struct dentry *dbfs_d204_file; /* - * DIAG 204 data structures and member access functions. + * DIAG 204 member access functions. * * Since we have two different diag 204 data formats for old and new s390 * machines, we do not access the structs directly, but use getter functions for @@ -62,302 +43,173 @@ static struct dentry *dbfs_d204_file; /* Time information block */ -struct info_blk_hdr { - __u8 npar; - __u8 flags; - __u16 tslice; - __u16 phys_cpus; - __u16 this_part; - __u64 curtod; -} __attribute__ ((packed)); - -struct x_info_blk_hdr { - __u8 npar; - __u8 flags; - __u16 tslice; - __u16 phys_cpus; - __u16 this_part; - __u64 curtod1; - __u64 curtod2; - char reserved[40]; -} __attribute__ ((packed)); - static inline int info_blk_hdr__size(enum diag204_format type) { - if (type == INFO_SIMPLE) - return sizeof(struct info_blk_hdr); - else /* INFO_EXT */ - return sizeof(struct x_info_blk_hdr); + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_info_blk_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_info_blk_hdr); } static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct info_blk_hdr *)hdr)->npar; - else /* INFO_EXT */ - return ((struct x_info_blk_hdr *)hdr)->npar; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->npar; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->npar; } static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct info_blk_hdr *)hdr)->flags; - else /* INFO_EXT */ - return ((struct x_info_blk_hdr *)hdr)->flags; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->flags; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->flags; } static inline __u16 info_blk_hdr__pcpus(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct info_blk_hdr *)hdr)->phys_cpus; - else /* INFO_EXT */ - return ((struct x_info_blk_hdr *)hdr)->phys_cpus; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->phys_cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->phys_cpus; } /* Partition header */ -struct part_hdr { - __u8 pn; - __u8 cpus; - char reserved[6]; - char part_name[LPAR_NAME_LEN]; -} __attribute__ ((packed)); - -struct x_part_hdr { - __u8 pn; - __u8 cpus; - __u8 rcpus; - __u8 pflag; - __u32 mlu; - char part_name[LPAR_NAME_LEN]; - char lpc_name[8]; - char os_name[8]; - __u64 online_cs; - __u64 online_es; - __u8 upid; - char reserved1[3]; - __u32 group_mlu; - char group_name[8]; - char reserved2[32]; -} __attribute__ ((packed)); - static inline int part_hdr__size(enum diag204_format type) { - if (type == INFO_SIMPLE) - return sizeof(struct part_hdr); - else /* INFO_EXT */ - return sizeof(struct x_part_hdr); + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_part_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_part_hdr); } static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct part_hdr *)hdr)->cpus; - else /* INFO_EXT */ - return ((struct x_part_hdr *)hdr)->rcpus; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_part_hdr *)hdr)->cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_part_hdr *)hdr)->rcpus; } static inline void part_hdr__part_name(enum diag204_format type, void *hdr, char *name) { - if (type == INFO_SIMPLE) - memcpy(name, ((struct part_hdr *)hdr)->part_name, - LPAR_NAME_LEN); - else /* INFO_EXT */ - memcpy(name, ((struct x_part_hdr *)hdr)->part_name, - LPAR_NAME_LEN); - EBCASC(name, LPAR_NAME_LEN); - name[LPAR_NAME_LEN] = 0; + if (type == DIAG204_INFO_SIMPLE) + memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name, + DIAG204_LPAR_NAME_LEN); + else /* DIAG204_INFO_EXT */ + memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name, + DIAG204_LPAR_NAME_LEN); + EBCASC(name, DIAG204_LPAR_NAME_LEN); + name[DIAG204_LPAR_NAME_LEN] = 0; strim(name); } -struct cpu_info { - __u16 cpu_addr; - char reserved1[2]; - __u8 ctidx; - __u8 cflag; - __u16 weight; - __u64 acc_time; - __u64 lp_time; -} __attribute__ ((packed)); - -struct x_cpu_info { - __u16 cpu_addr; - char reserved1[2]; - __u8 ctidx; - __u8 cflag; - __u16 weight; - __u64 acc_time; - __u64 lp_time; - __u16 min_weight; - __u16 cur_weight; - __u16 max_weight; - char reseved2[2]; - __u64 online_time; - __u64 wait_time; - __u32 pma_weight; - __u32 polar_weight; - char reserved3[40]; -} __attribute__ ((packed)); - /* CPU info block */ static inline int cpu_info__size(enum diag204_format type) { - if (type == INFO_SIMPLE) - return sizeof(struct cpu_info); - else /* INFO_EXT */ - return sizeof(struct x_cpu_info); + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_cpu_info); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_cpu_info); } static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct cpu_info *)hdr)->ctidx; - else /* INFO_EXT */ - return ((struct x_cpu_info *)hdr)->ctidx; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->ctidx; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->ctidx; } static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct cpu_info *)hdr)->cpu_addr; - else /* INFO_EXT */ - return ((struct x_cpu_info *)hdr)->cpu_addr; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->cpu_addr; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->cpu_addr; } static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct cpu_info *)hdr)->acc_time; - else /* INFO_EXT */ - return ((struct x_cpu_info *)hdr)->acc_time; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->acc_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->acc_time; } static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct cpu_info *)hdr)->lp_time; - else /* INFO_EXT */ - return ((struct x_cpu_info *)hdr)->lp_time; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->lp_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->lp_time; } static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) + if (type == DIAG204_INFO_SIMPLE) return 0; /* online_time not available in simple info */ - else /* INFO_EXT */ - return ((struct x_cpu_info *)hdr)->online_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->online_time; } /* Physical header */ -struct phys_hdr { - char reserved1[1]; - __u8 cpus; - char reserved2[6]; - char mgm_name[8]; -} __attribute__ ((packed)); - -struct x_phys_hdr { - char reserved1[1]; - __u8 cpus; - char reserved2[6]; - char mgm_name[8]; - char reserved3[80]; -} __attribute__ ((packed)); - static inline int phys_hdr__size(enum diag204_format type) { - if (type == INFO_SIMPLE) - return sizeof(struct phys_hdr); - else /* INFO_EXT */ - return sizeof(struct x_phys_hdr); + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_phys_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_phys_hdr); } static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct phys_hdr *)hdr)->cpus; - else /* INFO_EXT */ - return ((struct x_phys_hdr *)hdr)->cpus; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_hdr *)hdr)->cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_hdr *)hdr)->cpus; } /* Physical CPU info block */ -struct phys_cpu { - __u16 cpu_addr; - char reserved1[2]; - __u8 ctidx; - char reserved2[3]; - __u64 mgm_time; - char reserved3[8]; -} __attribute__ ((packed)); - -struct x_phys_cpu { - __u16 cpu_addr; - char reserved1[2]; - __u8 ctidx; - char reserved2[3]; - __u64 mgm_time; - char reserved3[80]; -} __attribute__ ((packed)); - static inline int phys_cpu__size(enum diag204_format type) { - if (type == INFO_SIMPLE) - return sizeof(struct phys_cpu); - else /* INFO_EXT */ - return sizeof(struct x_phys_cpu); + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_phys_cpu); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_phys_cpu); } static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct phys_cpu *)hdr)->cpu_addr; - else /* INFO_EXT */ - return ((struct x_phys_cpu *)hdr)->cpu_addr; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->cpu_addr; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr; } static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct phys_cpu *)hdr)->mgm_time; - else /* INFO_EXT */ - return ((struct x_phys_cpu *)hdr)->mgm_time; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->mgm_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->mgm_time; } static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct phys_cpu *)hdr)->ctidx; - else /* INFO_EXT */ - return ((struct x_phys_cpu *)hdr)->ctidx; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->ctidx; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->ctidx; } /* Diagnose 204 functions */ - -static inline int __diag204(unsigned long subcode, unsigned long size, void *addr) -{ - register unsigned long _subcode asm("0") = subcode; - register unsigned long _size asm("1") = size; - - asm volatile( - " diag %2,%0,0x204\n" - "0:\n" - EX_TABLE(0b,0b) - : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory"); - if (_subcode) - return -1; - return _size; -} - -static int diag204(unsigned long subcode, unsigned long size, void *addr) -{ - diag_stat_inc(DIAG_STAT_X204); - return __diag204(subcode, size, addr); -} - /* * For the old diag subcode 4 with simple data format we have to use real * memory. If we use subcode 6 or 7 with extended data format, we can (and @@ -409,12 +261,12 @@ static void *diag204_get_buffer(enum diag204_format fmt, int *pages) *pages = diag204_buf_pages; return diag204_buf; } - if (fmt == INFO_SIMPLE) { + if (fmt == DIAG204_INFO_SIMPLE) { *pages = 1; return diag204_alloc_rbuf(); - } else {/* INFO_EXT */ - *pages = diag204((unsigned long)SUBC_RSI | - (unsigned long)INFO_EXT, 0, NULL); + } else {/* DIAG204_INFO_EXT */ + *pages = diag204((unsigned long)DIAG204_SUBC_RSI | + (unsigned long)DIAG204_INFO_EXT, 0, NULL); if (*pages <= 0) return ERR_PTR(-ENOSYS); else @@ -441,18 +293,18 @@ static int diag204_probe(void) void *buf; int pages, rc; - buf = diag204_get_buffer(INFO_EXT, &pages); + buf = diag204_get_buffer(DIAG204_INFO_EXT, &pages); if (!IS_ERR(buf)) { - if (diag204((unsigned long)SUBC_STIB7 | - (unsigned long)INFO_EXT, pages, buf) >= 0) { - diag204_store_sc = SUBC_STIB7; - diag204_info_type = INFO_EXT; + if (diag204((unsigned long)DIAG204_SUBC_STIB7 | + (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) { + diag204_store_sc = DIAG204_SUBC_STIB7; + diag204_info_type = DIAG204_INFO_EXT; goto out; } - if (diag204((unsigned long)SUBC_STIB6 | - (unsigned long)INFO_EXT, pages, buf) >= 0) { - diag204_store_sc = SUBC_STIB6; - diag204_info_type = INFO_EXT; + if (diag204((unsigned long)DIAG204_SUBC_STIB6 | + (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) { + diag204_store_sc = DIAG204_SUBC_STIB6; + diag204_info_type = DIAG204_INFO_EXT; goto out; } diag204_free_buffer(); @@ -460,15 +312,15 @@ static int diag204_probe(void) /* subcodes 6 and 7 failed, now try subcode 4 */ - buf = diag204_get_buffer(INFO_SIMPLE, &pages); + buf = diag204_get_buffer(DIAG204_INFO_SIMPLE, &pages); if (IS_ERR(buf)) { rc = PTR_ERR(buf); goto fail_alloc; } - if (diag204((unsigned long)SUBC_STIB4 | - (unsigned long)INFO_SIMPLE, pages, buf) >= 0) { - diag204_store_sc = SUBC_STIB4; - diag204_info_type = INFO_SIMPLE; + if (diag204((unsigned long)DIAG204_SUBC_STIB4 | + (unsigned long)DIAG204_INFO_SIMPLE, pages, buf) >= 0) { + diag204_store_sc = DIAG204_SUBC_STIB4; + diag204_info_type = DIAG204_INFO_SIMPLE; goto out; } else { rc = -ENOSYS; @@ -508,20 +360,6 @@ out: /* Diagnose 224 functions */ -static int diag224(void *ptr) -{ - int rc = -EOPNOTSUPP; - - diag_stat_inc(DIAG_STAT_X224); - asm volatile( - " diag %1,%2,0x224\n" - "0: lhi %0,0x0\n" - "1:\n" - EX_TABLE(0b,1b) - : "+d" (rc) :"d" (0), "d" (ptr) : "memory"); - return rc; -} - static int diag224_get_name_table(void) { /* memory must be below 2GB */ @@ -543,9 +381,9 @@ static void diag224_delete_name_table(void) static int diag224_idx2name(int index, char *name) { - memcpy(name, diag224_cpu_names + ((index + 1) * CPU_NAME_LEN), - CPU_NAME_LEN); - name[CPU_NAME_LEN] = 0; + memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN), + DIAG204_CPU_NAME_LEN); + name[DIAG204_CPU_NAME_LEN] = 0; strim(name); return 0; } @@ -601,7 +439,7 @@ __init int hypfs_diag_init(void) pr_err("The hardware system does not support hypfs\n"); return -ENODATA; } - if (diag204_info_type == INFO_EXT) { + if (diag204_info_type == DIAG204_INFO_EXT) { rc = hypfs_dbfs_create_file(&dbfs_file_d204); if (rc) return rc; @@ -649,7 +487,7 @@ static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info) cpu_info__lp_time(diag204_info_type, cpu_info)); if (IS_ERR(rc)) return PTR_ERR(rc); - if (diag204_info_type == INFO_EXT) { + if (diag204_info_type == DIAG204_INFO_EXT) { rc = hypfs_create_u64(cpu_dir, "onlinetime", cpu_info__online_time(diag204_info_type, cpu_info)); @@ -665,12 +503,12 @@ static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr) { struct dentry *cpus_dir; struct dentry *lpar_dir; - char lpar_name[LPAR_NAME_LEN + 1]; + char lpar_name[DIAG204_LPAR_NAME_LEN + 1]; void *cpu_info; int i; part_hdr__part_name(diag204_info_type, part_hdr, lpar_name); - lpar_name[LPAR_NAME_LEN] = 0; + lpar_name[DIAG204_LPAR_NAME_LEN] = 0; lpar_dir = hypfs_mkdir(systems_dir, lpar_name); if (IS_ERR(lpar_dir)) return lpar_dir; @@ -753,7 +591,8 @@ int hypfs_diag_create_files(struct dentry *root) goto err_out; } } - if (info_blk_hdr__flags(diag204_info_type, time_hdr) & LPAR_PHYS_FLG) { + if (info_blk_hdr__flags(diag204_info_type, time_hdr) & + DIAG204_LPAR_PHYS_FLG) { ptr = hypfs_create_phys_files(root, part_hdr); if (IS_ERR(ptr)) { rc = PTR_ERR(ptr); diff --git a/arch/s390/hypfs/hypfs_vm.c b/arch/s390/hypfs/hypfs_vm.c index 44feac38ccfc..012919d9833b 100644 --- a/arch/s390/hypfs/hypfs_vm.c +++ b/arch/s390/hypfs/hypfs_vm.c @@ -70,7 +70,7 @@ static int diag2fc(int size, char* query, void *addr) diag_stat_inc(DIAG_STAT_X2FC); asm volatile( " diag %0,%1,0x2fc\n" - "0:\n" + "0: nopr %%r7\n" EX_TABLE(0b,0b) : "=d" (residual_cnt), "+d" (rc) : "0" (&parm_list) : "memory"); diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index 911064aa59b2..d28cc2f5b7b2 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -93,6 +93,11 @@ static inline int atomic_add_return(int i, atomic_t *v) return __ATOMIC_LOOP(v, i, __ATOMIC_ADD, __ATOMIC_BARRIER) + i; } +static inline int atomic_fetch_add(int i, atomic_t *v) +{ + return __ATOMIC_LOOP(v, i, __ATOMIC_ADD, __ATOMIC_BARRIER); +} + static inline void atomic_add(int i, atomic_t *v) { #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES @@ -114,22 +119,27 @@ static inline void atomic_add(int i, atomic_t *v) #define atomic_inc_and_test(_v) (atomic_add_return(1, _v) == 0) #define atomic_sub(_i, _v) atomic_add(-(int)(_i), _v) #define atomic_sub_return(_i, _v) atomic_add_return(-(int)(_i), _v) +#define atomic_fetch_sub(_i, _v) atomic_fetch_add(-(int)(_i), _v) #define atomic_sub_and_test(_i, _v) (atomic_sub_return(_i, _v) == 0) #define atomic_dec(_v) atomic_sub(1, _v) #define atomic_dec_return(_v) atomic_sub_return(1, _v) #define atomic_dec_and_test(_v) (atomic_sub_return(1, _v) == 0) -#define ATOMIC_OP(op, OP) \ +#define ATOMIC_OPS(op, OP) \ static inline void atomic_##op(int i, atomic_t *v) \ { \ __ATOMIC_LOOP(v, i, __ATOMIC_##OP, __ATOMIC_NO_BARRIER); \ +} \ +static inline int atomic_fetch_##op(int i, atomic_t *v) \ +{ \ + return __ATOMIC_LOOP(v, i, __ATOMIC_##OP, __ATOMIC_BARRIER); \ } -ATOMIC_OP(and, AND) -ATOMIC_OP(or, OR) -ATOMIC_OP(xor, XOR) +ATOMIC_OPS(and, AND) +ATOMIC_OPS(or, OR) +ATOMIC_OPS(xor, XOR) -#undef ATOMIC_OP +#undef ATOMIC_OPS #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) @@ -236,6 +246,11 @@ static inline long long atomic64_add_return(long long i, atomic64_t *v) return __ATOMIC64_LOOP(v, i, __ATOMIC64_ADD, __ATOMIC64_BARRIER) + i; } +static inline long long atomic64_fetch_add(long long i, atomic64_t *v) +{ + return __ATOMIC64_LOOP(v, i, __ATOMIC64_ADD, __ATOMIC64_BARRIER); +} + static inline void atomic64_add(long long i, atomic64_t *v) { #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES @@ -264,17 +279,21 @@ static inline long long atomic64_cmpxchg(atomic64_t *v, return old; } -#define ATOMIC64_OP(op, OP) \ +#define ATOMIC64_OPS(op, OP) \ static inline void atomic64_##op(long i, atomic64_t *v) \ { \ __ATOMIC64_LOOP(v, i, __ATOMIC64_##OP, __ATOMIC64_NO_BARRIER); \ +} \ +static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ +{ \ + return __ATOMIC64_LOOP(v, i, __ATOMIC64_##OP, __ATOMIC64_BARRIER); \ } -ATOMIC64_OP(and, AND) -ATOMIC64_OP(or, OR) -ATOMIC64_OP(xor, XOR) +ATOMIC64_OPS(and, AND) +ATOMIC64_OPS(or, OR) +ATOMIC64_OPS(xor, XOR) -#undef ATOMIC64_OP +#undef ATOMIC64_OPS #undef __ATOMIC64_LOOP static inline int atomic64_add_unless(atomic64_t *v, long long i, long long u) @@ -315,6 +334,7 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #define atomic64_inc_return(_v) atomic64_add_return(1, _v) #define atomic64_inc_and_test(_v) (atomic64_add_return(1, _v) == 0) #define atomic64_sub_return(_i, _v) atomic64_add_return(-(long long)(_i), _v) +#define atomic64_fetch_sub(_i, _v) atomic64_fetch_add(-(long long)(_i), _v) #define atomic64_sub(_i, _v) atomic64_add(-(long long)(_i), _v) #define atomic64_sub_and_test(_i, _v) (atomic64_sub_return(_i, _v) == 0) #define atomic64_dec(_v) atomic64_sub(1, _v) diff --git a/arch/s390/include/asm/cache.h b/arch/s390/include/asm/cache.h index 22da3b34c655..05219a5e0b2f 100644 --- a/arch/s390/include/asm/cache.h +++ b/arch/s390/include/asm/cache.h @@ -13,9 +13,6 @@ #define L1_CACHE_SHIFT 8 #define NET_SKB_PAD 32 -#define __read_mostly __attribute__((__section__(".data..read_mostly"))) - -/* Read-only memory is marked before mark_rodata_ro() is called. */ -#define __ro_after_init __read_mostly +#define __read_mostly __section(.data..read_mostly) #endif diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index d1e7b0a0feeb..f7ed88cc066e 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -320,7 +320,7 @@ struct cio_iplinfo { extern int cio_get_iplinfo(struct cio_iplinfo *iplinfo); /* Function from drivers/s390/cio/chsc.c */ -int chsc_sstpc(void *page, unsigned int op, u16 ctrl); +int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta); int chsc_sstpi(void *page, void *result, size_t size); #endif diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h new file mode 100644 index 000000000000..2c680db7e5c1 --- /dev/null +++ b/arch/s390/include/asm/cpacf.h @@ -0,0 +1,400 @@ +/* + * CP Assist for Cryptographic Functions (CPACF) + * + * Copyright IBM Corp. 2003, 2016 + * Author(s): Thomas Spatzier + * Jan Glauber + * Harald Freudenberger (freude@de.ibm.com) + * Martin Schwidefsky <schwidefsky@de.ibm.com> + */ +#ifndef _ASM_S390_CPACF_H +#define _ASM_S390_CPACF_H + +#include <asm/facility.h> + +/* + * Instruction opcodes for the CPACF instructions + */ +#define CPACF_KMAC 0xb91e /* MSA */ +#define CPACF_KM 0xb92e /* MSA */ +#define CPACF_KMC 0xb92f /* MSA */ +#define CPACF_KIMD 0xb93e /* MSA */ +#define CPACF_KLMD 0xb93f /* MSA */ +#define CPACF_PCKMO 0xb928 /* MSA3 */ +#define CPACF_KMF 0xb92a /* MSA4 */ +#define CPACF_KMO 0xb92b /* MSA4 */ +#define CPACF_PCC 0xb92c /* MSA4 */ +#define CPACF_KMCTR 0xb92d /* MSA4 */ +#define CPACF_PPNO 0xb93c /* MSA5 */ + +/* + * Decryption modifier bit + */ +#define CPACF_DECRYPT 0x80 + +/* + * Function codes for the KM (CIPHER MESSAGE) instruction + */ +#define CPACF_KM_QUERY 0x00 +#define CPACF_KM_DEA 0x01 +#define CPACF_KM_TDEA_128 0x02 +#define CPACF_KM_TDEA_192 0x03 +#define CPACF_KM_AES_128 0x12 +#define CPACF_KM_AES_192 0x13 +#define CPACF_KM_AES_256 0x14 +#define CPACF_KM_XTS_128 0x32 +#define CPACF_KM_XTS_256 0x34 + +/* + * Function codes for the KMC (CIPHER MESSAGE WITH CHAINING) + * instruction + */ +#define CPACF_KMC_QUERY 0x00 +#define CPACF_KMC_DEA 0x01 +#define CPACF_KMC_TDEA_128 0x02 +#define CPACF_KMC_TDEA_192 0x03 +#define CPACF_KMC_AES_128 0x12 +#define CPACF_KMC_AES_192 0x13 +#define CPACF_KMC_AES_256 0x14 +#define CPACF_KMC_PRNG 0x43 + +/* + * Function codes for the KMCTR (CIPHER MESSAGE WITH COUNTER) + * instruction + */ +#define CPACF_KMCTR_QUERY 0x00 +#define CPACF_KMCTR_DEA 0x01 +#define CPACF_KMCTR_TDEA_128 0x02 +#define CPACF_KMCTR_TDEA_192 0x03 +#define CPACF_KMCTR_AES_128 0x12 +#define CPACF_KMCTR_AES_192 0x13 +#define CPACF_KMCTR_AES_256 0x14 + +/* + * Function codes for the KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST) + * instruction + */ +#define CPACF_KIMD_QUERY 0x00 +#define CPACF_KIMD_SHA_1 0x01 +#define CPACF_KIMD_SHA_256 0x02 +#define CPACF_KIMD_SHA_512 0x03 +#define CPACF_KIMD_GHASH 0x41 + +/* + * Function codes for the KLMD (COMPUTE LAST MESSAGE DIGEST) + * instruction + */ +#define CPACF_KLMD_QUERY 0x00 +#define CPACF_KLMD_SHA_1 0x01 +#define CPACF_KLMD_SHA_256 0x02 +#define CPACF_KLMD_SHA_512 0x03 + +/* + * function codes for the KMAC (COMPUTE MESSAGE AUTHENTICATION CODE) + * instruction + */ +#define CPACF_KMAC_QUERY 0x00 +#define CPACF_KMAC_DEA 0x01 +#define CPACF_KMAC_TDEA_128 0x02 +#define CPACF_KMAC_TDEA_192 0x03 + +/* + * Function codes for the PPNO (PERFORM PSEUDORANDOM NUMBER OPERATION) + * instruction + */ +#define CPACF_PPNO_QUERY 0x00 +#define CPACF_PPNO_SHA512_DRNG_GEN 0x03 +#define CPACF_PPNO_SHA512_DRNG_SEED 0x83 + +typedef struct { unsigned char bytes[16]; } cpacf_mask_t; + +/** + * cpacf_query() - check if a specific CPACF function is available + * @opcode: the opcode of the crypto instruction + * @func: the function code to test for + * + * Executes the query function for the given crypto instruction @opcode + * and checks if @func is available + * + * Returns 1 if @func is available for @opcode, 0 otherwise + */ +static inline void __cpacf_query(unsigned int opcode, cpacf_mask_t *mask) +{ + register unsigned long r0 asm("0") = 0; /* query function */ + register unsigned long r1 asm("1") = (unsigned long) mask; + + asm volatile( + " spm 0\n" /* pckmo doesn't change the cc */ + /* Parameter registers are ignored, but may not be 0 */ + "0: .insn rrf,%[opc] << 16,2,2,2,0\n" + " brc 1,0b\n" /* handle partial completion */ + : "=m" (*mask) + : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (opcode) + : "cc"); +} + +static inline int __cpacf_check_opcode(unsigned int opcode) +{ + switch (opcode) { + case CPACF_KMAC: + case CPACF_KM: + case CPACF_KMC: + case CPACF_KIMD: + case CPACF_KLMD: + return test_facility(17); /* check for MSA */ + case CPACF_PCKMO: + return test_facility(76); /* check for MSA3 */ + case CPACF_KMF: + case CPACF_KMO: + case CPACF_PCC: + case CPACF_KMCTR: + return test_facility(77); /* check for MSA4 */ + case CPACF_PPNO: + return test_facility(57); /* check for MSA5 */ + default: + BUG(); + } +} + +static inline int cpacf_query(unsigned int opcode, cpacf_mask_t *mask) +{ + if (__cpacf_check_opcode(opcode)) { + __cpacf_query(opcode, mask); + return 1; + } + memset(mask, 0, sizeof(*mask)); + return 0; +} + +static inline int cpacf_test_func(cpacf_mask_t *mask, unsigned int func) +{ + return (mask->bytes[func >> 3] & (0x80 >> (func & 7))) != 0; +} + +static inline int cpacf_query_func(unsigned int opcode, unsigned int func) +{ + cpacf_mask_t mask; + + if (cpacf_query(opcode, &mask)) + return cpacf_test_func(&mask, func); + return 0; +} + +/** + * cpacf_km() - executes the KM (CIPHER MESSAGE) instruction + * @func: the function code passed to KM; see CPACF_KM_xxx defines + * @param: address of parameter block; see POP for details on each func + * @dest: address of destination memory area + * @src: address of source memory area + * @src_len: length of src operand in bytes + * + * Returns 0 for the query func, number of processed bytes for + * encryption/decryption funcs + */ +static inline int cpacf_km(unsigned long func, void *param, + u8 *dest, const u8 *src, long src_len) +{ + register unsigned long r0 asm("0") = (unsigned long) func; + register unsigned long r1 asm("1") = (unsigned long) param; + register unsigned long r2 asm("2") = (unsigned long) src; + register unsigned long r3 asm("3") = (unsigned long) src_len; + register unsigned long r4 asm("4") = (unsigned long) dest; + + asm volatile( + "0: .insn rre,%[opc] << 16,%[dst],%[src]\n" + " brc 1,0b\n" /* handle partial completion */ + : [src] "+a" (r2), [len] "+d" (r3), [dst] "+a" (r4) + : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KM) + : "cc", "memory"); + + return src_len - r3; +} + +/** + * cpacf_kmc() - executes the KMC (CIPHER MESSAGE WITH CHAINING) instruction + * @func: the function code passed to KM; see CPACF_KMC_xxx defines + * @param: address of parameter block; see POP for details on each func + * @dest: address of destination memory area + * @src: address of source memory area + * @src_len: length of src operand in bytes + * + * Returns 0 for the query func, number of processed bytes for + * encryption/decryption funcs + */ +static inline int cpacf_kmc(unsigned long func, void *param, + u8 *dest, const u8 *src, long src_len) +{ + register unsigned long r0 asm("0") = (unsigned long) func; + register unsigned long r1 asm("1") = (unsigned long) param; + register unsigned long r2 asm("2") = (unsigned long) src; + register unsigned long r3 asm("3") = (unsigned long) src_len; + register unsigned long r4 asm("4") = (unsigned long) dest; + + asm volatile( + "0: .insn rre,%[opc] << 16,%[dst],%[src]\n" + " brc 1,0b\n" /* handle partial completion */ + : [src] "+a" (r2), [len] "+d" (r3), [dst] "+a" (r4) + : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMC) + : "cc", "memory"); + + return src_len - r3; +} + +/** + * cpacf_kimd() - executes the KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST) + * instruction + * @func: the function code passed to KM; see CPACF_KIMD_xxx defines + * @param: address of parameter block; see POP for details on each func + * @src: address of source memory area + * @src_len: length of src operand in bytes + */ +static inline void cpacf_kimd(unsigned long func, void *param, + const u8 *src, long src_len) +{ + register unsigned long r0 asm("0") = (unsigned long) func; + register unsigned long r1 asm("1") = (unsigned long) param; + register unsigned long r2 asm("2") = (unsigned long) src; + register unsigned long r3 asm("3") = (unsigned long) src_len; + + asm volatile( + "0: .insn rre,%[opc] << 16,0,%[src]\n" + " brc 1,0b\n" /* handle partial completion */ + : [src] "+a" (r2), [len] "+d" (r3) + : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KIMD) + : "cc", "memory"); +} + +/** + * cpacf_klmd() - executes the KLMD (COMPUTE LAST MESSAGE DIGEST) instruction + * @func: the function code passed to KM; see CPACF_KLMD_xxx defines + * @param: address of parameter block; see POP for details on each func + * @src: address of source memory area + * @src_len: length of src operand in bytes + */ +static inline void cpacf_klmd(unsigned long func, void *param, + const u8 *src, long src_len) +{ + register unsigned long r0 asm("0") = (unsigned long) func; + register unsigned long r1 asm("1") = (unsigned long) param; + register unsigned long r2 asm("2") = (unsigned long) src; + register unsigned long r3 asm("3") = (unsigned long) src_len; + + asm volatile( + "0: .insn rre,%[opc] << 16,0,%[src]\n" + " brc 1,0b\n" /* handle partial completion */ + : [src] "+a" (r2), [len] "+d" (r3) + : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KLMD) + : "cc", "memory"); +} + +/** + * cpacf_kmac() - executes the KMAC (COMPUTE MESSAGE AUTHENTICATION CODE) + * instruction + * @func: the function code passed to KM; see CPACF_KMAC_xxx defines + * @param: address of parameter block; see POP for details on each func + * @src: address of source memory area + * @src_len: length of src operand in bytes + * + * Returns 0 for the query func, number of processed bytes for digest funcs + */ +static inline int cpacf_kmac(unsigned long func, void *param, + const u8 *src, long src_len) +{ + register unsigned long r0 asm("0") = (unsigned long) func; + register unsigned long r1 asm("1") = (unsigned long) param; + register unsigned long r2 asm("2") = (unsigned long) src; + register unsigned long r3 asm("3") = (unsigned long) src_len; + + asm volatile( + "0: .insn rre,%[opc] << 16,0,%[src]\n" + " brc 1,0b\n" /* handle partial completion */ + : [src] "+a" (r2), [len] "+d" (r3) + : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMAC) + : "cc", "memory"); + + return src_len - r3; +} + +/** + * cpacf_kmctr() - executes the KMCTR (CIPHER MESSAGE WITH COUNTER) instruction + * @func: the function code passed to KMCTR; see CPACF_KMCTR_xxx defines + * @param: address of parameter block; see POP for details on each func + * @dest: address of destination memory area + * @src: address of source memory area + * @src_len: length of src operand in bytes + * @counter: address of counter value + * + * Returns 0 for the query func, number of processed bytes for + * encryption/decryption funcs + */ +static inline int cpacf_kmctr(unsigned long func, void *param, u8 *dest, + const u8 *src, long src_len, u8 *counter) +{ + register unsigned long r0 asm("0") = (unsigned long) func; + register unsigned long r1 asm("1") = (unsigned long) param; + register unsigned long r2 asm("2") = (unsigned long) src; + register unsigned long r3 asm("3") = (unsigned long) src_len; + register unsigned long r4 asm("4") = (unsigned long) dest; + register unsigned long r6 asm("6") = (unsigned long) counter; + + asm volatile( + "0: .insn rrf,%[opc] << 16,%[dst],%[src],%[ctr],0\n" + " brc 1,0b\n" /* handle partial completion */ + : [src] "+a" (r2), [len] "+d" (r3), + [dst] "+a" (r4), [ctr] "+a" (r6) + : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_KMCTR) + : "cc", "memory"); + + return src_len - r3; +} + +/** + * cpacf_ppno() - executes the PPNO (PERFORM PSEUDORANDOM NUMBER OPERATION) + * instruction + * @func: the function code passed to PPNO; see CPACF_PPNO_xxx defines + * @param: address of parameter block; see POP for details on each func + * @dest: address of destination memory area + * @dest_len: size of destination memory area in bytes + * @seed: address of seed data + * @seed_len: size of seed data in bytes + */ +static inline void cpacf_ppno(unsigned long func, void *param, + u8 *dest, long dest_len, + const u8 *seed, long seed_len) +{ + register unsigned long r0 asm("0") = (unsigned long) func; + register unsigned long r1 asm("1") = (unsigned long) param; + register unsigned long r2 asm("2") = (unsigned long) dest; + register unsigned long r3 asm("3") = (unsigned long) dest_len; + register unsigned long r4 asm("4") = (unsigned long) seed; + register unsigned long r5 asm("5") = (unsigned long) seed_len; + + asm volatile ( + "0: .insn rre,%[opc] << 16,%[dst],%[seed]\n" + " brc 1,0b\n" /* handle partial completion */ + : [dst] "+a" (r2), [dlen] "+d" (r3) + : [fc] "d" (r0), [pba] "a" (r1), + [seed] "a" (r4), [slen] "d" (r5), [opc] "i" (CPACF_PPNO) + : "cc", "memory"); +} + +/** + * cpacf_pcc() - executes the PCC (PERFORM CRYPTOGRAPHIC COMPUTATION) + * instruction + * @func: the function code passed to PCC; see CPACF_KM_xxx defines + * @param: address of parameter block; see POP for details on each func + */ +static inline void cpacf_pcc(unsigned long func, void *param) +{ + register unsigned long r0 asm("0") = (unsigned long) func; + register unsigned long r1 asm("1") = (unsigned long) param; + + asm volatile( + "0: .insn rre,%[opc] << 16,0,0\n" /* PCC opcode */ + " brc 1,0b\n" /* handle partial completion */ + : + : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_PCC) + : "cc", "memory"); +} + +#endif /* _ASM_S390_CPACF_H */ diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index 9dd04b9e9782..03516476127b 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -169,16 +169,27 @@ static inline int lcctl(u64 ctl) } /* Extract CPU counter */ -static inline int ecctr(u64 ctr, u64 *val) +static inline int __ecctr(u64 ctr, u64 *content) { - register u64 content asm("4") = 0; + register u64 _content asm("4") = 0; int cc; asm volatile ( " .insn rre,0xb2e40000,%0,%2\n" " ipm %1\n" " srl %1,28\n" - : "=d" (content), "=d" (cc) : "d" (ctr) : "cc"); + : "=d" (_content), "=d" (cc) : "d" (ctr) : "cc"); + *content = _content; + return cc; +} + +/* Extract CPU counter */ +static inline int ecctr(u64 ctr, u64 *val) +{ + u64 content; + int cc; + + cc = __ecctr(ctr, &content); if (!cc) *val = content; return cc; diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index 5fac921c1c42..8acf482162ed 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -49,7 +49,7 @@ static inline void diag10_range(unsigned long start_pfn, unsigned long num_pfn) diag_stat_inc(DIAG_STAT_X010); asm volatile( "0: diag %0,%1,0x10\n" - "1:\n" + "1: nopr %%r7\n" EX_TABLE(0b, 1b) EX_TABLE(1b, 1b) : : "a" (start_addr), "a" (end_addr)); @@ -78,4 +78,153 @@ struct diag210 { extern int diag210(struct diag210 *addr); +/* bit is set in flags, when physical cpu info is included in diag 204 data */ +#define DIAG204_LPAR_PHYS_FLG 0x80 +#define DIAG204_LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */ +#define DIAG204_CPU_NAME_LEN 16 /* type name len of cpus in diag224 name table */ + +/* diag 204 subcodes */ +enum diag204_sc { + DIAG204_SUBC_STIB4 = 4, + DIAG204_SUBC_RSI = 5, + DIAG204_SUBC_STIB6 = 6, + DIAG204_SUBC_STIB7 = 7 +}; + +/* The two available diag 204 data formats */ +enum diag204_format { + DIAG204_INFO_SIMPLE = 0, + DIAG204_INFO_EXT = 0x00010000 +}; + +enum diag204_cpu_flags { + DIAG204_CPU_ONLINE = 0x20, + DIAG204_CPU_CAPPED = 0x40, +}; + +struct diag204_info_blk_hdr { + __u8 npar; + __u8 flags; + __u16 tslice; + __u16 phys_cpus; + __u16 this_part; + __u64 curtod; +} __packed; + +struct diag204_x_info_blk_hdr { + __u8 npar; + __u8 flags; + __u16 tslice; + __u16 phys_cpus; + __u16 this_part; + __u64 curtod1; + __u64 curtod2; + char reserved[40]; +} __packed; + +struct diag204_part_hdr { + __u8 pn; + __u8 cpus; + char reserved[6]; + char part_name[DIAG204_LPAR_NAME_LEN]; +} __packed; + +struct diag204_x_part_hdr { + __u8 pn; + __u8 cpus; + __u8 rcpus; + __u8 pflag; + __u32 mlu; + char part_name[DIAG204_LPAR_NAME_LEN]; + char lpc_name[8]; + char os_name[8]; + __u64 online_cs; + __u64 online_es; + __u8 upid; + __u8 reserved:3; + __u8 mtid:5; + char reserved1[2]; + __u32 group_mlu; + char group_name[8]; + char hardware_group_name[8]; + char reserved2[24]; +} __packed; + +struct diag204_cpu_info { + __u16 cpu_addr; + char reserved1[2]; + __u8 ctidx; + __u8 cflag; + __u16 weight; + __u64 acc_time; + __u64 lp_time; +} __packed; + +struct diag204_x_cpu_info { + __u16 cpu_addr; + char reserved1[2]; + __u8 ctidx; + __u8 cflag; + __u16 weight; + __u64 acc_time; + __u64 lp_time; + __u16 min_weight; + __u16 cur_weight; + __u16 max_weight; + char reseved2[2]; + __u64 online_time; + __u64 wait_time; + __u32 pma_weight; + __u32 polar_weight; + __u32 cpu_type_cap; + __u32 group_cpu_type_cap; + char reserved3[32]; +} __packed; + +struct diag204_phys_hdr { + char reserved1[1]; + __u8 cpus; + char reserved2[6]; + char mgm_name[8]; +} __packed; + +struct diag204_x_phys_hdr { + char reserved1[1]; + __u8 cpus; + char reserved2[6]; + char mgm_name[8]; + char reserved3[80]; +} __packed; + +struct diag204_phys_cpu { + __u16 cpu_addr; + char reserved1[2]; + __u8 ctidx; + char reserved2[3]; + __u64 mgm_time; + char reserved3[8]; +} __packed; + +struct diag204_x_phys_cpu { + __u16 cpu_addr; + char reserved1[2]; + __u8 ctidx; + char reserved2[1]; + __u16 weight; + __u64 mgm_time; + char reserved3[80]; +} __packed; + +struct diag204_x_part_block { + struct diag204_x_part_hdr hdr; + struct diag204_x_cpu_info cpus[]; +} __packed; + +struct diag204_x_phys_block { + struct diag204_x_phys_hdr hdr; + struct diag204_x_phys_cpu cpus[]; +} __packed; + +int diag204(unsigned long subcode, unsigned long size, void *addr); +int diag224(void *ptr); #endif /* _ASM_S390_DIAG_H */ diff --git a/arch/s390/include/asm/dma-mapping.h b/arch/s390/include/asm/dma-mapping.h index 3249b7464889..ffaba07f50ab 100644 --- a/arch/s390/include/asm/dma-mapping.h +++ b/arch/s390/include/asm/dma-mapping.h @@ -5,7 +5,6 @@ #include <linux/types.h> #include <linux/mm.h> #include <linux/scatterlist.h> -#include <linux/dma-attrs.h> #include <linux/dma-debug.h> #include <linux/io.h> diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index 563ab9f44874..1736c7d3c94c 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -225,6 +225,7 @@ do { \ #define MMAP_ALIGN_MASK (is_compat_task() ? 0 : 0x7fUL) #define STACK_RND_MASK MMAP_RND_MASK +/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ #define ARCH_DLINFO \ do { \ if (vdso_enabled) \ diff --git a/arch/s390/include/asm/etr.h b/arch/s390/include/asm/etr.h deleted file mode 100644 index 105f90e63a0e..000000000000 --- a/arch/s390/include/asm/etr.h +++ /dev/null @@ -1,261 +0,0 @@ -/* - * Copyright IBM Corp. 2006 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - */ -#ifndef __S390_ETR_H -#define __S390_ETR_H - -/* ETR attachment control register */ -struct etr_eacr { - unsigned int e0 : 1; /* port 0 stepping control */ - unsigned int e1 : 1; /* port 1 stepping control */ - unsigned int _pad0 : 5; /* must be 00100 */ - unsigned int dp : 1; /* data port control */ - unsigned int p0 : 1; /* port 0 change recognition control */ - unsigned int p1 : 1; /* port 1 change recognition control */ - unsigned int _pad1 : 3; /* must be 000 */ - unsigned int ea : 1; /* ETR alert control */ - unsigned int es : 1; /* ETR sync check control */ - unsigned int sl : 1; /* switch to local control */ -} __attribute__ ((packed)); - -/* Port state returned by steai */ -enum etr_psc { - etr_psc_operational = 0, - etr_psc_semi_operational = 1, - etr_psc_protocol_error = 4, - etr_psc_no_symbols = 8, - etr_psc_no_signal = 12, - etr_psc_pps_mode = 13 -}; - -/* Logical port state returned by stetr */ -enum etr_lpsc { - etr_lpsc_operational_step = 0, - etr_lpsc_operational_alt = 1, - etr_lpsc_semi_operational = 2, - etr_lpsc_protocol_error = 4, - etr_lpsc_no_symbol_sync = 8, - etr_lpsc_no_signal = 12, - etr_lpsc_pps_mode = 13 -}; - -/* ETR status words */ -struct etr_esw { - struct etr_eacr eacr; /* attachment control register */ - unsigned int y : 1; /* stepping mode */ - unsigned int _pad0 : 5; /* must be 00000 */ - unsigned int p : 1; /* stepping port number */ - unsigned int q : 1; /* data port number */ - unsigned int psc0 : 4; /* port 0 state code */ - unsigned int psc1 : 4; /* port 1 state code */ -} __attribute__ ((packed)); - -/* Second level data register status word */ -struct etr_slsw { - unsigned int vv1 : 1; /* copy of validity bit data frame 1 */ - unsigned int vv2 : 1; /* copy of validity bit data frame 2 */ - unsigned int vv3 : 1; /* copy of validity bit data frame 3 */ - unsigned int vv4 : 1; /* copy of validity bit data frame 4 */ - unsigned int _pad0 : 19; /* must by all zeroes */ - unsigned int n : 1; /* EAF port number */ - unsigned int v1 : 1; /* validity bit ETR data frame 1 */ - unsigned int v2 : 1; /* validity bit ETR data frame 2 */ - unsigned int v3 : 1; /* validity bit ETR data frame 3 */ - unsigned int v4 : 1; /* validity bit ETR data frame 4 */ - unsigned int _pad1 : 4; /* must be 0000 */ -} __attribute__ ((packed)); - -/* ETR data frames */ -struct etr_edf1 { - unsigned int u : 1; /* untuned bit */ - unsigned int _pad0 : 1; /* must be 0 */ - unsigned int r : 1; /* service request bit */ - unsigned int _pad1 : 4; /* must be 0000 */ - unsigned int a : 1; /* time adjustment bit */ - unsigned int net_id : 8; /* ETR network id */ - unsigned int etr_id : 8; /* id of ETR which sends data frames */ - unsigned int etr_pn : 8; /* port number of ETR output port */ -} __attribute__ ((packed)); - -struct etr_edf2 { - unsigned int etv : 32; /* Upper 32 bits of TOD. */ -} __attribute__ ((packed)); - -struct etr_edf3 { - unsigned int rc : 8; /* failure reason code */ - unsigned int _pad0 : 3; /* must be 000 */ - unsigned int c : 1; /* ETR coupled bit */ - unsigned int tc : 4; /* ETR type code */ - unsigned int blto : 8; /* biased local time offset */ - /* (blto - 128) * 15 = minutes */ - unsigned int buo : 8; /* biased utc offset */ - /* (buo - 128) = leap seconds */ -} __attribute__ ((packed)); - -struct etr_edf4 { - unsigned int ed : 8; /* ETS device dependent data */ - unsigned int _pad0 : 1; /* must be 0 */ - unsigned int buc : 5; /* biased ut1 correction */ - /* (buc - 16) * 0.1 seconds */ - unsigned int em : 6; /* ETS error magnitude */ - unsigned int dc : 6; /* ETS drift code */ - unsigned int sc : 6; /* ETS steering code */ -} __attribute__ ((packed)); - -/* - * ETR attachment information block, two formats - * format 1 has 4 reserved words with a size of 64 bytes - * format 2 has 16 reserved words with a size of 96 bytes - */ -struct etr_aib { - struct etr_esw esw; - struct etr_slsw slsw; - unsigned long long tsp; - struct etr_edf1 edf1; - struct etr_edf2 edf2; - struct etr_edf3 edf3; - struct etr_edf4 edf4; - unsigned int reserved[16]; -} __attribute__ ((packed,aligned(8))); - -/* ETR interruption parameter */ -struct etr_irq_parm { - unsigned int _pad0 : 8; - unsigned int pc0 : 1; /* port 0 state change */ - unsigned int pc1 : 1; /* port 1 state change */ - unsigned int _pad1 : 3; - unsigned int eai : 1; /* ETR alert indication */ - unsigned int _pad2 : 18; -} __attribute__ ((packed)); - -/* Query TOD offset result */ -struct etr_ptff_qto { - unsigned long long physical_clock; - unsigned long long tod_offset; - unsigned long long logical_tod_offset; - unsigned long long tod_epoch_difference; -} __attribute__ ((packed)); - -/* Inline assembly helper functions */ -static inline int etr_setr(struct etr_eacr *ctrl) -{ - int rc = -EOPNOTSUPP; - - asm volatile( - " .insn s,0xb2160000,%1\n" - "0: la %0,0\n" - "1:\n" - EX_TABLE(0b,1b) - : "+d" (rc) : "Q" (*ctrl)); - return rc; -} - -/* Stores a format 1 aib with 64 bytes */ -static inline int etr_stetr(struct etr_aib *aib) -{ - int rc = -EOPNOTSUPP; - - asm volatile( - " .insn s,0xb2170000,%1\n" - "0: la %0,0\n" - "1:\n" - EX_TABLE(0b,1b) - : "+d" (rc) : "Q" (*aib)); - return rc; -} - -/* Stores a format 2 aib with 96 bytes for specified port */ -static inline int etr_steai(struct etr_aib *aib, unsigned int func) -{ - register unsigned int reg0 asm("0") = func; - int rc = -EOPNOTSUPP; - - asm volatile( - " .insn s,0xb2b30000,%1\n" - "0: la %0,0\n" - "1:\n" - EX_TABLE(0b,1b) - : "+d" (rc) : "Q" (*aib), "d" (reg0)); - return rc; -} - -/* Function codes for the steai instruction. */ -#define ETR_STEAI_STEPPING_PORT 0x10 -#define ETR_STEAI_ALTERNATE_PORT 0x11 -#define ETR_STEAI_PORT_0 0x12 -#define ETR_STEAI_PORT_1 0x13 - -static inline int etr_ptff(void *ptff_block, unsigned int func) -{ - register unsigned int reg0 asm("0") = func; - register unsigned long reg1 asm("1") = (unsigned long) ptff_block; - int rc = -EOPNOTSUPP; - - asm volatile( - " .word 0x0104\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (rc), "=m" (ptff_block) - : "d" (reg0), "d" (reg1), "m" (ptff_block) : "cc"); - return rc; -} - -/* Function codes for the ptff instruction. */ -#define ETR_PTFF_QAF 0x00 /* query available functions */ -#define ETR_PTFF_QTO 0x01 /* query tod offset */ -#define ETR_PTFF_QSI 0x02 /* query steering information */ -#define ETR_PTFF_ATO 0x40 /* adjust tod offset */ -#define ETR_PTFF_STO 0x41 /* set tod offset */ -#define ETR_PTFF_SFS 0x42 /* set fine steering rate */ -#define ETR_PTFF_SGS 0x43 /* set gross steering rate */ - -/* Functions needed by the machine check handler */ -int etr_switch_to_local(void); -int etr_sync_check(void); -void etr_queue_work(void); - -/* notifier for syncs */ -extern struct atomic_notifier_head s390_epoch_delta_notifier; - -/* STP interruption parameter */ -struct stp_irq_parm { - unsigned int _pad0 : 14; - unsigned int tsc : 1; /* Timing status change */ - unsigned int lac : 1; /* Link availability change */ - unsigned int tcpc : 1; /* Time control parameter change */ - unsigned int _pad2 : 15; -} __attribute__ ((packed)); - -#define STP_OP_SYNC 1 -#define STP_OP_CTRL 3 - -struct stp_sstpi { - unsigned int rsvd0; - unsigned int rsvd1 : 8; - unsigned int stratum : 8; - unsigned int vbits : 16; - unsigned int leaps : 16; - unsigned int tmd : 4; - unsigned int ctn : 4; - unsigned int rsvd2 : 3; - unsigned int c : 1; - unsigned int tst : 4; - unsigned int tzo : 16; - unsigned int dsto : 16; - unsigned int ctrl : 16; - unsigned int rsvd3 : 16; - unsigned int tto; - unsigned int rsvd4; - unsigned int ctnid[3]; - unsigned int rsvd5; - unsigned int todoff[4]; - unsigned int rsvd6[48]; -} __attribute__ ((packed)); - -/* Functions needed by the machine check handler */ -int stp_sync_check(void); -int stp_island_check(void); -void stp_queue_work(void); - -#endif /* __S390_ETR_H */ diff --git a/arch/s390/include/asm/facilities_src.h b/arch/s390/include/asm/facilities_src.h index 4917728e5828..3b758f66e48b 100644 --- a/arch/s390/include/asm/facilities_src.h +++ b/arch/s390/include/asm/facilities_src.h @@ -55,4 +55,28 @@ static struct facility_def facility_defs[] = { -1 /* END */ } }, + { + .name = "FACILITIES_KVM", + .bits = (int[]){ + 0, /* N3 instructions */ + 1, /* z/Arch mode installed */ + 2, /* z/Arch mode active */ + 3, /* DAT-enhancement */ + 4, /* idte segment table */ + 5, /* idte region table */ + 6, /* ASN-and-LX reuse */ + 7, /* stfle */ + 8, /* enhanced-DAT 1 */ + 9, /* sense-running-status */ + 10, /* conditional sske */ + 13, /* ipte-range */ + 14, /* nonquiescing key-setting */ + 73, /* transactional execution */ + 75, /* access-exception-fetch/store indication */ + 76, /* msa extension 3 */ + 77, /* msa extension 4 */ + 78, /* enhanced-DAT 2 */ + -1 /* END */ + } + }, }; diff --git a/arch/s390/include/asm/fcx.h b/arch/s390/include/asm/fcx.h index 7ecb92b469b6..04cb4b4bcc5f 100644 --- a/arch/s390/include/asm/fcx.h +++ b/arch/s390/include/asm/fcx.h @@ -6,7 +6,7 @@ */ #ifndef _ASM_S390_FCX_H -#define _ASM_S390_FCX_H _ASM_S390_FCX_H +#define _ASM_S390_FCX_H #include <linux/types.h> diff --git a/arch/s390/include/asm/fpu/api.h b/arch/s390/include/asm/fpu/api.h index 5e04f3cbd320..02124d66bfb5 100644 --- a/arch/s390/include/asm/fpu/api.h +++ b/arch/s390/include/asm/fpu/api.h @@ -1,6 +1,41 @@ /* * In-kernel FPU support functions * + * + * Consider these guidelines before using in-kernel FPU functions: + * + * 1. Use kernel_fpu_begin() and kernel_fpu_end() to enclose all in-kernel + * use of floating-point or vector registers and instructions. + * + * 2. For kernel_fpu_begin(), specify the vector register range you want to + * use with the KERNEL_VXR_* constants. Consider these usage guidelines: + * + * a) If your function typically runs in process-context, use the lower + * half of the vector registers, for example, specify KERNEL_VXR_LOW. + * b) If your function typically runs in soft-irq or hard-irq context, + * prefer using the upper half of the vector registers, for example, + * specify KERNEL_VXR_HIGH. + * + * If you adhere to these guidelines, an interrupted process context + * does not require to save and restore vector registers because of + * disjoint register ranges. + * + * Also note that the __kernel_fpu_begin()/__kernel_fpu_end() functions + * includes logic to save and restore up to 16 vector registers at once. + * + * 3. You can nest kernel_fpu_begin()/kernel_fpu_end() by using different + * struct kernel_fpu states. Vector registers that are in use by outer + * levels are saved and restored. You can minimize the save and restore + * effort by choosing disjoint vector register ranges. + * + * 5. To use vector floating-point instructions, specify the KERNEL_FPC + * flag to save and restore floating-point controls in addition to any + * vector register range. + * + * 6. To use floating-point registers and instructions only, specify the + * KERNEL_FPR flag. This flag triggers a save and restore of vector + * registers V0 to V15 and floating-point controls. + * * Copyright IBM Corp. 2015 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> */ @@ -8,6 +43,8 @@ #ifndef _ASM_S390_FPU_API_H #define _ASM_S390_FPU_API_H +#include <linux/preempt.h> + void save_fpu_regs(void); static inline int test_fp_ctl(u32 fpc) @@ -22,9 +59,57 @@ static inline int test_fp_ctl(u32 fpc) " la %0,0\n" "1:\n" EX_TABLE(0b,1b) - : "=d" (rc), "=d" (orig_fpc) + : "=d" (rc), "=&d" (orig_fpc) : "d" (fpc), "0" (-EINVAL)); return rc; } +#define KERNEL_FPC 1 +#define KERNEL_VXR_V0V7 2 +#define KERNEL_VXR_V8V15 4 +#define KERNEL_VXR_V16V23 8 +#define KERNEL_VXR_V24V31 16 + +#define KERNEL_VXR_LOW (KERNEL_VXR_V0V7|KERNEL_VXR_V8V15) +#define KERNEL_VXR_MID (KERNEL_VXR_V8V15|KERNEL_VXR_V16V23) +#define KERNEL_VXR_HIGH (KERNEL_VXR_V16V23|KERNEL_VXR_V24V31) + +#define KERNEL_VXR (KERNEL_VXR_LOW|KERNEL_VXR_HIGH) +#define KERNEL_FPR (KERNEL_FPC|KERNEL_VXR_V0V7) + +struct kernel_fpu; + +/* + * Note the functions below must be called with preemption disabled. + * Do not enable preemption before calling __kernel_fpu_end() to prevent + * an corruption of an existing kernel FPU state. + * + * Prefer using the kernel_fpu_begin()/kernel_fpu_end() pair of functions. + */ +void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags); +void __kernel_fpu_end(struct kernel_fpu *state, u32 flags); + + +static inline void kernel_fpu_begin(struct kernel_fpu *state, u32 flags) +{ + preempt_disable(); + state->mask = S390_lowcore.fpu_flags; + if (!test_cpu_flag(CIF_FPU)) + /* Save user space FPU state and register contents */ + save_fpu_regs(); + else if (state->mask & flags) + /* Save FPU/vector register in-use by the kernel */ + __kernel_fpu_begin(state, flags); + S390_lowcore.fpu_flags |= flags; +} + +static inline void kernel_fpu_end(struct kernel_fpu *state, u32 flags) +{ + S390_lowcore.fpu_flags = state->mask; + if (state->mask & flags) + /* Restore FPU/vector register in-use by the kernel */ + __kernel_fpu_end(state, flags); + preempt_enable(); +} + #endif /* _ASM_S390_FPU_API_H */ diff --git a/arch/s390/include/asm/fpu/types.h b/arch/s390/include/asm/fpu/types.h index 14a8b0c14f87..bce255ead72b 100644 --- a/arch/s390/include/asm/fpu/types.h +++ b/arch/s390/include/asm/fpu/types.h @@ -11,15 +11,27 @@ #include <asm/sigcontext.h> struct fpu { - __u32 fpc; /* Floating-point control */ + __u32 fpc; /* Floating-point control */ + void *regs; /* Pointer to the current save area */ union { - void *regs; - freg_t *fprs; /* Floating-point register save area */ - __vector128 *vxrs; /* Vector register save area */ + /* Floating-point register save area */ + freg_t fprs[__NUM_FPRS]; + /* Vector register save area */ + __vector128 vxrs[__NUM_VXRS]; }; }; /* VX array structure for address operand constraints in inline assemblies */ struct vx_array { __vector128 _[__NUM_VXRS]; }; +/* In-kernel FPU state structure */ +struct kernel_fpu { + u32 mask; + u32 fpc; + union { + freg_t fprs[__NUM_FPRS]; + __vector128 vxrs[__NUM_VXRS]; + }; +}; + #endif /* _ASM_S390_FPU_TYPES_H */ diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 836c56290499..64053d9ac3f2 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -12,7 +12,9 @@ #ifndef __ASSEMBLY__ -#define ftrace_return_address(n) __builtin_return_address(n) +unsigned long return_address(int depth); + +#define ftrace_return_address(n) return_address(n) void _mcount(void); void ftrace_caller(void); diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index d054c1b07a3c..741ddba0bf11 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -10,14 +10,25 @@ /** * struct gmap_struct - guest address space + * @list: list head for the mm->context gmap list * @crst_list: list of all crst tables used in the guest address space * @mm: pointer to the parent mm_struct * @guest_to_host: radix tree with guest to host address translation * @host_to_guest: radix tree with pointer to segment table entries * @guest_table_lock: spinlock to protect all entries in the guest page table + * @ref_count: reference counter for the gmap structure * @table: pointer to the page directory * @asce: address space control element for gmap page table * @pfault_enabled: defines if pfaults are applicable for the guest + * @host_to_rmap: radix tree with gmap_rmap lists + * @children: list of shadow gmap structures + * @pt_list: list of all page tables used in the shadow guest address space + * @shadow_lock: spinlock to protect the shadow gmap list + * @parent: pointer to the parent gmap for shadow guest address spaces + * @orig_asce: ASCE for which the shadow page table has been created + * @edat_level: edat level to be used for the shadow translation + * @removed: flag to indicate if a shadow guest address space has been removed + * @initialized: flag to indicate if a shadow guest address space can be used */ struct gmap { struct list_head list; @@ -26,26 +37,64 @@ struct gmap { struct radix_tree_root guest_to_host; struct radix_tree_root host_to_guest; spinlock_t guest_table_lock; + atomic_t ref_count; unsigned long *table; unsigned long asce; unsigned long asce_end; void *private; bool pfault_enabled; + /* Additional data for shadow guest address spaces */ + struct radix_tree_root host_to_rmap; + struct list_head children; + struct list_head pt_list; + spinlock_t shadow_lock; + struct gmap *parent; + unsigned long orig_asce; + int edat_level; + bool removed; + bool initialized; }; /** + * struct gmap_rmap - reverse mapping for shadow page table entries + * @next: pointer to next rmap in the list + * @raddr: virtual rmap address in the shadow guest address space + */ +struct gmap_rmap { + struct gmap_rmap *next; + unsigned long raddr; +}; + +#define gmap_for_each_rmap(pos, head) \ + for (pos = (head); pos; pos = pos->next) + +#define gmap_for_each_rmap_safe(pos, n, head) \ + for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n) + +/** * struct gmap_notifier - notify function block for page invalidation * @notifier_call: address of callback function */ struct gmap_notifier { struct list_head list; - void (*notifier_call)(struct gmap *gmap, unsigned long gaddr); + struct rcu_head rcu; + void (*notifier_call)(struct gmap *gmap, unsigned long start, + unsigned long end); }; -struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit); -void gmap_free(struct gmap *gmap); +static inline int gmap_is_shadow(struct gmap *gmap) +{ + return !!gmap->parent; +} + +struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit); +void gmap_remove(struct gmap *gmap); +struct gmap *gmap_get(struct gmap *gmap); +void gmap_put(struct gmap *gmap); + void gmap_enable(struct gmap *gmap); void gmap_disable(struct gmap *gmap); +struct gmap *gmap_get_enabled(void); int gmap_map_segment(struct gmap *gmap, unsigned long from, unsigned long to, unsigned long len); int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); @@ -57,8 +106,29 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to); void __gmap_zap(struct gmap *, unsigned long gaddr); void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); -void gmap_register_ipte_notifier(struct gmap_notifier *); -void gmap_unregister_ipte_notifier(struct gmap_notifier *); -int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len); +int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val); + +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, + int edat_level); +int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level); +int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, + int fake); +int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, + int fake); +int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, + int fake); +int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, + int fake); +int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, + unsigned long *pgt, int *dat_protection, int *fake); +int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); + +void gmap_register_pte_notifier(struct gmap_notifier *); +void gmap_unregister_pte_notifier(struct gmap_notifier *); +void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *, + unsigned long bits); + +int gmap_mprotect_notify(struct gmap *, unsigned long start, + unsigned long len, int prot); #endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index d9be7c0c1291..4c7fac75090e 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -41,7 +41,10 @@ static inline int prepare_hugepage_range(struct file *file, static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_val(*ptep) = _SEGMENT_ENTRY_EMPTY; + if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + pte_val(*ptep) = _REGION3_ENTRY_EMPTY; + else + pte_val(*ptep) = _SEGMENT_ENTRY_EMPTY; } static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index 6fc44dca193e..4da22b2f0521 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -141,11 +141,11 @@ extern void setup_ipl(void); * DIAG 308 support */ enum diag308_subcode { - DIAG308_REL_HSA = 2, - DIAG308_IPL = 3, - DIAG308_DUMP = 4, - DIAG308_SET = 5, - DIAG308_STORE = 6, + DIAG308_REL_HSA = 2, + DIAG308_LOAD_CLEAR = 3, + DIAG308_LOAD_NORMAL_DUMP = 4, + DIAG308_SET = 5, + DIAG308_STORE = 6, }; enum diag308_ipl_type { diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index f97b055de76a..70c9bce766f5 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -7,11 +7,8 @@ #define NR_IRQS_BASE 3 -#ifdef CONFIG_PCI_NR_MSI -# define NR_IRQS (NR_IRQS_BASE + CONFIG_PCI_NR_MSI) -#else -# define NR_IRQS NR_IRQS_BASE -#endif +#define NR_IRQS NR_IRQS_BASE +#define NR_IRQS_LEGACY NR_IRQS_BASE /* External interruption codes */ #define EXT_IRQ_INTERRUPT_KEY 0x0040 diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index 7f9fd5e3f1bf..9be198f5ee79 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -4,6 +4,7 @@ #ifndef __ASSEMBLY__ #include <linux/types.h> +#include <linux/stringify.h> #define JUMP_LABEL_NOP_SIZE 6 #define JUMP_LABEL_NOP_OFFSET 2 diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h index b47ad3b642cc..591e5a5279b0 100644 --- a/arch/s390/include/asm/kprobes.h +++ b/arch/s390/include/asm/kprobes.h @@ -43,9 +43,9 @@ typedef u16 kprobe_opcode_t; #define MAX_INSN_SIZE 0x0003 #define MAX_STACK_SIZE 64 #define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \ - (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) \ + (((unsigned long)task_stack_page(current)) + THREAD_SIZE - (ADDR))) \ ? (MAX_STACK_SIZE) \ - : (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) + : (((unsigned long)task_stack_page(current)) + THREAD_SIZE - (ADDR))) #define kretprobe_blacklist_size 0 diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 6da41fab70fb..a41faf34b034 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -28,7 +28,7 @@ #define KVM_S390_BSCA_CPU_SLOTS 64 #define KVM_S390_ESCA_CPU_SLOTS 248 -#define KVM_MAX_VCPUS KVM_S390_ESCA_CPU_SLOTS +#define KVM_MAX_VCPUS 255 #define KVM_USER_MEM_SLOTS 32 /* @@ -38,11 +38,12 @@ */ #define KVM_NR_IRQCHIPS 1 #define KVM_IRQCHIP_NUM_PINS 4096 -#define KVM_HALT_POLL_NS_DEFAULT 0 +#define KVM_HALT_POLL_NS_DEFAULT 80000 /* s390-specific vcpu->requests bit members */ #define KVM_REQ_ENABLE_IBS 8 #define KVM_REQ_DISABLE_IBS 9 +#define KVM_REQ_ICPT_OPEREXC 10 #define SIGP_CTRL_C 0x80 #define SIGP_CTRL_SCN_MASK 0x3f @@ -145,7 +146,7 @@ struct kvm_s390_sie_block { __u64 cputm; /* 0x0028 */ __u64 ckc; /* 0x0030 */ __u64 epoch; /* 0x0038 */ - __u8 reserved40[4]; /* 0x0040 */ + __u32 svcc; /* 0x0040 */ #define LCTL_CR0 0x8000 #define LCTL_CR6 0x0200 #define LCTL_CR9 0x0040 @@ -154,6 +155,7 @@ struct kvm_s390_sie_block { #define LCTL_CR14 0x0002 __u16 lctl; /* 0x0044 */ __s16 icpua; /* 0x0046 */ +#define ICTL_OPEREXC 0x80000000 #define ICTL_PINT 0x20000000 #define ICTL_LPSW 0x00400000 #define ICTL_STCTL 0x00040000 @@ -166,6 +168,9 @@ struct kvm_s390_sie_block { #define ICPT_INST 0x04 #define ICPT_PROGI 0x08 #define ICPT_INSTPROGI 0x0C +#define ICPT_EXTINT 0x14 +#define ICPT_VALIDITY 0x20 +#define ICPT_STOP 0x28 #define ICPT_OPEREXC 0x2C #define ICPT_PARTEXEC 0x38 #define ICPT_IOINST 0x40 @@ -185,7 +190,9 @@ struct kvm_s390_sie_block { __u32 scaol; /* 0x0064 */ __u8 reserved68[4]; /* 0x0068 */ __u32 todpr; /* 0x006c */ - __u8 reserved70[32]; /* 0x0070 */ + __u8 reserved70[16]; /* 0x0070 */ + __u64 mso; /* 0x0080 */ + __u64 msl; /* 0x0088 */ psw_t gpsw; /* 0x0090 */ __u64 gg14; /* 0x00a0 */ __u64 gg15; /* 0x00a8 */ @@ -223,7 +230,7 @@ struct kvm_s390_sie_block { __u8 reserved1e6[2]; /* 0x01e6 */ __u64 itdba; /* 0x01e8 */ __u64 riccbd; /* 0x01f0 */ - __u8 reserved1f8[8]; /* 0x01f8 */ + __u64 gvrd; /* 0x01f8 */ } __attribute__((packed)); struct kvm_s390_itdb { @@ -238,67 +245,72 @@ struct sie_page { } __packed; struct kvm_vcpu_stat { - u32 exit_userspace; - u32 exit_null; - u32 exit_external_request; - u32 exit_external_interrupt; - u32 exit_stop_request; - u32 exit_validity; - u32 exit_instruction; - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_wakeup; - u32 instruction_lctl; - u32 instruction_lctlg; - u32 instruction_stctl; - u32 instruction_stctg; - u32 exit_program_interruption; - u32 exit_instr_and_program; - u32 deliver_external_call; - u32 deliver_emergency_signal; - u32 deliver_service_signal; - u32 deliver_virtio_interrupt; - u32 deliver_stop_signal; - u32 deliver_prefix_signal; - u32 deliver_restart_signal; - u32 deliver_program_int; - u32 deliver_io_int; - u32 exit_wait_state; - u32 instruction_pfmf; - u32 instruction_stidp; - u32 instruction_spx; - u32 instruction_stpx; - u32 instruction_stap; - u32 instruction_storage_key; - u32 instruction_ipte_interlock; - u32 instruction_stsch; - u32 instruction_chsc; - u32 instruction_stsi; - u32 instruction_stfl; - u32 instruction_tprot; - u32 instruction_essa; - u32 instruction_sigp_sense; - u32 instruction_sigp_sense_running; - u32 instruction_sigp_external_call; - u32 instruction_sigp_emergency; - u32 instruction_sigp_cond_emergency; - u32 instruction_sigp_start; - u32 instruction_sigp_stop; - u32 instruction_sigp_stop_store_status; - u32 instruction_sigp_store_status; - u32 instruction_sigp_store_adtl_status; - u32 instruction_sigp_arch; - u32 instruction_sigp_prefix; - u32 instruction_sigp_restart; - u32 instruction_sigp_init_cpu_reset; - u32 instruction_sigp_cpu_reset; - u32 instruction_sigp_unknown; - u32 diagnose_10; - u32 diagnose_44; - u32 diagnose_9c; - u32 diagnose_258; - u32 diagnose_308; - u32 diagnose_500; + u64 exit_userspace; + u64 exit_null; + u64 exit_external_request; + u64 exit_external_interrupt; + u64 exit_stop_request; + u64 exit_validity; + u64 exit_instruction; + u64 exit_pei; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; + u64 instruction_lctl; + u64 instruction_lctlg; + u64 instruction_stctl; + u64 instruction_stctg; + u64 exit_program_interruption; + u64 exit_instr_and_program; + u64 exit_operation_exception; + u64 deliver_external_call; + u64 deliver_emergency_signal; + u64 deliver_service_signal; + u64 deliver_virtio_interrupt; + u64 deliver_stop_signal; + u64 deliver_prefix_signal; + u64 deliver_restart_signal; + u64 deliver_program_int; + u64 deliver_io_int; + u64 exit_wait_state; + u64 instruction_pfmf; + u64 instruction_stidp; + u64 instruction_spx; + u64 instruction_stpx; + u64 instruction_stap; + u64 instruction_storage_key; + u64 instruction_ipte_interlock; + u64 instruction_stsch; + u64 instruction_chsc; + u64 instruction_stsi; + u64 instruction_stfl; + u64 instruction_tprot; + u64 instruction_sie; + u64 instruction_essa; + u64 instruction_sthyi; + u64 instruction_sigp_sense; + u64 instruction_sigp_sense_running; + u64 instruction_sigp_external_call; + u64 instruction_sigp_emergency; + u64 instruction_sigp_cond_emergency; + u64 instruction_sigp_start; + u64 instruction_sigp_stop; + u64 instruction_sigp_stop_store_status; + u64 instruction_sigp_store_status; + u64 instruction_sigp_store_adtl_status; + u64 instruction_sigp_arch; + u64 instruction_sigp_prefix; + u64 instruction_sigp_restart; + u64 instruction_sigp_init_cpu_reset; + u64 instruction_sigp_cpu_reset; + u64 instruction_sigp_unknown; + u64 diagnose_10; + u64 diagnose_44; + u64 diagnose_9c; + u64 diagnose_258; + u64 diagnose_308; + u64 diagnose_500; }; #define PGM_OPERATION 0x01 @@ -539,16 +551,16 @@ struct kvm_guestdbg_info_arch { struct kvm_vcpu_arch { struct kvm_s390_sie_block *sie_block; + /* if vsie is active, currently executed shadow sie control block */ + struct kvm_s390_sie_block *vsie_block; unsigned int host_acrs[NUM_ACRS]; struct fpu host_fpregs; struct kvm_s390_local_interrupt local_int; struct hrtimer ckc_timer; struct kvm_s390_pgm_info pgm; - union { - struct cpuid cpu_id; - u64 stidp_data; - }; struct gmap *gmap; + /* backup location for the currently enabled gmap when scheduled out */ + struct gmap *enabled_gmap; struct kvm_guestdbg_info_arch guestdbg; unsigned long pfault_token; unsigned long pfault_select; @@ -565,7 +577,7 @@ struct kvm_vcpu_arch { }; struct kvm_vm_stat { - u32 remote_tlb_flush; + ulong remote_tlb_flush; }; struct kvm_arch_memory_slot { @@ -605,7 +617,7 @@ struct kvm_s390_cpu_model { __u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64]; /* facility list requested by guest (in dma page) */ __u64 *fac_list; - struct cpuid cpu_id; + u64 cpuid; unsigned short ibc; }; @@ -633,6 +645,14 @@ struct sie_page2 { u8 reserved900[0x1000 - 0x900]; /* 0x0900 */ } __packed; +struct kvm_s390_vsie { + struct mutex mutex; + struct radix_tree_root addr_to_page; + int page_count; + int next; + struct page *pages[KVM_MAX_VCPUS]; +}; + struct kvm_arch{ void *sca; int use_esca; @@ -648,15 +668,20 @@ struct kvm_arch{ int user_cpu_state_ctrl; int user_sigp; int user_stsi; + int user_instr0; struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; wait_queue_head_t ipte_wq; int ipte_lock_count; struct mutex ipte_mutex; + struct ratelimit_state sthyi_limit; spinlock_t start_stop_lock; struct sie_page2 *sie_page2; struct kvm_s390_cpu_model model; struct kvm_s390_crypto crypto; + struct kvm_s390_vsie vsie; u64 epoch; + /* subset of available cpu features enabled by user space */ + DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); }; #define KVM_HVA_ERR_BAD (-1UL) @@ -700,4 +725,6 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} +void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h index d5427c78b1b3..2c1213785892 100644 --- a/arch/s390/include/asm/livepatch.h +++ b/arch/s390/include/asm/livepatch.h @@ -24,13 +24,6 @@ static inline int klp_check_compiler_support(void) return 0; } -static inline int klp_write_module_reloc(struct module *mod, unsigned long - type, unsigned long loc, unsigned long value) -{ - /* not supported yet */ - return -ENOSYS; -} - static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) { regs->psw.addr = ip; diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index d79ba7cf75b0..7b93b78f423c 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -129,7 +129,8 @@ struct lowcore { __u8 pad_0x0390[0x0398-0x0390]; /* 0x0390 */ __u64 gmap; /* 0x0398 */ __u32 spinlock_lockval; /* 0x03a0 */ - __u8 pad_0x03a0[0x0400-0x03a4]; /* 0x03a4 */ + __u32 fpu_flags; /* 0x03a4 */ + __u8 pad_0x03a8[0x0400-0x03a8]; /* 0x03a8 */ /* Per cpu primary space access list */ __u32 paste[16]; /* 0x0400 */ diff --git a/arch/s390/include/asm/mathemu.h b/arch/s390/include/asm/mathemu.h deleted file mode 100644 index 614dfaf47f71..000000000000 --- a/arch/s390/include/asm/mathemu.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * IEEE floating point emulation. - * - * S390 version - * Copyright IBM Corp. 1999 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - */ - -#ifndef __MATHEMU__ -#define __MATHEMU__ - -extern int math_emu_b3(__u8 *, struct pt_regs *); -extern int math_emu_ed(__u8 *, struct pt_regs *); -extern int math_emu_ldr(__u8 *); -extern int math_emu_ler(__u8 *); -extern int math_emu_std(__u8 *, struct pt_regs *); -extern int math_emu_ld(__u8 *, struct pt_regs *); -extern int math_emu_ste(__u8 *, struct pt_regs *); -extern int math_emu_le(__u8 *, struct pt_regs *); -extern int math_emu_lfpc(__u8 *, struct pt_regs *); -extern int math_emu_stfpc(__u8 *, struct pt_regs *); -extern int math_emu_srnm(__u8 *, struct pt_regs *); - -#endif /* __MATHEMU__ */ - - - - diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index 081b2ad99d73..bea785d7f853 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -6,11 +6,13 @@ typedef struct { cpumask_t cpu_attach_mask; - atomic_t attach_count; + atomic_t flush_count; unsigned int flush_mm; - spinlock_t list_lock; + spinlock_t pgtable_lock; struct list_head pgtable_list; + spinlock_t gmap_lock; struct list_head gmap_list; + unsigned long gmap_asce; unsigned long asce; unsigned long asce_limit; unsigned long vdso_base; @@ -22,9 +24,11 @@ typedef struct { unsigned int use_skey:1; } mm_context_t; -#define INIT_MM_CONTEXT(name) \ - .context.list_lock = __SPIN_LOCK_UNLOCKED(name.context.list_lock), \ - .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ +#define INIT_MM_CONTEXT(name) \ + .context.pgtable_lock = \ + __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock), \ + .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ + .context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \ .context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list), static inline int tprot(unsigned long addr) diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index c837b79b455d..515fea5a3fc4 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -15,11 +15,13 @@ static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { - spin_lock_init(&mm->context.list_lock); + spin_lock_init(&mm->context.pgtable_lock); INIT_LIST_HEAD(&mm->context.pgtable_list); + spin_lock_init(&mm->context.gmap_lock); INIT_LIST_HEAD(&mm->context.gmap_list); cpumask_clear(&mm->context.cpu_attach_mask); - atomic_set(&mm->context.attach_count, 0); + atomic_set(&mm->context.flush_count, 0); + mm->context.gmap_asce = 0; mm->context.flush_mm = 0; #ifdef CONFIG_PGSTE mm->context.alloc_pgste = page_table_allocate_pgste; @@ -90,15 +92,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, S390_lowcore.user_asce = next->context.asce; if (prev == next) return; - if (MACHINE_HAS_TLB_LC) - cpumask_set_cpu(cpu, &next->context.cpu_attach_mask); + cpumask_set_cpu(cpu, &next->context.cpu_attach_mask); + cpumask_set_cpu(cpu, mm_cpumask(next)); /* Clear old ASCE by loading the kernel ASCE. */ __ctl_load(S390_lowcore.kernel_asce, 1, 1); __ctl_load(S390_lowcore.kernel_asce, 7, 7); - atomic_inc(&next->context.attach_count); - atomic_dec(&prev->context.attach_count); - if (MACHINE_HAS_TLB_LC) - cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask); + cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask); } #define finish_arch_post_lock_switch finish_arch_post_lock_switch @@ -110,10 +109,9 @@ static inline void finish_arch_post_lock_switch(void) load_kernel_asce(); if (mm) { preempt_disable(); - while (atomic_read(&mm->context.attach_count) >> 16) + while (atomic_read(&mm->context.flush_count)) cpu_relax(); - cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); if (mm->context.flush_mm) __tlb_flush_mm(mm); preempt_enable(); @@ -128,7 +126,6 @@ static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) { switch_mm(prev, next, current); - cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); set_user_asce(next); } diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 53eacbd4f09b..69b8a41fca84 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -21,6 +21,7 @@ #define HPAGE_SIZE (1UL << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) +#define HUGE_MAX_HSTATE 2 #define ARCH_HAS_SETCLEAR_HUGE_PTE #define ARCH_HAS_HUGE_PTE_TYPE @@ -30,11 +31,12 @@ #include <asm/setup.h> #ifndef __ASSEMBLY__ +void __storage_key_init_range(unsigned long start, unsigned long end); + static inline void storage_key_init_range(unsigned long start, unsigned long end) { -#if PAGE_DEFAULT_KEY - __storage_key_init_range(start, end); -#endif + if (PAGE_DEFAULT_KEY) + __storage_key_init_range(start, end); } #define clear_page(page) memset((page), 0, PAGE_SIZE) @@ -109,13 +111,14 @@ static inline unsigned char page_get_storage_key(unsigned long addr) static inline int page_reset_referenced(unsigned long addr) { - unsigned int ipm; + int cc; asm volatile( " rrbe 0,%1\n" " ipm %0\n" - : "=d" (ipm) : "a" (addr) : "cc"); - return !!(ipm & 0x20000000); + " srl %0,28\n" + : "=d" (cc) : "a" (addr) : "cc"); + return cc; } /* Bits int the storage key */ @@ -146,6 +149,8 @@ static inline int devmem_is_allowed(unsigned long pfn) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT) +#define page_to_virt(page) pfn_to_virt(page_to_pfn(page)) #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 535a46d46d28..6611f798d2be 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -11,6 +11,7 @@ #include <asm-generic/pci.h> #include <asm/pci_clp.h> #include <asm/pci_debug.h> +#include <asm/sclp.h> #define PCIBIOS_MIN_IO 0x1000 #define PCIBIOS_MIN_MEM 0x10000000 @@ -31,20 +32,41 @@ int pci_proc_domain(struct pci_bus *); #define ZPCI_FC_BLOCKED 0x20 #define ZPCI_FC_DMA_ENABLED 0x10 +#define ZPCI_FMB_DMA_COUNTER_VALID (1 << 23) + +struct zpci_fmb_fmt0 { + u64 dma_rbytes; + u64 dma_wbytes; +}; + +struct zpci_fmb_fmt1 { + u64 rx_bytes; + u64 rx_packets; + u64 tx_bytes; + u64 tx_packets; +}; + +struct zpci_fmb_fmt2 { + u64 consumed_work_units; + u64 max_work_units; +}; + struct zpci_fmb { - u32 format : 8; - u32 dma_valid : 1; - u32 : 23; + u32 format : 8; + u32 fmt_ind : 24; u32 samples; u64 last_update; - /* hardware counters */ + /* common counters */ u64 ld_ops; u64 st_ops; u64 stb_ops; u64 rpcit_ops; - u64 dma_rbytes; - u64 dma_wbytes; - u64 pad[2]; + /* format specific counters */ + union { + struct zpci_fmb_fmt0 fmt0; + struct zpci_fmb_fmt1 fmt1; + struct zpci_fmb_fmt2 fmt2; + }; } __packed __aligned(128); enum zpci_state { @@ -96,6 +118,7 @@ struct zpci_dev { spinlock_t iommu_bitmap_lock; unsigned long *iommu_bitmap; + unsigned long *lazy_bitmap; unsigned long iommu_size; unsigned long iommu_pages; unsigned int next_bit; @@ -195,6 +218,9 @@ void zpci_debug_init_device(struct zpci_dev *, const char *); void zpci_debug_exit_device(struct zpci_dev *); void zpci_debug_info(struct zpci_dev *, struct seq_file *); +/* Error reporting */ +int zpci_report_error(struct pci_dev *, struct zpci_report_error_header *); + #ifdef CONFIG_NUMA /* Returns the node based on PCI bus */ diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h index 1f7ff85c5e4c..c64c0befd3f3 100644 --- a/arch/s390/include/asm/perf_event.h +++ b/arch/s390/include/asm/perf_event.h @@ -86,16 +86,4 @@ struct sf_raw_sample { u8 padding[]; /* Padding to next multiple of 8 */ } __packed; -/* Perf hardware reserve and release functions */ -#ifdef CONFIG_PERF_EVENTS -int perf_reserve_sampling(void); -void perf_release_sampling(void); -#else /* CONFIG_PERF_EVENTS */ -static inline int perf_reserve_sampling(void) -{ - return 0; -} -static inline void perf_release_sampling(void) {} -#endif /* CONFIG_PERF_EVENTS */ - #endif /* _ASM_S390_PERF_EVENT_H */ diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index da34cb6b1f3b..f4eb9843eed4 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -19,8 +19,10 @@ unsigned long *crst_table_alloc(struct mm_struct *); void crst_table_free(struct mm_struct *, unsigned long *); unsigned long *page_table_alloc(struct mm_struct *); +struct page *page_table_alloc_pgste(struct mm_struct *mm); void page_table_free(struct mm_struct *, unsigned long *); void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); +void page_table_free_pgste(struct page *page); extern int page_table_allocate_pgste; static inline void clear_table(unsigned long *s, unsigned long val, size_t n) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2f66645587a2..0362cd5fa187 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -28,12 +28,33 @@ #include <linux/mm_types.h> #include <linux/page-flags.h> #include <linux/radix-tree.h> +#include <linux/atomic.h> #include <asm/bug.h> #include <asm/page.h> -extern pgd_t swapper_pg_dir[] __attribute__ ((aligned (4096))); +extern pgd_t swapper_pg_dir[]; extern void paging_init(void); extern void vmem_map_init(void); +pmd_t *vmem_pmd_alloc(void); +pte_t *vmem_pte_alloc(void); + +enum { + PG_DIRECT_MAP_4K = 0, + PG_DIRECT_MAP_1M, + PG_DIRECT_MAP_2G, + PG_DIRECT_MAP_MAX +}; + +extern atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX]; + +static inline void update_page_count(int level, long count) +{ + if (IS_ENABLED(CONFIG_PROC_FS)) + atomic_long_add(count, &direct_pages_count[level]); +} + +struct seq_file; +void arch_report_meminfo(struct seq_file *m); /* * The S390 doesn't have any external MMU info: the kernel page @@ -221,8 +242,8 @@ static inline int is_module_addr(void *addr) * swap .11..ttttt.0 * prot-none, clean, old .11.xx0000.1 * prot-none, clean, young .11.xx0001.1 - * prot-none, dirty, old .10.xx0010.1 - * prot-none, dirty, young .10.xx0011.1 + * prot-none, dirty, old .11.xx0010.1 + * prot-none, dirty, young .11.xx0011.1 * read-only, clean, old .11.xx0100.1 * read-only, clean, young .01.xx0101.1 * read-only, dirty, old .11.xx0110.1 @@ -256,6 +277,7 @@ static inline int is_module_addr(void *addr) /* Bits in the region table entry */ #define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */ #define _REGION_ENTRY_PROTECT 0x200 /* region protection bit */ +#define _REGION_ENTRY_OFFSET 0xc0 /* region table offset */ #define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */ #define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */ #define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */ @@ -270,8 +292,23 @@ static inline int is_module_addr(void *addr) #define _REGION3_ENTRY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH) #define _REGION3_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID) -#define _REGION3_ENTRY_LARGE 0x400 /* RTTE-format control, large page */ -#define _REGION3_ENTRY_RO 0x200 /* page protection bit */ +#define _REGION3_ENTRY_ORIGIN_LARGE ~0x7fffffffUL /* large page address */ +#define _REGION3_ENTRY_ORIGIN ~0x7ffUL/* region third table origin */ + +#define _REGION3_ENTRY_DIRTY 0x2000 /* SW region dirty bit */ +#define _REGION3_ENTRY_YOUNG 0x1000 /* SW region young bit */ +#define _REGION3_ENTRY_LARGE 0x0400 /* RTTE-format control, large page */ +#define _REGION3_ENTRY_READ 0x0002 /* SW region read bit */ +#define _REGION3_ENTRY_WRITE 0x0001 /* SW region write bit */ + +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _REGION3_ENTRY_SOFT_DIRTY 0x4000 /* SW region soft dirty bit */ +#else +#define _REGION3_ENTRY_SOFT_DIRTY 0x0000 /* SW region soft dirty bit */ +#endif + +#define _REGION_ENTRY_BITS 0xfffffffffffff227UL +#define _REGION_ENTRY_BITS_LARGE 0xffffffff8000fe27UL /* Bits in the segment table entry */ #define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL @@ -287,8 +324,8 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */ #define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */ #define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */ -#define _SEGMENT_ENTRY_READ 0x0002 /* SW segment read bit */ -#define _SEGMENT_ENTRY_WRITE 0x0001 /* SW segment write bit */ +#define _SEGMENT_ENTRY_WRITE 0x0002 /* SW segment write bit */ +#define _SEGMENT_ENTRY_READ 0x0001 /* SW segment read bit */ #ifdef CONFIG_MEM_SOFT_DIRTY #define _SEGMENT_ENTRY_SOFT_DIRTY 0x4000 /* SW segment soft dirty bit */ @@ -297,16 +334,17 @@ static inline int is_module_addr(void *addr) #endif /* - * Segment table entry encoding (R = read-only, I = invalid, y = young bit): - * dy..R...I...rw + * Segment table and region3 table entry encoding + * (R = read-only, I = invalid, y = young bit): + * dy..R...I...wr * prot-none, clean, old 00..1...1...00 * prot-none, clean, young 01..1...1...00 * prot-none, dirty, old 10..1...1...00 * prot-none, dirty, young 11..1...1...00 - * read-only, clean, old 00..1...1...10 - * read-only, clean, young 01..1...0...10 - * read-only, dirty, old 10..1...1...10 - * read-only, dirty, young 11..1...0...10 + * read-only, clean, old 00..1...1...01 + * read-only, clean, young 01..1...0...01 + * read-only, dirty, old 10..1...1...01 + * read-only, dirty, young 11..1...0...01 * read-write, clean, old 00..1...1...11 * read-write, clean, young 01..1...0...11 * read-write, dirty, old 10..0...1...11 @@ -327,6 +365,7 @@ static inline int is_module_addr(void *addr) #define PGSTE_GC_BIT 0x0002000000000000UL #define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */ #define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */ +#define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */ /* Guest Page State used for virtualization */ #define _PGSTE_GPS_ZERO 0x0000000080000000UL @@ -345,7 +384,7 @@ static inline int is_module_addr(void *addr) /* * Page protection definitions. */ -#define PAGE_NONE __pgprot(_PAGE_PRESENT | _PAGE_INVALID) +#define PAGE_NONE __pgprot(_PAGE_PRESENT | _PAGE_INVALID | _PAGE_PROTECT) #define PAGE_READ __pgprot(_PAGE_PRESENT | _PAGE_READ | \ _PAGE_INVALID | _PAGE_PROTECT) #define PAGE_WRITE __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ @@ -391,6 +430,33 @@ static inline int is_module_addr(void *addr) _SEGMENT_ENTRY_READ) #define SEGMENT_WRITE __pgprot(_SEGMENT_ENTRY_READ | \ _SEGMENT_ENTRY_WRITE) +#define SEGMENT_KERNEL __pgprot(_SEGMENT_ENTRY | \ + _SEGMENT_ENTRY_LARGE | \ + _SEGMENT_ENTRY_READ | \ + _SEGMENT_ENTRY_WRITE | \ + _SEGMENT_ENTRY_YOUNG | \ + _SEGMENT_ENTRY_DIRTY) +#define SEGMENT_KERNEL_RO __pgprot(_SEGMENT_ENTRY | \ + _SEGMENT_ENTRY_LARGE | \ + _SEGMENT_ENTRY_READ | \ + _SEGMENT_ENTRY_YOUNG | \ + _SEGMENT_ENTRY_PROTECT) + +/* + * Region3 entry (large page) protection definitions. + */ + +#define REGION3_KERNEL __pgprot(_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_LARGE | \ + _REGION3_ENTRY_READ | \ + _REGION3_ENTRY_WRITE | \ + _REGION3_ENTRY_YOUNG | \ + _REGION3_ENTRY_DIRTY) +#define REGION3_KERNEL_RO __pgprot(_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_LARGE | \ + _REGION3_ENTRY_READ | \ + _REGION3_ENTRY_YOUNG | \ + _REGION_ENTRY_PROTECT) static inline int mm_has_pgste(struct mm_struct *mm) { @@ -424,6 +490,53 @@ static inline int mm_use_skey(struct mm_struct *mm) return 0; } +static inline void csp(unsigned int *ptr, unsigned int old, unsigned int new) +{ + register unsigned long reg2 asm("2") = old; + register unsigned long reg3 asm("3") = new; + unsigned long address = (unsigned long)ptr | 1; + + asm volatile( + " csp %0,%3" + : "+d" (reg2), "+m" (*ptr) + : "d" (reg3), "d" (address) + : "cc"); +} + +static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new) +{ + register unsigned long reg2 asm("2") = old; + register unsigned long reg3 asm("3") = new; + unsigned long address = (unsigned long)ptr | 1; + + asm volatile( + " .insn rre,0xb98a0000,%0,%3" + : "+d" (reg2), "+m" (*ptr) + : "d" (reg3), "d" (address) + : "cc"); +} + +#define CRDTE_DTT_PAGE 0x00UL +#define CRDTE_DTT_SEGMENT 0x10UL +#define CRDTE_DTT_REGION3 0x14UL +#define CRDTE_DTT_REGION2 0x18UL +#define CRDTE_DTT_REGION1 0x1cUL + +static inline void crdte(unsigned long old, unsigned long new, + unsigned long table, unsigned long dtt, + unsigned long address, unsigned long asce) +{ + register unsigned long reg2 asm("2") = old; + register unsigned long reg3 asm("3") = new; + register unsigned long reg4 asm("4") = table | dtt; + register unsigned long reg5 asm("5") = address; + + asm volatile(".insn rrf,0xb98f0000,%0,%2,%4,0" + : "+d" (reg2) + : "d" (reg3), "d" (reg4), "d" (reg5), "a" (asce) + : "memory", "cc"); +} + /* * pgd/pmd/pte query functions */ @@ -465,7 +578,7 @@ static inline int pud_none(pud_t pud) { if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3) return 0; - return (pud_val(pud) & _REGION_ENTRY_INVALID) != 0UL; + return pud_val(pud) == _REGION3_ENTRY_EMPTY; } static inline int pud_large(pud_t pud) @@ -475,17 +588,35 @@ static inline int pud_large(pud_t pud) return !!(pud_val(pud) & _REGION3_ENTRY_LARGE); } +static inline unsigned long pud_pfn(pud_t pud) +{ + unsigned long origin_mask; + + origin_mask = _REGION3_ENTRY_ORIGIN; + if (pud_large(pud)) + origin_mask = _REGION3_ENTRY_ORIGIN_LARGE; + return (pud_val(pud) & origin_mask) >> PAGE_SHIFT; +} + +static inline int pmd_large(pmd_t pmd) +{ + return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0; +} + +static inline int pmd_bad(pmd_t pmd) +{ + if (pmd_large(pmd)) + return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0; + return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0; +} + static inline int pud_bad(pud_t pud) { - /* - * With dynamic page table levels the pud can be a region table - * entry or a segment table entry. Check for the bit that are - * invalid for either table entry. - */ - unsigned long mask = - ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INVALID & - ~_REGION_ENTRY_TYPE_MASK & ~_REGION_ENTRY_LENGTH; - return (pud_val(pud) & mask) != 0; + if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3) + return pmd_bad(__pmd(pud_val(pud))); + if (pud_large(pud)) + return (pud_val(pud) & ~_REGION_ENTRY_BITS_LARGE) != 0; + return (pud_val(pud) & ~_REGION_ENTRY_BITS) != 0; } static inline int pmd_present(pmd_t pmd) @@ -498,11 +629,6 @@ static inline int pmd_none(pmd_t pmd) return pmd_val(pmd) == _SEGMENT_ENTRY_INVALID; } -static inline int pmd_large(pmd_t pmd) -{ - return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0; -} - static inline unsigned long pmd_pfn(pmd_t pmd) { unsigned long origin_mask; @@ -513,13 +639,6 @@ static inline unsigned long pmd_pfn(pmd_t pmd) return (pmd_val(pmd) & origin_mask) >> PAGE_SHIFT; } -static inline int pmd_bad(pmd_t pmd) -{ - if (pmd_large(pmd)) - return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0; - return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0; -} - #define __HAVE_ARCH_PMD_WRITE static inline int pmd_write(pmd_t pmd) { @@ -755,35 +874,31 @@ static inline pte_t pte_mkhuge(pte_t pte) } #endif -static inline void __ptep_ipte(unsigned long address, pte_t *ptep) -{ - unsigned long pto = (unsigned long) ptep; - - /* Invalidation + global TLB flush for the pte */ - asm volatile( - " ipte %2,%3" - : "=m" (*ptep) : "m" (*ptep), "a" (pto), "a" (address)); -} +#define IPTE_GLOBAL 0 +#define IPTE_LOCAL 1 -static inline void __ptep_ipte_local(unsigned long address, pte_t *ptep) +static inline void __ptep_ipte(unsigned long address, pte_t *ptep, int local) { unsigned long pto = (unsigned long) ptep; - /* Invalidation + local TLB flush for the pte */ + /* Invalidation + TLB flush for the pte */ asm volatile( - " .insn rrf,0xb2210000,%2,%3,0,1" - : "=m" (*ptep) : "m" (*ptep), "a" (pto), "a" (address)); + " .insn rrf,0xb2210000,%[r1],%[r2],0,%[m4]" + : "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address), + [m4] "i" (local)); } -static inline void __ptep_ipte_range(unsigned long address, int nr, pte_t *ptep) +static inline void __ptep_ipte_range(unsigned long address, int nr, + pte_t *ptep, int local) { unsigned long pto = (unsigned long) ptep; - /* Invalidate a range of ptes + global TLB flush of the ptes */ + /* Invalidate a range of ptes + TLB flush of the ptes */ do { asm volatile( - " .insn rrf,0xb2210000,%2,%0,%1,0" - : "+a" (address), "+a" (nr) : "a" (pto) : "memory"); + " .insn rrf,0xb2210000,%[r1],%[r2],%[r3],%[m4]" + : [r2] "+a" (address), [r3] "+a" (nr) + : [r1] "a" (pto), [m4] "i" (local) : "memory"); } while (nr != 255); } @@ -885,15 +1000,26 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry); void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void ptep_notify(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long bits); +int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr, + pte_t *ptep, int prot, unsigned long bit); void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, pte_t *ptep , int reset); void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, + pte_t *sptep, pte_t *tptep, pte_t pte); +void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep); bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address); int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char key, bool nq); -unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr); +int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, unsigned char *oldkey, + bool nq, bool mr, bool mc); +int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr); +int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char *key); /* * Certain architectures need to do special things when PTEs @@ -963,6 +1089,7 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) #define pte_page(x) pfn_to_page(pte_pfn(x)) #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) +#define pud_page(pud) pfn_to_page(pud_pfn(pud)) /* Find an entry in the lowest level page table.. */ #define pte_offset(pmd, addr) ((pte_t *) pmd_deref(*(pmd)) + pte_index(addr)) @@ -970,20 +1097,6 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) #define pte_unmap(pte) do { } while (0) -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) -static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot) -{ - /* - * pgprot is PAGE_NONE, PAGE_READ, or PAGE_WRITE (see __Pxxx / __Sxxx) - * Convert to segment table entry format. - */ - if (pgprot_val(pgprot) == pgprot_val(PAGE_NONE)) - return pgprot_val(SEGMENT_NONE); - if (pgprot_val(pgprot) == pgprot_val(PAGE_READ)) - return pgprot_val(SEGMENT_READ); - return pgprot_val(SEGMENT_WRITE); -} - static inline pmd_t pmd_wrprotect(pmd_t pmd) { pmd_val(pmd) &= ~_SEGMENT_ENTRY_WRITE; @@ -1020,6 +1133,56 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) return pmd; } +static inline pud_t pud_wrprotect(pud_t pud) +{ + pud_val(pud) &= ~_REGION3_ENTRY_WRITE; + pud_val(pud) |= _REGION_ENTRY_PROTECT; + return pud; +} + +static inline pud_t pud_mkwrite(pud_t pud) +{ + pud_val(pud) |= _REGION3_ENTRY_WRITE; + if (pud_large(pud) && !(pud_val(pud) & _REGION3_ENTRY_DIRTY)) + return pud; + pud_val(pud) &= ~_REGION_ENTRY_PROTECT; + return pud; +} + +static inline pud_t pud_mkclean(pud_t pud) +{ + if (pud_large(pud)) { + pud_val(pud) &= ~_REGION3_ENTRY_DIRTY; + pud_val(pud) |= _REGION_ENTRY_PROTECT; + } + return pud; +} + +static inline pud_t pud_mkdirty(pud_t pud) +{ + if (pud_large(pud)) { + pud_val(pud) |= _REGION3_ENTRY_DIRTY | + _REGION3_ENTRY_SOFT_DIRTY; + if (pud_val(pud) & _REGION3_ENTRY_WRITE) + pud_val(pud) &= ~_REGION_ENTRY_PROTECT; + } + return pud; +} + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) +static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot) +{ + /* + * pgprot is PAGE_NONE, PAGE_READ, or PAGE_WRITE (see __Pxxx / __Sxxx) + * Convert to segment table entry format. + */ + if (pgprot_val(pgprot) == pgprot_val(PAGE_NONE)) + return pgprot_val(SEGMENT_NONE); + if (pgprot_val(pgprot) == pgprot_val(PAGE_READ)) + return pgprot_val(SEGMENT_READ); + return pgprot_val(SEGMENT_WRITE); +} + static inline pmd_t pmd_mkyoung(pmd_t pmd) { if (pmd_large(pmd)) { @@ -1068,43 +1231,43 @@ static inline pmd_t mk_pmd_phys(unsigned long physpage, pgprot_t pgprot) static inline void __pmdp_csp(pmd_t *pmdp) { - register unsigned long reg2 asm("2") = pmd_val(*pmdp); - register unsigned long reg3 asm("3") = pmd_val(*pmdp) | - _SEGMENT_ENTRY_INVALID; - register unsigned long reg4 asm("4") = ((unsigned long) pmdp) + 5; - - asm volatile( - " csp %1,%3" - : "=m" (*pmdp) - : "d" (reg2), "d" (reg3), "d" (reg4), "m" (*pmdp) : "cc"); + csp((unsigned int *)pmdp + 1, pmd_val(*pmdp), + pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID); } -static inline void __pmdp_idte(unsigned long address, pmd_t *pmdp) +#define IDTE_GLOBAL 0 +#define IDTE_LOCAL 1 + +static inline void __pmdp_idte(unsigned long address, pmd_t *pmdp, int local) { unsigned long sto; sto = (unsigned long) pmdp - pmd_index(address) * sizeof(pmd_t); asm volatile( - " .insn rrf,0xb98e0000,%2,%3,0,0" - : "=m" (*pmdp) - : "m" (*pmdp), "a" (sto), "a" ((address & HPAGE_MASK)) + " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" + : "+m" (*pmdp) + : [r1] "a" (sto), [r2] "a" ((address & HPAGE_MASK)), + [m4] "i" (local) : "cc" ); } -static inline void __pmdp_idte_local(unsigned long address, pmd_t *pmdp) +static inline void __pudp_idte(unsigned long address, pud_t *pudp, int local) { - unsigned long sto; + unsigned long r3o; - sto = (unsigned long) pmdp - pmd_index(address) * sizeof(pmd_t); + r3o = (unsigned long) pudp - pud_index(address) * sizeof(pud_t); + r3o |= _ASCE_TYPE_REGION3; asm volatile( - " .insn rrf,0xb98e0000,%2,%3,0,1" - : "=m" (*pmdp) - : "m" (*pmdp), "a" (sto), "a" ((address & HPAGE_MASK)) - : "cc" ); + " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" + : "+m" (*pudp) + : [r1] "a" (r3o), [r2] "a" ((address & PUD_MASK)), + [m4] "i" (local) + : "cc"); } pmd_t pmdp_xchg_direct(struct mm_struct *, unsigned long, pmd_t *, pmd_t); pmd_t pmdp_xchg_lazy(struct mm_struct *, unsigned long, pmd_t *, pmd_t); +pud_t pudp_xchg_direct(struct mm_struct *, unsigned long, pud_t *, pud_t); #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -1223,6 +1386,7 @@ static inline int pmd_trans_huge(pmd_t pmd) return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE; } +#define has_transparent_hugepage has_transparent_hugepage static inline int has_transparent_hugepage(void) { return MACHINE_HAS_HPAGE ? 1 : 0; diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 18cdede1aeda..03323175de30 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -77,7 +77,10 @@ static inline void get_cpu_id(struct cpuid *ptr) asm volatile("stidp %0" : "=Q" (*ptr)); } -extern void s390_adjust_jiffies(void); +void s390_adjust_jiffies(void); +void s390_update_cpu_mhz(void); +void cpu_detect_mhz_feature(void); + extern const struct seq_operations cpuinfo_op; extern int sysctl_ieee_emulation_warnings; extern void execve_tail(void); @@ -105,11 +108,12 @@ typedef struct { * Thread structure */ struct thread_struct { - struct fpu fpu; /* FP and VX register save area */ unsigned int acrs[NUM_ACRS]; unsigned long ksp; /* kernel stack pointer */ mm_segment_t mm_segment; unsigned long gmap_addr; /* address of last gmap fault. */ + unsigned int gmap_write_flag; /* gmap fault write indication */ + unsigned int gmap_int_code; /* int code of last gmap fault */ unsigned int gmap_pfault; /* signal of a pending guest pfault */ struct per_regs per_user; /* User specified PER registers */ struct per_event per_event; /* Cause of the last PER trap */ @@ -120,6 +124,11 @@ struct thread_struct { /* cpu runtime instrumentation */ struct runtime_instr_cb *ri_cb; unsigned char trap_tdb[256]; /* Transaction abort diagnose block */ + /* + * Warning: 'fpu' is dynamically-sized. It *MUST* be at + * the end. + */ + struct fpu fpu; /* FP and VX register save area */ }; /* Flag to disable transactions. */ @@ -155,10 +164,9 @@ struct stack_frame { #define ARCH_MIN_TASKALIGN 8 -extern __vector128 init_task_fpu_regs[__NUM_VXRS]; #define INIT_THREAD { \ .ksp = sizeof(init_stack) + (unsigned long) &init_stack, \ - .fpu.regs = (void *)&init_task_fpu_regs, \ + .fpu.regs = (void *) init_task.thread.fpu.fprs, \ } /* @@ -230,6 +238,18 @@ void cpu_relax(void); #define cpu_relax_lowlatency() barrier() +#define ECAG_CACHE_ATTRIBUTE 0 +#define ECAG_CPU_ATTRIBUTE 1 + +static inline unsigned long __ecag(unsigned int asi, unsigned char parm) +{ + unsigned long val; + + asm volatile(".insn rsy,0xeb000000004c,%0,0,0(%1)" /* ecag */ + : "=d" (val) : "a" (asi << 8 | parm)); + return val; +} + static inline void psw_set_key(unsigned int key) { asm volatile("spka 0(%0)" : : "d" (key)); diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h index fead491dfc28..597e7e96b59e 100644 --- a/arch/s390/include/asm/rwsem.h +++ b/arch/s390/include/asm/rwsem.h @@ -90,7 +90,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline long ___down_write(struct rw_semaphore *sem) { signed long old, new, tmp; @@ -104,13 +104,23 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) : "=&d" (old), "=&d" (new), "=Q" (sem->count) : "Q" (sem->count), "m" (tmp) : "cc", "memory"); - if (old != 0) - rwsem_down_write_failed(sem); + + return old; } static inline void __down_write(struct rw_semaphore *sem) { - __down_write_nested(sem, 0); + if (___down_write(sem)) + rwsem_down_write_failed(sem); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + if (___down_write(sem)) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + + return 0; } /* @@ -197,41 +207,4 @@ static inline void __downgrade_write(struct rw_semaphore *sem) rwsem_downgrade_wake(sem); } -/* - * implement atomic add functionality - */ -static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem) -{ - signed long old, new; - - asm volatile( - " lg %0,%2\n" - "0: lgr %1,%0\n" - " agr %1,%4\n" - " csg %0,%1,%2\n" - " jl 0b" - : "=&d" (old), "=&d" (new), "=Q" (sem->count) - : "Q" (sem->count), "d" (delta) - : "cc", "memory"); -} - -/* - * implement exchange and add functionality - */ -static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) -{ - signed long old, new; - - asm volatile( - " lg %0,%2\n" - "0: lgr %1,%0\n" - " agr %1,%4\n" - " csg %0,%1,%2\n" - " jl 0b" - : "=&d" (old), "=&d" (new), "=Q" (sem->count) - : "Q" (sem->count), "d" (delta) - : "cc", "memory"); - return new; -} - #endif /* _S390_RWSEM_H */ diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index bab456be9a4f..2ad9c204b1a2 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -32,12 +32,19 @@ struct sclp_core_entry { u8 reserved0; u8 : 4; u8 sief2 : 1; - u8 : 3; - u8 : 3; + u8 skey : 1; + u8 : 2; + u8 : 2; + u8 gpere : 1; u8 siif : 1; u8 sigpif : 1; u8 : 3; - u8 reserved2[10]; + u8 reserved2[3]; + u8 : 2; + u8 ib : 1; + u8 cei : 1; + u8 : 4; + u8 reserved3[6]; u8 type; u8 reserved1; } __attribute__((packed)); @@ -59,6 +66,15 @@ struct sclp_info { unsigned char has_hvs : 1; unsigned char has_esca : 1; unsigned char has_sief2 : 1; + unsigned char has_64bscao : 1; + unsigned char has_gpere : 1; + unsigned char has_cmma : 1; + unsigned char has_gsls : 1; + unsigned char has_ib : 1; + unsigned char has_cei : 1; + unsigned char has_pfmfi : 1; + unsigned char has_ibs : 1; + unsigned char has_skey : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; @@ -69,9 +85,22 @@ struct sclp_info { unsigned int max_cores; unsigned long hsa_size; unsigned long facilities; + unsigned int hmfai; }; extern struct sclp_info sclp; +struct zpci_report_error_header { + u8 version; /* Interface version byte */ + u8 action; /* Action qualifier byte + * 1: Deconfigure and repair action requested + * (OpenCrypto Problem Call Home) + * 2: Informational Report + * (OpenCrypto Successful Diagnostics Execution) + */ + u16 length; /* Length of Subsequent Data (up to 4K – SCLP header */ + u8 data[0]; /* Subsequent Data passed verbatim to SCLP ET 24 */ +} __packed; + int sclp_get_core_info(struct sclp_core_info *info); int sclp_core_configure(u8 core); int sclp_core_deconfigure(u8 core); @@ -83,9 +112,11 @@ int sclp_chp_read_info(struct sclp_chp_info *info); void sclp_get_ipl_info(struct sclp_ipl_info *info); int sclp_pci_configure(u32 fid); int sclp_pci_deconfigure(u32 fid); +int sclp_pci_report(struct zpci_report_error_header *report, u32 fh, u32 fid); int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count); int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count); void sclp_early_detect(void); void _sclp_print_early(const char *); +void sclp_ocf_cpc_name_copy(char *dst); #endif /* _ASM_S390_SCLP_H */ diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h index fbd9116eb17b..5ce29fe100ba 100644 --- a/arch/s390/include/asm/sections.h +++ b/arch/s390/include/asm/sections.h @@ -4,5 +4,6 @@ #include <asm-generic/sections.h> extern char _eshared[], _ehead[]; +extern char __start_ro_after_init[], __end_ro_after_init[]; #endif diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index c0f0efbb6ab5..5e8d57e1cc5e 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -86,9 +86,13 @@ extern char vmpoff_cmd[]; #define CONSOLE_IS_SCLP (console_mode == 1) #define CONSOLE_IS_3215 (console_mode == 2) #define CONSOLE_IS_3270 (console_mode == 3) +#define CONSOLE_IS_VT220 (console_mode == 4) +#define CONSOLE_IS_HVC (console_mode == 5) #define SET_CONSOLE_SCLP do { console_mode = 1; } while (0) #define SET_CONSOLE_3215 do { console_mode = 2; } while (0) #define SET_CONSOLE_3270 do { console_mode = 3; } while (0) +#define SET_CONSOLE_VT220 do { console_mode = 4; } while (0) +#define SET_CONSOLE_HVC do { console_mode = 5; } while (0) #define NSS_NAME_SIZE 8 extern char kernel_nss_name[]; diff --git a/arch/s390/include/asm/sfp-machine.h b/arch/s390/include/asm/sfp-machine.h deleted file mode 100644 index 4e16aede4b06..000000000000 --- a/arch/s390/include/asm/sfp-machine.h +++ /dev/null @@ -1,142 +0,0 @@ -/* Machine-dependent software floating-point definitions. - S/390 kernel version. - Copyright (C) 1997,1998,1999 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Contributed by Richard Henderson (rth@cygnus.com), - Jakub Jelinek (jj@ultra.linux.cz), - David S. Miller (davem@redhat.com) and - Peter Maydell (pmaydell@chiark.greenend.org.uk). - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public - License along with the GNU C Library; see the file COPYING.LIB. If - not, write to the Free Software Foundation, Inc., - 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ - -#ifndef _SFP_MACHINE_H -#define _SFP_MACHINE_H - - -#define _FP_W_TYPE_SIZE 32 -#define _FP_W_TYPE unsigned int -#define _FP_WS_TYPE signed int -#define _FP_I_TYPE int - -#define _FP_MUL_MEAT_S(R,X,Y) \ - _FP_MUL_MEAT_1_wide(_FP_WFRACBITS_S,R,X,Y,umul_ppmm) -#define _FP_MUL_MEAT_D(R,X,Y) \ - _FP_MUL_MEAT_2_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm) -#define _FP_MUL_MEAT_Q(R,X,Y) \ - _FP_MUL_MEAT_4_wide(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm) - -#define _FP_DIV_MEAT_S(R,X,Y) _FP_DIV_MEAT_1_udiv(S,R,X,Y) -#define _FP_DIV_MEAT_D(R,X,Y) _FP_DIV_MEAT_2_udiv(D,R,X,Y) -#define _FP_DIV_MEAT_Q(R,X,Y) _FP_DIV_MEAT_4_udiv(Q,R,X,Y) - -#define _FP_NANFRAC_S ((_FP_QNANBIT_S << 1) - 1) -#define _FP_NANFRAC_D ((_FP_QNANBIT_D << 1) - 1), -1 -#define _FP_NANFRAC_Q ((_FP_QNANBIT_Q << 1) - 1), -1, -1, -1 -#define _FP_NANSIGN_S 0 -#define _FP_NANSIGN_D 0 -#define _FP_NANSIGN_Q 0 - -#define _FP_KEEPNANFRACP 1 - -/* - * If one NaN is signaling and the other is not, - * we choose that one, otherwise we choose X. - */ -#define _FP_CHOOSENAN(fs, wc, R, X, Y, OP) \ - do { \ - if ((_FP_FRAC_HIGH_RAW_##fs(X) & _FP_QNANBIT_##fs) \ - && !(_FP_FRAC_HIGH_RAW_##fs(Y) & _FP_QNANBIT_##fs)) \ - { \ - R##_s = Y##_s; \ - _FP_FRAC_COPY_##wc(R,Y); \ - } \ - else \ - { \ - R##_s = X##_s; \ - _FP_FRAC_COPY_##wc(R,X); \ - } \ - R##_c = FP_CLS_NAN; \ - } while (0) - -/* Some assembly to speed things up. */ -#define __FP_FRAC_ADD_3(r2,r1,r0,x2,x1,x0,y2,y1,y0) ({ \ - unsigned int __r2 = (x2) + (y2); \ - unsigned int __r1 = (x1); \ - unsigned int __r0 = (x0); \ - asm volatile( \ - " alr %2,%3\n" \ - " brc 12,0f\n" \ - " lhi 0,1\n" \ - " alr %1,0\n" \ - " brc 12,0f\n" \ - " alr %0,0\n" \ - "0:" \ - : "+&d" (__r2), "+&d" (__r1), "+&d" (__r0) \ - : "d" (y0), "i" (1) : "cc", "0" ); \ - asm volatile( \ - " alr %1,%2\n" \ - " brc 12,0f\n" \ - " ahi %0,1\n" \ - "0:" \ - : "+&d" (__r2), "+&d" (__r1) \ - : "d" (y1) : "cc"); \ - (r2) = __r2; \ - (r1) = __r1; \ - (r0) = __r0; \ -}) - -#define __FP_FRAC_SUB_3(r2,r1,r0,x2,x1,x0,y2,y1,y0) ({ \ - unsigned int __r2 = (x2) - (y2); \ - unsigned int __r1 = (x1); \ - unsigned int __r0 = (x0); \ - asm volatile( \ - " slr %2,%3\n" \ - " brc 3,0f\n" \ - " lhi 0,1\n" \ - " slr %1,0\n" \ - " brc 3,0f\n" \ - " slr %0,0\n" \ - "0:" \ - : "+&d" (__r2), "+&d" (__r1), "+&d" (__r0) \ - : "d" (y0) : "cc", "0"); \ - asm volatile( \ - " slr %1,%2\n" \ - " brc 3,0f\n" \ - " ahi %0,-1\n" \ - "0:" \ - : "+&d" (__r2), "+&d" (__r1) \ - : "d" (y1) : "cc"); \ - (r2) = __r2; \ - (r1) = __r1; \ - (r0) = __r0; \ -}) - -#define __FP_FRAC_DEC_3(x2,x1,x0,y2,y1,y0) __FP_FRAC_SUB_3(x2,x1,x0,x2,x1,x0,y2,y1,y0) - -/* Obtain the current rounding mode. */ -#define FP_ROUNDMODE mode - -/* Exception flags. */ -#define FP_EX_INVALID 0x800000 -#define FP_EX_DIVZERO 0x400000 -#define FP_EX_OVERFLOW 0x200000 -#define FP_EX_UNDERFLOW 0x100000 -#define FP_EX_INEXACT 0x080000 - -/* We write the results always */ -#define FP_INHIBIT_RESULTS 0 - -#endif diff --git a/arch/s390/include/asm/sfp-util.h b/arch/s390/include/asm/sfp-util.h deleted file mode 100644 index c8b7cf9d6279..000000000000 --- a/arch/s390/include/asm/sfp-util.h +++ /dev/null @@ -1,67 +0,0 @@ -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/types.h> -#include <asm/byteorder.h> - -#define add_ssaaaa(sh, sl, ah, al, bh, bl) ({ \ - unsigned int __sh = (ah); \ - unsigned int __sl = (al); \ - asm volatile( \ - " alr %1,%3\n" \ - " brc 12,0f\n" \ - " ahi %0,1\n" \ - "0: alr %0,%2" \ - : "+&d" (__sh), "+d" (__sl) \ - : "d" (bh), "d" (bl) : "cc"); \ - (sh) = __sh; \ - (sl) = __sl; \ -}) - -#define sub_ddmmss(sh, sl, ah, al, bh, bl) ({ \ - unsigned int __sh = (ah); \ - unsigned int __sl = (al); \ - asm volatile( \ - " slr %1,%3\n" \ - " brc 3,0f\n" \ - " ahi %0,-1\n" \ - "0: slr %0,%2" \ - : "+&d" (__sh), "+d" (__sl) \ - : "d" (bh), "d" (bl) : "cc"); \ - (sh) = __sh; \ - (sl) = __sl; \ -}) - -/* a umul b = a mul b + (a>=2<<31) ? b<<32:0 + (b>=2<<31) ? a<<32:0 */ -#define umul_ppmm(wh, wl, u, v) ({ \ - unsigned int __wh = u; \ - unsigned int __wl = v; \ - asm volatile( \ - " ltr 1,%0\n" \ - " mr 0,%1\n" \ - " jnm 0f\n" \ - " alr 0,%1\n" \ - "0: ltr %1,%1\n" \ - " jnm 1f\n" \ - " alr 0,%0\n" \ - "1: lr %0,0\n" \ - " lr %1,1\n" \ - : "+d" (__wh), "+d" (__wl) \ - : : "0", "1", "cc"); \ - wh = __wh; \ - wl = __wl; \ -}) - -#define udiv_qrnnd(q, r, n1, n0, d) \ - do { unsigned long __n; \ - unsigned int __r, __d; \ - __n = ((unsigned long)(n1) << 32) + n0; \ - __d = (d); \ - (q) = __n / __d; \ - (r) = __n % __d; \ - } while (0) - -#define UDIV_NEEDS_NORMALIZATION 0 - -#define abort() BUG() - -#define __BYTE_ORDER __BIG_ENDIAN diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h index ec60cf7fa0a2..72df5f2de6b0 100644 --- a/arch/s390/include/asm/sigp.h +++ b/arch/s390/include/asm/sigp.h @@ -27,6 +27,7 @@ /* SIGP cpu status bits */ +#define SIGP_STATUS_INVALID_ORDER 0x00000002UL #define SIGP_STATUS_CHECK_STOP 0x00000010UL #define SIGP_STATUS_STOPPED 0x00000040UL #define SIGP_STATUS_EXT_CALL_PENDING 0x00000080UL @@ -36,8 +37,8 @@ #ifndef __ASSEMBLY__ -static inline int __pcpu_sigp(u16 addr, u8 order, unsigned long parm, - u32 *status) +static inline int ____pcpu_sigp(u16 addr, u8 order, unsigned long parm, + u32 *status) { register unsigned long reg1 asm ("1") = parm; int cc; @@ -47,8 +48,19 @@ static inline int __pcpu_sigp(u16 addr, u8 order, unsigned long parm, " ipm %0\n" " srl %0,28\n" : "=d" (cc), "+d" (reg1) : "d" (addr), "a" (order) : "cc"); + *status = reg1; + return cc; +} + +static inline int __pcpu_sigp(u16 addr, u8 order, unsigned long parm, + u32 *status) +{ + u32 _status; + int cc; + + cc = ____pcpu_sigp(addr, order, parm, &_status); if (status && cc == 1) - *status = reg1; + *status = _status; return cc; } diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 63ebf37d3143..7e9e09f600fa 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -10,6 +10,8 @@ #define __ASM_SPINLOCK_H #include <linux/smp.h> +#include <asm/barrier.h> +#include <asm/processor.h> #define SPINLOCK_LOCKVAL (S390_lowcore.spinlock_lockval) @@ -97,6 +99,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { while (arch_spin_is_locked(lock)) arch_spin_relax(lock); + smp_acquire__after_ctrl_dep(); } /* diff --git a/arch/s390/include/asm/stp.h b/arch/s390/include/asm/stp.h new file mode 100644 index 000000000000..7689727585b2 --- /dev/null +++ b/arch/s390/include/asm/stp.h @@ -0,0 +1,51 @@ +/* + * Copyright IBM Corp. 2006 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) + */ +#ifndef __S390_STP_H +#define __S390_STP_H + +/* notifier for syncs */ +extern struct atomic_notifier_head s390_epoch_delta_notifier; + +/* STP interruption parameter */ +struct stp_irq_parm { + unsigned int _pad0 : 14; + unsigned int tsc : 1; /* Timing status change */ + unsigned int lac : 1; /* Link availability change */ + unsigned int tcpc : 1; /* Time control parameter change */ + unsigned int _pad2 : 15; +} __attribute__ ((packed)); + +#define STP_OP_SYNC 1 +#define STP_OP_CTRL 3 + +struct stp_sstpi { + unsigned int rsvd0; + unsigned int rsvd1 : 8; + unsigned int stratum : 8; + unsigned int vbits : 16; + unsigned int leaps : 16; + unsigned int tmd : 4; + unsigned int ctn : 4; + unsigned int rsvd2 : 3; + unsigned int c : 1; + unsigned int tst : 4; + unsigned int tzo : 16; + unsigned int dsto : 16; + unsigned int ctrl : 16; + unsigned int rsvd3 : 16; + unsigned int tto; + unsigned int rsvd4; + unsigned int ctnid[3]; + unsigned int rsvd5; + unsigned int todoff[4]; + unsigned int rsvd6[48]; +} __attribute__ ((packed)); + +/* Functions needed by the machine check handler */ +int stp_sync_check(void); +int stp_island_check(void); +void stp_queue_work(void); + +#endif /* __S390_STP_H */ diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index 2fffc2c27581..f15c0398c363 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -62,6 +62,7 @@ static inline struct thread_info *current_thread_info(void) } void arch_release_task_struct(struct task_struct *tsk); +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #define THREAD_SIZE_ORDER THREAD_ORDER diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index dcb6312a0b91..0bb08f341c09 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -52,6 +52,70 @@ static inline void store_clock_comparator(__u64 *time) void clock_comparator_work(void); +void __init ptff_init(void); + +extern unsigned char ptff_function_mask[16]; +extern unsigned long lpar_offset; +extern unsigned long initial_leap_seconds; + +/* Function codes for the ptff instruction. */ +#define PTFF_QAF 0x00 /* query available functions */ +#define PTFF_QTO 0x01 /* query tod offset */ +#define PTFF_QSI 0x02 /* query steering information */ +#define PTFF_QUI 0x04 /* query UTC information */ +#define PTFF_ATO 0x40 /* adjust tod offset */ +#define PTFF_STO 0x41 /* set tod offset */ +#define PTFF_SFS 0x42 /* set fine steering rate */ +#define PTFF_SGS 0x43 /* set gross steering rate */ + +/* Query TOD offset result */ +struct ptff_qto { + unsigned long long physical_clock; + unsigned long long tod_offset; + unsigned long long logical_tod_offset; + unsigned long long tod_epoch_difference; +} __packed; + +static inline int ptff_query(unsigned int nr) +{ + unsigned char *ptr; + + ptr = ptff_function_mask + (nr >> 3); + return (*ptr & (0x80 >> (nr & 7))) != 0; +} + +/* Query UTC information result */ +struct ptff_qui { + unsigned int tm : 2; + unsigned int ts : 2; + unsigned int : 28; + unsigned int pad_0x04; + unsigned long leap_event; + short old_leap; + short new_leap; + unsigned int pad_0x14; + unsigned long prt[5]; + unsigned long cst[3]; + unsigned int skew; + unsigned int pad_0x5c[41]; +} __packed; + +static inline int ptff(void *ptff_block, size_t len, unsigned int func) +{ + typedef struct { char _[len]; } addrtype; + register unsigned int reg0 asm("0") = func; + register unsigned long reg1 asm("1") = (unsigned long) ptff_block; + int rc; + + asm volatile( + " .word 0x0104\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (rc), "+m" (*(addrtype *) ptff_block) + : "d" (reg0), "d" (reg1) : "cc"); + return rc; +} + static inline unsigned long long local_tick_disable(void) { unsigned long long old; @@ -105,7 +169,7 @@ static inline cycles_t get_cycles(void) return (cycles_t) get_tod_clock() >> 2; } -int get_sync_clock(unsigned long long *clock); +int get_phys_clock(unsigned long long *clock); void init_cpu_timer(void); unsigned long long monotonic_clock(void); diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 7a92e69c50bc..15711de10403 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -87,10 +87,10 @@ static inline void tlb_finish_mmu(struct mmu_gather *tlb, * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page * has already been freed, so just do free_page_and_swap_cache. */ -static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { free_page_and_swap_cache(page); - return 1; /* avoid calling tlb_flush_mmu */ + return false; /* avoid calling tlb_flush_mmu */ } static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) @@ -98,6 +98,24 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) free_page_and_swap_cache(page); } +static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, + struct page *page, int page_size) +{ + return __tlb_remove_page(tlb, page); +} + +static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, + struct page *page) +{ + return __tlb_remove_page(tlb, page); +} + +static inline void tlb_remove_page_size(struct mmu_gather *tlb, + struct page *page, int page_size) +{ + return tlb_remove_page(tlb, page); +} + /* * pte_free_tlb frees a pte table and clears the CRSTE for the * page table from the tlb. diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index a2e6ef32e054..39846100682a 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -5,6 +5,7 @@ #include <linux/sched.h> #include <asm/processor.h> #include <asm/pgalloc.h> +#include <asm/pgtable.h> /* * Flush all TLB entries on the local CPU. @@ -25,17 +26,6 @@ static inline void __tlb_flush_idte(unsigned long asce) : : "a" (2048), "a" (asce) : "cc"); } -/* - * Flush TLB entries for a specific ASCE on the local CPU - */ -static inline void __tlb_flush_idte_local(unsigned long asce) -{ - /* Local TLB flush for the mm */ - asm volatile( - " .insn rrf,0xb98e0000,0,%0,%1,1" - : : "a" (2048), "a" (asce) : "cc"); -} - #ifdef CONFIG_SMP void smp_ptlb_all(void); @@ -44,17 +34,9 @@ void smp_ptlb_all(void); */ static inline void __tlb_flush_global(void) { - register unsigned long reg2 asm("2"); - register unsigned long reg3 asm("3"); - register unsigned long reg4 asm("4"); - long dummy; - - dummy = 0; - reg2 = reg3 = 0; - reg4 = ((unsigned long) &dummy) + 1; - asm volatile( - " csp %0,%2" - : : "d" (reg2), "d" (reg3), "d" (reg4), "m" (dummy) : "cc" ); + unsigned int dummy = 0; + + csp(&dummy, 0, 0); } /* @@ -64,7 +46,7 @@ static inline void __tlb_flush_global(void) static inline void __tlb_flush_full(struct mm_struct *mm) { preempt_disable(); - atomic_add(0x10000, &mm->context.attach_count); + atomic_inc(&mm->context.flush_count); if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { /* Local TLB flush */ __tlb_flush_local(); @@ -72,38 +54,34 @@ static inline void __tlb_flush_full(struct mm_struct *mm) /* Global TLB flush */ __tlb_flush_global(); /* Reset TLB flush mask */ - if (MACHINE_HAS_TLB_LC) - cpumask_copy(mm_cpumask(mm), - &mm->context.cpu_attach_mask); + cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); } - atomic_sub(0x10000, &mm->context.attach_count); + atomic_dec(&mm->context.flush_count); preempt_enable(); } -/* - * Flush TLB entries for a specific ASCE on all CPUs. - */ -static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce) +static inline void __tlb_flush_mm(struct mm_struct *mm) { - int active, count; + unsigned long gmap_asce; + /* + * If the machine has IDTE we prefer to do a per mm flush + * on all cpus instead of doing a local flush if the mm + * only ran on the local cpu. + */ preempt_disable(); - active = (mm == current->active_mm) ? 1 : 0; - count = atomic_add_return(0x10000, &mm->context.attach_count); - if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active && - cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { - __tlb_flush_idte_local(asce); + atomic_inc(&mm->context.flush_count); + gmap_asce = READ_ONCE(mm->context.gmap_asce); + if (MACHINE_HAS_IDTE && gmap_asce != -1UL) { + if (gmap_asce) + __tlb_flush_idte(gmap_asce); + __tlb_flush_idte(mm->context.asce); } else { - if (MACHINE_HAS_IDTE) - __tlb_flush_idte(asce); - else - __tlb_flush_global(); - /* Reset TLB flush mask */ - if (MACHINE_HAS_TLB_LC) - cpumask_copy(mm_cpumask(mm), - &mm->context.cpu_attach_mask); + __tlb_flush_full(mm); } - atomic_sub(0x10000, &mm->context.attach_count); + /* Reset TLB flush mask */ + cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); + atomic_dec(&mm->context.flush_count); preempt_enable(); } @@ -121,36 +99,17 @@ static inline void __tlb_flush_kernel(void) /* * Flush TLB entries for a specific ASCE on all CPUs. */ -static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce) +static inline void __tlb_flush_mm(struct mm_struct *mm) { - if (MACHINE_HAS_TLB_LC) - __tlb_flush_idte_local(asce); - else - __tlb_flush_local(); + __tlb_flush_local(); } static inline void __tlb_flush_kernel(void) { - if (MACHINE_HAS_TLB_LC) - __tlb_flush_idte_local(init_mm.context.asce); - else - __tlb_flush_local(); + __tlb_flush_local(); } #endif -static inline void __tlb_flush_mm(struct mm_struct * mm) -{ - /* - * If the machine has IDTE we prefer to do a per mm flush - * on all cpus instead of doing a local flush if the mm - * only ran on the local cpu. - */ - if (MACHINE_HAS_IDTE && list_empty(&mm->context.gmap_list)) - __tlb_flush_asce(mm, mm->context.asce); - else - __tlb_flush_full(mm); -} - static inline void __tlb_flush_mm_lazy(struct mm_struct * mm) { if (mm->context.flush_mm) { diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 6b53962e807e..f15f5571ca2b 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -14,10 +14,12 @@ struct cpu_topology_s390 { unsigned short core_id; unsigned short socket_id; unsigned short book_id; + unsigned short drawer_id; unsigned short node_id; cpumask_t thread_mask; cpumask_t core_mask; cpumask_t book_mask; + cpumask_t drawer_mask; }; DECLARE_PER_CPU(struct cpu_topology_s390, cpu_topology); @@ -30,6 +32,8 @@ DECLARE_PER_CPU(struct cpu_topology_s390, cpu_topology); #define topology_core_cpumask(cpu) (&per_cpu(cpu_topology, cpu).core_mask) #define topology_book_id(cpu) (per_cpu(cpu_topology, cpu).book_id) #define topology_book_cpumask(cpu) (&per_cpu(cpu_topology, cpu).book_mask) +#define topology_drawer_id(cpu) (per_cpu(cpu_topology, cpu).drawer_id) +#define topology_drawer_cpumask(cpu) (&per_cpu(cpu_topology, cpu).drawer_mask) #define mc_capable() 1 diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index e0900ddf91dd..52d7c8709279 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -151,8 +151,65 @@ unsigned long __must_check __copy_to_user(void __user *to, const void *from, __rc; \ }) -#define __put_user_fn(x, ptr, size) __put_get_user_asm(ptr, x, size, 0x810000UL) -#define __get_user_fn(x, ptr, size) __put_get_user_asm(x, ptr, size, 0x81UL) +static inline int __put_user_fn(void *x, void __user *ptr, unsigned long size) +{ + unsigned long spec = 0x810000UL; + int rc; + + switch (size) { + case 1: + rc = __put_get_user_asm((unsigned char __user *)ptr, + (unsigned char *)x, + size, spec); + break; + case 2: + rc = __put_get_user_asm((unsigned short __user *)ptr, + (unsigned short *)x, + size, spec); + break; + case 4: + rc = __put_get_user_asm((unsigned int __user *)ptr, + (unsigned int *)x, + size, spec); + break; + case 8: + rc = __put_get_user_asm((unsigned long __user *)ptr, + (unsigned long *)x, + size, spec); + break; + }; + return rc; +} + +static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size) +{ + unsigned long spec = 0x81UL; + int rc; + + switch (size) { + case 1: + rc = __put_get_user_asm((unsigned char *)x, + (unsigned char __user *)ptr, + size, spec); + break; + case 2: + rc = __put_get_user_asm((unsigned short *)x, + (unsigned short __user *)ptr, + size, spec); + break; + case 4: + rc = __put_get_user_asm((unsigned int *)x, + (unsigned int __user *)ptr, + size, spec); + break; + case 8: + rc = __put_get_user_asm((unsigned long *)x, + (unsigned long __user *)ptr, + size, spec); + break; + }; + return rc; +} #else /* CONFIG_HAVE_MARCH_Z10_FEATURES */ @@ -191,7 +248,7 @@ static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long s __put_user_bad(); \ break; \ } \ - __pu_err; \ + __builtin_expect(__pu_err, 0); \ }) #define put_user(x, ptr) \ @@ -209,28 +266,28 @@ int __put_user_bad(void) __attribute__((noreturn)); __chk_user_ptr(ptr); \ switch (sizeof(*(ptr))) { \ case 1: { \ - unsigned char __x; \ + unsigned char __x = 0; \ __gu_err = __get_user_fn(&__x, ptr, \ sizeof(*(ptr))); \ (x) = *(__force __typeof__(*(ptr)) *) &__x; \ break; \ }; \ case 2: { \ - unsigned short __x; \ + unsigned short __x = 0; \ __gu_err = __get_user_fn(&__x, ptr, \ sizeof(*(ptr))); \ (x) = *(__force __typeof__(*(ptr)) *) &__x; \ break; \ }; \ case 4: { \ - unsigned int __x; \ + unsigned int __x = 0; \ __gu_err = __get_user_fn(&__x, ptr, \ sizeof(*(ptr))); \ (x) = *(__force __typeof__(*(ptr)) *) &__x; \ break; \ }; \ case 8: { \ - unsigned long long __x; \ + unsigned long long __x = 0; \ __gu_err = __get_user_fn(&__x, ptr, \ sizeof(*(ptr))); \ (x) = *(__force __typeof__(*(ptr)) *) &__x; \ @@ -240,7 +297,7 @@ int __put_user_bad(void) __attribute__((noreturn)); __get_user_bad(); \ break; \ } \ - __gu_err; \ + __builtin_expect(__gu_err, 0); \ }) #define get_user(x, ptr) \ @@ -254,6 +311,14 @@ int __get_user_bad(void) __attribute__((noreturn)); #define __put_user_unaligned __put_user #define __get_user_unaligned __get_user +extern void __compiletime_error("usercopy buffer size is too small") +__bad_copy_user(void); + +static inline void copy_user_overflow(int size, unsigned long count) +{ + WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); +} + /** * copy_to_user: - Copy a block of data into user space. * @to: Destination address, in user space. @@ -275,12 +340,6 @@ copy_to_user(void __user *to, const void *from, unsigned long n) return __copy_to_user(to, from, n); } -void copy_from_user_overflow(void) -#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS -__compiletime_warning("copy_from_user() buffer size is not provably correct") -#endif -; - /** * copy_from_user: - Copy a block of data from user space. * @to: Destination address, in kernel space. @@ -305,7 +364,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n) might_fault(); if (unlikely(sz != -1 && sz < n)) { - copy_from_user_overflow(); + if (!__builtin_constant_p(n)) + copy_user_overflow(sz, n); + else + __bad_copy_user(); return n; } return __copy_from_user(to, from, n); diff --git a/arch/s390/include/asm/vx-insn.h b/arch/s390/include/asm/vx-insn.h index 4a3135620f5e..49c24a2afce0 100644 --- a/arch/s390/include/asm/vx-insn.h +++ b/arch/s390/include/asm/vx-insn.h @@ -16,15 +16,13 @@ /* Macros to generate vector instruction byte code */ -#define REG_NUM_INVALID 255 - /* GR_NUM - Retrieve general-purpose register number * * @opd: Operand to store register number * @r64: String designation register in the format "%rN" */ .macro GR_NUM opd gr - \opd = REG_NUM_INVALID + \opd = 255 .ifc \gr,%r0 \opd = 0 .endif @@ -73,14 +71,11 @@ .ifc \gr,%r15 \opd = 15 .endif - .if \opd == REG_NUM_INVALID - .error "Invalid general-purpose register designation: \gr" + .if \opd == 255 + \opd = \gr .endif .endm -/* VX_R() - Macro to encode the VX_NUM into the instruction */ -#define VX_R(v) (v & 0x0F) - /* VX_NUM - Retrieve vector register number * * @opd: Operand to store register number @@ -88,11 +83,10 @@ * * The vector register number is used for as input number to the * instruction and, as well as, to compute the RXB field of the - * instruction. To encode the particular vector register number, - * use the VX_R(v) macro to extract the instruction opcode. + * instruction. */ .macro VX_NUM opd vxr - \opd = REG_NUM_INVALID + \opd = 255 .ifc \vxr,%v0 \opd = 0 .endif @@ -189,8 +183,8 @@ .ifc \vxr,%v31 \opd = 31 .endif - .if \opd == REG_NUM_INVALID - .error "Invalid vector register designation: \vxr" + .if \opd == 255 + \opd = \vxr .endif .endm @@ -251,7 +245,7 @@ /* VECTOR GENERATE BYTE MASK */ .macro VGBM vr imm2 VX_NUM v1, \vr - .word (0xE700 | (VX_R(v1) << 4)) + .word (0xE700 | ((v1&15) << 4)) .word \imm2 MRXBOPC 0, 0x44, v1 .endm @@ -267,7 +261,7 @@ VX_NUM v1, \v GR_NUM b2, "%r0" GR_NUM r3, \gr - .word 0xE700 | (VX_R(v1) << 4) | r3 + .word 0xE700 | ((v1&15) << 4) | r3 .word (b2 << 12) | (\disp) MRXBOPC \m, 0x22, v1 .endm @@ -284,12 +278,21 @@ VLVG \v, \gr, \index, 3 .endm +/* VECTOR LOAD REGISTER */ +.macro VLR v1, v2 + VX_NUM v1, \v1 + VX_NUM v2, \v2 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word 0 + MRXBOPC 0, 0x56, v1, v2 +.endm + /* VECTOR LOAD */ .macro VL v, disp, index="%r0", base VX_NUM v1, \v GR_NUM x2, \index GR_NUM b2, \base - .word 0xE700 | (VX_R(v1) << 4) | x2 + .word 0xE700 | ((v1&15) << 4) | x2 .word (b2 << 12) | (\disp) MRXBOPC 0, 0x06, v1 .endm @@ -299,7 +302,7 @@ VX_NUM v1, \vr1 GR_NUM x2, \index GR_NUM b2, \base - .word 0xE700 | (VX_R(v1) << 4) | x2 + .word 0xE700 | ((v1&15) << 4) | x2 .word (b2 << 12) | (\disp) MRXBOPC \m3, \opc, v1 .endm @@ -319,7 +322,7 @@ /* VECTOR LOAD ELEMENT IMMEDIATE */ .macro VLEIx vr1, imm2, m3, opc VX_NUM v1, \vr1 - .word 0xE700 | (VX_R(v1) << 4) + .word 0xE700 | ((v1&15) << 4) .word \imm2 MRXBOPC \m3, \opc, v1 .endm @@ -341,7 +344,7 @@ GR_NUM r1, \gr GR_NUM b2, \base VX_NUM v3, \vr - .word 0xE700 | (r1 << 4) | VX_R(v3) + .word 0xE700 | (r1 << 4) | (v3&15) .word (b2 << 12) | (\disp) MRXBOPC \m, 0x21, v3 .endm @@ -363,7 +366,7 @@ VX_NUM v1, \vfrom VX_NUM v3, \vto GR_NUM b2, \base /* Base register */ - .word 0xE700 | (VX_R(v1) << 4) | VX_R(v3) + .word 0xE700 | ((v1&15) << 4) | (v3&15) .word (b2 << 12) | (\disp) MRXBOPC 0, 0x36, v1, v3 .endm @@ -373,7 +376,7 @@ VX_NUM v1, \vfrom VX_NUM v3, \vto GR_NUM b2, \base /* Base register */ - .word 0xE700 | (VX_R(v1) << 4) | VX_R(v3) + .word 0xE700 | ((v1&15) << 4) | (v3&15) .word (b2 << 12) | (\disp) MRXBOPC 0, 0x3E, v1, v3 .endm @@ -384,16 +387,16 @@ VX_NUM v2, \vr2 VX_NUM v3, \vr3 VX_NUM v4, \vr4 - .word 0xE700 | (VX_R(v1) << 4) | VX_R(v2) - .word (VX_R(v3) << 12) - MRXBOPC VX_R(v4), 0x8C, v1, v2, v3, v4 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC (v4&15), 0x8C, v1, v2, v3, v4 .endm /* VECTOR UNPACK LOGICAL LOW */ .macro VUPLL vr1, vr2, m3 VX_NUM v1, \vr1 VX_NUM v2, \vr2 - .word 0xE700 | (VX_R(v1) << 4) | VX_R(v2) + .word 0xE700 | ((v1&15) << 4) | (v2&15) .word 0x0000 MRXBOPC \m3, 0xD4, v1, v2 .endm @@ -410,13 +413,23 @@ /* Vector integer instructions */ +/* VECTOR AND */ +.macro VN vr1, vr2, vr3 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC 0, 0x68, v1, v2, v3 +.endm + /* VECTOR EXCLUSIVE OR */ .macro VX vr1, vr2, vr3 VX_NUM v1, \vr1 VX_NUM v2, \vr2 VX_NUM v3, \vr3 - .word 0xE700 | (VX_R(v1) << 4) | VX_R(v2) - .word (VX_R(v3) << 12) + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) MRXBOPC 0, 0x6D, v1, v2, v3 .endm @@ -425,8 +438,8 @@ VX_NUM v1, \vr1 VX_NUM v2, \vr2 VX_NUM v3, \vr3 - .word 0xE700 | (VX_R(v1) << 4) | VX_R(v2) - .word (VX_R(v3) << 12) + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) MRXBOPC \m4, 0xB4, v1, v2, v3 .endm .macro VGFMB vr1, vr2, vr3 @@ -448,9 +461,9 @@ VX_NUM v2, \vr2 VX_NUM v3, \vr3 VX_NUM v4, \vr4 - .word 0xE700 | (VX_R(v1) << 4) | VX_R(v2) - .word (VX_R(v3) << 12) | (\m5 << 8) - MRXBOPC VX_R(v4), 0xBC, v1, v2, v3, v4 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) | (\m5 << 8) + MRXBOPC (v4&15), 0xBC, v1, v2, v3, v4 .endm .macro VGFMAB vr1, vr2, vr3, vr4 VGFMA \vr1, \vr2, \vr3, \vr4, 0 @@ -470,11 +483,78 @@ VX_NUM v1, \vr1 VX_NUM v2, \vr2 VX_NUM v3, \vr3 - .word 0xE700 | (VX_R(v1) << 4) | VX_R(v2) - .word (VX_R(v3) << 12) + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) MRXBOPC 0, 0x7D, v1, v2, v3 .endm +/* VECTOR REPLICATE IMMEDIATE */ +.macro VREPI vr1, imm2, m3 + VX_NUM v1, \vr1 + .word 0xE700 | ((v1&15) << 4) + .word \imm2 + MRXBOPC \m3, 0x45, v1 +.endm +.macro VREPIB vr1, imm2 + VREPI \vr1, \imm2, 0 +.endm +.macro VREPIH vr1, imm2 + VREPI \vr1, \imm2, 1 +.endm +.macro VREPIF vr1, imm2 + VREPI \vr1, \imm2, 2 +.endm +.macro VREPIG vr1, imm2 + VREP \vr1, \imm2, 3 +.endm + +/* VECTOR ADD */ +.macro VA vr1, vr2, vr3, m4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC \m4, 0xF3, v1, v2, v3 +.endm +.macro VAB vr1, vr2, vr3 + VA \vr1, \vr2, \vr3, 0 +.endm +.macro VAH vr1, vr2, vr3 + VA \vr1, \vr2, \vr3, 1 +.endm +.macro VAF vr1, vr2, vr3 + VA \vr1, \vr2, \vr3, 2 +.endm +.macro VAG vr1, vr2, vr3 + VA \vr1, \vr2, \vr3, 3 +.endm +.macro VAQ vr1, vr2, vr3 + VA \vr1, \vr2, \vr3, 4 +.endm + +/* VECTOR ELEMENT SHIFT RIGHT ARITHMETIC */ +.macro VESRAV vr1, vr2, vr3, m4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC \m4, 0x7A, v1, v2, v3 +.endm + +.macro VESRAVB vr1, vr2, vr3 + VESRAV \vr1, \vr2, \vr3, 0 +.endm +.macro VESRAVH vr1, vr2, vr3 + VESRAV \vr1, \vr2, \vr3, 1 +.endm +.macro VESRAVF vr1, vr2, vr3 + VESRAV \vr1, \vr2, \vr3, 2 +.endm +.macro VESRAVG vr1, vr2, vr3 + VESRAV \vr1, \vr2, \vr3, 3 +.endm #endif /* __ASSEMBLY__ */ #endif /* __ASM_S390_VX_INSN_H */ diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild index 08fe6dad9026..cc44b09c25fc 100644 --- a/arch/s390/include/uapi/asm/Kbuild +++ b/arch/s390/include/uapi/asm/Kbuild @@ -6,6 +6,7 @@ header-y += bitsperlong.h header-y += byteorder.h header-y += chpid.h header-y += chsc.h +header-y += clp.h header-y += cmb.h header-y += dasd.h header-y += debug.h diff --git a/arch/s390/include/uapi/asm/auxvec.h b/arch/s390/include/uapi/asm/auxvec.h index a1f153e89133..c53e08442255 100644 --- a/arch/s390/include/uapi/asm/auxvec.h +++ b/arch/s390/include/uapi/asm/auxvec.h @@ -3,4 +3,6 @@ #define AT_SYSINFO_EHDR 33 +#define AT_VECTOR_SIZE_ARCH 1 /* entries in ARCH_DLINFO */ + #endif diff --git a/arch/s390/include/uapi/asm/dasd.h b/arch/s390/include/uapi/asm/dasd.h index 5812a3b2df9e..1340311dab77 100644 --- a/arch/s390/include/uapi/asm/dasd.h +++ b/arch/s390/include/uapi/asm/dasd.h @@ -187,6 +187,36 @@ typedef struct format_data_t { #define DASD_FMT_INT_INVAL 4 /* invalidate tracks */ #define DASD_FMT_INT_COMPAT 8 /* use OS/390 compatible disk layout */ +/* + * struct format_check_t + * represents all data necessary to evaluate the format of + * different tracks of a dasd + */ +typedef struct format_check_t { + /* Input */ + struct format_data_t expect; + + /* Output */ + unsigned int result; /* Error indication (DASD_FMT_ERR_*) */ + unsigned int unit; /* Track that is in error */ + unsigned int rec; /* Record that is in error */ + unsigned int num_records; /* Records in the track in error */ + unsigned int blksize; /* Blocksize of first record in error */ + unsigned int key_length; /* Key length of first record in error */ +} format_check_t; + +/* Values returned in format_check_t when a format error is detected: */ +/* Too few records were found on a single track */ +#define DASD_FMT_ERR_TOO_FEW_RECORDS 1 +/* Too many records were found on a single track */ +#define DASD_FMT_ERR_TOO_MANY_RECORDS 2 +/* Blocksize/data-length of a record was wrong */ +#define DASD_FMT_ERR_BLKSIZE 3 +/* A record ID is defined by cylinder, head, and record number (CHR). */ +/* On mismatch, this error is set */ +#define DASD_FMT_ERR_RECORD_ID 4 +/* If key-length was != 0 */ +#define DASD_FMT_ERR_KEY_LENGTH 5 /* * struct attrib_data_t @@ -288,6 +318,8 @@ struct dasd_snid_ioctl_data { /* Get Sense Path Group ID (SNID) data */ #define BIODASDSNID _IOWR(DASD_IOCTL_LETTER, 1, struct dasd_snid_ioctl_data) +/* Check device format according to format_check_t */ +#define BIODASDCHECKFMT _IOWR(DASD_IOCTL_LETTER, 2, format_check_t) #define BIODASDSYMMIO _IOWR(DASD_IOCTL_LETTER, 240, dasd_symmio_parms_t) diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 347fe5afa419..a2ffec4139ad 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -25,6 +25,7 @@ #define KVM_DEV_FLIC_APF_DISABLE_WAIT 5 #define KVM_DEV_FLIC_ADAPTER_REGISTER 6 #define KVM_DEV_FLIC_ADAPTER_MODIFY 7 +#define KVM_DEV_FLIC_CLEAR_IO_IRQ 8 /* * We can have up to 4*64k pending subchannels + 8 adapter interrupts, * as well as up to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts. @@ -92,6 +93,47 @@ struct kvm_s390_vm_cpu_machine { __u64 fac_list[256]; }; +#define KVM_S390_VM_CPU_PROCESSOR_FEAT 2 +#define KVM_S390_VM_CPU_MACHINE_FEAT 3 + +#define KVM_S390_VM_CPU_FEAT_NR_BITS 1024 +#define KVM_S390_VM_CPU_FEAT_ESOP 0 +#define KVM_S390_VM_CPU_FEAT_SIEF2 1 +#define KVM_S390_VM_CPU_FEAT_64BSCAO 2 +#define KVM_S390_VM_CPU_FEAT_SIIF 3 +#define KVM_S390_VM_CPU_FEAT_GPERE 4 +#define KVM_S390_VM_CPU_FEAT_GSLS 5 +#define KVM_S390_VM_CPU_FEAT_IB 6 +#define KVM_S390_VM_CPU_FEAT_CEI 7 +#define KVM_S390_VM_CPU_FEAT_IBS 8 +#define KVM_S390_VM_CPU_FEAT_SKEY 9 +#define KVM_S390_VM_CPU_FEAT_CMMA 10 +#define KVM_S390_VM_CPU_FEAT_PFMFI 11 +#define KVM_S390_VM_CPU_FEAT_SIGPIF 12 +struct kvm_s390_vm_cpu_feat { + __u64 feat[16]; +}; + +#define KVM_S390_VM_CPU_PROCESSOR_SUBFUNC 4 +#define KVM_S390_VM_CPU_MACHINE_SUBFUNC 5 +/* for "test bit" instructions MSB 0 bit ordering, for "query" raw blocks */ +struct kvm_s390_vm_cpu_subfunc { + __u8 plo[32]; /* always */ + __u8 ptff[16]; /* with TOD-clock steering */ + __u8 kmac[16]; /* with MSA */ + __u8 kmc[16]; /* with MSA */ + __u8 km[16]; /* with MSA */ + __u8 kimd[16]; /* with MSA */ + __u8 klmd[16]; /* with MSA */ + __u8 pckmo[16]; /* with MSA3 */ + __u8 kmctr[16]; /* with MSA4 */ + __u8 kmf[16]; /* with MSA4 */ + __u8 kmo[16]; /* with MSA4 */ + __u8 pcc[16]; /* with MSA4 */ + __u8 ppno[16]; /* with MSA5 */ + __u8 reserved[1824]; +}; + /* kvm attributes for crypto */ #define KVM_S390_VM_CRYPTO_ENABLE_AES_KW 0 #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1 diff --git a/arch/s390/include/uapi/asm/ptrace.h b/arch/s390/include/uapi/asm/ptrace.h index a150f4fabe43..77630c74f13b 100644 --- a/arch/s390/include/uapi/asm/ptrace.h +++ b/arch/s390/include/uapi/asm/ptrace.h @@ -359,9 +359,9 @@ typedef struct per_cr_bits bits; } control_regs; /* - * Use these flags instead of setting em_instruction_fetch - * directly they are used so that single stepping can be - * switched on & off while not affecting other tracing + * The single_step and instruction_fetch bits are obsolete, + * the kernel always sets them to zero. To enable single + * stepping use ptrace(PTRACE_SINGLESTEP) instead. */ unsigned single_step : 1; unsigned instruction_fetch : 1; diff --git a/arch/s390/include/uapi/asm/sie.h b/arch/s390/include/uapi/asm/sie.h index 5dbaa72baa64..3ac634368939 100644 --- a/arch/s390/include/uapi/asm/sie.h +++ b/arch/s390/include/uapi/asm/sie.h @@ -16,14 +16,19 @@ { 0x01, "SIGP sense" }, \ { 0x02, "SIGP external call" }, \ { 0x03, "SIGP emergency signal" }, \ + { 0x04, "SIGP start" }, \ { 0x05, "SIGP stop" }, \ { 0x06, "SIGP restart" }, \ { 0x09, "SIGP stop and store status" }, \ { 0x0b, "SIGP initial cpu reset" }, \ + { 0x0c, "SIGP cpu reset" }, \ { 0x0d, "SIGP set prefix" }, \ { 0x0e, "SIGP store status at address" }, \ { 0x12, "SIGP set architecture" }, \ - { 0x15, "SIGP sense running" } + { 0x13, "SIGP conditional emergency signal" }, \ + { 0x15, "SIGP sense running" }, \ + { 0x16, "SIGP set multithreading"}, \ + { 0x17, "SIGP store additional status ait address"} #define icpt_prog_codes \ { 0x0001, "Prog Operation" }, \ @@ -135,6 +140,7 @@ exit_code_ipa0(0xB2, 0x4c, "TAR"), \ exit_code_ipa0(0xB2, 0x50, "CSP"), \ exit_code_ipa0(0xB2, 0x54, "MVPG"), \ + exit_code_ipa0(0xB2, 0x56, "STHYI"), \ exit_code_ipa0(0xB2, 0x58, "BSG"), \ exit_code_ipa0(0xB2, 0x5a, "BSA"), \ exit_code_ipa0(0xB2, 0x5f, "CHSC"), \ diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 2f5586ab8a6a..72ccc41444dc 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -2,6 +2,10 @@ # Makefile for the linux kernel. # +KCOV_INSTRUMENT_early.o := n +KCOV_INSTRUMENT_sclp.o := n +KCOV_INSTRUMENT_als.o := n + ifdef CONFIG_FUNCTION_TRACER # Don't trace early setup code and tracing code CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE) @@ -29,23 +33,30 @@ CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' CFLAGS_sysinfo.o += -w # -# Use -march=z900 for sclp.c to be able to print an error message if -# the kernel is started on a machine which is too old +# Use -march=z900 for sclp.c and als.c to be able to print an error +# message if the kernel is started on a machine which is too old # CFLAGS_REMOVE_sclp.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_als.o = $(CC_FLAGS_FTRACE) ifneq ($(CC_FLAGS_MARCH),-march=z900) CFLAGS_REMOVE_sclp.o += $(CC_FLAGS_MARCH) CFLAGS_sclp.o += -march=z900 +CFLAGS_REMOVE_als.o += $(CC_FLAGS_MARCH) +CFLAGS_als.o += -march=z900 AFLAGS_REMOVE_head.o += $(CC_FLAGS_MARCH) AFLAGS_head.o += -march=z900 endif GCOV_PROFILE_sclp.o := n +GCOV_PROFILE_als.o := n +UBSAN_SANITIZE_als.o := n +UBSAN_SANITIZE_early.o := n +UBSAN_SANITIZE_sclp.o := n obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o -obj-y += debug.o irq.o ipl.o dis.o diag.o sclp.o vdso.o +obj-y += debug.o irq.o ipl.o dis.o diag.o sclp.o vdso.o als.o obj-y += sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o -obj-y += runtime_instr.o cache.o dumpstack.o +obj-y += runtime_instr.o cache.o fpu.o dumpstack.o obj-y += entry.o reipl.o relocate_kernel.o extra-y += head.o head64.o vmlinux.lds diff --git a/arch/s390/kernel/als.c b/arch/s390/kernel/als.c new file mode 100644 index 000000000000..a16e9d1bf9e3 --- /dev/null +++ b/arch/s390/kernel/als.c @@ -0,0 +1,124 @@ +/* + * Copyright IBM Corp. 2016 + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <asm/processor.h> +#include <asm/facility.h> +#include <asm/lowcore.h> +#include <asm/sclp.h> +#include "entry.h" + +/* + * The code within this file will be called very early. It may _not_ + * access anything within the bss section, since that is not cleared + * yet and may contain data (e.g. initrd) that must be saved by other + * code. + * For temporary objects the stack (16k) should be used. + */ + +static unsigned long als[] __initdata = { FACILITIES_ALS }; + +static void __init u16_to_hex(char *str, u16 val) +{ + int i, num; + + for (i = 1; i <= 4; i++) { + num = (val >> (16 - 4 * i)) & 0xf; + if (num >= 10) + num += 7; + *str++ = '0' + num; + } + *str = '\0'; +} + +static void __init print_machine_type(void) +{ + static char mach_str[80] __initdata = "Detected machine-type number: "; + char type_str[5]; + struct cpuid id; + + get_cpu_id(&id); + u16_to_hex(type_str, id.machine); + strcat(mach_str, type_str); + _sclp_print_early(mach_str); +} + +static void __init u16_to_decimal(char *str, u16 val) +{ + int div = 1; + + while (div * 10 <= val) + div *= 10; + while (div) { + *str++ = '0' + val / div; + val %= div; + div /= 10; + } + *str = '\0'; +} + +static void __init print_missing_facilities(void) +{ + static char als_str[80] __initdata = "Missing facilities: "; + unsigned long val; + char val_str[6]; + int i, j, first; + + first = 1; + for (i = 0; i < ARRAY_SIZE(als); i++) { + val = ~S390_lowcore.stfle_fac_list[i] & als[i]; + for (j = 0; j < BITS_PER_LONG; j++) { + if (!(val & (1UL << (BITS_PER_LONG - 1 - j)))) + continue; + if (!first) + strcat(als_str, ","); + /* + * Make sure we stay within one line. Consider that + * each facility bit adds up to five characters and + * z/VM adds a four character prefix. + */ + if (strlen(als_str) > 70) { + _sclp_print_early(als_str); + *als_str = '\0'; + } + u16_to_decimal(val_str, i * BITS_PER_LONG + j); + strcat(als_str, val_str); + first = 0; + } + } + _sclp_print_early(als_str); + _sclp_print_early("See Principles of Operations for facility bits"); +} + +static void __init facility_mismatch(void) +{ + _sclp_print_early("The Linux kernel requires more recent processor hardware"); + print_machine_type(); + print_missing_facilities(); + disabled_wait(0x8badcccc); +} + +void __init verify_facilities(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(S390_lowcore.stfle_fac_list); i++) + S390_lowcore.stfle_fac_list[i] = 0; + asm volatile( + " stfl 0(0)\n" + : "=m" (S390_lowcore.stfl_fac_list)); + S390_lowcore.stfle_fac_list[0] = (u64)S390_lowcore.stfl_fac_list << 32; + if (S390_lowcore.stfl_fac_list & 0x01000000) { + register unsigned long reg0 asm("0") = ARRAY_SIZE(als) - 1; + + asm volatile(".insn s,0xb2b00000,0(%1)" /* stfle */ + : "+d" (reg0) + : "a" (&S390_lowcore.stfle_fac_list) + : "memory", "cc"); + } + for (i = 0; i < ARRAY_SIZE(als); i++) { + if ((S390_lowcore.stfle_fac_list[i] & als[i]) != als[i]) + facility_mismatch(); + } +} diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 1f95cc1faeb7..f3df9e0a5dec 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -125,6 +125,7 @@ int main(void) OFFSET(__LC_STFL_FAC_LIST, lowcore, stfl_fac_list); OFFSET(__LC_STFLE_FAC_LIST, lowcore, stfle_fac_list); OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code); + OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code); OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address); OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr); OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw); diff --git a/arch/s390/kernel/cache.c b/arch/s390/kernel/cache.c index 8ba32436effe..c8a83276a4dc 100644 --- a/arch/s390/kernel/cache.c +++ b/arch/s390/kernel/cache.c @@ -72,7 +72,6 @@ void show_cacheinfo(struct seq_file *m) if (!test_facility(34)) return; - get_online_cpus(); this_cpu_ci = get_cpu_cacheinfo(cpumask_any(cpu_online_mask)); for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { cache = this_cpu_ci->info_list + idx; @@ -86,7 +85,6 @@ void show_cacheinfo(struct seq_file *m) seq_printf(m, "associativity=%d", cache->ways_of_associativity); seq_puts(m, "\n"); } - put_online_cpus(); } static inline enum cache_type get_cache_type(struct cache_info *ci, int level) @@ -101,12 +99,7 @@ static inline enum cache_type get_cache_type(struct cache_info *ci, int level) static inline unsigned long ecag(int ai, int li, int ti) { - unsigned long cmd, val; - - cmd = ai << 4 | li << 1 | ti; - asm volatile(".insn rsy,0xeb000000004c,%0,0,0(%1)" /* ecag */ - : "=d" (val) : "a" (cmd)); - return val; + return __ecag(ECAG_CACHE_ATTRIBUTE, ai << 4 | li << 1 | ti); } static void ci_leaf_init(struct cacheinfo *this_leaf, int private, diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 3986c9f62191..f9293bfefb7f 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -71,9 +71,7 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu) */ struct save_area * __init save_area_boot_cpu(void) { - if (list_empty(&dump_save_areas)) - return NULL; - return list_first_entry(&dump_save_areas, struct save_area, list); + return list_first_entry_or_null(&dump_save_areas, struct save_area, list); } /* @@ -173,7 +171,7 @@ int copy_oldmem_kernel(void *dst, void *src, size_t count) /* * Copy memory of the old, dumped system to a user space virtual address */ -int copy_oldmem_user(void __user *dst, void *src, size_t count) +static int copy_oldmem_user(void __user *dst, void *src, size_t count) { unsigned long from, len; int rc; diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index 48b37b8357e6..a97354c8c667 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -162,6 +162,30 @@ int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode) } EXPORT_SYMBOL(diag14); +static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr) +{ + register unsigned long _subcode asm("0") = *subcode; + register unsigned long _size asm("1") = size; + + asm volatile( + " diag %2,%0,0x204\n" + "0: nopr %%r7\n" + EX_TABLE(0b,0b) + : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory"); + *subcode = _subcode; + return _size; +} + +int diag204(unsigned long subcode, unsigned long size, void *addr) +{ + diag_stat_inc(DIAG_STAT_X204); + size = __diag204(&subcode, size, addr); + if (subcode) + return -1; + return size; +} +EXPORT_SYMBOL(diag204); + /* * Diagnose 210: Get information about a virtual device */ @@ -196,3 +220,18 @@ int diag210(struct diag210 *addr) return ccode; } EXPORT_SYMBOL(diag210); + +int diag224(void *ptr) +{ + int rc = -EOPNOTSUPP; + + diag_stat_inc(DIAG_STAT_X224); + asm volatile( + " diag %1,%2,0x224\n" + "0: lhi %0,0x0\n" + "1:\n" + EX_TABLE(0b,1b) + : "+d" (rc) :"d" (0), "d" (ptr) : "memory"); + return rc; +} +EXPORT_SYMBOL(diag224); diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 8cb9bfdd3ea8..43446fa2a4e5 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -26,7 +26,6 @@ #include <asm/dis.h> #include <asm/io.h> #include <linux/atomic.h> -#include <asm/mathemu.h> #include <asm/cpcmd.h> #include <asm/lowcore.h> #include <asm/debug.h> diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 1b6081c0aff9..6693383bc01b 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -78,17 +78,37 @@ void dump_trace(dump_trace_func_t func, void *data, struct task_struct *task, sp = __dump_trace(func, data, sp, S390_lowcore.async_stack + frame_size - ASYNC_SIZE, S390_lowcore.async_stack + frame_size); - if (task) - __dump_trace(func, data, sp, - (unsigned long)task_stack_page(task), - (unsigned long)task_stack_page(task) + THREAD_SIZE); - else - __dump_trace(func, data, sp, - S390_lowcore.thread_info, - S390_lowcore.thread_info + THREAD_SIZE); + task = task ?: current; + __dump_trace(func, data, sp, + (unsigned long)task_stack_page(task), + (unsigned long)task_stack_page(task) + THREAD_SIZE); } EXPORT_SYMBOL_GPL(dump_trace); +struct return_address_data { + unsigned long address; + int depth; +}; + +static int __return_address(void *data, unsigned long address) +{ + struct return_address_data *rd = data; + + if (rd->depth--) + return 0; + rd->address = address; + return 1; +} + +unsigned long return_address(int depth) +{ + struct return_address_data rd = { .depth = depth + 2 }; + + dump_trace(__return_address, &rd, NULL, current_stack_pointer()); + return rd.address; +} +EXPORT_SYMBOL_GPL(return_address); + static int show_address(void *data, unsigned long address) { printk("([<%016lx>] %pSR)\n", address, (void *)address); diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index a0684de5a93b..2374c5b46bbc 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -13,7 +13,7 @@ #include <linux/string.h> #include <linux/ctype.h> #include <linux/lockdep.h> -#include <linux/module.h> +#include <linux/extable.h> #include <linux/pfn.h> #include <linux/uaccess.h> #include <linux/kernel.h> @@ -231,6 +231,26 @@ static noinline __init void detect_machine_type(void) S390_lowcore.machine_flags |= MACHINE_FLAG_VM; } +static noinline __init void setup_arch_string(void) +{ + struct sysinfo_1_1_1 *mach = (struct sysinfo_1_1_1 *)&sysinfo_page; + + if (stsi(mach, 1, 1, 1)) + return; + EBCASC(mach->manufacturer, sizeof(mach->manufacturer)); + EBCASC(mach->type, sizeof(mach->type)); + EBCASC(mach->model, sizeof(mach->model)); + EBCASC(mach->model_capacity, sizeof(mach->model_capacity)); + dump_stack_set_arch_desc("%-16.16s %-4.4s %-16.16s %-16.16s (%s)", + mach->manufacturer, + mach->type, + mach->model, + mach->model_capacity, + MACHINE_IS_LPAR ? "LPAR" : + MACHINE_IS_VM ? "z/VM" : + MACHINE_IS_KVM ? "KVM" : "unknown"); +} + static __init void setup_topology(void) { int max_mnest; @@ -447,11 +467,13 @@ void __init startup_init(void) ipl_save_parameters(); rescue_initrd(); clear_bss_section(); + ptff_init(); init_kernel_storage_key(); lockdep_off(); setup_lowcore_early(); setup_facility_list(); detect_machine_type(); + setup_arch_string(); ipl_update_parameters(); setup_boot_command_line(); create_kernel_nss(); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 2d47f9cfcb36..c51650a1ed16 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -163,6 +163,16 @@ _PIF_WORK = (_PIF_PER_TRAP) .endm .section .kprobes.text, "ax" +.Ldummy: + /* + * This nop exists only in order to avoid that __switch_to starts at + * the beginning of the kprobes text section. In that case we would + * have several symbols at the same address. E.g. objdump would take + * an arbitrary symbol name when disassembling this code. + * With the added nop in between the __switch_to symbol is unique + * again. + */ + nop 0 /* * Scheduler resume function, called by switch_to @@ -175,7 +185,6 @@ ENTRY(__switch_to) stmg %r6,%r15,__SF_GPRS(%r15) # store gprs of prev task lgr %r1,%r2 aghi %r1,__TASK_thread # thread_struct of prev task - lg %r4,__TASK_thread_info(%r2) # get thread_info of prev lg %r5,__TASK_thread_info(%r3) # get thread_info of next stg %r15,__THREAD_ksp(%r1) # store kernel stack of prev lgr %r1,%r3 diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index b7019ab74070..e79f030dd276 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -1,6 +1,7 @@ #ifndef _ENTRY_H #define _ENTRY_H +#include <linux/percpu.h> #include <linux/types.h> #include <linux/signal.h> #include <asm/ptrace.h> @@ -75,4 +76,9 @@ long sys_s390_personality(unsigned int personality); long sys_s390_runtime_instr(int command, int signum); long sys_s390_pci_mmio_write(unsigned long, const void __user *, size_t); long sys_s390_pci_mmio_read(unsigned long, void __user *, size_t); + +DECLARE_PER_CPU(u64, mt_cycles[8]); + +void verify_facilities(void); + #endif /* _ENTRY_H */ diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c new file mode 100644 index 000000000000..1235b9438df4 --- /dev/null +++ b/arch/s390/kernel/fpu.c @@ -0,0 +1,176 @@ +/* + * In-kernel vector facility support functions + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ +#include <linux/kernel.h> +#include <linux/cpu.h> +#include <linux/sched.h> +#include <asm/fpu/types.h> +#include <asm/fpu/api.h> + +asm(".include \"asm/vx-insn.h\"\n"); + +void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags) +{ + /* + * Limit the save to the FPU/vector registers already + * in use by the previous context + */ + flags &= state->mask; + + if (flags & KERNEL_FPC) + /* Save floating point control */ + asm volatile("stfpc %0" : "=m" (state->fpc)); + + if (!MACHINE_HAS_VX) { + if (flags & KERNEL_VXR_V0V7) { + /* Save floating-point registers */ + asm volatile("std 0,%0" : "=Q" (state->fprs[0])); + asm volatile("std 1,%0" : "=Q" (state->fprs[1])); + asm volatile("std 2,%0" : "=Q" (state->fprs[2])); + asm volatile("std 3,%0" : "=Q" (state->fprs[3])); + asm volatile("std 4,%0" : "=Q" (state->fprs[4])); + asm volatile("std 5,%0" : "=Q" (state->fprs[5])); + asm volatile("std 6,%0" : "=Q" (state->fprs[6])); + asm volatile("std 7,%0" : "=Q" (state->fprs[7])); + asm volatile("std 8,%0" : "=Q" (state->fprs[8])); + asm volatile("std 9,%0" : "=Q" (state->fprs[9])); + asm volatile("std 10,%0" : "=Q" (state->fprs[10])); + asm volatile("std 11,%0" : "=Q" (state->fprs[11])); + asm volatile("std 12,%0" : "=Q" (state->fprs[12])); + asm volatile("std 13,%0" : "=Q" (state->fprs[13])); + asm volatile("std 14,%0" : "=Q" (state->fprs[14])); + asm volatile("std 15,%0" : "=Q" (state->fprs[15])); + } + return; + } + + /* Test and save vector registers */ + asm volatile ( + /* + * Test if any vector register must be saved and, if so, + * test if all register can be saved. + */ + " la 1,%[vxrs]\n" /* load save area */ + " tmll %[m],30\n" /* KERNEL_VXR */ + " jz 7f\n" /* no work -> done */ + " jo 5f\n" /* -> save V0..V31 */ + /* + * Test for special case KERNEL_FPU_MID only. In this + * case a vstm V8..V23 is the best instruction + */ + " chi %[m],12\n" /* KERNEL_VXR_MID */ + " jne 0f\n" /* -> save V8..V23 */ + " VSTM 8,23,128,1\n" /* vstm %v8,%v23,128(%r1) */ + " j 7f\n" + /* Test and save the first half of 16 vector registers */ + "0: tmll %[m],6\n" /* KERNEL_VXR_LOW */ + " jz 3f\n" /* -> KERNEL_VXR_HIGH */ + " jo 2f\n" /* 11 -> save V0..V15 */ + " brc 2,1f\n" /* 10 -> save V8..V15 */ + " VSTM 0,7,0,1\n" /* vstm %v0,%v7,0(%r1) */ + " j 3f\n" + "1: VSTM 8,15,128,1\n" /* vstm %v8,%v15,128(%r1) */ + " j 3f\n" + "2: VSTM 0,15,0,1\n" /* vstm %v0,%v15,0(%r1) */ + /* Test and save the second half of 16 vector registers */ + "3: tmll %[m],24\n" /* KERNEL_VXR_HIGH */ + " jz 7f\n" + " jo 6f\n" /* 11 -> save V16..V31 */ + " brc 2,4f\n" /* 10 -> save V24..V31 */ + " VSTM 16,23,256,1\n" /* vstm %v16,%v23,256(%r1) */ + " j 7f\n" + "4: VSTM 24,31,384,1\n" /* vstm %v24,%v31,384(%r1) */ + " j 7f\n" + "5: VSTM 0,15,0,1\n" /* vstm %v0,%v15,0(%r1) */ + "6: VSTM 16,31,256,1\n" /* vstm %v16,%v31,256(%r1) */ + "7:" + : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs) + : [m] "d" (flags) + : "1", "cc"); +} +EXPORT_SYMBOL(__kernel_fpu_begin); + +void __kernel_fpu_end(struct kernel_fpu *state, u32 flags) +{ + /* + * Limit the restore to the FPU/vector registers of the + * previous context that have been overwritte by the + * current context + */ + flags &= state->mask; + + if (flags & KERNEL_FPC) + /* Restore floating-point controls */ + asm volatile("lfpc %0" : : "Q" (state->fpc)); + + if (!MACHINE_HAS_VX) { + if (flags & KERNEL_VXR_V0V7) { + /* Restore floating-point registers */ + asm volatile("ld 0,%0" : : "Q" (state->fprs[0])); + asm volatile("ld 1,%0" : : "Q" (state->fprs[1])); + asm volatile("ld 2,%0" : : "Q" (state->fprs[2])); + asm volatile("ld 3,%0" : : "Q" (state->fprs[3])); + asm volatile("ld 4,%0" : : "Q" (state->fprs[4])); + asm volatile("ld 5,%0" : : "Q" (state->fprs[5])); + asm volatile("ld 6,%0" : : "Q" (state->fprs[6])); + asm volatile("ld 7,%0" : : "Q" (state->fprs[7])); + asm volatile("ld 8,%0" : : "Q" (state->fprs[8])); + asm volatile("ld 9,%0" : : "Q" (state->fprs[9])); + asm volatile("ld 10,%0" : : "Q" (state->fprs[10])); + asm volatile("ld 11,%0" : : "Q" (state->fprs[11])); + asm volatile("ld 12,%0" : : "Q" (state->fprs[12])); + asm volatile("ld 13,%0" : : "Q" (state->fprs[13])); + asm volatile("ld 14,%0" : : "Q" (state->fprs[14])); + asm volatile("ld 15,%0" : : "Q" (state->fprs[15])); + } + return; + } + + /* Test and restore (load) vector registers */ + asm volatile ( + /* + * Test if any vector register must be loaded and, if so, + * test if all registers can be loaded at once. + */ + " la 1,%[vxrs]\n" /* load restore area */ + " tmll %[m],30\n" /* KERNEL_VXR */ + " jz 7f\n" /* no work -> done */ + " jo 5f\n" /* -> restore V0..V31 */ + /* + * Test for special case KERNEL_FPU_MID only. In this + * case a vlm V8..V23 is the best instruction + */ + " chi %[m],12\n" /* KERNEL_VXR_MID */ + " jne 0f\n" /* -> restore V8..V23 */ + " VLM 8,23,128,1\n" /* vlm %v8,%v23,128(%r1) */ + " j 7f\n" + /* Test and restore the first half of 16 vector registers */ + "0: tmll %[m],6\n" /* KERNEL_VXR_LOW */ + " jz 3f\n" /* -> KERNEL_VXR_HIGH */ + " jo 2f\n" /* 11 -> restore V0..V15 */ + " brc 2,1f\n" /* 10 -> restore V8..V15 */ + " VLM 0,7,0,1\n" /* vlm %v0,%v7,0(%r1) */ + " j 3f\n" + "1: VLM 8,15,128,1\n" /* vlm %v8,%v15,128(%r1) */ + " j 3f\n" + "2: VLM 0,15,0,1\n" /* vlm %v0,%v15,0(%r1) */ + /* Test and restore the second half of 16 vector registers */ + "3: tmll %[m],24\n" /* KERNEL_VXR_HIGH */ + " jz 7f\n" + " jo 6f\n" /* 11 -> restore V16..V31 */ + " brc 2,4f\n" /* 10 -> restore V24..V31 */ + " VLM 16,23,256,1\n" /* vlm %v16,%v23,256(%r1) */ + " j 7f\n" + "4: VLM 24,31,384,1\n" /* vlm %v24,%v31,384(%r1) */ + " j 7f\n" + "5: VLM 0,15,0,1\n" /* vlm %v0,%v15,0(%r1) */ + "6: VLM 16,31,256,1\n" /* vlm %v16,%v31,256(%r1) */ + "7:" + : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs) + : [m] "d" (flags) + : "1", "cc"); +} +EXPORT_SYMBOL(__kernel_fpu_end); diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 0f7bfeba6da6..60a8a4e207ed 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -209,7 +209,8 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) /* Only trace if the calling function expects to. */ if (!ftrace_graph_entry(&trace)) goto out; - if (ftrace_push_return_trace(parent, ip, &trace.depth, 0) == -EBUSY) + if (ftrace_push_return_trace(parent, ip, &trace.depth, 0, + NULL) == -EBUSY) goto out; parent = (unsigned long) return_to_handler; out: diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S index fcaefb041364..4431905f8cfa 100644 --- a/arch/s390/kernel/head.S +++ b/arch/s390/kernel/head.S @@ -306,49 +306,16 @@ ENTRY(startup_kdump) stck __LC_LAST_UPDATE_CLOCK spt 6f-.LPG0(%r13) mvc __LC_LAST_UPDATE_TIMER(8),6f-.LPG0(%r13) - stfl 0(%r0) # store facilities @ __LC_STFL_FAC_LIST - mvc __LC_STFLE_FAC_LIST(4),__LC_STFL_FAC_LIST - tm __LC_STFLE_FAC_LIST,0x01 # stfle available ? - jz 0f - lghi %r0,FACILITIES_ALS_DWORDS-1 - .insn s,0xb2b00000,__LC_STFLE_FAC_LIST # store facility list extended - # verify if all required facilities are supported by the machine -0: la %r1,__LC_STFLE_FAC_LIST - la %r2,3f+8-.LPG0(%r13) - lhi %r3,FACILITIES_ALS_DWORDS -1: lg %r0,0(%r1) - ng %r0,0(%r2) - clg %r0,0(%r2) - jne 2f - la %r1,8(%r1) - la %r2,8(%r2) - ahi %r3,-1 - jnz 1b - j 4f -2: l %r15,.Lstack-.LPG0(%r13) + l %r15,.Lstack-.LPG0(%r13) ahi %r15,-STACK_FRAME_OVERHEAD - la %r2,.Lals_string-.LPG0(%r13) - l %r3,.Lsclp_print-.LPG0(%r13) - basr %r14,%r3 - lpsw 3f-.LPG0(%r13) # machine type not good enough, crash -.Lals_string: - .asciz "The Linux kernel requires more recent processor hardware" -.Lsclp_print: - .long _sclp_print_early -.Lstack: - .long 0x8000 + (1<<(PAGE_SHIFT+THREAD_ORDER)) - .align 16 -3: .long 0x000a0000,0x8badcccc - -# List of facilities that are required. If not all facilities are present -# the kernel will crash. - - .quad FACILITIES_ALS - -4: - /* Continue with startup code in head64.S */ + brasl %r14,verify_facilities +# For uncompressed images, continue in +# arch/s390/kernel/head64.S. For compressed images, continue in +# arch/s390/boot/compressed/head.S. jg startup_continue +.Lstack: + .long 0x8000 + (1<<(PAGE_SHIFT+THREAD_ORDER)) .align 8 6: .long 0x7fffffff,0xffffffff diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index f20abdb5630a..295bfb7124bc 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -121,9 +121,9 @@ static char *dump_type_str(enum dump_type type) * Must be in data section since the bss section * is not cleared when these are accessed. */ -static u8 ipl_ssid __attribute__((__section__(".data"))) = 0; -static u16 ipl_devno __attribute__((__section__(".data"))) = 0; -u32 ipl_flags __attribute__((__section__(".data"))) = 0; +static u8 ipl_ssid __section(.data) = 0; +static u16 ipl_devno __section(.data) = 0; +u32 ipl_flags __section(.data) = 0; enum ipl_method { REIPL_METHOD_CCW_CIO, @@ -174,7 +174,7 @@ static inline int __diag308(unsigned long subcode, void *addr) asm volatile( " diag %0,%2,0x308\n" - "0:\n" + "0: nopr %%r7\n" EX_TABLE(0b,0b) : "+d" (_addr), "+d" (_rc) : "d" (subcode) : "cc", "memory"); @@ -563,7 +563,7 @@ static struct kset *ipl_kset; static void __ipl_run(void *unused) { - diag308(DIAG308_IPL, NULL); + diag308(DIAG308_LOAD_CLEAR, NULL); if (MACHINE_IS_VM) __cpcmd("IPL", NULL, 0, NULL); else if (ipl_info.type == IPL_TYPE_CCW) @@ -1085,21 +1085,24 @@ static void __reipl_run(void *unused) break; case REIPL_METHOD_CCW_DIAG: diag308(DIAG308_SET, reipl_block_ccw); - diag308(DIAG308_IPL, NULL); + if (MACHINE_IS_LPAR) + diag308(DIAG308_LOAD_NORMAL_DUMP, NULL); + else + diag308(DIAG308_LOAD_CLEAR, NULL); break; case REIPL_METHOD_FCP_RW_DIAG: diag308(DIAG308_SET, reipl_block_fcp); - diag308(DIAG308_IPL, NULL); + diag308(DIAG308_LOAD_CLEAR, NULL); break; case REIPL_METHOD_FCP_RO_DIAG: - diag308(DIAG308_IPL, NULL); + diag308(DIAG308_LOAD_CLEAR, NULL); break; case REIPL_METHOD_FCP_RO_VM: __cpcmd("IPL", NULL, 0, NULL); break; case REIPL_METHOD_NSS_DIAG: diag308(DIAG308_SET, reipl_block_nss); - diag308(DIAG308_IPL, NULL); + diag308(DIAG308_LOAD_CLEAR, NULL); break; case REIPL_METHOD_NSS: get_ipl_string(buf, reipl_block_nss, REIPL_METHOD_NSS); @@ -1108,7 +1111,7 @@ static void __reipl_run(void *unused) case REIPL_METHOD_DEFAULT: if (MACHINE_IS_VM) __cpcmd("IPL", NULL, 0, NULL); - diag308(DIAG308_IPL, NULL); + diag308(DIAG308_LOAD_CLEAR, NULL); break; case REIPL_METHOD_FCP_DUMP: break; @@ -1423,7 +1426,7 @@ static void diag308_dump(void *dump_block) { diag308(DIAG308_SET, dump_block); while (1) { - if (diag308(DIAG308_DUMP, NULL) != 0x302) + if (diag308(DIAG308_LOAD_NORMAL_DUMP, NULL) != 0x302) break; udelay_simple(USEC_PER_SEC); } @@ -2064,12 +2067,5 @@ void s390_reset_system(void) S390_lowcore.program_new_psw.addr = (unsigned long) s390_base_pgm_handler; - /* - * Clear subchannel ID and number to signal new kernel that no CCW or - * SCSI IPL has been done (for kexec and kdump) - */ - S390_lowcore.subchannel_id = 0; - S390_lowcore.subchannel_nr = 0; - do_reset_calls(); } diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index c373a1d41d10..285d6561076d 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -127,9 +127,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, "CPU%d ", cpu); seq_putc(p, '\n'); } - if (index < NR_IRQS) { - if (index >= NR_IRQS_BASE) - goto out; + if (index < NR_IRQS_BASE) { seq_printf(p, "%s: ", irqclass_main_desc[index].name); irq = irqclass_main_desc[index].irq; for_each_online_cpu(cpu) @@ -137,6 +135,9 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); goto out; } + if (index > NR_IRQS_BASE) + goto out; + for (index = 0; index < NR_ARCH_IRQS; index++) { seq_printf(p, "%s: ", irqclass_sub_desc[index].name); irq = irqclass_sub_desc[index].irq; diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 250f5972536a..fdb40424acfe 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -26,12 +26,14 @@ #include <linux/stop_machine.h> #include <linux/kdebug.h> #include <linux/uaccess.h> +#include <linux/extable.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/hardirq.h> #include <linux/ftrace.h> #include <asm/cacheflush.h> #include <asm/sections.h> +#include <asm/uaccess.h> #include <asm/dis.h> DEFINE_PER_CPU(struct kprobe *, current_kprobe); @@ -690,6 +692,15 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) stack = (unsigned long) regs->gprs[15]; memcpy(kcb->jprobes_stack, (void *) stack, MIN_STACK_SIZE(stack)); + + /* + * jprobes use jprobe_return() which skips the normal return + * path of the function, and this messes up the accounting of the + * function graph tracer to get messed up. + * + * Pause function graph tracing while performing the jprobe function. + */ + pause_graph_tracing(); return 1; } NOKPROBE_SYMBOL(setjmp_pre_handler); @@ -705,6 +716,9 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); unsigned long stack; + /* It's OK to start function graph tracing again */ + unpause_graph_tracing(); + stack = (unsigned long) kcb->jprobe_saved_regs.gprs[15]; /* Put the regs back */ diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 2f1b7217c25c..3074c1d83829 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -24,6 +24,7 @@ #include <asm/diag.h> #include <asm/elf.h> #include <asm/asm-offsets.h> +#include <asm/cacheflush.h> #include <asm/os_info.h> #include <asm/switch_to.h> @@ -43,13 +44,13 @@ static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action, switch (action) { case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: - if (crashk_res.start) - crash_map_reserved_pages(); + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: - if (crashk_res.start) - crash_unmap_reserved_pages(); + if (kexec_crash_image) + arch_kexec_protect_crashkres(); break; default: return NOTIFY_DONE; @@ -146,42 +147,46 @@ static int kdump_csum_valid(struct kimage *image) #endif } -/* - * Map or unmap crashkernel memory - */ -static void crash_map_pages(int enable) +#ifdef CONFIG_CRASH_DUMP + +void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) { - unsigned long size = resource_size(&crashk_res); - - BUG_ON(crashk_res.start % KEXEC_CRASH_MEM_ALIGN || - size % KEXEC_CRASH_MEM_ALIGN); - if (enable) - vmem_add_mapping(crashk_res.start, size); - else { - vmem_remove_mapping(crashk_res.start, size); - if (size) - os_info_crashkernel_add(crashk_res.start, size); - else - os_info_crashkernel_add(0, 0); - } + unsigned long addr, size; + + for (addr = begin; addr < end; addr += PAGE_SIZE) + free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); + size = begin - crashk_res.start; + if (size) + os_info_crashkernel_add(crashk_res.start, size); + else + os_info_crashkernel_add(0, 0); } -/* - * Map crashkernel memory - */ -void crash_map_reserved_pages(void) +static void crash_protect_pages(int protect) +{ + unsigned long size; + + if (!crashk_res.end) + return; + size = resource_size(&crashk_res); + if (protect) + set_memory_ro(crashk_res.start, size >> PAGE_SHIFT); + else + set_memory_rw(crashk_res.start, size >> PAGE_SHIFT); +} + +void arch_kexec_protect_crashkres(void) { - crash_map_pages(1); + crash_protect_pages(1); } -/* - * Unmap crashkernel memory - */ -void crash_unmap_reserved_pages(void) +void arch_kexec_unprotect_crashkres(void) { - crash_map_pages(0); + crash_protect_pages(0); } +#endif + /* * Give back memory to hypervisor before new kdump is loaded */ diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index 7873e171457c..fbc07891f9e7 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -51,6 +51,10 @@ void *module_alloc(unsigned long size) void module_arch_freeing_init(struct module *mod) { + if (is_livepatch_module(mod) && + mod->state == MODULE_STATE_LIVE) + return; + vfree(mod->arch.syminfo); mod->arch.syminfo = NULL; } @@ -425,7 +429,5 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { jump_label_apply_nops(me); - vfree(me->arch.syminfo); - me->arch.syminfo = NULL; return 0; } diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 07302ce37648..9a32f7419d78 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -16,7 +16,7 @@ #include <linux/module.h> #include <asm/lowcore.h> #include <asm/smp.h> -#include <asm/etr.h> +#include <asm/stp.h> #include <asm/cputime.h> #include <asm/nmi.h> #include <asm/crw.h> @@ -27,7 +27,6 @@ struct mcck_struct { unsigned int kill_task : 1; unsigned int channel_report : 1; unsigned int warning : 1; - unsigned int etr_queue : 1; unsigned int stp_queue : 1; unsigned long mcck_code; }; @@ -82,8 +81,6 @@ void s390_handle_mcck(void) if (xchg(&mchchk_wng_posted, 1) == 0) kill_cad_pid(SIGPWR, 1); } - if (mcck.etr_queue) - etr_queue_work(); if (mcck.stp_queue) stp_queue_work(); if (mcck.kill_task) { @@ -101,7 +98,7 @@ EXPORT_SYMBOL_GPL(s390_handle_mcck); * returns 0 if all registers could be validated * returns 1 otherwise */ -static int notrace s390_validate_registers(union mci mci) +static int notrace s390_validate_registers(union mci mci, int umode) { int kill_task; u64 zero; @@ -113,26 +110,41 @@ static int notrace s390_validate_registers(union mci mci) if (!mci.gr) { /* * General purpose registers couldn't be restored and have - * unknown contents. Process needs to be terminated. + * unknown contents. Stop system or terminate process. */ + if (!umode) + s390_handle_damage(); kill_task = 1; } if (!mci.fp) { /* - * Floating point registers can't be restored and - * therefore the process needs to be terminated. + * Floating point registers can't be restored. If the + * kernel currently uses floating point registers the + * system is stopped. If the process has its floating + * pointer registers loaded it is terminated. + * Otherwise just revalidate the registers. */ - kill_task = 1; + if (S390_lowcore.fpu_flags & KERNEL_VXR_V0V7) + s390_handle_damage(); + if (!test_cpu_flag(CIF_FPU)) + kill_task = 1; } fpt_save_area = &S390_lowcore.floating_pt_save_area; fpt_creg_save_area = &S390_lowcore.fpt_creg_save_area; if (!mci.fc) { /* * Floating point control register can't be restored. - * Task will be terminated. + * If the kernel currently uses the floating pointer + * registers and needs the FPC register the system is + * stopped. If the process has its floating pointer + * registers loaded it is terminated. Otherwiese the + * FPC is just revalidated. */ + if (S390_lowcore.fpu_flags & KERNEL_FPC) + s390_handle_damage(); asm volatile("lfpc 0(%0)" : : "a" (&zero), "m" (zero)); - kill_task = 1; + if (!test_cpu_flag(CIF_FPU)) + kill_task = 1; } else asm volatile("lfpc 0(%0)" : : "a" (fpt_creg_save_area)); @@ -162,10 +174,16 @@ static int notrace s390_validate_registers(union mci mci) if (!mci.vr) { /* - * Vector registers can't be restored and therefore - * the process needs to be terminated. + * Vector registers can't be restored. If the kernel + * currently uses vector registers the system is + * stopped. If the process has its vector registers + * loaded it is terminated. Otherwise just revalidate + * the registers. */ - kill_task = 1; + if (S390_lowcore.fpu_flags & KERNEL_VXR) + s390_handle_damage(); + if (!test_cpu_flag(CIF_FPU)) + kill_task = 1; } cr0.val = S390_lowcore.cregs_save_area[0]; cr0.afp = cr0.vx = 1; @@ -241,8 +259,6 @@ static int notrace s390_validate_registers(union mci mci) #define ED_STP_ISLAND 6 /* External damage STP island check */ #define ED_STP_SYNC 7 /* External damage STP sync check */ -#define ED_ETR_SYNC 12 /* External damage ETR sync check */ -#define ED_ETR_SWITCH 13 /* External damage ETR switch to local */ /* * machine check handler. @@ -255,13 +271,11 @@ void notrace s390_do_machine_check(struct pt_regs *regs) struct mcck_struct *mcck; unsigned long long tmp; union mci mci; - int umode; nmi_enter(); inc_irq_stat(NMI_NMI); mci.val = S390_lowcore.mcck_interruption_code; mcck = this_cpu_ptr(&cpu_mcck); - umode = user_mode(regs); if (mci.sd) { /* System damage -> stopping machine */ @@ -302,22 +316,14 @@ void notrace s390_do_machine_check(struct pt_regs *regs) s390_handle_damage(); } } - if (s390_validate_registers(mci)) { - if (umode) { - /* - * Couldn't restore all register contents while in - * user mode -> mark task for termination. - */ - mcck->kill_task = 1; - mcck->mcck_code = mci.val; - set_cpu_flag(CIF_MCCK_PENDING); - } else { - /* - * Couldn't restore all register contents while in - * kernel mode -> stopping machine. - */ - s390_handle_damage(); - } + if (s390_validate_registers(mci, user_mode(regs))) { + /* + * Couldn't restore all register contents for the + * user space process -> mark task for termination. + */ + mcck->kill_task = 1; + mcck->mcck_code = mci.val; + set_cpu_flag(CIF_MCCK_PENDING); } if (mci.cd) { /* Timing facility damage */ @@ -325,15 +331,11 @@ void notrace s390_do_machine_check(struct pt_regs *regs) } if (mci.ed && mci.ec) { /* External damage */ - if (S390_lowcore.external_damage_code & (1U << ED_ETR_SYNC)) - mcck->etr_queue |= etr_sync_check(); - if (S390_lowcore.external_damage_code & (1U << ED_ETR_SWITCH)) - mcck->etr_queue |= etr_switch_to_local(); if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC)) mcck->stp_queue |= stp_sync_check(); if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND)) mcck->stp_queue |= stp_island_check(); - if (mcck->etr_queue || mcck->stp_queue) + if (mcck->stp_queue) set_cpu_flag(CIF_MCCK_PENDING); } if (mci.se) diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 62f066b5259e..037c2a253ae4 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -649,6 +649,8 @@ static int cpumf_pmu_commit_txn(struct pmu *pmu) /* Performance monitoring unit for s390x */ static struct pmu cpumf_pmu = { + .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, .pmu_enable = cpumf_pmu_enable, .pmu_disable = cpumf_pmu_disable, .event_init = cpumf_pmu_event_init, @@ -662,27 +664,22 @@ static struct pmu cpumf_pmu = { .cancel_txn = cpumf_pmu_cancel_txn, }; -static int cpumf_pmu_notifier(struct notifier_block *self, unsigned long action, - void *hcpu) +static int cpumf_pmf_setup(unsigned int cpu, int flags) { - unsigned int cpu = (long) hcpu; - int flags; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - flags = PMC_INIT; - smp_call_function_single(cpu, setup_pmc_cpu, &flags, 1); - break; - case CPU_DOWN_PREPARE: - flags = PMC_RELEASE; - smp_call_function_single(cpu, setup_pmc_cpu, &flags, 1); - break; - default: - break; - } + local_irq_disable(); + setup_pmc_cpu(&flags); + local_irq_enable(); + return 0; +} - return NOTIFY_OK; +static int s390_pmu_online_cpu(unsigned int cpu) +{ + return cpumf_pmf_setup(cpu, PMC_INIT); +} + +static int s390_pmu_offline_cpu(unsigned int cpu) +{ + return cpumf_pmf_setup(cpu, PMC_RELEASE); } static int __init cpumf_pmu_init(void) @@ -702,25 +699,19 @@ static int __init cpumf_pmu_init(void) if (rc) { pr_err("Registering for CPU-measurement alerts " "failed with rc=%i\n", rc); - goto out; + return rc; } - /* The CPU measurement counter facility does not have overflow - * interrupts to do sampling. Sampling must be provided by - * external means, for example, by timers. - */ - cpumf_pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; - cpumf_pmu.attr_groups = cpumf_cf_event_group(); rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", PERF_TYPE_RAW); if (rc) { pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc); unregister_external_irq(EXT_IRQ_MEASURE_ALERT, cpumf_measurement_alert); - goto out; + return rc; } - perf_cpu_notifier(cpumf_pmu_notifier); -out: - return rc; + return cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE, + "AP_PERF_S390_CF_ONLINE", + s390_pmu_online_cpu, s390_pmu_offline_cpu); } early_initcall(cpumf_pmu_init); diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index eaab9a7cb3be..fcc634c1479a 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -601,17 +601,12 @@ static void release_pmc_hardware(void) irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); on_each_cpu(setup_pmc_cpu, &flags, 1); - perf_release_sampling(); } static int reserve_pmc_hardware(void) { int flags = PMC_INIT; - int err; - err = perf_reserve_sampling(); - if (err) - return err; on_each_cpu(setup_pmc_cpu, &flags, 1); if (flags & PMC_FAILURE) { release_pmc_hardware(); @@ -979,12 +974,15 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) struct pt_regs regs; struct perf_sf_sde_regs *sde_regs; struct perf_sample_data data; - struct perf_raw_record raw; + struct perf_raw_record raw = { + .frag = { + .size = sfr->size, + .data = sfr, + }, + }; /* Setup perf sample */ perf_sample_data_init(&data, 0, event->hw.last_period); - raw.size = sfr->size; - raw.data = sfr; data.raw = &raw; /* Setup pt_regs to look like an CPU-measurement external interrupt @@ -1506,34 +1504,28 @@ static void cpumf_measurement_alert(struct ext_code ext_code, sf_disable(); } } - -static int cpumf_pmu_notifier(struct notifier_block *self, - unsigned long action, void *hcpu) +static int cpusf_pmu_setup(unsigned int cpu, int flags) { - unsigned int cpu = (long) hcpu; - int flags; - /* Ignore the notification if no events are scheduled on the PMU. * This might be racy... */ if (!atomic_read(&num_events)) - return NOTIFY_OK; + return 0; - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - flags = PMC_INIT; - smp_call_function_single(cpu, setup_pmc_cpu, &flags, 1); - break; - case CPU_DOWN_PREPARE: - flags = PMC_RELEASE; - smp_call_function_single(cpu, setup_pmc_cpu, &flags, 1); - break; - default: - break; - } + local_irq_disable(); + setup_pmc_cpu(&flags); + local_irq_enable(); + return 0; +} + +static int s390_pmu_sf_online_cpu(unsigned int cpu) +{ + return cpusf_pmu_setup(cpu, PMC_INIT); +} - return NOTIFY_OK; +static int s390_pmu_sf_offline_cpu(unsigned int cpu) +{ + return cpusf_pmu_setup(cpu, PMC_RELEASE); } static int param_get_sfb_size(char *buffer, const struct kernel_param *kp) @@ -1633,7 +1625,9 @@ static int __init init_cpum_sampling_pmu(void) cpumf_measurement_alert); goto out; } - perf_cpu_notifier(cpumf_pmu_notifier); + + cpuhp_setup_state(CPUHP_AP_PERF_S390_SF_ONLINE, "AP_PERF_S390_SF_ONLINE", + s390_pmu_sf_online_cpu, s390_pmu_sf_offline_cpu); out: return err; } diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c index c3e4099b60a5..17431f63de00 100644 --- a/arch/s390/kernel/perf_event.c +++ b/arch/s390/kernel/perf_event.c @@ -224,13 +224,13 @@ arch_initcall(service_level_perf_register); static int __perf_callchain_kernel(void *data, unsigned long address) { - struct perf_callchain_entry *entry = data; + struct perf_callchain_entry_ctx *entry = data; perf_callchain_store(entry, address); return 0; } -void perf_callchain_kernel(struct perf_callchain_entry *entry, +void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { if (user_mode(regs)) @@ -248,33 +248,3 @@ ssize_t cpumf_events_sysfs_show(struct device *dev, return sprintf(page, "event=0x%04llx,name=%s\n", pmu_attr->id, attr->attr.name); } - -/* Reserve/release functions for sharing perf hardware */ -static DEFINE_SPINLOCK(perf_hw_owner_lock); -static void *perf_sampling_owner; - -int perf_reserve_sampling(void) -{ - int err; - - err = 0; - spin_lock(&perf_hw_owner_lock); - if (perf_sampling_owner) { - pr_warn("The sampling facility is already reserved by %p\n", - perf_sampling_owner); - err = -EBUSY; - } else - perf_sampling_owner = __builtin_return_address(0); - spin_unlock(&perf_hw_owner_lock); - return err; -} -EXPORT_SYMBOL(perf_reserve_sampling); - -void perf_release_sampling(void) -{ - spin_lock(&perf_hw_owner_lock); - WARN_ON(!perf_sampling_owner); - perf_sampling_owner = NULL; - spin_unlock(&perf_hw_owner_lock); -} -EXPORT_SYMBOL(perf_release_sampling); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 2bba7df4ac51..bba4fa74b321 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -7,6 +7,7 @@ * Denis Joseph Barrow, */ +#include <linux/elf-randomize.h> #include <linux/compiler.h> #include <linux/cpu.h> #include <linux/sched.h> @@ -37,9 +38,6 @@ asmlinkage void ret_from_fork(void) asm ("ret_from_fork"); -/* FPU save area for the init task */ -__vector128 init_task_fpu_regs[__NUM_VXRS] __init_task_data; - /* * Return saved PC of a blocked thread. used in kernel/sched. * resume in entry.S does not create a new stack frame, it @@ -70,9 +68,10 @@ extern void kernel_thread_starter(void); /* * Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - exit_thread_runtime_instr(); + if (tsk == current) + exit_thread_runtime_instr(); } void flush_thread(void) @@ -85,35 +84,19 @@ void release_thread(struct task_struct *dead_task) void arch_release_task_struct(struct task_struct *tsk) { - /* Free either the floating-point or the vector register save area */ - kfree(tsk->thread.fpu.regs); } int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - size_t fpu_regs_size; - - *dst = *src; - - /* - * If the vector extension is available, it is enabled for all tasks, - * and, thus, the FPU register save area must be allocated accordingly. - */ - fpu_regs_size = MACHINE_HAS_VX ? sizeof(__vector128) * __NUM_VXRS - : sizeof(freg_t) * __NUM_FPRS; - dst->thread.fpu.regs = kzalloc(fpu_regs_size, GFP_KERNEL|__GFP_REPEAT); - if (!dst->thread.fpu.regs) - return -ENOMEM; - /* * Save the floating-point or vector register state of the current * task and set the CIF_FPU flag to lazy restore the FPU register * state when returning to user space. */ save_fpu_regs(); - dst->thread.fpu.fpc = current->thread.fpu.fpc; - memcpy(dst->thread.fpu.regs, current->thread.fpu.regs, fpu_regs_size); + memcpy(dst, src, arch_task_struct_size); + dst->thread.fpu.regs = dst->thread.fpu.fprs; return 0; } diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 647128d5b983..81d0808085e6 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -6,18 +6,52 @@ #define KMSG_COMPONENT "cpu" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#include <linux/cpufeature.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/seq_file.h> #include <linux/delay.h> #include <linux/cpu.h> #include <asm/diag.h> +#include <asm/facility.h> #include <asm/elf.h> #include <asm/lowcore.h> #include <asm/param.h> #include <asm/smp.h> -static DEFINE_PER_CPU(struct cpuid, cpu_id); +struct cpu_info { + unsigned int cpu_mhz_dynamic; + unsigned int cpu_mhz_static; + struct cpuid cpu_id; +}; + +static DEFINE_PER_CPU(struct cpu_info, cpu_info); + +static bool machine_has_cpu_mhz; + +void __init cpu_detect_mhz_feature(void) +{ + if (test_facility(34) && __ecag(ECAG_CPU_ATTRIBUTE, 0) != -1UL) + machine_has_cpu_mhz = 1; +} + +static void update_cpu_mhz(void *arg) +{ + unsigned long mhz; + struct cpu_info *c; + + mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); + c = this_cpu_ptr(&cpu_info); + c->cpu_mhz_dynamic = mhz >> 32; + c->cpu_mhz_static = mhz & 0xffffffff; +} + +void s390_update_cpu_mhz(void) +{ + s390_adjust_jiffies(); + if (machine_has_cpu_mhz) + on_each_cpu(update_cpu_mhz, NULL, 0); +} void notrace cpu_relax(void) { @@ -34,9 +68,11 @@ EXPORT_SYMBOL(cpu_relax); */ void cpu_init(void) { - struct cpuid *id = this_cpu_ptr(&cpu_id); + struct cpuid *id = this_cpu_ptr(&cpu_info.cpu_id); get_cpu_id(id); + if (machine_has_cpu_mhz) + update_cpu_mhz(NULL); atomic_inc(&init_mm.mm_count); current->active_mm = &init_mm; BUG_ON(current->mm); @@ -52,10 +88,7 @@ int cpu_have_feature(unsigned int num) } EXPORT_SYMBOL(cpu_have_feature); -/* - * show_cpuinfo - Get information on one CPU for use by procfs. - */ -static int show_cpuinfo(struct seq_file *m, void *v) +static void show_cpu_summary(struct seq_file *m, void *v) { static const char *hwcap_str[] = { "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp", @@ -64,52 +97,80 @@ static int show_cpuinfo(struct seq_file *m, void *v) static const char * const int_hwcap_str[] = { "sie" }; - unsigned long n = (unsigned long) v - 1; - int i; - - if (!n) { - s390_adjust_jiffies(); - seq_printf(m, "vendor_id : IBM/S390\n" - "# processors : %i\n" - "bogomips per cpu: %lu.%02lu\n", - num_online_cpus(), loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ))%100); - seq_puts(m, "features\t: "); - for (i = 0; i < ARRAY_SIZE(hwcap_str); i++) - if (hwcap_str[i] && (elf_hwcap & (1UL << i))) - seq_printf(m, "%s ", hwcap_str[i]); - for (i = 0; i < ARRAY_SIZE(int_hwcap_str); i++) - if (int_hwcap_str[i] && (int_hwcap & (1UL << i))) - seq_printf(m, "%s ", int_hwcap_str[i]); - seq_puts(m, "\n"); - show_cacheinfo(m); - } - get_online_cpus(); - if (cpu_online(n)) { - struct cpuid *id = &per_cpu(cpu_id, n); - seq_printf(m, "processor %li: " + int i, cpu; + + seq_printf(m, "vendor_id : IBM/S390\n" + "# processors : %i\n" + "bogomips per cpu: %lu.%02lu\n", + num_online_cpus(), loops_per_jiffy/(500000/HZ), + (loops_per_jiffy/(5000/HZ))%100); + seq_printf(m, "max thread id : %d\n", smp_cpu_mtid); + seq_puts(m, "features\t: "); + for (i = 0; i < ARRAY_SIZE(hwcap_str); i++) + if (hwcap_str[i] && (elf_hwcap & (1UL << i))) + seq_printf(m, "%s ", hwcap_str[i]); + for (i = 0; i < ARRAY_SIZE(int_hwcap_str); i++) + if (int_hwcap_str[i] && (int_hwcap & (1UL << i))) + seq_printf(m, "%s ", int_hwcap_str[i]); + seq_puts(m, "\n"); + show_cacheinfo(m); + for_each_online_cpu(cpu) { + struct cpuid *id = &per_cpu(cpu_info.cpu_id, cpu); + + seq_printf(m, "processor %d: " "version = %02X, " "identification = %06X, " "machine = %04X\n", - n, id->version, id->ident, id->machine); + cpu, id->version, id->ident, id->machine); } - put_online_cpus(); +} + +static void show_cpu_mhz(struct seq_file *m, unsigned long n) +{ + struct cpu_info *c = per_cpu_ptr(&cpu_info, n); + + seq_printf(m, "cpu MHz dynamic : %d\n", c->cpu_mhz_dynamic); + seq_printf(m, "cpu MHz static : %d\n", c->cpu_mhz_static); +} + +/* + * show_cpuinfo - Get information on one CPU for use by procfs. + */ +static int show_cpuinfo(struct seq_file *m, void *v) +{ + unsigned long n = (unsigned long) v - 1; + + if (!n) + show_cpu_summary(m, v); + if (!machine_has_cpu_mhz) + return 0; + seq_printf(m, "\ncpu number : %ld\n", n); + show_cpu_mhz(m, n); return 0; } +static inline void *c_update(loff_t *pos) +{ + if (*pos) + *pos = cpumask_next(*pos - 1, cpu_online_mask); + return *pos < nr_cpu_ids ? (void *)*pos + 1 : NULL; +} + static void *c_start(struct seq_file *m, loff_t *pos) { - return *pos < nr_cpu_ids ? (void *)((unsigned long) *pos + 1) : NULL; + get_online_cpus(); + return c_update(pos); } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { ++*pos; - return c_start(m, pos); + return c_update(pos); } static void c_stop(struct seq_file *m, void *v) { + put_online_cpus(); } const struct seq_operations cpuinfo_op = { @@ -118,4 +179,3 @@ const struct seq_operations cpuinfo_op = { .stop = c_stop, .show = show_cpuinfo, }; - diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 49b1c13bf6c9..9336e824e2db 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -821,14 +821,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) { - long ret = 0; - - /* Do the secure computing check first. */ - if (secure_computing()) { - /* seccomp failures shouldn't expose any additional code. */ - ret = -1; - goto out; - } + unsigned long mask = -1UL; /* * The sysc_tracesys code in entry.S stored the system @@ -843,17 +836,26 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) * the system call and the system call restart handling. */ clear_pt_regs_flag(regs, PIF_SYSCALL); - ret = -1; + return -1; + } + + /* Do the secure computing check after ptrace. */ + if (secure_computing(NULL)) { + /* seccomp failures shouldn't expose any additional code. */ + return -1; } if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->gprs[2]); - audit_syscall_entry(regs->gprs[2], regs->orig_gpr2, - regs->gprs[3], regs->gprs[4], - regs->gprs[5]); -out: - return ret ?: regs->gprs[2]; + if (is_compat_task()) + mask = 0xffffffff; + + audit_syscall_entry(regs->gprs[2], regs->orig_gpr2 & mask, + regs->gprs[3] &mask, regs->gprs[4] &mask, + regs->gprs[5] &mask); + + return regs->gprs[2]; } asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) diff --git a/arch/s390/kernel/sclp.c b/arch/s390/kernel/sclp.c index d88db40bdf15..f08af675f36f 100644 --- a/arch/s390/kernel/sclp.c +++ b/arch/s390/kernel/sclp.c @@ -12,8 +12,9 @@ #define EVTYP_VT220MSG_MASK 0x00000040 #define EVTYP_MSG_MASK 0x40000000 -static char _sclp_work_area[4096] __aligned(PAGE_SIZE); -static bool have_vt220, have_linemode; +static char _sclp_work_area[4096] __aligned(PAGE_SIZE) __section(data); +static bool have_vt220 __section(data); +static bool have_linemode __section(data); static void _sclp_wait_int(void) { diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index d3f9688f26b5..7f7ba5f23f13 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -130,17 +130,14 @@ __setup("condev=", condev_setup); static void __init set_preferred_console(void) { - if (MACHINE_IS_KVM) { - if (sclp.has_vt220) - add_preferred_console("ttyS", 1, NULL); - else if (sclp.has_linemode) - add_preferred_console("ttyS", 0, NULL); - else - add_preferred_console("hvc", 0, NULL); - } else if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP) + if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP) add_preferred_console("ttyS", 0, NULL); else if (CONSOLE_IS_3270) add_preferred_console("tty3270", 0, NULL); + else if (CONSOLE_IS_VT220) + add_preferred_console("ttyS", 1, NULL); + else if (CONSOLE_IS_HVC) + add_preferred_console("hvc", 0, NULL); } static int __init conmode_setup(char *str) @@ -206,6 +203,13 @@ static void __init conmode_default(void) SET_CONSOLE_SCLP; #endif } + } else if (MACHINE_IS_KVM) { + if (sclp.has_vt220 && IS_ENABLED(CONFIG_SCLP_VT220_CONSOLE)) + SET_CONSOLE_VT220; + else if (sclp.has_linemode && IS_ENABLED(CONFIG_SCLP_CONSOLE)) + SET_CONSOLE_SCLP; + else + SET_CONSOLE_HVC; } else { #if defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE) SET_CONSOLE_SCLP; @@ -289,7 +293,7 @@ static int __init parse_vmalloc(char *arg) } early_param("vmalloc", parse_vmalloc); -void *restart_stack __attribute__((__section__(".data"))); +void *restart_stack __section(.data); static void __init setup_lowcore(void) { @@ -432,6 +436,20 @@ static void __init setup_resources(void) } } } +#ifdef CONFIG_CRASH_DUMP + /* + * Re-add removed crash kernel memory as reserved memory. This makes + * sure it will be mapped with the identity mapping and struct pages + * will be created, so it can be resized later on. + * However add it later since the crash kernel resource should not be + * part of the System RAM resource. + */ + if (crashk_res.end) { + memblock_add(crashk_res.start, resource_size(&crashk_res)); + memblock_reserve(crashk_res.start, resource_size(&crashk_res)); + insert_resource(&iomem_resource, &crashk_res); + } +#endif } static void __init setup_memory_end(void) @@ -602,7 +620,6 @@ static void __init reserve_crashkernel(void) diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size)); crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; - insert_resource(&iomem_resource, &crashk_res); memblock_remove(crash_base, crash_size); pr_info("Reserving %lluMB of memory at %lluMB " "for crashkernel (System RAM: %luMB)\n", @@ -809,6 +826,22 @@ static void __init setup_randomness(void) } /* + * Find the correct size for the task_struct. This depends on + * the size of the struct fpu at the end of the thread_struct + * which is embedded in the task_struct. + */ +static void __init setup_task_size(void) +{ + int task_size = sizeof(struct task_struct); + + if (!MACHINE_HAS_VX) { + task_size -= sizeof(__vector128) * __NUM_VXRS; + task_size += sizeof(freg_t) * __NUM_FPRS; + } + arch_task_struct_size = task_size; +} + +/* * Setup function called from init/main.c just after the banner * was printed. */ @@ -846,6 +879,7 @@ void __init setup_arch(char **cmdline_p) os_info_init(); setup_ipl(); + setup_task_size(); /* Do some memory reservations *before* memory is added to memblock */ reserve_memory_end(); @@ -884,6 +918,7 @@ void __init setup_arch(char **cmdline_p) setup_vmcoreinfo(); setup_lowcore(); smp_fill_possible_mask(); + cpu_detect_mhz_feature(); cpu_init(); numa_setup(); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 40a6b4f9c36c..35531fe1c5ea 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -242,10 +242,8 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) { struct lowcore *lc = pcpu->lowcore; - if (MACHINE_HAS_TLB_LC) - cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask); + cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(cpu, mm_cpumask(&init_mm)); - atomic_inc(&init_mm.context.attach_count); lc->cpu_nr = cpu; lc->spinlock_lockval = arch_spin_lockval(cpu); lc->percpu_offset = __per_cpu_offset[cpu]; @@ -320,17 +318,11 @@ static void pcpu_delegate(struct pcpu *pcpu, void (*func)(void *), */ static int pcpu_set_smt(unsigned int mtid) { - register unsigned long reg1 asm ("1") = (unsigned long) mtid; int cc; if (smp_cpu_mtid == mtid) return 0; - asm volatile( - " sigp %1,0,%2 # sigp set multi-threading\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (cc) : "d" (reg1), "K" (SIGP_SET_MULTI_THREADING) - : "cc"); + cc = __pcpu_sigp(0, SIGP_SET_MULTI_THREADING, mtid, NULL); if (cc == 0) { smp_cpu_mtid = mtid; smp_cpu_mt_shift = 0; @@ -832,7 +824,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) pcpu_attach_task(pcpu, tidle); pcpu_start_fn(pcpu, smp_start_secondary, NULL); /* Wait until cpu puts itself in the online & active maps */ - while (!cpu_online(cpu) || !cpu_active(cpu)) + while (!cpu_online(cpu)) cpu_relax(); return 0; } @@ -876,10 +868,8 @@ void __cpu_die(unsigned int cpu) while (!pcpu_stopped(pcpu)) cpu_relax(); pcpu_free_lowcore(pcpu); - atomic_dec(&init_mm.context.attach_count); cpumask_clear_cpu(cpu, mm_cpumask(&init_mm)); - if (MACHINE_HAS_TLB_LC) - cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask); + cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask); } void __noreturn cpu_die(void) @@ -897,7 +887,7 @@ void __init smp_fill_possible_mask(void) sclp_max = max(sclp.mtid, sclp.mtid_cp) + 1; sclp_max = min(smp_max_threads, sclp_max); - sclp_max = sclp.max_cores * sclp_max ?: nr_cpu_ids; + sclp_max = (sclp.max_cores * sclp_max) ?: nr_cpu_ids; possible = setup_possible_cpus ?: nr_cpu_ids; possible = min(possible, sclp_max); for (cpu = 0; cpu < possible && cpu < nr_cpu_ids; cpu++) diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c index f7dba3887a54..bfda6aa40280 100644 --- a/arch/s390/kernel/sysinfo.c +++ b/arch/s390/kernel/sysinfo.c @@ -16,21 +16,11 @@ #include <asm/sysinfo.h> #include <asm/cpcmd.h> #include <asm/topology.h> - -/* Sigh, math-emu. Don't ask. */ -#include <asm/sfp-util.h> -#include <math-emu/soft-fp.h> -#include <math-emu/single.h> +#include <asm/fpu/api.h> int topology_max_mnest; -/* - * stsi - store system information - * - * Returns the current configuration level if function code 0 was specified. - * Otherwise returns 0 on success or a negative value on error. - */ -int stsi(void *sysinfo, int fc, int sel1, int sel2) +static inline int __stsi(void *sysinfo, int fc, int sel1, int sel2, int *lvl) { register int r0 asm("0") = (fc << 28) | sel1; register int r1 asm("1") = sel2; @@ -45,9 +35,24 @@ int stsi(void *sysinfo, int fc, int sel1, int sel2) : "+d" (r0), "+d" (rc) : "d" (r1), "a" (sysinfo), "K" (-EOPNOTSUPP) : "cc", "memory"); + *lvl = ((unsigned int) r0) >> 28; + return rc; +} + +/* + * stsi - store system information + * + * Returns the current configuration level if function code 0 was specified. + * Otherwise returns 0 on success or a negative value on error. + */ +int stsi(void *sysinfo, int fc, int sel1, int sel2) +{ + int lvl, rc; + + rc = __stsi(sysinfo, fc, sel1, sel2, &lvl); if (rc) return rc; - return fc ? 0 : ((unsigned int) r0) >> 28; + return fc ? 0 : lvl; } EXPORT_SYMBOL(stsi); @@ -414,10 +419,8 @@ subsys_initcall(create_proc_service_level); void s390_adjust_jiffies(void) { struct sysinfo_1_2_2 *info; - const unsigned int fmil = 0x4b189680; /* 1e7 as 32-bit float. */ - FP_DECL_S(SA); FP_DECL_S(SB); FP_DECL_S(SR); - FP_DECL_EX; - unsigned int capability; + unsigned long capability; + struct kernel_fpu fpu; info = (void *) get_zeroed_page(GFP_KERNEL); if (!info) @@ -433,15 +436,25 @@ void s390_adjust_jiffies(void) * higher cpu capacity. Bogomips are the other way round. * To get to a halfway suitable number we divide 1e7 * by the cpu capability number. Yes, that means a floating - * point division .. math-emu here we come :-) + * point division .. */ - FP_UNPACK_SP(SA, &fmil); - if ((info->capability >> 23) == 0) - FP_FROM_INT_S(SB, (long) info->capability, 64, long); - else - FP_UNPACK_SP(SB, &info->capability); - FP_DIV_S(SR, SA, SB); - FP_TO_INT_S(capability, SR, 32, 0); + kernel_fpu_begin(&fpu, KERNEL_FPR); + asm volatile( + " sfpc %3\n" + " l %0,%1\n" + " tmlh %0,0xff80\n" + " jnz 0f\n" + " cefbr %%f2,%0\n" + " j 1f\n" + "0: le %%f2,%1\n" + "1: cefbr %%f0,%2\n" + " debr %%f0,%%f2\n" + " cgebr %0,5,%%f0\n" + : "=&d" (capability) + : "Q" (info->capability), "d" (10000000), "d" (0) + : "cc" + ); + kernel_fpu_end(&fpu, KERNEL_FPR); } else /* * Really old machine without stsi block for basic diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 9409d32f285e..0bfcc492987e 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -39,20 +39,17 @@ #include <linux/gfp.h> #include <linux/kprobes.h> #include <asm/uaccess.h> +#include <asm/facility.h> #include <asm/delay.h> #include <asm/div64.h> #include <asm/vdso.h> #include <asm/irq.h> #include <asm/irq_regs.h> #include <asm/vtimer.h> -#include <asm/etr.h> +#include <asm/stp.h> #include <asm/cio.h> #include "entry.h" -/* change this if you have some constant time drift */ -#define USECS_PER_JIFFY ((unsigned long) 1000000/HZ) -#define CLK_TICKS_PER_JIFFY ((unsigned long) USECS_PER_JIFFY << 12) - u64 sched_clock_base_cc = -1; /* Force to data section. */ EXPORT_SYMBOL_GPL(sched_clock_base_cc); @@ -61,6 +58,32 @@ static DEFINE_PER_CPU(struct clock_event_device, comparators); ATOMIC_NOTIFIER_HEAD(s390_epoch_delta_notifier); EXPORT_SYMBOL(s390_epoch_delta_notifier); +unsigned char ptff_function_mask[16]; +unsigned long lpar_offset; +unsigned long initial_leap_seconds; + +/* + * Get time offsets with PTFF + */ +void __init ptff_init(void) +{ + struct ptff_qto qto; + struct ptff_qui qui; + + if (!test_facility(28)) + return; + ptff(&ptff_function_mask, sizeof(ptff_function_mask), PTFF_QAF); + + /* get LPAR offset */ + if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) + lpar_offset = qto.tod_epoch_difference; + + /* get initial leap seconds */ + if (ptff_query(PTFF_QUI) && ptff(&qui, sizeof(qui), PTFF_QUI) == 0) + initial_leap_seconds = (unsigned long) + ((long) qui.old_leap * 4096000000L); +} + /* * Scheduler clock - returns current time in nanosec units. */ @@ -162,30 +185,32 @@ static void clock_comparator_interrupt(struct ext_code ext_code, set_clock_comparator(S390_lowcore.clock_comparator); } -static void etr_timing_alert(struct etr_irq_parm *); static void stp_timing_alert(struct stp_irq_parm *); static void timing_alert_interrupt(struct ext_code ext_code, unsigned int param32, unsigned long param64) { inc_irq_stat(IRQEXT_TLA); - if (param32 & 0x00c40000) - etr_timing_alert((struct etr_irq_parm *) ¶m32); if (param32 & 0x00038000) stp_timing_alert((struct stp_irq_parm *) ¶m32); } -static void etr_reset(void); static void stp_reset(void); void read_persistent_clock64(struct timespec64 *ts) { - tod_to_timeval(get_tod_clock() - TOD_UNIX_EPOCH, ts); + __u64 clock; + + clock = get_tod_clock() - initial_leap_seconds; + tod_to_timeval(clock - TOD_UNIX_EPOCH, ts); } void read_boot_clock64(struct timespec64 *ts) { - tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, ts); + __u64 clock; + + clock = sched_clock_base_cc - initial_leap_seconds; + tod_to_timeval(clock - TOD_UNIX_EPOCH, ts); } static cycle_t read_tod_clock(struct clocksource *cs) @@ -253,13 +278,8 @@ extern struct timezone sys_tz; void update_vsyscall_tz(void) { - /* Make userspace gettimeofday spin until we're done. */ - ++vdso_data->tb_update_count; - smp_wmb(); vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; vdso_data->tz_dsttime = sys_tz.tz_dsttime; - smp_wmb(); - ++vdso_data->tb_update_count; } /* @@ -269,7 +289,6 @@ void update_vsyscall_tz(void) void __init time_init(void) { /* Reset time synchronization interfaces. */ - etr_reset(); stp_reset(); /* request the clock comparator external interrupt */ @@ -290,100 +309,59 @@ void __init time_init(void) vtime_init(); } -/* - * The time is "clock". old is what we think the time is. - * Adjust the value by a multiple of jiffies and add the delta to ntp. - * "delay" is an approximation how long the synchronization took. If - * the time correction is positive, then "delay" is subtracted from - * the time difference and only the remaining part is passed to ntp. - */ -static unsigned long long adjust_time(unsigned long long old, - unsigned long long clock, - unsigned long long delay) -{ - unsigned long long delta, ticks; - struct timex adjust; - - if (clock > old) { - /* It is later than we thought. */ - delta = ticks = clock - old; - delta = ticks = (delta < delay) ? 0 : delta - delay; - delta -= do_div(ticks, CLK_TICKS_PER_JIFFY); - adjust.offset = ticks * (1000000 / HZ); - } else { - /* It is earlier than we thought. */ - delta = ticks = old - clock; - delta -= do_div(ticks, CLK_TICKS_PER_JIFFY); - delta = -delta; - adjust.offset = -ticks * (1000000 / HZ); - } - sched_clock_base_cc += delta; - if (adjust.offset != 0) { - pr_notice("The ETR interface has adjusted the clock " - "by %li microseconds\n", adjust.offset); - adjust.modes = ADJ_OFFSET_SINGLESHOT; - do_adjtimex(&adjust); - } - return delta; -} - static DEFINE_PER_CPU(atomic_t, clock_sync_word); static DEFINE_MUTEX(clock_sync_mutex); static unsigned long clock_sync_flags; -#define CLOCK_SYNC_HAS_ETR 0 -#define CLOCK_SYNC_HAS_STP 1 -#define CLOCK_SYNC_ETR 2 -#define CLOCK_SYNC_STP 3 +#define CLOCK_SYNC_HAS_STP 0 +#define CLOCK_SYNC_STP 1 /* - * The synchronous get_clock function. It will write the current clock - * value to the clock pointer and return 0 if the clock is in sync with - * the external time source. If the clock mode is local it will return - * -EOPNOTSUPP and -EAGAIN if the clock is not in sync with the external - * reference. + * The get_clock function for the physical clock. It will get the current + * TOD clock, subtract the LPAR offset and write the result to *clock. + * The function returns 0 if the clock is in sync with the external time + * source. If the clock mode is local it will return -EOPNOTSUPP and + * -EAGAIN if the clock is not in sync with the external reference. */ -int get_sync_clock(unsigned long long *clock) +int get_phys_clock(unsigned long long *clock) { atomic_t *sw_ptr; unsigned int sw0, sw1; sw_ptr = &get_cpu_var(clock_sync_word); sw0 = atomic_read(sw_ptr); - *clock = get_tod_clock(); + *clock = get_tod_clock() - lpar_offset; sw1 = atomic_read(sw_ptr); put_cpu_var(clock_sync_word); if (sw0 == sw1 && (sw0 & 0x80000000U)) /* Success: time is in sync. */ return 0; - if (!test_bit(CLOCK_SYNC_HAS_ETR, &clock_sync_flags) && - !test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags)) + if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags)) return -EOPNOTSUPP; - if (!test_bit(CLOCK_SYNC_ETR, &clock_sync_flags) && - !test_bit(CLOCK_SYNC_STP, &clock_sync_flags)) + if (!test_bit(CLOCK_SYNC_STP, &clock_sync_flags)) return -EACCES; return -EAGAIN; } -EXPORT_SYMBOL(get_sync_clock); +EXPORT_SYMBOL(get_phys_clock); /* - * Make get_sync_clock return -EAGAIN. + * Make get_phys_clock() return -EAGAIN. */ static void disable_sync_clock(void *dummy) { atomic_t *sw_ptr = this_cpu_ptr(&clock_sync_word); /* - * Clear the in-sync bit 2^31. All get_sync_clock calls will + * Clear the in-sync bit 2^31. All get_phys_clock calls will * fail until the sync bit is turned back on. In addition * increase the "sequence" counter to avoid the race of an - * etr event and the complete recovery against get_sync_clock. + * stp event and the complete recovery against get_phys_clock. */ atomic_andnot(0x80000000, sw_ptr); atomic_inc(sw_ptr); } /* - * Make get_sync_clock return 0 again. + * Make get_phys_clock() return 0 again. * Needs to be called from a context disabled for preemption. */ static void enable_sync_clock(void) @@ -406,7 +384,7 @@ static inline int check_sync_clock(void) return rc; } -/* Single threaded workqueue used for etr and stp sync events */ +/* Single threaded workqueue used for stp sync events */ static struct workqueue_struct *time_sync_wq; static void __init time_init_wq(void) @@ -416,319 +394,16 @@ static void __init time_init_wq(void) time_sync_wq = create_singlethread_workqueue("timesync"); } -/* - * External Time Reference (ETR) code. - */ -static int etr_port0_online; -static int etr_port1_online; -static int etr_steai_available; - -static int __init early_parse_etr(char *p) -{ - if (strncmp(p, "off", 3) == 0) - etr_port0_online = etr_port1_online = 0; - else if (strncmp(p, "port0", 5) == 0) - etr_port0_online = 1; - else if (strncmp(p, "port1", 5) == 0) - etr_port1_online = 1; - else if (strncmp(p, "on", 2) == 0) - etr_port0_online = etr_port1_online = 1; - return 0; -} -early_param("etr", early_parse_etr); - -enum etr_event { - ETR_EVENT_PORT0_CHANGE, - ETR_EVENT_PORT1_CHANGE, - ETR_EVENT_PORT_ALERT, - ETR_EVENT_SYNC_CHECK, - ETR_EVENT_SWITCH_LOCAL, - ETR_EVENT_UPDATE, -}; - -/* - * Valid bit combinations of the eacr register are (x = don't care): - * e0 e1 dp p0 p1 ea es sl - * 0 0 x 0 0 0 0 0 initial, disabled state - * 0 0 x 0 1 1 0 0 port 1 online - * 0 0 x 1 0 1 0 0 port 0 online - * 0 0 x 1 1 1 0 0 both ports online - * 0 1 x 0 1 1 0 0 port 1 online and usable, ETR or PPS mode - * 0 1 x 0 1 1 0 1 port 1 online, usable and ETR mode - * 0 1 x 0 1 1 1 0 port 1 online, usable, PPS mode, in-sync - * 0 1 x 0 1 1 1 1 port 1 online, usable, ETR mode, in-sync - * 0 1 x 1 1 1 0 0 both ports online, port 1 usable - * 0 1 x 1 1 1 1 0 both ports online, port 1 usable, PPS mode, in-sync - * 0 1 x 1 1 1 1 1 both ports online, port 1 usable, ETR mode, in-sync - * 1 0 x 1 0 1 0 0 port 0 online and usable, ETR or PPS mode - * 1 0 x 1 0 1 0 1 port 0 online, usable and ETR mode - * 1 0 x 1 0 1 1 0 port 0 online, usable, PPS mode, in-sync - * 1 0 x 1 0 1 1 1 port 0 online, usable, ETR mode, in-sync - * 1 0 x 1 1 1 0 0 both ports online, port 0 usable - * 1 0 x 1 1 1 1 0 both ports online, port 0 usable, PPS mode, in-sync - * 1 0 x 1 1 1 1 1 both ports online, port 0 usable, ETR mode, in-sync - * 1 1 x 1 1 1 1 0 both ports online & usable, ETR, in-sync - * 1 1 x 1 1 1 1 1 both ports online & usable, ETR, in-sync - */ -static struct etr_eacr etr_eacr; -static u64 etr_tolec; /* time of last eacr update */ -static struct etr_aib etr_port0; -static int etr_port0_uptodate; -static struct etr_aib etr_port1; -static int etr_port1_uptodate; -static unsigned long etr_events; -static struct timer_list etr_timer; - -static void etr_timeout(unsigned long dummy); -static void etr_work_fn(struct work_struct *work); -static DEFINE_MUTEX(etr_work_mutex); -static DECLARE_WORK(etr_work, etr_work_fn); - -/* - * Reset ETR attachment. - */ -static void etr_reset(void) -{ - etr_eacr = (struct etr_eacr) { - .e0 = 0, .e1 = 0, ._pad0 = 4, .dp = 0, - .p0 = 0, .p1 = 0, ._pad1 = 0, .ea = 0, - .es = 0, .sl = 0 }; - if (etr_setr(&etr_eacr) == 0) { - etr_tolec = get_tod_clock(); - set_bit(CLOCK_SYNC_HAS_ETR, &clock_sync_flags); - if (etr_port0_online && etr_port1_online) - set_bit(CLOCK_SYNC_ETR, &clock_sync_flags); - } else if (etr_port0_online || etr_port1_online) { - pr_warn("The real or virtual hardware system does not provide an ETR interface\n"); - etr_port0_online = etr_port1_online = 0; - } -} - -static int __init etr_init(void) -{ - struct etr_aib aib; - - if (!test_bit(CLOCK_SYNC_HAS_ETR, &clock_sync_flags)) - return 0; - time_init_wq(); - /* Check if this machine has the steai instruction. */ - if (etr_steai(&aib, ETR_STEAI_STEPPING_PORT) == 0) - etr_steai_available = 1; - setup_timer(&etr_timer, etr_timeout, 0UL); - if (etr_port0_online) { - set_bit(ETR_EVENT_PORT0_CHANGE, &etr_events); - queue_work(time_sync_wq, &etr_work); - } - if (etr_port1_online) { - set_bit(ETR_EVENT_PORT1_CHANGE, &etr_events); - queue_work(time_sync_wq, &etr_work); - } - return 0; -} - -arch_initcall(etr_init); - -/* - * Two sorts of ETR machine checks. The architecture reads: - * "When a machine-check niterruption occurs and if a switch-to-local or - * ETR-sync-check interrupt request is pending but disabled, this pending - * disabled interruption request is indicated and is cleared". - * Which means that we can get etr_switch_to_local events from the machine - * check handler although the interruption condition is disabled. Lovely.. - */ - -/* - * Switch to local machine check. This is called when the last usable - * ETR port goes inactive. After switch to local the clock is not in sync. - */ -int etr_switch_to_local(void) -{ - if (!etr_eacr.sl) - return 0; - disable_sync_clock(NULL); - if (!test_and_set_bit(ETR_EVENT_SWITCH_LOCAL, &etr_events)) { - etr_eacr.es = etr_eacr.sl = 0; - etr_setr(&etr_eacr); - return 1; - } - return 0; -} - -/* - * ETR sync check machine check. This is called when the ETR OTE and the - * local clock OTE are farther apart than the ETR sync check tolerance. - * After a ETR sync check the clock is not in sync. The machine check - * is broadcasted to all cpus at the same time. - */ -int etr_sync_check(void) -{ - if (!etr_eacr.es) - return 0; - disable_sync_clock(NULL); - if (!test_and_set_bit(ETR_EVENT_SYNC_CHECK, &etr_events)) { - etr_eacr.es = 0; - etr_setr(&etr_eacr); - return 1; - } - return 0; -} - -void etr_queue_work(void) -{ - queue_work(time_sync_wq, &etr_work); -} - -/* - * ETR timing alert. There are two causes: - * 1) port state change, check the usability of the port - * 2) port alert, one of the ETR-data-validity bits (v1-v2 bits of the - * sldr-status word) or ETR-data word 1 (edf1) or ETR-data word 3 (edf3) - * or ETR-data word 4 (edf4) has changed. - */ -static void etr_timing_alert(struct etr_irq_parm *intparm) -{ - if (intparm->pc0) - /* ETR port 0 state change. */ - set_bit(ETR_EVENT_PORT0_CHANGE, &etr_events); - if (intparm->pc1) - /* ETR port 1 state change. */ - set_bit(ETR_EVENT_PORT1_CHANGE, &etr_events); - if (intparm->eai) - /* - * ETR port alert on either port 0, 1 or both. - * Both ports are not up-to-date now. - */ - set_bit(ETR_EVENT_PORT_ALERT, &etr_events); - queue_work(time_sync_wq, &etr_work); -} - -static void etr_timeout(unsigned long dummy) -{ - set_bit(ETR_EVENT_UPDATE, &etr_events); - queue_work(time_sync_wq, &etr_work); -} - -/* - * Check if the etr mode is pss. - */ -static inline int etr_mode_is_pps(struct etr_eacr eacr) -{ - return eacr.es && !eacr.sl; -} - -/* - * Check if the etr mode is etr. - */ -static inline int etr_mode_is_etr(struct etr_eacr eacr) -{ - return eacr.es && eacr.sl; -} - -/* - * Check if the port can be used for TOD synchronization. - * For PPS mode the port has to receive OTEs. For ETR mode - * the port has to receive OTEs, the ETR stepping bit has to - * be zero and the validity bits for data frame 1, 2, and 3 - * have to be 1. - */ -static int etr_port_valid(struct etr_aib *aib, int port) -{ - unsigned int psc; - - /* Check that this port is receiving OTEs. */ - if (aib->tsp == 0) - return 0; - - psc = port ? aib->esw.psc1 : aib->esw.psc0; - if (psc == etr_lpsc_pps_mode) - return 1; - if (psc == etr_lpsc_operational_step) - return !aib->esw.y && aib->slsw.v1 && - aib->slsw.v2 && aib->slsw.v3; - return 0; -} - -/* - * Check if two ports are on the same network. - */ -static int etr_compare_network(struct etr_aib *aib1, struct etr_aib *aib2) -{ - // FIXME: any other fields we have to compare? - return aib1->edf1.net_id == aib2->edf1.net_id; -} - -/* - * Wrapper for etr_stei that converts physical port states - * to logical port states to be consistent with the output - * of stetr (see etr_psc vs. etr_lpsc). - */ -static void etr_steai_cv(struct etr_aib *aib, unsigned int func) -{ - BUG_ON(etr_steai(aib, func) != 0); - /* Convert port state to logical port state. */ - if (aib->esw.psc0 == 1) - aib->esw.psc0 = 2; - else if (aib->esw.psc0 == 0 && aib->esw.p == 0) - aib->esw.psc0 = 1; - if (aib->esw.psc1 == 1) - aib->esw.psc1 = 2; - else if (aib->esw.psc1 == 0 && aib->esw.p == 1) - aib->esw.psc1 = 1; -} - -/* - * Check if the aib a2 is still connected to the same attachment as - * aib a1, the etv values differ by one and a2 is valid. - */ -static int etr_aib_follows(struct etr_aib *a1, struct etr_aib *a2, int p) -{ - int state_a1, state_a2; - - /* Paranoia check: e0/e1 should better be the same. */ - if (a1->esw.eacr.e0 != a2->esw.eacr.e0 || - a1->esw.eacr.e1 != a2->esw.eacr.e1) - return 0; - - /* Still connected to the same etr ? */ - state_a1 = p ? a1->esw.psc1 : a1->esw.psc0; - state_a2 = p ? a2->esw.psc1 : a2->esw.psc0; - if (state_a1 == etr_lpsc_operational_step) { - if (state_a2 != etr_lpsc_operational_step || - a1->edf1.net_id != a2->edf1.net_id || - a1->edf1.etr_id != a2->edf1.etr_id || - a1->edf1.etr_pn != a2->edf1.etr_pn) - return 0; - } else if (state_a2 != etr_lpsc_pps_mode) - return 0; - - /* The ETV value of a2 needs to be ETV of a1 + 1. */ - if (a1->edf2.etv + 1 != a2->edf2.etv) - return 0; - - if (!etr_port_valid(a2, p)) - return 0; - - return 1; -} - struct clock_sync_data { atomic_t cpus; int in_sync; unsigned long long fixup_cc; - int etr_port; - struct etr_aib *etr_aib; }; static void clock_sync_cpu(struct clock_sync_data *sync) { atomic_dec(&sync->cpus); enable_sync_clock(); - /* - * This looks like a busy wait loop but it isn't. etr_sync_cpus - * is called on all other cpus while the TOD clocks is stopped. - * __udelay will stop the cpu on an enabled wait psw until the - * TOD is running again. - */ while (sync->in_sync == 0) { __udelay(1); /* @@ -748,688 +423,6 @@ static void clock_sync_cpu(struct clock_sync_data *sync) } /* - * Sync the TOD clock using the port referred to by aibp. This port - * has to be enabled and the other port has to be disabled. The - * last eacr update has to be more than 1.6 seconds in the past. - */ -static int etr_sync_clock(void *data) -{ - static int first; - unsigned long long clock, old_clock, clock_delta, delay, delta; - struct clock_sync_data *etr_sync; - struct etr_aib *sync_port, *aib; - int port; - int rc; - - etr_sync = data; - - if (xchg(&first, 1) == 1) { - /* Slave */ - clock_sync_cpu(etr_sync); - return 0; - } - - /* Wait until all other cpus entered the sync function. */ - while (atomic_read(&etr_sync->cpus) != 0) - cpu_relax(); - - port = etr_sync->etr_port; - aib = etr_sync->etr_aib; - sync_port = (port == 0) ? &etr_port0 : &etr_port1; - enable_sync_clock(); - - /* Set clock to next OTE. */ - __ctl_set_bit(14, 21); - __ctl_set_bit(0, 29); - clock = ((unsigned long long) (aib->edf2.etv + 1)) << 32; - old_clock = get_tod_clock(); - if (set_tod_clock(clock) == 0) { - __udelay(1); /* Wait for the clock to start. */ - __ctl_clear_bit(0, 29); - __ctl_clear_bit(14, 21); - etr_stetr(aib); - /* Adjust Linux timing variables. */ - delay = (unsigned long long) - (aib->edf2.etv - sync_port->edf2.etv) << 32; - delta = adjust_time(old_clock, clock, delay); - clock_delta = clock - old_clock; - atomic_notifier_call_chain(&s390_epoch_delta_notifier, 0, - &clock_delta); - etr_sync->fixup_cc = delta; - fixup_clock_comparator(delta); - /* Verify that the clock is properly set. */ - if (!etr_aib_follows(sync_port, aib, port)) { - /* Didn't work. */ - disable_sync_clock(NULL); - etr_sync->in_sync = -EAGAIN; - rc = -EAGAIN; - } else { - etr_sync->in_sync = 1; - rc = 0; - } - } else { - /* Could not set the clock ?!? */ - __ctl_clear_bit(0, 29); - __ctl_clear_bit(14, 21); - disable_sync_clock(NULL); - etr_sync->in_sync = -EAGAIN; - rc = -EAGAIN; - } - xchg(&first, 0); - return rc; -} - -static int etr_sync_clock_stop(struct etr_aib *aib, int port) -{ - struct clock_sync_data etr_sync; - struct etr_aib *sync_port; - int follows; - int rc; - - /* Check if the current aib is adjacent to the sync port aib. */ - sync_port = (port == 0) ? &etr_port0 : &etr_port1; - follows = etr_aib_follows(sync_port, aib, port); - memcpy(sync_port, aib, sizeof(*aib)); - if (!follows) - return -EAGAIN; - memset(&etr_sync, 0, sizeof(etr_sync)); - etr_sync.etr_aib = aib; - etr_sync.etr_port = port; - get_online_cpus(); - atomic_set(&etr_sync.cpus, num_online_cpus() - 1); - rc = stop_machine(etr_sync_clock, &etr_sync, cpu_online_mask); - put_online_cpus(); - return rc; -} - -/* - * Handle the immediate effects of the different events. - * The port change event is used for online/offline changes. - */ -static struct etr_eacr etr_handle_events(struct etr_eacr eacr) -{ - if (test_and_clear_bit(ETR_EVENT_SYNC_CHECK, &etr_events)) - eacr.es = 0; - if (test_and_clear_bit(ETR_EVENT_SWITCH_LOCAL, &etr_events)) - eacr.es = eacr.sl = 0; - if (test_and_clear_bit(ETR_EVENT_PORT_ALERT, &etr_events)) - etr_port0_uptodate = etr_port1_uptodate = 0; - - if (test_and_clear_bit(ETR_EVENT_PORT0_CHANGE, &etr_events)) { - if (eacr.e0) - /* - * Port change of an enabled port. We have to - * assume that this can have caused an stepping - * port switch. - */ - etr_tolec = get_tod_clock(); - eacr.p0 = etr_port0_online; - if (!eacr.p0) - eacr.e0 = 0; - etr_port0_uptodate = 0; - } - if (test_and_clear_bit(ETR_EVENT_PORT1_CHANGE, &etr_events)) { - if (eacr.e1) - /* - * Port change of an enabled port. We have to - * assume that this can have caused an stepping - * port switch. - */ - etr_tolec = get_tod_clock(); - eacr.p1 = etr_port1_online; - if (!eacr.p1) - eacr.e1 = 0; - etr_port1_uptodate = 0; - } - clear_bit(ETR_EVENT_UPDATE, &etr_events); - return eacr; -} - -/* - * Set up a timer that expires after the etr_tolec + 1.6 seconds if - * one of the ports needs an update. - */ -static void etr_set_tolec_timeout(unsigned long long now) -{ - unsigned long micros; - - if ((!etr_eacr.p0 || etr_port0_uptodate) && - (!etr_eacr.p1 || etr_port1_uptodate)) - return; - micros = (now > etr_tolec) ? ((now - etr_tolec) >> 12) : 0; - micros = (micros > 1600000) ? 0 : 1600000 - micros; - mod_timer(&etr_timer, jiffies + (micros * HZ) / 1000000 + 1); -} - -/* - * Set up a time that expires after 1/2 second. - */ -static void etr_set_sync_timeout(void) -{ - mod_timer(&etr_timer, jiffies + HZ/2); -} - -/* - * Update the aib information for one or both ports. - */ -static struct etr_eacr etr_handle_update(struct etr_aib *aib, - struct etr_eacr eacr) -{ - /* With both ports disabled the aib information is useless. */ - if (!eacr.e0 && !eacr.e1) - return eacr; - - /* Update port0 or port1 with aib stored in etr_work_fn. */ - if (aib->esw.q == 0) { - /* Information for port 0 stored. */ - if (eacr.p0 && !etr_port0_uptodate) { - etr_port0 = *aib; - if (etr_port0_online) - etr_port0_uptodate = 1; - } - } else { - /* Information for port 1 stored. */ - if (eacr.p1 && !etr_port1_uptodate) { - etr_port1 = *aib; - if (etr_port0_online) - etr_port1_uptodate = 1; - } - } - - /* - * Do not try to get the alternate port aib if the clock - * is not in sync yet. - */ - if (!eacr.es || !check_sync_clock()) - return eacr; - - /* - * If steai is available we can get the information about - * the other port immediately. If only stetr is available the - * data-port bit toggle has to be used. - */ - if (etr_steai_available) { - if (eacr.p0 && !etr_port0_uptodate) { - etr_steai_cv(&etr_port0, ETR_STEAI_PORT_0); - etr_port0_uptodate = 1; - } - if (eacr.p1 && !etr_port1_uptodate) { - etr_steai_cv(&etr_port1, ETR_STEAI_PORT_1); - etr_port1_uptodate = 1; - } - } else { - /* - * One port was updated above, if the other - * port is not uptodate toggle dp bit. - */ - if ((eacr.p0 && !etr_port0_uptodate) || - (eacr.p1 && !etr_port1_uptodate)) - eacr.dp ^= 1; - else - eacr.dp = 0; - } - return eacr; -} - -/* - * Write new etr control register if it differs from the current one. - * Return 1 if etr_tolec has been updated as well. - */ -static void etr_update_eacr(struct etr_eacr eacr) -{ - int dp_changed; - - if (memcmp(&etr_eacr, &eacr, sizeof(eacr)) == 0) - /* No change, return. */ - return; - /* - * The disable of an active port of the change of the data port - * bit can/will cause a change in the data port. - */ - dp_changed = etr_eacr.e0 > eacr.e0 || etr_eacr.e1 > eacr.e1 || - (etr_eacr.dp ^ eacr.dp) != 0; - etr_eacr = eacr; - etr_setr(&etr_eacr); - if (dp_changed) - etr_tolec = get_tod_clock(); -} - -/* - * ETR work. In this function you'll find the main logic. In - * particular this is the only function that calls etr_update_eacr(), - * it "controls" the etr control register. - */ -static void etr_work_fn(struct work_struct *work) -{ - unsigned long long now; - struct etr_eacr eacr; - struct etr_aib aib; - int sync_port; - - /* prevent multiple execution. */ - mutex_lock(&etr_work_mutex); - - /* Create working copy of etr_eacr. */ - eacr = etr_eacr; - - /* Check for the different events and their immediate effects. */ - eacr = etr_handle_events(eacr); - - /* Check if ETR is supposed to be active. */ - eacr.ea = eacr.p0 || eacr.p1; - if (!eacr.ea) { - /* Both ports offline. Reset everything. */ - eacr.dp = eacr.es = eacr.sl = 0; - on_each_cpu(disable_sync_clock, NULL, 1); - del_timer_sync(&etr_timer); - etr_update_eacr(eacr); - goto out_unlock; - } - - /* Store aib to get the current ETR status word. */ - BUG_ON(etr_stetr(&aib) != 0); - etr_port0.esw = etr_port1.esw = aib.esw; /* Copy status word. */ - now = get_tod_clock(); - - /* - * Update the port information if the last stepping port change - * or data port change is older than 1.6 seconds. - */ - if (now >= etr_tolec + (1600000 << 12)) - eacr = etr_handle_update(&aib, eacr); - - /* - * Select ports to enable. The preferred synchronization mode is PPS. - * If a port can be enabled depends on a number of things: - * 1) The port needs to be online and uptodate. A port is not - * disabled just because it is not uptodate, but it is only - * enabled if it is uptodate. - * 2) The port needs to have the same mode (pps / etr). - * 3) The port needs to be usable -> etr_port_valid() == 1 - * 4) To enable the second port the clock needs to be in sync. - * 5) If both ports are useable and are ETR ports, the network id - * has to be the same. - * The eacr.sl bit is used to indicate etr mode vs. pps mode. - */ - if (eacr.p0 && aib.esw.psc0 == etr_lpsc_pps_mode) { - eacr.sl = 0; - eacr.e0 = 1; - if (!etr_mode_is_pps(etr_eacr)) - eacr.es = 0; - if (!eacr.es || !eacr.p1 || aib.esw.psc1 != etr_lpsc_pps_mode) - eacr.e1 = 0; - // FIXME: uptodate checks ? - else if (etr_port0_uptodate && etr_port1_uptodate) - eacr.e1 = 1; - sync_port = (etr_port0_uptodate && - etr_port_valid(&etr_port0, 0)) ? 0 : -1; - } else if (eacr.p1 && aib.esw.psc1 == etr_lpsc_pps_mode) { - eacr.sl = 0; - eacr.e0 = 0; - eacr.e1 = 1; - if (!etr_mode_is_pps(etr_eacr)) - eacr.es = 0; - sync_port = (etr_port1_uptodate && - etr_port_valid(&etr_port1, 1)) ? 1 : -1; - } else if (eacr.p0 && aib.esw.psc0 == etr_lpsc_operational_step) { - eacr.sl = 1; - eacr.e0 = 1; - if (!etr_mode_is_etr(etr_eacr)) - eacr.es = 0; - if (!eacr.es || !eacr.p1 || - aib.esw.psc1 != etr_lpsc_operational_alt) - eacr.e1 = 0; - else if (etr_port0_uptodate && etr_port1_uptodate && - etr_compare_network(&etr_port0, &etr_port1)) - eacr.e1 = 1; - sync_port = (etr_port0_uptodate && - etr_port_valid(&etr_port0, 0)) ? 0 : -1; - } else if (eacr.p1 && aib.esw.psc1 == etr_lpsc_operational_step) { - eacr.sl = 1; - eacr.e0 = 0; - eacr.e1 = 1; - if (!etr_mode_is_etr(etr_eacr)) - eacr.es = 0; - sync_port = (etr_port1_uptodate && - etr_port_valid(&etr_port1, 1)) ? 1 : -1; - } else { - /* Both ports not usable. */ - eacr.es = eacr.sl = 0; - sync_port = -1; - } - - /* - * If the clock is in sync just update the eacr and return. - * If there is no valid sync port wait for a port update. - */ - if ((eacr.es && check_sync_clock()) || sync_port < 0) { - etr_update_eacr(eacr); - etr_set_tolec_timeout(now); - goto out_unlock; - } - - /* - * Prepare control register for clock syncing - * (reset data port bit, set sync check control. - */ - eacr.dp = 0; - eacr.es = 1; - - /* - * Update eacr and try to synchronize the clock. If the update - * of eacr caused a stepping port switch (or if we have to - * assume that a stepping port switch has occurred) or the - * clock syncing failed, reset the sync check control bit - * and set up a timer to try again after 0.5 seconds - */ - etr_update_eacr(eacr); - if (now < etr_tolec + (1600000 << 12) || - etr_sync_clock_stop(&aib, sync_port) != 0) { - /* Sync failed. Try again in 1/2 second. */ - eacr.es = 0; - etr_update_eacr(eacr); - etr_set_sync_timeout(); - } else - etr_set_tolec_timeout(now); -out_unlock: - mutex_unlock(&etr_work_mutex); -} - -/* - * Sysfs interface functions - */ -static struct bus_type etr_subsys = { - .name = "etr", - .dev_name = "etr", -}; - -static struct device etr_port0_dev = { - .id = 0, - .bus = &etr_subsys, -}; - -static struct device etr_port1_dev = { - .id = 1, - .bus = &etr_subsys, -}; - -/* - * ETR subsys attributes - */ -static ssize_t etr_stepping_port_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "%i\n", etr_port0.esw.p); -} - -static DEVICE_ATTR(stepping_port, 0400, etr_stepping_port_show, NULL); - -static ssize_t etr_stepping_mode_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - char *mode_str; - - if (etr_mode_is_pps(etr_eacr)) - mode_str = "pps"; - else if (etr_mode_is_etr(etr_eacr)) - mode_str = "etr"; - else - mode_str = "local"; - return sprintf(buf, "%s\n", mode_str); -} - -static DEVICE_ATTR(stepping_mode, 0400, etr_stepping_mode_show, NULL); - -/* - * ETR port attributes - */ -static inline struct etr_aib *etr_aib_from_dev(struct device *dev) -{ - if (dev == &etr_port0_dev) - return etr_port0_online ? &etr_port0 : NULL; - else - return etr_port1_online ? &etr_port1 : NULL; -} - -static ssize_t etr_online_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - unsigned int online; - - online = (dev == &etr_port0_dev) ? etr_port0_online : etr_port1_online; - return sprintf(buf, "%i\n", online); -} - -static ssize_t etr_online_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - unsigned int value; - - value = simple_strtoul(buf, NULL, 0); - if (value != 0 && value != 1) - return -EINVAL; - if (!test_bit(CLOCK_SYNC_HAS_ETR, &clock_sync_flags)) - return -EOPNOTSUPP; - mutex_lock(&clock_sync_mutex); - if (dev == &etr_port0_dev) { - if (etr_port0_online == value) - goto out; /* Nothing to do. */ - etr_port0_online = value; - if (etr_port0_online && etr_port1_online) - set_bit(CLOCK_SYNC_ETR, &clock_sync_flags); - else - clear_bit(CLOCK_SYNC_ETR, &clock_sync_flags); - set_bit(ETR_EVENT_PORT0_CHANGE, &etr_events); - queue_work(time_sync_wq, &etr_work); - } else { - if (etr_port1_online == value) - goto out; /* Nothing to do. */ - etr_port1_online = value; - if (etr_port0_online && etr_port1_online) - set_bit(CLOCK_SYNC_ETR, &clock_sync_flags); - else - clear_bit(CLOCK_SYNC_ETR, &clock_sync_flags); - set_bit(ETR_EVENT_PORT1_CHANGE, &etr_events); - queue_work(time_sync_wq, &etr_work); - } -out: - mutex_unlock(&clock_sync_mutex); - return count; -} - -static DEVICE_ATTR(online, 0600, etr_online_show, etr_online_store); - -static ssize_t etr_stepping_control_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "%i\n", (dev == &etr_port0_dev) ? - etr_eacr.e0 : etr_eacr.e1); -} - -static DEVICE_ATTR(stepping_control, 0400, etr_stepping_control_show, NULL); - -static ssize_t etr_mode_code_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - if (!etr_port0_online && !etr_port1_online) - /* Status word is not uptodate if both ports are offline. */ - return -ENODATA; - return sprintf(buf, "%i\n", (dev == &etr_port0_dev) ? - etr_port0.esw.psc0 : etr_port0.esw.psc1); -} - -static DEVICE_ATTR(state_code, 0400, etr_mode_code_show, NULL); - -static ssize_t etr_untuned_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct etr_aib *aib = etr_aib_from_dev(dev); - - if (!aib || !aib->slsw.v1) - return -ENODATA; - return sprintf(buf, "%i\n", aib->edf1.u); -} - -static DEVICE_ATTR(untuned, 0400, etr_untuned_show, NULL); - -static ssize_t etr_network_id_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct etr_aib *aib = etr_aib_from_dev(dev); - - if (!aib || !aib->slsw.v1) - return -ENODATA; - return sprintf(buf, "%i\n", aib->edf1.net_id); -} - -static DEVICE_ATTR(network, 0400, etr_network_id_show, NULL); - -static ssize_t etr_id_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct etr_aib *aib = etr_aib_from_dev(dev); - - if (!aib || !aib->slsw.v1) - return -ENODATA; - return sprintf(buf, "%i\n", aib->edf1.etr_id); -} - -static DEVICE_ATTR(id, 0400, etr_id_show, NULL); - -static ssize_t etr_port_number_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct etr_aib *aib = etr_aib_from_dev(dev); - - if (!aib || !aib->slsw.v1) - return -ENODATA; - return sprintf(buf, "%i\n", aib->edf1.etr_pn); -} - -static DEVICE_ATTR(port, 0400, etr_port_number_show, NULL); - -static ssize_t etr_coupled_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct etr_aib *aib = etr_aib_from_dev(dev); - - if (!aib || !aib->slsw.v3) - return -ENODATA; - return sprintf(buf, "%i\n", aib->edf3.c); -} - -static DEVICE_ATTR(coupled, 0400, etr_coupled_show, NULL); - -static ssize_t etr_local_time_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct etr_aib *aib = etr_aib_from_dev(dev); - - if (!aib || !aib->slsw.v3) - return -ENODATA; - return sprintf(buf, "%i\n", aib->edf3.blto); -} - -static DEVICE_ATTR(local_time, 0400, etr_local_time_show, NULL); - -static ssize_t etr_utc_offset_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct etr_aib *aib = etr_aib_from_dev(dev); - - if (!aib || !aib->slsw.v3) - return -ENODATA; - return sprintf(buf, "%i\n", aib->edf3.buo); -} - -static DEVICE_ATTR(utc_offset, 0400, etr_utc_offset_show, NULL); - -static struct device_attribute *etr_port_attributes[] = { - &dev_attr_online, - &dev_attr_stepping_control, - &dev_attr_state_code, - &dev_attr_untuned, - &dev_attr_network, - &dev_attr_id, - &dev_attr_port, - &dev_attr_coupled, - &dev_attr_local_time, - &dev_attr_utc_offset, - NULL -}; - -static int __init etr_register_port(struct device *dev) -{ - struct device_attribute **attr; - int rc; - - rc = device_register(dev); - if (rc) - goto out; - for (attr = etr_port_attributes; *attr; attr++) { - rc = device_create_file(dev, *attr); - if (rc) - goto out_unreg; - } - return 0; -out_unreg: - for (; attr >= etr_port_attributes; attr--) - device_remove_file(dev, *attr); - device_unregister(dev); -out: - return rc; -} - -static void __init etr_unregister_port(struct device *dev) -{ - struct device_attribute **attr; - - for (attr = etr_port_attributes; *attr; attr++) - device_remove_file(dev, *attr); - device_unregister(dev); -} - -static int __init etr_init_sysfs(void) -{ - int rc; - - rc = subsys_system_register(&etr_subsys, NULL); - if (rc) - goto out; - rc = device_create_file(etr_subsys.dev_root, &dev_attr_stepping_port); - if (rc) - goto out_unreg_subsys; - rc = device_create_file(etr_subsys.dev_root, &dev_attr_stepping_mode); - if (rc) - goto out_remove_stepping_port; - rc = etr_register_port(&etr_port0_dev); - if (rc) - goto out_remove_stepping_mode; - rc = etr_register_port(&etr_port1_dev); - if (rc) - goto out_remove_port0; - return 0; - -out_remove_port0: - etr_unregister_port(&etr_port0_dev); -out_remove_stepping_mode: - device_remove_file(etr_subsys.dev_root, &dev_attr_stepping_mode); -out_remove_stepping_port: - device_remove_file(etr_subsys.dev_root, &dev_attr_stepping_port); -out_unreg_subsys: - bus_unregister(&etr_subsys); -out: - return rc; -} - -device_initcall(etr_init_sysfs); - -/* * Server Time Protocol (STP) code. */ static bool stp_online; @@ -1455,7 +448,7 @@ static void __init stp_reset(void) int rc; stp_page = (void *) get_zeroed_page(GFP_ATOMIC); - rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000); + rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL); if (rc == 0) set_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags); else if (stp_online) { @@ -1531,8 +524,9 @@ void stp_queue_work(void) static int stp_sync_clock(void *data) { static int first; - unsigned long long old_clock, delta, new_clock, clock_delta; + unsigned long long clock_delta; struct clock_sync_data *stp_sync; + struct ptff_qto qto; int rc; stp_sync = data; @@ -1553,15 +547,18 @@ static int stp_sync_clock(void *data) if (stp_info.todoff[0] || stp_info.todoff[1] || stp_info.todoff[2] || stp_info.todoff[3] || stp_info.tmd != 2) { - old_clock = get_tod_clock(); - rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0); + rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0, &clock_delta); if (rc == 0) { - new_clock = get_tod_clock(); - delta = adjust_time(old_clock, new_clock, 0); - clock_delta = new_clock - old_clock; + /* fixup the monotonic sched clock */ + sched_clock_base_cc += clock_delta; + if (ptff_query(PTFF_QTO) && + ptff(&qto, sizeof(qto), PTFF_QTO) == 0) + /* Update LPAR offset */ + lpar_offset = qto.tod_epoch_difference; atomic_notifier_call_chain(&s390_epoch_delta_notifier, 0, &clock_delta); - fixup_clock_comparator(delta); + stp_sync->fixup_cc = clock_delta; + fixup_clock_comparator(clock_delta); rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi)); if (rc == 0 && stp_info.tmd != 2) @@ -1590,12 +587,12 @@ static void stp_work_fn(struct work_struct *work) mutex_lock(&stp_work_mutex); if (!stp_online) { - chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000); + chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL); del_timer_sync(&stp_timer); goto out_unlock; } - rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0); + rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0, NULL); if (rc) goto out_unlock; diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 64298a867589..e959c02e0cac 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -46,6 +46,7 @@ static DECLARE_WORK(topology_work, topology_work_fn); */ static struct mask_info socket_info; static struct mask_info book_info; +static struct mask_info drawer_info; DEFINE_PER_CPU(struct cpu_topology_s390, cpu_topology); EXPORT_PER_CPU_SYMBOL_GPL(cpu_topology); @@ -79,10 +80,10 @@ static cpumask_t cpu_thread_map(unsigned int cpu) return mask; } -static struct mask_info *add_cpus_to_mask(struct topology_core *tl_core, - struct mask_info *book, - struct mask_info *socket, - int one_socket_per_cpu) +static void add_cpus_to_mask(struct topology_core *tl_core, + struct mask_info *drawer, + struct mask_info *book, + struct mask_info *socket) { struct cpu_topology_s390 *topo; unsigned int core; @@ -97,21 +98,17 @@ static struct mask_info *add_cpus_to_mask(struct topology_core *tl_core, continue; for (i = 0; i <= smp_cpu_mtid; i++) { topo = &per_cpu(cpu_topology, lcpu + i); + topo->drawer_id = drawer->id; topo->book_id = book->id; + topo->socket_id = socket->id; topo->core_id = rcore; topo->thread_id = lcpu + i; + cpumask_set_cpu(lcpu + i, &drawer->mask); cpumask_set_cpu(lcpu + i, &book->mask); cpumask_set_cpu(lcpu + i, &socket->mask); - if (one_socket_per_cpu) - topo->socket_id = rcore; - else - topo->socket_id = socket->id; smp_cpu_set_polarization(lcpu + i, tl_core->pp); } - if (one_socket_per_cpu) - socket = socket->next; } - return socket; } static void clear_masks(void) @@ -128,6 +125,11 @@ static void clear_masks(void) cpumask_clear(&info->mask); info = info->next; } + info = &drawer_info; + while (info) { + cpumask_clear(&info->mask); + info = info->next; + } } static union topology_entry *next_tle(union topology_entry *tle) @@ -137,16 +139,22 @@ static union topology_entry *next_tle(union topology_entry *tle) return (union topology_entry *)((struct topology_container *)tle + 1); } -static void __tl_to_masks_generic(struct sysinfo_15_1_x *info) +static void tl_to_masks(struct sysinfo_15_1_x *info) { struct mask_info *socket = &socket_info; struct mask_info *book = &book_info; + struct mask_info *drawer = &drawer_info; union topology_entry *tle, *end; + clear_masks(); tle = info->tle; end = (union topology_entry *)((unsigned long)info + info->length); while (tle < end) { switch (tle->nl) { + case 3: + drawer = drawer->next; + drawer->id = tle->container.id; + break; case 2: book = book->next; book->id = tle->container.id; @@ -156,32 +164,7 @@ static void __tl_to_masks_generic(struct sysinfo_15_1_x *info) socket->id = tle->container.id; break; case 0: - add_cpus_to_mask(&tle->cpu, book, socket, 0); - break; - default: - clear_masks(); - return; - } - tle = next_tle(tle); - } -} - -static void __tl_to_masks_z10(struct sysinfo_15_1_x *info) -{ - struct mask_info *socket = &socket_info; - struct mask_info *book = &book_info; - union topology_entry *tle, *end; - - tle = info->tle; - end = (union topology_entry *)((unsigned long)info + info->length); - while (tle < end) { - switch (tle->nl) { - case 1: - book = book->next; - book->id = tle->container.id; - break; - case 0: - socket = add_cpus_to_mask(&tle->cpu, book, socket, 1); + add_cpus_to_mask(&tle->cpu, drawer, book, socket); break; default: clear_masks(); @@ -191,22 +174,6 @@ static void __tl_to_masks_z10(struct sysinfo_15_1_x *info) } } -static void tl_to_masks(struct sysinfo_15_1_x *info) -{ - struct cpuid cpu_id; - - get_cpu_id(&cpu_id); - clear_masks(); - switch (cpu_id.machine) { - case 0x2097: - case 0x2098: - __tl_to_masks_z10(info); - break; - default: - __tl_to_masks_generic(info); - } -} - static void topology_update_polarization_simple(void) { int cpu; @@ -257,11 +224,13 @@ static void update_cpu_masks(void) topo->thread_mask = cpu_thread_map(cpu); topo->core_mask = cpu_group_map(&socket_info, cpu); topo->book_mask = cpu_group_map(&book_info, cpu); + topo->drawer_mask = cpu_group_map(&drawer_info, cpu); if (!MACHINE_HAS_TOPOLOGY) { topo->thread_id = cpu; topo->core_id = cpu; topo->socket_id = cpu; topo->book_id = cpu; + topo->drawer_id = cpu; } } numa_update_cpu_topology(); @@ -269,10 +238,7 @@ static void update_cpu_masks(void) void store_topology(struct sysinfo_15_1_x *info) { - if (topology_max_mnest >= 3) - stsi(info, 15, 1, 3); - else - stsi(info, 15, 1, 2); + stsi(info, 15, 1, min(topology_max_mnest, 4)); } int arch_update_cpu_topology(void) @@ -442,6 +408,11 @@ static const struct cpumask *cpu_book_mask(int cpu) return &per_cpu(cpu_topology, cpu).book_mask; } +static const struct cpumask *cpu_drawer_mask(int cpu) +{ + return &per_cpu(cpu_topology, cpu).drawer_mask; +} + static int __init early_parse_topology(char *p) { return kstrtobool(p, &topology_enabled); @@ -452,6 +423,7 @@ static struct sched_domain_topology_level s390_topology[] = { { cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, { cpu_book_mask, SD_INIT_NAME(BOOK) }, + { cpu_drawer_mask, SD_INIT_NAME(DRAWER) }, { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, }, }; @@ -487,6 +459,7 @@ static int __init s390_topology_init(void) printk(KERN_CONT " / %d\n", info->mnest); alloc_masks(info, &socket_info, 1); alloc_masks(info, &book_info, 2); + alloc_masks(info, &drawer_info, 3); set_sched_topology(s390_topology); return 0; } diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index dd97a3e8a34a..d0539f76fd24 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -14,11 +14,12 @@ */ #include <linux/kprobes.h> #include <linux/kdebug.h> -#include <linux/module.h> +#include <linux/extable.h> #include <linux/ptrace.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/slab.h> +#include <asm/uaccess.h> #include <asm/fpu/api.h> #include "entry.h" diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 94495cac8be3..5904abf6b1ae 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -216,7 +216,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) * it at vdso_base which is the "natural" base for it, but we might * fail and end up putting it elsewhere. */ - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; vdso_base = get_unmapped_area(NULL, 0, vdso_pages << PAGE_SHIFT, 0, 0); if (IS_ERR_VALUE(vdso_base)) { rc = vdso_base; diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile index f9c459586649..6cc947896c77 100644 --- a/arch/s390/kernel/vdso32/Makefile +++ b/arch/s390/kernel/vdso32/Makefile @@ -1,5 +1,7 @@ # List of files in the vdso, has to be asm only for now +KCOV_INSTRUMENT := n + obj-vdso32 = gettimeofday.o clock_getres.o clock_gettime.o note.o getcpu.o # Build rules @@ -22,8 +24,9 @@ obj-y += vdso32_wrapper.o extra-y += vdso32.lds CPPFLAGS_vdso32.lds += -P -C -U$(ARCH) -# Disable gcov profiling for VDSO code +# Disable gcov profiling and ubsan for VDSO code GCOV_PROFILE := n +UBSAN_SANITIZE := n # Force dependency (incbin is bad) $(obj)/vdso32_wrapper.o : $(obj)/vdso32.so diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index 058659c1b8cf..2d54c18089eb 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -1,5 +1,7 @@ # List of files in the vdso, has to be asm only for now +KCOV_INSTRUMENT := n + obj-vdso64 = gettimeofday.o clock_getres.o clock_gettime.o note.o getcpu.o # Build rules @@ -22,8 +24,9 @@ obj-y += vdso64_wrapper.o extra-y += vdso64.lds CPPFLAGS_vdso64.lds += -P -C -U$(ARCH) -# Disable gcov profiling for VDSO code +# Disable gcov profiling and ubsan for VDSO code GCOV_PROFILE := n +UBSAN_SANITIZE := n # Force dependency (incbin is bad) $(obj)/vdso64_wrapper.o : $(obj)/vdso64.so diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 0f41a8286378..429bfd111961 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -4,6 +4,16 @@ #include <asm/thread_info.h> #include <asm/page.h> + +/* + * Put .bss..swapper_pg_dir as the first thing in .bss. This will + * make sure it has 16k alignment. + */ +#define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir) + +/* Handle ro_after_init data on our own. */ +#define RO_AFTER_INIT_DATA + #include <asm-generic/vmlinux.lds.h> OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") @@ -49,7 +59,14 @@ SECTIONS _eshared = .; /* End of shareable data */ _sdata = .; /* Start of data section */ - EXCEPTION_TABLE(16) :data + . = ALIGN(PAGE_SIZE); + __start_ro_after_init = .; + .data..ro_after_init : { + *(.data..ro_after_init) + } + EXCEPTION_TABLE(16) + . = ALIGN(PAGE_SIZE); + __end_ro_after_init = .; RW_DATA_SECTION(0x100, PAGE_SIZE, THREAD_SIZE) @@ -81,7 +98,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ - BSS_SECTION(0, 2, 0) + BSS_SECTION(PAGE_SIZE, 4 * PAGE_SIZE, PAGE_SIZE) _end = . ; diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index dafc44f519c3..856e30d8463f 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -18,6 +18,8 @@ #include <asm/cpu_mf.h> #include <asm/smp.h> +#include "entry.h" + static void virt_timer_expire(void); static LIST_HEAD(virt_timer_list); diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 5ea5af3c7db7..b1900239b0ab 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -28,6 +28,7 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING + select HAVE_KVM_INVALID_WAKEUPS select SRCU select KVM_VFIO ---help--- diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index d42fa38c2429..09a9e6dfc09f 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqch ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-objs += diag.o gaccess.o guestdbg.o +kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o vsie.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 1ea4095b67d7..ce865bd4f81d 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -212,6 +212,11 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu) (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY)) return -EOPNOTSUPP; + VCPU_EVENT(vcpu, 4, "diag 0x500 schid 0x%8.8x queue 0x%x cookie 0x%llx", + (u32) vcpu->run->s.regs.gprs[2], + (u32) vcpu->run->s.regs.gprs[3], + vcpu->run->s.regs.gprs[4]); + /* * The layout is as follows: * - gpr 2 contains the subchannel id (passed as addr) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 66938d283b77..4aa8a7e2a1da 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -8,6 +8,7 @@ #include <linux/vmalloc.h> #include <linux/err.h> #include <asm/pgtable.h> +#include <asm/gmap.h> #include "kvm-s390.h" #include "gaccess.h" #include <asm/switch_to.h> @@ -476,18 +477,72 @@ enum { FSI_FETCH = 2 /* Exception was due to fetch operation */ }; -static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, - ar_t ar, enum gacc_mode mode) +enum prot_type { + PROT_TYPE_LA = 0, + PROT_TYPE_KEYC = 1, + PROT_TYPE_ALC = 2, + PROT_TYPE_DAT = 3, +}; + +static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, + ar_t ar, enum gacc_mode mode, enum prot_type prot) { - int rc; - struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw); struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; - struct trans_exc_code_bits *tec_bits; + struct trans_exc_code_bits *tec; memset(pgm, 0, sizeof(*pgm)); - tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code; - tec_bits->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; - tec_bits->as = psw.as; + pgm->code = code; + tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code; + + switch (code) { + case PGM_PROTECTION: + switch (prot) { + case PROT_TYPE_ALC: + tec->b60 = 1; + /* FALL THROUGH */ + case PROT_TYPE_DAT: + tec->b61 = 1; + break; + default: /* LA and KEYC set b61 to 0, other params undefined */ + return code; + } + /* FALL THROUGH */ + case PGM_ASCE_TYPE: + case PGM_PAGE_TRANSLATION: + case PGM_REGION_FIRST_TRANS: + case PGM_REGION_SECOND_TRANS: + case PGM_REGION_THIRD_TRANS: + case PGM_SEGMENT_TRANSLATION: + /* + * op_access_id only applies to MOVE_PAGE -> set bit 61 + * exc_access_id has to be set to 0 for some instructions. Both + * cases have to be handled by the caller. + */ + tec->addr = gva >> PAGE_SHIFT; + tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; + tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as; + /* FALL THROUGH */ + case PGM_ALEN_TRANSLATION: + case PGM_ALE_SEQUENCE: + case PGM_ASTE_VALIDITY: + case PGM_ASTE_SEQUENCE: + case PGM_EXTENDED_AUTHORITY: + /* + * We can always store exc_access_id, as it is + * undefined for non-ar cases. It is undefined for + * most DAT protection exceptions. + */ + pgm->exc_access_id = ar; + break; + } + return code; +} + +static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, + unsigned long ga, ar_t ar, enum gacc_mode mode) +{ + int rc; + struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw); if (!psw.t) { asce->val = 0; @@ -510,21 +565,8 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, return 0; case PSW_AS_ACCREG: rc = ar_translation(vcpu, asce, ar, mode); - switch (rc) { - case PGM_ALEN_TRANSLATION: - case PGM_ALE_SEQUENCE: - case PGM_ASTE_VALIDITY: - case PGM_ASTE_SEQUENCE: - case PGM_EXTENDED_AUTHORITY: - vcpu->arch.pgm.exc_access_id = ar; - break; - case PGM_PROTECTION: - tec_bits->b60 = 1; - tec_bits->b61 = 1; - break; - } if (rc > 0) - pgm->code = rc; + return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_ALC); return rc; } return 0; @@ -729,40 +771,31 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu, return 1; } -static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, +static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, unsigned long *pages, unsigned long nr_pages, const union asce asce, enum gacc_mode mode) { - struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; psw_t *psw = &vcpu->arch.sie_block->gpsw; - struct trans_exc_code_bits *tec_bits; - int lap_enabled, rc; + int lap_enabled, rc = 0; - tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code; lap_enabled = low_address_protection_enabled(vcpu, asce); while (nr_pages) { ga = kvm_s390_logical_to_effective(vcpu, ga); - tec_bits->addr = ga >> PAGE_SHIFT; - if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) { - pgm->code = PGM_PROTECTION; - return pgm->code; - } + if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) + return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode, + PROT_TYPE_LA); ga &= PAGE_MASK; if (psw_bits(*psw).t) { rc = guest_translate(vcpu, ga, pages, asce, mode); if (rc < 0) return rc; - if (rc == PGM_PROTECTION) - tec_bits->b61 = 1; - if (rc) - pgm->code = rc; } else { *pages = kvm_s390_real_to_abs(vcpu, ga); if (kvm_is_error_gpa(vcpu->kvm, *pages)) - pgm->code = PGM_ADDRESSING; + rc = PGM_ADDRESSING; } - if (pgm->code) - return pgm->code; + if (rc) + return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_DAT); ga += PAGE_SIZE; pages++; nr_pages--; @@ -783,7 +816,8 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, if (!len) return 0; - rc = get_vcpu_asce(vcpu, &asce, ar, mode); + ga = kvm_s390_logical_to_effective(vcpu, ga); + rc = get_vcpu_asce(vcpu, &asce, ga, ar, mode); if (rc) return rc; nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1; @@ -795,7 +829,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, need_ipte_lock = psw_bits(*psw).t && !asce.r; if (need_ipte_lock) ipte_lock(vcpu); - rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, mode); + rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode); for (idx = 0; idx < nr_pages && !rc; idx++) { gpa = *(pages + idx) + (ga & ~PAGE_MASK); _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len); @@ -846,37 +880,28 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, unsigned long *gpa, enum gacc_mode mode) { - struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; psw_t *psw = &vcpu->arch.sie_block->gpsw; - struct trans_exc_code_bits *tec; union asce asce; int rc; gva = kvm_s390_logical_to_effective(vcpu, gva); - tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code; - rc = get_vcpu_asce(vcpu, &asce, ar, mode); - tec->addr = gva >> PAGE_SHIFT; + rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode); if (rc) return rc; if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) { - if (mode == GACC_STORE) { - rc = pgm->code = PGM_PROTECTION; - return rc; - } + if (mode == GACC_STORE) + return trans_exc(vcpu, PGM_PROTECTION, gva, 0, + mode, PROT_TYPE_LA); } if (psw_bits(*psw).t && !asce.r) { /* Use DAT? */ rc = guest_translate(vcpu, gva, gpa, asce, mode); - if (rc > 0) { - if (rc == PGM_PROTECTION) - tec->b61 = 1; - pgm->code = rc; - } + if (rc > 0) + return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT); } else { - rc = 0; *gpa = kvm_s390_real_to_abs(vcpu, gva); if (kvm_is_error_gpa(vcpu->kvm, *gpa)) - rc = pgm->code = PGM_ADDRESSING; + return trans_exc(vcpu, rc, gva, PGM_ADDRESSING, mode, 0); } return rc; @@ -915,20 +940,247 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, */ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) { - struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; - psw_t *psw = &vcpu->arch.sie_block->gpsw; - struct trans_exc_code_bits *tec_bits; union ctlreg0 ctlreg0 = {.val = vcpu->arch.sie_block->gcr[0]}; if (!ctlreg0.lap || !is_low_address(gra)) return 0; + return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA); +} - memset(pgm, 0, sizeof(*pgm)); - tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code; - tec_bits->fsi = FSI_STORE; - tec_bits->as = psw_bits(*psw).as; - tec_bits->addr = gra >> PAGE_SHIFT; - pgm->code = PGM_PROTECTION; +/** + * kvm_s390_shadow_tables - walk the guest page table and create shadow tables + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pgt: pointer to the page table address result + * @fake: pgt references contiguous guest memory block, not a pgtable + */ +static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, + unsigned long *pgt, int *dat_protection, + int *fake) +{ + struct gmap *parent; + union asce asce; + union vaddress vaddr; + unsigned long ptr; + int rc; + + *fake = 0; + *dat_protection = 0; + parent = sg->parent; + vaddr.addr = saddr; + asce.val = sg->orig_asce; + ptr = asce.origin * 4096; + if (asce.r) { + *fake = 1; + asce.dt = ASCE_TYPE_REGION1; + } + switch (asce.dt) { + case ASCE_TYPE_REGION1: + if (vaddr.rfx01 > asce.tl && !asce.r) + return PGM_REGION_FIRST_TRANS; + break; + case ASCE_TYPE_REGION2: + if (vaddr.rfx) + return PGM_ASCE_TYPE; + if (vaddr.rsx01 > asce.tl) + return PGM_REGION_SECOND_TRANS; + break; + case ASCE_TYPE_REGION3: + if (vaddr.rfx || vaddr.rsx) + return PGM_ASCE_TYPE; + if (vaddr.rtx01 > asce.tl) + return PGM_REGION_THIRD_TRANS; + break; + case ASCE_TYPE_SEGMENT: + if (vaddr.rfx || vaddr.rsx || vaddr.rtx) + return PGM_ASCE_TYPE; + if (vaddr.sx01 > asce.tl) + return PGM_SEGMENT_TRANSLATION; + break; + } - return pgm->code; + switch (asce.dt) { + case ASCE_TYPE_REGION1: { + union region1_table_entry rfte; + + if (*fake) { + /* offset in 16EB guest memory block */ + ptr = ptr + ((unsigned long) vaddr.rsx << 53UL); + rfte.val = ptr; + goto shadow_r2t; + } + rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val); + if (rc) + return rc; + if (rfte.i) + return PGM_REGION_FIRST_TRANS; + if (rfte.tt != TABLE_TYPE_REGION1) + return PGM_TRANSLATION_SPEC; + if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl) + return PGM_REGION_SECOND_TRANS; + if (sg->edat_level >= 1) + *dat_protection |= rfte.p; + ptr = rfte.rto << 12UL; +shadow_r2t: + rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); + if (rc) + return rc; + /* fallthrough */ + } + case ASCE_TYPE_REGION2: { + union region2_table_entry rste; + + if (*fake) { + /* offset in 8PB guest memory block */ + ptr = ptr + ((unsigned long) vaddr.rtx << 42UL); + rste.val = ptr; + goto shadow_r3t; + } + rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val); + if (rc) + return rc; + if (rste.i) + return PGM_REGION_SECOND_TRANS; + if (rste.tt != TABLE_TYPE_REGION2) + return PGM_TRANSLATION_SPEC; + if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl) + return PGM_REGION_THIRD_TRANS; + if (sg->edat_level >= 1) + *dat_protection |= rste.p; + ptr = rste.rto << 12UL; +shadow_r3t: + rste.p |= *dat_protection; + rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); + if (rc) + return rc; + /* fallthrough */ + } + case ASCE_TYPE_REGION3: { + union region3_table_entry rtte; + + if (*fake) { + /* offset in 4TB guest memory block */ + ptr = ptr + ((unsigned long) vaddr.sx << 31UL); + rtte.val = ptr; + goto shadow_sgt; + } + rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val); + if (rc) + return rc; + if (rtte.i) + return PGM_REGION_THIRD_TRANS; + if (rtte.tt != TABLE_TYPE_REGION3) + return PGM_TRANSLATION_SPEC; + if (rtte.cr && asce.p && sg->edat_level >= 2) + return PGM_TRANSLATION_SPEC; + if (rtte.fc && sg->edat_level >= 2) { + *dat_protection |= rtte.fc0.p; + *fake = 1; + ptr = rtte.fc1.rfaa << 31UL; + rtte.val = ptr; + goto shadow_sgt; + } + if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl) + return PGM_SEGMENT_TRANSLATION; + if (sg->edat_level >= 1) + *dat_protection |= rtte.fc0.p; + ptr = rtte.fc0.sto << 12UL; +shadow_sgt: + rtte.fc0.p |= *dat_protection; + rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); + if (rc) + return rc; + /* fallthrough */ + } + case ASCE_TYPE_SEGMENT: { + union segment_table_entry ste; + + if (*fake) { + /* offset in 2G guest memory block */ + ptr = ptr + ((unsigned long) vaddr.sx << 20UL); + ste.val = ptr; + goto shadow_pgt; + } + rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val); + if (rc) + return rc; + if (ste.i) + return PGM_SEGMENT_TRANSLATION; + if (ste.tt != TABLE_TYPE_SEGMENT) + return PGM_TRANSLATION_SPEC; + if (ste.cs && asce.p) + return PGM_TRANSLATION_SPEC; + *dat_protection |= ste.fc0.p; + if (ste.fc && sg->edat_level >= 1) { + *fake = 1; + ptr = ste.fc1.sfaa << 20UL; + ste.val = ptr; + goto shadow_pgt; + } + ptr = ste.fc0.pto << 11UL; +shadow_pgt: + ste.fc0.p |= *dat_protection; + rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); + if (rc) + return rc; + } + } + /* Return the parent address of the page table */ + *pgt = ptr; + return 0; +} + +/** + * kvm_s390_shadow_fault - handle fault on a shadow page table + * @vcpu: virtual cpu + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * + * Returns: - 0 if the shadow fault was successfully resolved + * - > 0 (pgm exception code) on exceptions while faulting + * - -EAGAIN if the caller can retry immediately + * - -EFAULT when accessing invalid guest addresses + * - -ENOMEM if out of memory + */ +int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, + unsigned long saddr) +{ + union vaddress vaddr; + union page_table_entry pte; + unsigned long pgt; + int dat_protection, fake; + int rc; + + down_read(&sg->mm->mmap_sem); + /* + * We don't want any guest-2 tables to change - so the parent + * tables/pointers we read stay valid - unshadowing is however + * always possible - only guest_table_lock protects us. + */ + ipte_lock(vcpu); + + rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); + if (rc) + rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection, + &fake); + + vaddr.addr = saddr; + if (fake) { + /* offset in 1MB guest memory block */ + pte.val = pgt + ((unsigned long) vaddr.px << 12UL); + goto shadow_page; + } + if (!rc) + rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val); + if (!rc && pte.i) + rc = PGM_PAGE_TRANSLATION; + if (!rc && (pte.z || (pte.co && sg->edat_level < 1))) + rc = PGM_TRANSLATION_SPEC; +shadow_page: + pte.p |= dat_protection; + if (!rc) + rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); + ipte_unlock(vcpu); + up_read(&sg->mm->mmap_sem); + return rc; } diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index df0a79dd8159..8756569ad938 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -361,4 +361,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu); int ipte_lock_held(struct kvm_vcpu *vcpu); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); +int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow, + unsigned long saddr); + #endif /* __KVM_S390_GACCESS_H */ diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index e8c6843b9600..d7c6a7f53ced 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -206,7 +206,7 @@ static int __import_wp_info(struct kvm_vcpu *vcpu, int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - int ret = 0, nr_wp = 0, nr_bp = 0, i, size; + int ret = 0, nr_wp = 0, nr_bp = 0, i; struct kvm_hw_breakpoint *bp_data = NULL; struct kvm_hw_wp_info_arch *wp_info = NULL; struct kvm_hw_bp_info_arch *bp_info = NULL; @@ -216,17 +216,10 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT) return -EINVAL; - size = dbg->arch.nr_hw_bp * sizeof(struct kvm_hw_breakpoint); - bp_data = kmalloc(size, GFP_KERNEL); - if (!bp_data) { - ret = -ENOMEM; - goto error; - } - - if (copy_from_user(bp_data, dbg->arch.hw_bp, size)) { - ret = -EFAULT; - goto error; - } + bp_data = memdup_user(dbg->arch.hw_bp, + sizeof(*bp_data) * dbg->arch.nr_hw_bp); + if (IS_ERR(bp_data)) + return PTR_ERR(bp_data); for (i = 0; i < dbg->arch.nr_hw_bp; i++) { switch (bp_data[i].type) { @@ -241,17 +234,19 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, } } - size = nr_wp * sizeof(struct kvm_hw_wp_info_arch); - if (size > 0) { - wp_info = kmalloc(size, GFP_KERNEL); + if (nr_wp > 0) { + wp_info = kmalloc_array(nr_wp, + sizeof(*wp_info), + GFP_KERNEL); if (!wp_info) { ret = -ENOMEM; goto error; } } - size = nr_bp * sizeof(struct kvm_hw_bp_info_arch); - if (size > 0) { - bp_info = kmalloc(size, GFP_KERNEL); + if (nr_bp > 0) { + bp_info = kmalloc_array(nr_bp, + sizeof(*bp_info), + GFP_KERNEL); if (!bp_info) { ret = -ENOMEM; goto error; @@ -382,14 +377,20 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu) vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING; } +#define PER_CODE_MASK (PER_EVENT_MASK >> 24) +#define PER_CODE_BRANCH (PER_EVENT_BRANCH >> 24) +#define PER_CODE_IFETCH (PER_EVENT_IFETCH >> 24) +#define PER_CODE_STORE (PER_EVENT_STORE >> 24) +#define PER_CODE_STORE_REAL (PER_EVENT_STORE_REAL >> 24) + #define per_bp_event(code) \ - (code & (PER_EVENT_IFETCH | PER_EVENT_BRANCH)) + (code & (PER_CODE_IFETCH | PER_CODE_BRANCH)) #define per_write_wp_event(code) \ - (code & (PER_EVENT_STORE | PER_EVENT_STORE_REAL)) + (code & (PER_CODE_STORE | PER_CODE_STORE_REAL)) static int debug_exit_required(struct kvm_vcpu *vcpu) { - u32 perc = (vcpu->arch.sie_block->perc << 24); + u8 perc = vcpu->arch.sie_block->perc; struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch; struct kvm_hw_wp_info_arch *wp_info = NULL; struct kvm_hw_bp_info_arch *bp_info = NULL; @@ -439,35 +440,52 @@ exit_required: #define guest_per_enabled(vcpu) \ (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) +int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu) +{ + const u8 ilen = kvm_s390_get_ilen(vcpu); + struct kvm_s390_pgm_info pgm_info = { + .code = PGM_PER, + .per_code = PER_CODE_IFETCH, + .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen), + }; + + /* + * The PSW points to the next instruction, therefore the intercepted + * instruction generated a PER i-fetch event. PER address therefore + * points at the previous PSW address (could be an EXECUTE function). + */ + return kvm_s390_inject_prog_irq(vcpu, &pgm_info); +} + static void filter_guest_per_event(struct kvm_vcpu *vcpu) { - u32 perc = vcpu->arch.sie_block->perc << 24; + const u8 perc = vcpu->arch.sie_block->perc; u64 peraddr = vcpu->arch.sie_block->peraddr; u64 addr = vcpu->arch.sie_block->gpsw.addr; u64 cr9 = vcpu->arch.sie_block->gcr[9]; u64 cr10 = vcpu->arch.sie_block->gcr[10]; u64 cr11 = vcpu->arch.sie_block->gcr[11]; /* filter all events, demanded by the guest */ - u32 guest_perc = perc & cr9 & PER_EVENT_MASK; + u8 guest_perc = perc & (cr9 >> 24) & PER_CODE_MASK; if (!guest_per_enabled(vcpu)) guest_perc = 0; /* filter "successful-branching" events */ - if (guest_perc & PER_EVENT_BRANCH && + if (guest_perc & PER_CODE_BRANCH && cr9 & PER_CONTROL_BRANCH_ADDRESS && !in_addr_range(addr, cr10, cr11)) - guest_perc &= ~PER_EVENT_BRANCH; + guest_perc &= ~PER_CODE_BRANCH; /* filter "instruction-fetching" events */ - if (guest_perc & PER_EVENT_IFETCH && + if (guest_perc & PER_CODE_IFETCH && !in_addr_range(peraddr, cr10, cr11)) - guest_perc &= ~PER_EVENT_IFETCH; + guest_perc &= ~PER_CODE_IFETCH; /* All other PER events will be given to the guest */ - /* TODO: Check alterated address/address space */ + /* TODO: Check altered address/address space */ - vcpu->arch.sie_block->perc = guest_perc >> 24; + vcpu->arch.sie_block->perc = guest_perc; if (!guest_perc) vcpu->arch.sie_block->iprcc &= ~PGM_PER; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 2e6b54e4d3f9..1cab8a177d0e 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -29,6 +29,7 @@ static const intercept_handler_t instruction_handlers[256] = { [0x01] = kvm_s390_handle_01, [0x82] = kvm_s390_handle_lpsw, [0x83] = kvm_s390_handle_diag, + [0xaa] = kvm_s390_handle_aa, [0xae] = kvm_s390_handle_sigp, [0xb2] = kvm_s390_handle_b2, [0xb6] = kvm_s390_handle_stctl, @@ -341,6 +342,8 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) static int handle_partial_execution(struct kvm_vcpu *vcpu) { + vcpu->stat.exit_pei++; + if (vcpu->arch.sie_block->ipa == 0xb254) /* MVPG */ return handle_mvpg_pei(vcpu); if (vcpu->arch.sie_block->ipa >> 8 == 0xae) /* SIGP */ @@ -349,8 +352,26 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; } +static int handle_operexc(struct kvm_vcpu *vcpu) +{ + vcpu->stat.exit_operation_exception++; + trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa, + vcpu->arch.sie_block->ipb); + + if (vcpu->arch.sie_block->ipa == 0xb256 && + test_kvm_facility(vcpu->kvm, 74)) + return handle_sthyi(vcpu); + + if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0) + return -EOPNOTSUPP; + + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); +} + int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) { + int rc, per_rc = 0; + if (kvm_is_ucontrol(vcpu->kvm)) return -EOPNOTSUPP; @@ -359,7 +380,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) case 0x18: return handle_noop(vcpu); case 0x04: - return handle_instruction(vcpu); + rc = handle_instruction(vcpu); + break; case 0x08: return handle_prog(vcpu); case 0x14: @@ -370,9 +392,19 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) return handle_validity(vcpu); case 0x28: return handle_stop(vcpu); + case 0x2c: + rc = handle_operexc(vcpu); + break; case 0x38: - return handle_partial_execution(vcpu); + rc = handle_partial_execution(vcpu); + break; default: return -EOPNOTSUPP; } + + /* process PER, also if the instrution is processed in user space */ + if (vcpu->arch.sie_block->icptstatus & 0x02 && + (!rc || rc == -EOPNOTSUPP)) + per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu); + return per_rc ? per_rc : rc; } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 84efc2ba6a90..be4db07f70d3 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -24,13 +24,12 @@ #include <asm/sclp.h> #include <asm/isc.h> #include <asm/gmap.h> +#include <asm/switch_to.h> +#include <asm/nmi.h> #include "kvm-s390.h" #include "gaccess.h" #include "trace-s390.h" -#define IOINT_SCHID_MASK 0x0000ffff -#define IOINT_SSID_MASK 0x00030000 -#define IOINT_CSSID_MASK 0x03fc0000 #define PFAULT_INIT 0x0600 #define PFAULT_DONE 0x0680 #define VIRTIO_PARAM 0x0d00 @@ -43,6 +42,7 @@ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) if (!(atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND)) return 0; + BUG_ON(!kvm_s390_use_sca_entries()); read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; @@ -71,6 +71,7 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) { int expect, rc; + BUG_ON(!kvm_s390_use_sca_entries()); read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; @@ -112,6 +113,8 @@ static void sca_clear_ext_call(struct kvm_vcpu *vcpu) struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; int rc, expect; + if (!kvm_s390_use_sca_entries()) + return; atomic_andnot(CPUSTAT_ECALL_PEND, li->cpuflags); read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { @@ -403,12 +406,78 @@ static int __must_check __deliver_pfault_init(struct kvm_vcpu *vcpu) return rc ? -EFAULT : 0; } +static int __write_machine_check(struct kvm_vcpu *vcpu, + struct kvm_s390_mchk_info *mchk) +{ + unsigned long ext_sa_addr; + freg_t fprs[NUM_FPRS]; + union mci mci; + int rc; + + mci.val = mchk->mcic; + /* take care of lazy register loading via vcpu load/put */ + save_fpu_regs(); + save_access_regs(vcpu->run->s.regs.acrs); + + /* Extended save area */ + rc = read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR, &ext_sa_addr, + sizeof(unsigned long)); + /* Only bits 0-53 are used for address formation */ + ext_sa_addr &= ~0x3ffUL; + if (!rc && mci.vr && ext_sa_addr && test_kvm_facility(vcpu->kvm, 129)) { + if (write_guest_abs(vcpu, ext_sa_addr, vcpu->run->s.regs.vrs, + 512)) + mci.vr = 0; + } else { + mci.vr = 0; + } + + /* General interruption information */ + rc |= put_guest_lc(vcpu, 1, (u8 __user *) __LC_AR_MODE_ID); + rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= put_guest_lc(vcpu, mci.val, (u64 __user *) __LC_MCCK_CODE); + + /* Register-save areas */ + if (MACHINE_HAS_VX) { + convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs); + rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, fprs, 128); + } else { + rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, + vcpu->run->s.regs.fprs, 128); + } + rc |= write_guest_lc(vcpu, __LC_GPREGS_SAVE_AREA, + vcpu->run->s.regs.gprs, 128); + rc |= put_guest_lc(vcpu, current->thread.fpu.fpc, + (u32 __user *) __LC_FP_CREG_SAVE_AREA); + rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->todpr, + (u32 __user *) __LC_TOD_PROGREG_SAVE_AREA); + rc |= put_guest_lc(vcpu, kvm_s390_get_cpu_timer(vcpu), + (u64 __user *) __LC_CPU_TIMER_SAVE_AREA); + rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->ckc >> 8, + (u64 __user *) __LC_CLOCK_COMP_SAVE_AREA); + rc |= write_guest_lc(vcpu, __LC_AREGS_SAVE_AREA, + &vcpu->run->s.regs.acrs, 64); + rc |= write_guest_lc(vcpu, __LC_CREGS_SAVE_AREA, + &vcpu->arch.sie_block->gcr, 128); + + /* Extended interruption information */ + rc |= put_guest_lc(vcpu, mchk->ext_damage_code, + (u32 __user *) __LC_EXT_DAMAGE_CODE); + rc |= put_guest_lc(vcpu, mchk->failing_storage_address, + (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR); + rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA, &mchk->fixed_logout, + sizeof(mchk->fixed_logout)); + return rc ? -EFAULT : 0; +} + static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) { struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; struct kvm_s390_mchk_info mchk = {}; - unsigned long adtl_status_addr; int deliver = 0; int rc = 0; @@ -449,29 +518,9 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_MCHK, mchk.cr14, mchk.mcic); - - rc = kvm_s390_vcpu_store_status(vcpu, - KVM_S390_STORE_STATUS_PREFIXED); - rc |= read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR, - &adtl_status_addr, - sizeof(unsigned long)); - rc |= kvm_s390_vcpu_store_adtl_status(vcpu, - adtl_status_addr); - rc |= put_guest_lc(vcpu, mchk.mcic, - (u64 __user *) __LC_MCCK_CODE); - rc |= put_guest_lc(vcpu, mchk.failing_storage_address, - (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR); - rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA, - &mchk.fixed_logout, - sizeof(mchk.fixed_logout)); - rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW, - &vcpu->arch.sie_block->gpsw, - sizeof(psw_t)); - rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW, - &vcpu->arch.sie_block->gpsw, - sizeof(psw_t)); + rc = __write_machine_check(vcpu, &mchk); } - return rc ? -EFAULT : 0; + return rc; } static int __must_check __deliver_restart(struct kvm_vcpu *vcpu) @@ -821,7 +870,14 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_interrupt_info, list); if (inti) { - VCPU_EVENT(vcpu, 4, "deliver: I/O 0x%llx", inti->type); + if (inti->type & KVM_S390_INT_IO_AI_MASK) + VCPU_EVENT(vcpu, 4, "%s", "deliver: I/O (AI)"); + else + VCPU_EVENT(vcpu, 4, "deliver: I/O %x ss %x schid %04x", + inti->io.subchannel_id >> 8, + inti->io.subchannel_id >> 1 & 0x3, + inti->io.subchannel_nr); + vcpu->stat.deliver_io_int++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, @@ -977,6 +1033,11 @@ no_timer: void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) { + /* + * We cannot move this into the if, as the CPU might be already + * in kvm_vcpu_block without having the waitqueue set (polling) + */ + vcpu->valid_wakeup = true; if (swait_active(&vcpu->wq)) { /* * The vcpu gave up the cpu voluntarily, mark it as a good @@ -986,6 +1047,11 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) swake_up(&vcpu->wq); vcpu->stat.halt_wakeup++; } + /* + * The VCPU might not be sleeping but is executing the VSIE. Let's + * kick it, so it leaves the SIE to process the request. + */ + kvm_s390_vsie_kick(vcpu); } enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) @@ -1410,6 +1476,13 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) } fi->counters[FIRQ_CNTR_IO] += 1; + if (inti->type & KVM_S390_INT_IO_AI_MASK) + VM_EVENT(kvm, 4, "%s", "inject: I/O (AI)"); + else + VM_EVENT(kvm, 4, "inject: I/O %x ss %x schid %04x", + inti->io.subchannel_id >> 8, + inti->io.subchannel_id >> 1 & 0x3, + inti->io.subchannel_nr); isc = int_word_to_isc(inti->io.io_int_word); list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc]; list_add_tail(&inti->list, list); @@ -1526,13 +1599,6 @@ int kvm_s390_inject_vm(struct kvm *kvm, inti->mchk.mcic = s390int->parm64; break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: - if (inti->type & KVM_S390_INT_IO_AI_MASK) - VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)"); - else - VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x", - s390int->type & IOINT_CSSID_MASK, - s390int->type & IOINT_SSID_MASK, - s390int->type & IOINT_SCHID_MASK); inti->io.subchannel_id = s390int->parm >> 16; inti->io.subchannel_nr = s390int->parm & 0x0000ffffu; inti->io.io_int_parm = s390int->parm64 >> 32; @@ -2034,6 +2100,27 @@ static int modify_io_adapter(struct kvm_device *dev, return ret; } +static int clear_io_irq(struct kvm *kvm, struct kvm_device_attr *attr) + +{ + const u64 isc_mask = 0xffUL << 24; /* all iscs set */ + u32 schid; + + if (attr->flags) + return -EINVAL; + if (attr->attr != sizeof(schid)) + return -EINVAL; + if (copy_from_user(&schid, (void __user *) attr->addr, sizeof(schid))) + return -EFAULT; + kfree(kvm_s390_get_io_int(kvm, isc_mask, schid)); + /* + * If userspace is conforming to the architecture, we can have at most + * one pending I/O interrupt per subchannel, so this is effectively a + * clear all. + */ + return 0; +} + static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { int r = 0; @@ -2067,6 +2154,9 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) case KVM_DEV_FLIC_ADAPTER_MODIFY: r = modify_io_adapter(dev, attr); break; + case KVM_DEV_FLIC_CLEAR_IO_IRQ: + r = clear_io_irq(dev->kvm, attr); + break; default: r = -EINVAL; } @@ -2074,6 +2164,23 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) return r; } +static int flic_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_FLIC_GET_ALL_IRQS: + case KVM_DEV_FLIC_ENQUEUE: + case KVM_DEV_FLIC_CLEAR_IRQS: + case KVM_DEV_FLIC_APF_ENABLE: + case KVM_DEV_FLIC_APF_DISABLE_WAIT: + case KVM_DEV_FLIC_ADAPTER_REGISTER: + case KVM_DEV_FLIC_ADAPTER_MODIFY: + case KVM_DEV_FLIC_CLEAR_IO_IRQ: + return 0; + } + return -ENXIO; +} + static int flic_create(struct kvm_device *dev, u32 type) { if (!dev) @@ -2095,6 +2202,7 @@ struct kvm_device_ops kvm_flic_ops = { .name = "kvm-flic", .get_attr = flic_get_attr, .set_attr = flic_set_attr, + .has_attr = flic_has_attr, .create = flic_create, .destroy = flic_destroy, }; @@ -2190,7 +2298,8 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e, return ret; } -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int ret; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 668c087513e5..9c7a1ecfe6bd 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -21,20 +21,24 @@ #include <linux/init.h> #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <linux/mman.h> #include <linux/module.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/vmalloc.h> +#include <linux/bitmap.h> #include <asm/asm-offsets.h> #include <asm/lowcore.h> -#include <asm/etr.h> +#include <asm/stp.h> #include <asm/pgtable.h> #include <asm/gmap.h> #include <asm/nmi.h> #include <asm/switch_to.h> #include <asm/isc.h> #include <asm/sclp.h> +#include <asm/cpacf.h> +#include <asm/timex.h> #include "kvm-s390.h" #include "gaccess.h" @@ -61,10 +65,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "exit_external_request", VCPU_STAT(exit_external_request) }, { "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) }, { "exit_instruction", VCPU_STAT(exit_instruction) }, + { "exit_pei", VCPU_STAT(exit_pei) }, { "exit_program_interruption", VCPU_STAT(exit_program_interruption) }, { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) }, + { "exit_operation_exception", VCPU_STAT(exit_operation_exception) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, + { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "instruction_lctlg", VCPU_STAT(instruction_lctlg) }, { "instruction_lctl", VCPU_STAT(instruction_lctl) }, @@ -92,6 +99,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "instruction_stsi", VCPU_STAT(instruction_stsi) }, { "instruction_stfl", VCPU_STAT(instruction_stfl) }, { "instruction_tprot", VCPU_STAT(instruction_tprot) }, + { "instruction_sthyi", VCPU_STAT(instruction_sthyi) }, + { "instruction_sie", VCPU_STAT(instruction_sie) }, { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) }, { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) }, { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) }, @@ -117,11 +126,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; +/* allow nested virtualization in KVM (if enabled by user space) */ +static int nested; +module_param(nested, int, S_IRUGO); +MODULE_PARM_DESC(nested, "Nested virtualization support"); + /* upper facilities limit for kvm */ -unsigned long kvm_s390_fac_list_mask[] = { - 0xffe6fffbfcfdfc40UL, - 0x005e800000000000UL, -}; +unsigned long kvm_s390_fac_list_mask[16] = { FACILITIES_KVM }; unsigned long kvm_s390_fac_list_mask_size(void) { @@ -129,7 +140,13 @@ unsigned long kvm_s390_fac_list_mask_size(void) return ARRAY_SIZE(kvm_s390_fac_list_mask); } +/* available cpu features supported by kvm */ +static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); +/* available subfunctions indicated via query / "test bit" */ +static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc; + static struct gmap_notifier gmap_notifier; +static struct gmap_notifier vsie_gmap_notifier; debug_info_t *kvm_s390_dbf; /* Section: not file related */ @@ -139,7 +156,8 @@ int kvm_arch_hardware_enable(void) return 0; } -static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address); +static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end); /* * This callback is executed during stop_machine(). All CPUs are therefore @@ -161,6 +179,8 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val, vcpu->arch.sie_block->epoch -= *delta; if (vcpu->arch.cputm_enabled) vcpu->arch.cputm_start += *delta; + if (vcpu->arch.vsie_block) + vcpu->arch.vsie_block->epoch -= *delta; } } return NOTIFY_OK; @@ -173,7 +193,9 @@ static struct notifier_block kvm_clock_notifier = { int kvm_arch_hardware_setup(void) { gmap_notifier.notifier_call = kvm_gmap_notifier; - gmap_register_ipte_notifier(&gmap_notifier); + gmap_register_pte_notifier(&gmap_notifier); + vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; + gmap_register_pte_notifier(&vsie_gmap_notifier); atomic_notifier_chain_register(&s390_epoch_delta_notifier, &kvm_clock_notifier); return 0; @@ -181,11 +203,120 @@ int kvm_arch_hardware_setup(void) void kvm_arch_hardware_unsetup(void) { - gmap_unregister_ipte_notifier(&gmap_notifier); + gmap_unregister_pte_notifier(&gmap_notifier); + gmap_unregister_pte_notifier(&vsie_gmap_notifier); atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, &kvm_clock_notifier); } +static void allow_cpu_feat(unsigned long nr) +{ + set_bit_inv(nr, kvm_s390_available_cpu_feat); +} + +static inline int plo_test_bit(unsigned char nr) +{ + register unsigned long r0 asm("0") = (unsigned long) nr | 0x100; + int cc = 3; /* subfunction not available */ + + asm volatile( + /* Parameter registers are ignored for "test bit" */ + " plo 0,0,0,0(0)\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (cc) + : "d" (r0) + : "cc"); + return cc == 0; +} + +static void kvm_s390_cpu_feat_init(void) +{ + int i; + + for (i = 0; i < 256; ++i) { + if (plo_test_bit(i)) + kvm_s390_available_subfunc.plo[i >> 3] |= 0x80 >> (i & 7); + } + + if (test_facility(28)) /* TOD-clock steering */ + ptff(kvm_s390_available_subfunc.ptff, + sizeof(kvm_s390_available_subfunc.ptff), + PTFF_QAF); + + if (test_facility(17)) { /* MSA */ + __cpacf_query(CPACF_KMAC, (cpacf_mask_t *) + kvm_s390_available_subfunc.kmac); + __cpacf_query(CPACF_KMC, (cpacf_mask_t *) + kvm_s390_available_subfunc.kmc); + __cpacf_query(CPACF_KM, (cpacf_mask_t *) + kvm_s390_available_subfunc.km); + __cpacf_query(CPACF_KIMD, (cpacf_mask_t *) + kvm_s390_available_subfunc.kimd); + __cpacf_query(CPACF_KLMD, (cpacf_mask_t *) + kvm_s390_available_subfunc.klmd); + } + if (test_facility(76)) /* MSA3 */ + __cpacf_query(CPACF_PCKMO, (cpacf_mask_t *) + kvm_s390_available_subfunc.pckmo); + if (test_facility(77)) { /* MSA4 */ + __cpacf_query(CPACF_KMCTR, (cpacf_mask_t *) + kvm_s390_available_subfunc.kmctr); + __cpacf_query(CPACF_KMF, (cpacf_mask_t *) + kvm_s390_available_subfunc.kmf); + __cpacf_query(CPACF_KMO, (cpacf_mask_t *) + kvm_s390_available_subfunc.kmo); + __cpacf_query(CPACF_PCC, (cpacf_mask_t *) + kvm_s390_available_subfunc.pcc); + } + if (test_facility(57)) /* MSA5 */ + __cpacf_query(CPACF_PPNO, (cpacf_mask_t *) + kvm_s390_available_subfunc.ppno); + + if (MACHINE_HAS_ESOP) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP); + /* + * We need SIE support, ESOP (PROT_READ protection for gmap_shadow), + * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing). + */ + if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao || + !test_facility(3) || !nested) + return; + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2); + if (sclp.has_64bscao) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO); + if (sclp.has_siif) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF); + if (sclp.has_gpere) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE); + if (sclp.has_gsls) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS); + if (sclp.has_ib) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB); + if (sclp.has_cei) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI); + if (sclp.has_ibs) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS); + /* + * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make + * all skey handling functions read/set the skey from the PGSTE + * instead of the real storage key. + * + * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make + * pages being detected as preserved although they are resident. + * + * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will + * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY. + * + * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and + * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be + * correctly shadowed. We can do that for the PGSTE but not for PTE.I. + * + * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We + * cannot easily shadow the SCA because of the ipte lock. + */ +} + int kvm_arch_init(void *opaque) { kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long)); @@ -197,6 +328,8 @@ int kvm_arch_init(void *opaque) return -ENOMEM; } + kvm_s390_cpu_feat_init(); + /* Register floating interrupt controller interface. */ return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); } @@ -242,6 +375,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_USER_STSI: case KVM_CAP_S390_SKEYS: case KVM_CAP_S390_IRQ_STATE: + case KVM_CAP_S390_USER_INSTR0: r = 1; break; case KVM_CAP_S390_MEM_OP: @@ -249,8 +383,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: - r = sclp.has_esca ? KVM_S390_ESCA_CPU_SLOTS - : KVM_S390_BSCA_CPU_SLOTS; + r = KVM_S390_BSCA_CPU_SLOTS; + if (!kvm_s390_use_sca_entries()) + r = KVM_MAX_VCPUS; + else if (sclp.has_esca && sclp.has_64bscao) + r = KVM_S390_ESCA_CPU_SLOTS; break; case KVM_CAP_NR_MEMSLOTS: r = KVM_USER_MEM_SLOTS; @@ -333,6 +470,16 @@ out: return r; } +static void icpt_operexc_on_all_vcpus(struct kvm *kvm) +{ + unsigned int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu); + } +} + static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { int r; @@ -353,7 +500,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) break; case KVM_CAP_S390_VECTOR_REGISTERS: mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus)) { + if (kvm->created_vcpus) { r = -EBUSY; } else if (MACHINE_HAS_VX) { set_kvm_facility(kvm->arch.model.fac_mask, 129); @@ -368,7 +515,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) case KVM_CAP_S390_RI: r = -EINVAL; mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus)) { + if (kvm->created_vcpus) { r = -EBUSY; } else if (test_facility(64)) { set_kvm_facility(kvm->arch.model.fac_mask, 64); @@ -384,6 +531,12 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) kvm->arch.user_stsi = 1; r = 0; break; + case KVM_CAP_S390_USER_INSTR0: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0"); + kvm->arch.user_instr0 = 1; + icpt_operexc_on_all_vcpus(kvm); + r = 0; + break; default: r = -EINVAL; break; @@ -416,21 +569,23 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att unsigned int idx; switch (attr->attr) { case KVM_S390_VM_MEM_ENABLE_CMMA: - /* enable CMMA only for z10 and later (EDAT_1) */ - ret = -EINVAL; - if (!MACHINE_IS_LPAR || !MACHINE_HAS_EDAT1) + ret = -ENXIO; + if (!sclp.has_cmma) break; ret = -EBUSY; VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support"); mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus) == 0) { + if (!kvm->created_vcpus) { kvm->arch.use_cmma = 1; ret = 0; } mutex_unlock(&kvm->lock); break; case KVM_S390_VM_MEM_CLR_CMMA: + ret = -ENXIO; + if (!sclp.has_cmma) + break; ret = -EINVAL; if (!kvm->arch.use_cmma) break; @@ -459,20 +614,20 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (!new_limit) return -EINVAL; - /* gmap_alloc takes last usable address */ + /* gmap_create takes last usable address */ if (new_limit != KVM_S390_NO_MEM_LIMIT) new_limit -= 1; ret = -EBUSY; mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus) == 0) { - /* gmap_alloc will round the limit up */ - struct gmap *new = gmap_alloc(current->mm, new_limit); + if (!kvm->created_vcpus) { + /* gmap_create will round the limit up */ + struct gmap *new = gmap_create(current->mm, new_limit); if (!new) { ret = -ENOMEM; } else { - gmap_free(kvm->arch.gmap); + gmap_remove(kvm->arch.gmap); new->private = kvm; kvm->arch.gmap = new; ret = 0; @@ -638,10 +793,11 @@ static int kvm_s390_get_tod(struct kvm *kvm, struct kvm_device_attr *attr) static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr) { struct kvm_s390_vm_cpu_processor *proc; + u16 lowest_ibc, unblocked_ibc; int ret = 0; mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus)) { + if (kvm->created_vcpus) { ret = -EBUSY; goto out; } @@ -652,9 +808,17 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr) } if (!copy_from_user(proc, (void __user *)attr->addr, sizeof(*proc))) { - memcpy(&kvm->arch.model.cpu_id, &proc->cpuid, - sizeof(struct cpuid)); - kvm->arch.model.ibc = proc->ibc; + kvm->arch.model.cpuid = proc->cpuid; + lowest_ibc = sclp.ibc >> 16 & 0xfff; + unblocked_ibc = sclp.ibc & 0xfff; + if (lowest_ibc && proc->ibc) { + if (proc->ibc > unblocked_ibc) + kvm->arch.model.ibc = unblocked_ibc; + else if (proc->ibc < lowest_ibc) + kvm->arch.model.ibc = lowest_ibc; + else + kvm->arch.model.ibc = proc->ibc; + } memcpy(kvm->arch.model.fac_list, proc->fac_list, S390_ARCH_FAC_LIST_SIZE_BYTE); } else @@ -665,6 +829,39 @@ out: return ret; } +static int kvm_s390_set_processor_feat(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_feat data; + int ret = -EBUSY; + + if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data))) + return -EFAULT; + if (!bitmap_subset((unsigned long *) data.feat, + kvm_s390_available_cpu_feat, + KVM_S390_VM_CPU_FEAT_NR_BITS)) + return -EINVAL; + + mutex_lock(&kvm->lock); + if (!atomic_read(&kvm->online_vcpus)) { + bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat, + KVM_S390_VM_CPU_FEAT_NR_BITS); + ret = 0; + } + mutex_unlock(&kvm->lock); + return ret; +} + +static int kvm_s390_set_processor_subfunc(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + /* + * Once supported by kernel + hw, we have to store the subfunctions + * in kvm->arch and remember that user space configured them. + */ + return -ENXIO; +} + static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) { int ret = -ENXIO; @@ -673,6 +870,12 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_PROCESSOR: ret = kvm_s390_set_processor(kvm, attr); break; + case KVM_S390_VM_CPU_PROCESSOR_FEAT: + ret = kvm_s390_set_processor_feat(kvm, attr); + break; + case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: + ret = kvm_s390_set_processor_subfunc(kvm, attr); + break; } return ret; } @@ -687,7 +890,7 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr) ret = -ENOMEM; goto out; } - memcpy(&proc->cpuid, &kvm->arch.model.cpu_id, sizeof(struct cpuid)); + proc->cpuid = kvm->arch.model.cpuid; proc->ibc = kvm->arch.model.ibc; memcpy(&proc->fac_list, kvm->arch.model.fac_list, S390_ARCH_FAC_LIST_SIZE_BYTE); @@ -721,6 +924,50 @@ out: return ret; } +static int kvm_s390_get_processor_feat(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_feat data; + + bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat, + KVM_S390_VM_CPU_FEAT_NR_BITS); + if (copy_to_user((void __user *)attr->addr, &data, sizeof(data))) + return -EFAULT; + return 0; +} + +static int kvm_s390_get_machine_feat(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_feat data; + + bitmap_copy((unsigned long *) data.feat, + kvm_s390_available_cpu_feat, + KVM_S390_VM_CPU_FEAT_NR_BITS); + if (copy_to_user((void __user *)attr->addr, &data, sizeof(data))) + return -EFAULT; + return 0; +} + +static int kvm_s390_get_processor_subfunc(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + /* + * Once we can actually configure subfunctions (kernel + hw support), + * we have to check if they were already set by user space, if so copy + * them from kvm->arch. + */ + return -ENXIO; +} + +static int kvm_s390_get_machine_subfunc(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc, + sizeof(struct kvm_s390_vm_cpu_subfunc))) + return -EFAULT; + return 0; +} static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) { int ret = -ENXIO; @@ -732,6 +979,18 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_MACHINE: ret = kvm_s390_get_machine(kvm, attr); break; + case KVM_S390_VM_CPU_PROCESSOR_FEAT: + ret = kvm_s390_get_processor_feat(kvm, attr); + break; + case KVM_S390_VM_CPU_MACHINE_FEAT: + ret = kvm_s390_get_machine_feat(kvm, attr); + break; + case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: + ret = kvm_s390_get_processor_subfunc(kvm, attr); + break; + case KVM_S390_VM_CPU_MACHINE_SUBFUNC: + ret = kvm_s390_get_machine_subfunc(kvm, attr); + break; } return ret; } @@ -792,6 +1051,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) switch (attr->attr) { case KVM_S390_VM_MEM_ENABLE_CMMA: case KVM_S390_VM_MEM_CLR_CMMA: + ret = sclp.has_cmma ? 0 : -ENXIO; + break; case KVM_S390_VM_MEM_LIMIT_SIZE: ret = 0; break; @@ -815,8 +1076,13 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) switch (attr->attr) { case KVM_S390_VM_CPU_PROCESSOR: case KVM_S390_VM_CPU_MACHINE: + case KVM_S390_VM_CPU_PROCESSOR_FEAT: + case KVM_S390_VM_CPU_MACHINE_FEAT: + case KVM_S390_VM_CPU_MACHINE_SUBFUNC: ret = 0; break; + /* configuring subfunctions is not supported yet */ + case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: default: ret = -ENXIO; break; @@ -847,7 +1113,6 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { uint8_t *keys; uint64_t hva; - unsigned long curkey; int i, r = 0; if (args->flags != 0) @@ -868,26 +1133,27 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (!keys) return -ENOMEM; + down_read(¤t->mm->mmap_sem); for (i = 0; i < args->count; i++) { hva = gfn_to_hva(kvm, args->start_gfn + i); if (kvm_is_error_hva(hva)) { r = -EFAULT; - goto out; + break; } - curkey = get_guest_storage_key(current->mm, hva); - if (IS_ERR_VALUE(curkey)) { - r = curkey; - goto out; - } - keys[i] = curkey; + r = get_guest_storage_key(current->mm, hva, &keys[i]); + if (r) + break; + } + up_read(¤t->mm->mmap_sem); + + if (!r) { + r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys, + sizeof(uint8_t) * args->count); + if (r) + r = -EFAULT; } - r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys, - sizeof(uint8_t) * args->count); - if (r) - r = -EFAULT; -out: kvfree(keys); return r; } @@ -924,24 +1190,25 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (r) goto out; + down_read(¤t->mm->mmap_sem); for (i = 0; i < args->count; i++) { hva = gfn_to_hva(kvm, args->start_gfn + i); if (kvm_is_error_hva(hva)) { r = -EFAULT; - goto out; + break; } /* Lowest order bit is reserved */ if (keys[i] & 0x01) { r = -EINVAL; - goto out; + break; } - r = set_guest_storage_key(current->mm, hva, - (unsigned long)keys[i], 0); + r = set_guest_storage_key(current->mm, hva, keys[i], 0); if (r) - goto out; + break; } + up_read(¤t->mm->mmap_sem); out: kvfree(keys); return r; @@ -1081,10 +1348,13 @@ static void kvm_s390_set_crycb_format(struct kvm *kvm) kvm->arch.crypto.crycbd |= CRYCB_FORMAT1; } -static void kvm_s390_get_cpu_id(struct cpuid *cpu_id) +static u64 kvm_s390_get_initial_cpuid(void) { - get_cpu_id(cpu_id); - cpu_id->version = 0xff; + struct cpuid cpuid; + + get_cpu_id(&cpuid); + cpuid.version = 0xff; + return *((u64 *) &cpuid); } static void kvm_s390_crypto_init(struct kvm *kvm) @@ -1115,6 +1385,7 @@ static void sca_dispose(struct kvm *kvm) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + gfp_t alloc_flags = GFP_KERNEL; int i, rc; char debug_name[16]; static unsigned long sca_offset; @@ -1136,9 +1407,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) rc = -ENOMEM; + ratelimit_state_init(&kvm->arch.sthyi_limit, 5 * HZ, 500); + kvm->arch.use_esca = 0; /* start with basic SCA */ + if (!sclp.has_64bscao) + alloc_flags |= GFP_DMA; rwlock_init(&kvm->arch.sca_lock); - kvm->arch.sca = (struct bsca_block *) get_zeroed_page(GFP_KERNEL); + kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags); if (!kvm->arch.sca) goto out_err; spin_lock(&kvm_lock); @@ -1175,7 +1450,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask, S390_ARCH_FAC_LIST_SIZE_BYTE); - kvm_s390_get_cpu_id(&kvm->arch.model.cpu_id); + set_kvm_facility(kvm->arch.model.fac_mask, 74); + set_kvm_facility(kvm->arch.model.fac_list, 74); + + kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid(); kvm->arch.model.ibc = sclp.ibc & 0x0fff; kvm_s390_crypto_init(kvm); @@ -1198,7 +1476,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) else kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE, sclp.hamax + 1); - kvm->arch.gmap = gmap_alloc(current->mm, kvm->arch.mem_limit - 1); + kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1); if (!kvm->arch.gmap) goto out_err; kvm->arch.gmap->private = kvm; @@ -1210,6 +1488,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.epoch = 0; spin_lock_init(&kvm->arch.start_stop_lock); + kvm_s390_vsie_init(kvm); KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); return 0; @@ -1221,6 +1500,16 @@ out_err: return rc; } +bool kvm_arch_has_vcpu_debugfs(void) +{ + return false; +} + +int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + return 0; +} + void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 3, "%s", "free cpu"); @@ -1231,7 +1520,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) sca_del_vcpu(vcpu); if (kvm_is_ucontrol(vcpu->kvm)) - gmap_free(vcpu->arch.gmap); + gmap_remove(vcpu->arch.gmap); if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); @@ -1264,16 +1553,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm) debug_unregister(kvm->arch.dbf); free_page((unsigned long)kvm->arch.sie_page2); if (!kvm_is_ucontrol(kvm)) - gmap_free(kvm->arch.gmap); + gmap_remove(kvm->arch.gmap); kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); + kvm_s390_vsie_destroy(kvm); KVM_EVENT(3, "vm 0x%pK destroyed", kvm); } /* Section: vcpu related */ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) { - vcpu->arch.gmap = gmap_alloc(current->mm, -1UL); + vcpu->arch.gmap = gmap_create(current->mm, -1UL); if (!vcpu->arch.gmap) return -ENOMEM; vcpu->arch.gmap->private = vcpu->kvm; @@ -1283,6 +1573,8 @@ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) static void sca_del_vcpu(struct kvm_vcpu *vcpu) { + if (!kvm_s390_use_sca_entries()) + return; read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; @@ -1300,6 +1592,13 @@ static void sca_del_vcpu(struct kvm_vcpu *vcpu) static void sca_add_vcpu(struct kvm_vcpu *vcpu) { + if (!kvm_s390_use_sca_entries()) { + struct bsca_block *sca = vcpu->kvm->arch.sca; + + /* we still need the basic sca for the ipte control */ + vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); + vcpu->arch.sie_block->scaol = (__u32)(__u64)sca; + } read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; @@ -1380,9 +1679,14 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) { int rc; + if (!kvm_s390_use_sca_entries()) { + if (id < KVM_MAX_VCPUS) + return true; + return false; + } if (id < KVM_S390_BSCA_CPU_SLOTS) return true; - if (!sclp.has_esca) + if (!sclp.has_esca || !sclp.has_64bscao) return false; mutex_lock(&kvm->lock); @@ -1402,6 +1706,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) KVM_SYNC_CRS | KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT; + kvm_s390_set_prefix(vcpu, 0); if (test_kvm_facility(vcpu->kvm, 64)) vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; /* fprs can be synchronized via vrs, even if the guest has no vx. With @@ -1523,7 +1828,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) save_access_regs(vcpu->arch.host_acrs); restore_access_regs(vcpu->run->s.regs.acrs); - gmap_enable(vcpu->arch.gmap); + gmap_enable(vcpu->arch.enabled_gmap); atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu)) __start_cpu_timer_accounting(vcpu); @@ -1536,7 +1841,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu)) __stop_cpu_timer_accounting(vcpu); atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); - gmap_disable(vcpu->arch.gmap); + vcpu->arch.enabled_gmap = gmap_get_enabled(); + gmap_disable(vcpu->arch.enabled_gmap); /* Save guest register state */ save_fpu_regs(); @@ -1585,7 +1891,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) vcpu->arch.gmap = vcpu->kvm->arch.gmap; sca_add_vcpu(vcpu); } - + if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0) + vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; + /* make vcpu_load load the right gmap on the first trigger */ + vcpu->arch.enabled_gmap = vcpu->arch.gmap; } static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) @@ -1624,7 +1933,6 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu) { struct kvm_s390_cpu_model *model = &vcpu->kvm->arch.model; - vcpu->arch.cpu_id = model->cpu_id; vcpu->arch.sie_block->ibc = model->ibc; if (test_kvm_facility(vcpu->kvm, 7)) vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list; @@ -1645,18 +1953,25 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) kvm_s390_vcpu_setup_model(vcpu); - vcpu->arch.sie_block->ecb = 6; - if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73)) + /* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */ + if (MACHINE_HAS_ESOP) + vcpu->arch.sie_block->ecb |= 0x02; + if (test_kvm_facility(vcpu->kvm, 9)) + vcpu->arch.sie_block->ecb |= 0x04; + if (test_kvm_facility(vcpu->kvm, 73)) vcpu->arch.sie_block->ecb |= 0x10; - vcpu->arch.sie_block->ecb2 = 8; - vcpu->arch.sie_block->eca = 0xC1002000U; + if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi) + vcpu->arch.sie_block->ecb2 |= 0x08; + vcpu->arch.sie_block->eca = 0x1002000U; + if (sclp.has_cei) + vcpu->arch.sie_block->eca |= 0x80000000U; + if (sclp.has_ib) + vcpu->arch.sie_block->eca |= 0x40000000U; if (sclp.has_siif) vcpu->arch.sie_block->eca |= 1; if (sclp.has_sigpif) vcpu->arch.sie_block->eca |= 0x10000000U; - if (test_kvm_facility(vcpu->kvm, 64)) - vcpu->arch.sie_block->ecb3 |= 0x01; if (test_kvm_facility(vcpu->kvm, 129)) { vcpu->arch.sie_block->eca |= 0x00020000; vcpu->arch.sie_block->ecd |= 0x20000000; @@ -1700,6 +2015,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block = &sie_page->sie_block; vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb; + /* the real guest size will always be smaller than msl */ + vcpu->arch.sie_block->mso = 0; + vcpu->arch.sie_block->msl = sclp.hamax; + vcpu->arch.sie_block->icpua = id; spin_lock_init(&vcpu->arch.local_int.lock); vcpu->arch.local_int.float_int = &kvm->arch.float_int; @@ -1768,16 +2087,25 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu) kvm_s390_vcpu_request(vcpu); } -static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address) +static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) { - int i; struct kvm *kvm = gmap->private; struct kvm_vcpu *vcpu; + unsigned long prefix; + int i; + if (gmap_is_shadow(gmap)) + return; + if (start >= 1UL << 31) + /* We are only interested in prefix pages */ + return; kvm_for_each_vcpu(i, vcpu, kvm) { /* match against both prefix pages */ - if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) { - VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address); + prefix = kvm_s390_get_prefix(vcpu); + if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) { + VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx", + start, end); kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu); } } @@ -1935,9 +2263,10 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return -EINVAL; current->thread.fpu.fpc = fpu->fpc; if (MACHINE_HAS_VX) - convert_fp_to_vx(current->thread.fpu.vxrs, (freg_t *)fpu->fprs); + convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs, + (freg_t *) fpu->fprs); else - memcpy(current->thread.fpu.fprs, &fpu->fprs, sizeof(fpu->fprs)); + memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs)); return 0; } @@ -1946,9 +2275,10 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) /* make sure we have the latest values */ save_fpu_regs(); if (MACHINE_HAS_VX) - convert_vx_to_fp((freg_t *)fpu->fprs, current->thread.fpu.vxrs); + convert_vx_to_fp((freg_t *) fpu->fprs, + (__vector128 *) vcpu->run->s.regs.vrs); else - memcpy(fpu->fprs, current->thread.fpu.fprs, sizeof(fpu->fprs)); + memcpy(fpu->fprs, vcpu->run->s.regs.fprs, sizeof(fpu->fprs)); fpu->fpc = current->thread.fpu.fpc; return 0; } @@ -1986,6 +2316,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, if (dbg->control & ~VALID_GUESTDBG_FLAGS) return -EINVAL; + if (!sclp.has_gpere) + return -EINVAL; if (dbg->control & KVM_GUESTDBG_ENABLE) { vcpu->guest_debug = dbg->control; @@ -2054,18 +2386,20 @@ retry: return 0; /* * We use MMU_RELOAD just to re-arm the ipte notifier for the - * guest prefix page. gmap_ipte_notify will wait on the ptl lock. + * guest prefix page. gmap_mprotect_notify will wait on the ptl lock. * This ensures that the ipte instruction for this request has * already finished. We might race against a second unmapper that * wants to set the blocking bit. Lets just retry the request loop. */ if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) { int rc; - rc = gmap_ipte_notify(vcpu->arch.gmap, - kvm_s390_get_prefix(vcpu), - PAGE_SIZE * 2); - if (rc) + rc = gmap_mprotect_notify(vcpu->arch.gmap, + kvm_s390_get_prefix(vcpu), + PAGE_SIZE * 2, PROT_WRITE); + if (rc) { + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); return rc; + } goto retry; } @@ -2092,6 +2426,11 @@ retry: goto retry; } + if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) { + vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; + goto retry; + } + /* nothing to do, just clear the request */ clear_bit(KVM_REQ_UNHALT, &vcpu->requests); @@ -2346,14 +2685,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) * guest_enter and guest_exit should be no uaccess. */ local_irq_disable(); - __kvm_guest_enter(); + guest_enter_irqoff(); __disable_cpu_timer_accounting(vcpu); local_irq_enable(); exit_reason = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); local_irq_disable(); __enable_cpu_timer_accounting(vcpu); - __kvm_guest_exit(); + guest_exit_irqoff(); local_irq_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); @@ -2389,6 +2728,19 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) kvm_clear_async_pf_completion_queue(vcpu); } + /* + * If userspace sets the riccb (e.g. after migration) to a valid state, + * we should enable RI here instead of doing the lazy enablement. + */ + if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) && + test_kvm_facility(vcpu->kvm, 64)) { + struct runtime_instr_cb *riccb = + (struct runtime_instr_cb *) &kvm_run->s.regs.riccb; + + if (riccb->valid) + vcpu->arch.sie_block->ecb3 |= 0x01; + } + kvm_run->kvm_dirty_regs = 0; } @@ -2532,38 +2884,6 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) return kvm_s390_store_status_unloaded(vcpu, addr); } -/* - * store additional status at address - */ -int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu, - unsigned long gpa) -{ - /* Only bits 0-53 are used for address formation */ - if (!(gpa & ~0x3ff)) - return 0; - - return write_guest_abs(vcpu, gpa & ~0x3ff, - (void *)&vcpu->run->s.regs.vrs, 512); -} - -int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr) -{ - if (!test_kvm_facility(vcpu->kvm, 129)) - return 0; - - /* - * The guest VXRS are in the host VXRs due to the lazy - * copying in vcpu load/put. We can simply call save_fpu_regs() - * to save the current register state because we are in the - * middle of a load/put cycle. - * - * Let's update our copies before we save it into the save area. - */ - save_fpu_regs(); - - return kvm_s390_store_adtl_status_unloaded(vcpu, addr); -} - static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu) { kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu); @@ -2582,6 +2902,8 @@ static void __disable_ibs_on_all_vcpus(struct kvm *kvm) static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu) { + if (!sclp.has_ibs) + return; kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu); kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu); } @@ -2971,13 +3293,31 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, return; } +static inline unsigned long nonhyp_mask(int i) +{ + unsigned int nonhyp_fai = (sclp.hmfai << i * 2) >> 30; + + return 0x0000ffffffffffffUL >> (nonhyp_fai << 4); +} + +void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) +{ + vcpu->valid_wakeup = false; +} + static int __init kvm_s390_init(void) { + int i; + if (!sclp.has_sief2) { pr_info("SIE not available\n"); return -ENODEV; } + for (i = 0; i < 16; i++) + kvm_s390_fac_list_mask[i] |= + S390_lowcore.stfle_fac_list[i] & nonhyp_mask(i); + return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 8621ab00ec8e..3a4e97f1a9e6 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -20,6 +20,7 @@ #include <linux/kvm_host.h> #include <asm/facility.h> #include <asm/processor.h> +#include <asm/sclp.h> typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); @@ -56,7 +57,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_WAIT; + return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) @@ -175,6 +176,12 @@ static inline int set_kvm_facility(u64 *fac_list, unsigned long nr) return 0; } +static inline int test_kvm_cpu_feat(struct kvm *kvm, unsigned long nr) +{ + WARN_ON_ONCE(nr >= KVM_S390_VM_CPU_FEAT_NR_BITS); + return test_bit_inv(nr, kvm->arch.cpu_feat); +} + /* are cpu states controlled by user space */ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) { @@ -232,11 +239,14 @@ static inline void kvm_s390_forward_psw(struct kvm_vcpu *vcpu, int ilen) } static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu) { + /* don't inject PER events if we re-execute the instruction */ + vcpu->arch.sie_block->icptstatus &= ~0x02; kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu)); } /* implemented in priv.c */ int is_valid_psw(psw_t *psw); +int kvm_s390_handle_aa(struct kvm_vcpu *vcpu); int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); int kvm_s390_handle_e5(struct kvm_vcpu *vcpu); int kvm_s390_handle_01(struct kvm_vcpu *vcpu); @@ -246,18 +256,26 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu); int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu); int kvm_s390_handle_eb(struct kvm_vcpu *vcpu); +/* implemented in vsie.c */ +int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu); +void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu); +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end); +void kvm_s390_vsie_init(struct kvm *kvm); +void kvm_s390_vsie_destroy(struct kvm *kvm); + /* implemented in sigp.c */ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); +/* implemented in sthyi.c */ +int handle_sthyi(struct kvm_vcpu *vcpu); + /* implemented in kvm-s390.c */ void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod); long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); -int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu, - unsigned long addr); int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr); -int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu); @@ -360,6 +378,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg); void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu); void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu); +int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu); void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu); /* support for Basic/Extended SCA handling */ @@ -369,4 +388,13 @@ static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm) return &sca->ipte_control; } +static inline int kvm_s390_use_sca_entries(void) +{ + /* + * Without SIGP interpretation, only SRS interpretation (if available) + * might use the entries. By not setting the entries and keeping them + * invalid, hardware will not access them but intercept. + */ + return sclp.has_sigpif; +} #endif diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 0a1591d3d25d..e18435355c16 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -27,10 +27,29 @@ #include <asm/io.h> #include <asm/ptrace.h> #include <asm/compat.h> +#include <asm/sclp.h> #include "gaccess.h" #include "kvm-s390.h" #include "trace.h" +static int handle_ri(struct kvm_vcpu *vcpu) +{ + if (test_kvm_facility(vcpu->kvm, 64)) { + vcpu->arch.sie_block->ecb3 |= 0x01; + kvm_s390_retry_instr(vcpu); + return 0; + } else + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); +} + +int kvm_s390_handle_aa(struct kvm_vcpu *vcpu) +{ + if ((vcpu->arch.sie_block->ipa & 0xf) <= 4) + return handle_ri(vcpu); + else + return -EOPNOTSUPP; +} + /* Handle SCK (SET CLOCK) interception */ static int handle_set_clock(struct kvm_vcpu *vcpu) { @@ -152,30 +171,166 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu) static int __skey_check_enable(struct kvm_vcpu *vcpu) { int rc = 0; + + trace_kvm_s390_skey_related_inst(vcpu); if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE))) return rc; rc = s390_enable_skey(); - VCPU_EVENT(vcpu, 3, "%s", "enabling storage keys for guest"); - trace_kvm_s390_skey_related_inst(vcpu); - vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE); + VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc); + if (!rc) + vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE); return rc; } - -static int handle_skey(struct kvm_vcpu *vcpu) +static int try_handle_skey(struct kvm_vcpu *vcpu) { - int rc = __skey_check_enable(vcpu); + int rc; + vcpu->stat.instruction_storage_key++; + rc = __skey_check_enable(vcpu); if (rc) return rc; - vcpu->stat.instruction_storage_key++; - + if (sclp.has_skey) { + /* with storage-key facility, SIE interprets it for us */ + kvm_s390_retry_instr(vcpu); + VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation"); + return -EAGAIN; + } if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + return 0; +} - kvm_s390_retry_instr(vcpu); - VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation"); +static int handle_iske(struct kvm_vcpu *vcpu) +{ + unsigned long addr; + unsigned char key; + int reg1, reg2; + int rc; + + rc = try_handle_skey(vcpu); + if (rc) + return rc != -EAGAIN ? rc : 0; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + addr = kvm_s390_logical_to_effective(vcpu, addr); + addr = kvm_s390_real_to_abs(vcpu, addr); + addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr)); + if (kvm_is_error_hva(addr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + down_read(¤t->mm->mmap_sem); + rc = get_guest_storage_key(current->mm, addr, &key); + up_read(¤t->mm->mmap_sem); + if (rc) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + vcpu->run->s.regs.gprs[reg1] &= ~0xff; + vcpu->run->s.regs.gprs[reg1] |= key; + return 0; +} + +static int handle_rrbe(struct kvm_vcpu *vcpu) +{ + unsigned long addr; + int reg1, reg2; + int rc; + + rc = try_handle_skey(vcpu); + if (rc) + return rc != -EAGAIN ? rc : 0; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + addr = kvm_s390_logical_to_effective(vcpu, addr); + addr = kvm_s390_real_to_abs(vcpu, addr); + addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr)); + if (kvm_is_error_hva(addr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + down_read(¤t->mm->mmap_sem); + rc = reset_guest_reference_bit(current->mm, addr); + up_read(¤t->mm->mmap_sem); + if (rc < 0) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + kvm_s390_set_psw_cc(vcpu, rc); + return 0; +} + +#define SSKE_NQ 0x8 +#define SSKE_MR 0x4 +#define SSKE_MC 0x2 +#define SSKE_MB 0x1 +static int handle_sske(struct kvm_vcpu *vcpu) +{ + unsigned char m3 = vcpu->arch.sie_block->ipb >> 28; + unsigned long start, end; + unsigned char key, oldkey; + int reg1, reg2; + int rc; + + rc = try_handle_skey(vcpu); + if (rc) + return rc != -EAGAIN ? rc : 0; + + if (!test_kvm_facility(vcpu->kvm, 8)) + m3 &= ~SSKE_MB; + if (!test_kvm_facility(vcpu->kvm, 10)) + m3 &= ~(SSKE_MC | SSKE_MR); + if (!test_kvm_facility(vcpu->kvm, 14)) + m3 &= ~SSKE_NQ; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + key = vcpu->run->s.regs.gprs[reg1] & 0xfe; + start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + start = kvm_s390_logical_to_effective(vcpu, start); + if (m3 & SSKE_MB) { + /* start already designates an absolute address */ + end = (start + (1UL << 20)) & ~((1UL << 20) - 1); + } else { + start = kvm_s390_real_to_abs(vcpu, start); + end = start + PAGE_SIZE; + } + + while (start != end) { + unsigned long addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); + + if (kvm_is_error_hva(addr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + down_read(¤t->mm->mmap_sem); + rc = cond_set_guest_storage_key(current->mm, addr, key, &oldkey, + m3 & SSKE_NQ, m3 & SSKE_MR, + m3 & SSKE_MC); + up_read(¤t->mm->mmap_sem); + if (rc < 0) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + start += PAGE_SIZE; + }; + + if (m3 & (SSKE_MC | SSKE_MR)) { + if (m3 & SSKE_MB) { + /* skey in reg1 is unpredictable */ + kvm_s390_set_psw_cc(vcpu, 3); + } else { + kvm_s390_set_psw_cc(vcpu, rc); + vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL; + vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8; + } + } + if (m3 & SSKE_MB) { + if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT) + vcpu->run->s.regs.gprs[reg2] &= ~PAGE_MASK; + else + vcpu->run->s.regs.gprs[reg2] &= ~0xfffff000UL; + end = kvm_s390_logical_to_effective(vcpu, end); + vcpu->run->s.regs.gprs[reg2] |= end; + } return 0; } @@ -439,7 +594,7 @@ static int handle_lpswe(struct kvm_vcpu *vcpu) static int handle_stidp(struct kvm_vcpu *vcpu) { - u64 stidp_data = vcpu->arch.stidp_data; + u64 stidp_data = vcpu->kvm->arch.model.cpuid; u64 operand2; int rc; ar_t ar; @@ -582,10 +737,11 @@ static const intercept_handler_t b2_handlers[256] = { [0x10] = handle_set_prefix, [0x11] = handle_store_prefix, [0x12] = handle_store_cpu_address, + [0x14] = kvm_s390_handle_vsie, [0x21] = handle_ipte_interlock, - [0x29] = handle_skey, - [0x2a] = handle_skey, - [0x2b] = handle_skey, + [0x29] = handle_iske, + [0x2a] = handle_rrbe, + [0x2b] = handle_sske, [0x2c] = handle_test_block, [0x30] = handle_io_inst, [0x31] = handle_io_inst, @@ -654,8 +810,10 @@ static int handle_epsw(struct kvm_vcpu *vcpu) static int handle_pfmf(struct kvm_vcpu *vcpu) { + bool mr = false, mc = false, nq; int reg1, reg2; unsigned long start, end; + unsigned char key; vcpu->stat.instruction_pfmf++; @@ -670,19 +828,32 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (vcpu->run->s.regs.gprs[reg1] & PFMF_RESERVED) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - /* Only provide non-quiescing support if the host supports it */ - if (vcpu->run->s.regs.gprs[reg1] & PFMF_NQ && !test_facility(14)) + /* Only provide non-quiescing support if enabled for the guest */ + if (vcpu->run->s.regs.gprs[reg1] & PFMF_NQ && + !test_kvm_facility(vcpu->kvm, 14)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - /* No support for conditional-SSKE */ - if (vcpu->run->s.regs.gprs[reg1] & (PFMF_MR | PFMF_MC)) - return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + /* Only provide conditional-SSKE support if enabled for the guest */ + if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK && + test_kvm_facility(vcpu->kvm, 10)) { + mr = vcpu->run->s.regs.gprs[reg1] & PFMF_MR; + mc = vcpu->run->s.regs.gprs[reg1] & PFMF_MC; + } + nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ; + key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY; start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; start = kvm_s390_logical_to_effective(vcpu, start); + if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { + if (kvm_s390_check_low_addr_prot_real(vcpu, start)) + return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); + } + switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) { case 0x00000000: + /* only 4k frames specify a real address */ + start = kvm_s390_real_to_abs(vcpu, start); end = (start + (1UL << 12)) & ~((1UL << 12) - 1); break; case 0x00001000: @@ -700,20 +871,11 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); } - if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { - if (kvm_s390_check_low_addr_prot_real(vcpu, start)) - return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); - } - - while (start < end) { - unsigned long useraddr, abs_addr; + while (start != end) { + unsigned long useraddr; /* Translate guest address to host address */ - if ((vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) == 0) - abs_addr = kvm_s390_real_to_abs(vcpu, start); - else - abs_addr = start; - useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(abs_addr)); + useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); if (kvm_is_error_hva(useraddr)) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); @@ -727,16 +889,25 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (rc) return rc; - if (set_guest_storage_key(current->mm, useraddr, - vcpu->run->s.regs.gprs[reg1] & PFMF_KEY, - vcpu->run->s.regs.gprs[reg1] & PFMF_NQ)) + down_read(¤t->mm->mmap_sem); + rc = cond_set_guest_storage_key(current->mm, useraddr, + key, NULL, nq, mr, mc); + up_read(¤t->mm->mmap_sem); + if (rc < 0) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); } start += PAGE_SIZE; } - if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) - vcpu->run->s.regs.gprs[reg2] = end; + if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) { + if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT) { + vcpu->run->s.regs.gprs[reg2] = end; + } else { + vcpu->run->s.regs.gprs[reg2] &= ~0xffffffffUL; + end = kvm_s390_logical_to_effective(vcpu, end); + vcpu->run->s.regs.gprs[reg2] |= end; + } + } return 0; } @@ -744,7 +915,7 @@ static int handle_essa(struct kvm_vcpu *vcpu) { /* entries expected to be 1FF */ int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; - unsigned long *cbrlo, cbrle; + unsigned long *cbrlo; struct gmap *gmap; int i; @@ -765,17 +936,9 @@ static int handle_essa(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */ cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo); down_read(&gmap->mm->mmap_sem); - for (i = 0; i < entries; ++i) { - cbrle = cbrlo[i]; - if (unlikely(cbrle & ~PAGE_MASK || cbrle < 2 * PAGE_SIZE)) - /* invalid entry */ - break; - /* try to free backing */ - __gmap_zap(gmap, cbrle); - } + for (i = 0; i < entries; ++i) + __gmap_zap(gmap, cbrlo[i]); up_read(&gmap->mm->mmap_sem); - if (i < entries) - return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); return 0; } @@ -948,6 +1111,9 @@ static int handle_stctg(struct kvm_vcpu *vcpu) static const intercept_handler_t eb_handlers[256] = { [0x2f] = handle_lctlg, [0x25] = handle_stctg, + [0x60] = handle_ri, + [0x61] = handle_ri, + [0x62] = handle_ri, }; int kvm_s390_handle_eb(struct kvm_vcpu *vcpu) @@ -1040,7 +1206,15 @@ static int handle_sckpf(struct kvm_vcpu *vcpu) return 0; } +static int handle_ptff(struct kvm_vcpu *vcpu) +{ + /* we don't emulate any control instructions yet */ + kvm_s390_set_psw_cc(vcpu, 3); + return 0; +} + static const intercept_handler_t x01_handlers[256] = { + [0x04] = handle_ptff, [0x07] = handle_sckpf, }; diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 77c22d685c7a..1a252f537081 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -77,18 +77,18 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, const u64 psw_int_mask = PSW_MASK_IO | PSW_MASK_EXT; u16 p_asn, s_asn; psw_t *psw; - u32 flags; + bool idle; - flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags); + idle = is_vcpu_idle(vcpu); psw = &dst_vcpu->arch.sie_block->gpsw; p_asn = dst_vcpu->arch.sie_block->gcr[4] & 0xffff; /* Primary ASN */ s_asn = dst_vcpu->arch.sie_block->gcr[3] & 0xffff; /* Secondary ASN */ /* Inject the emergency signal? */ - if (!(flags & CPUSTAT_STOPPED) + if (!is_vcpu_stopped(vcpu) || (psw->mask & psw_int_mask) != psw_int_mask - || ((flags & CPUSTAT_WAIT) && psw->addr != 0) - || (!(flags & CPUSTAT_WAIT) && (asn == p_asn || asn == s_asn))) { + || (idle && psw->addr != 0) + || (!idle && (asn == p_asn || asn == s_asn))) { return __inject_sigp_emergency(vcpu, dst_vcpu); } else { *reg &= 0xffffffff00000000UL; @@ -240,6 +240,12 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, struct kvm_s390_local_interrupt *li; int rc; + if (!test_kvm_facility(vcpu->kvm, 9)) { + *reg &= 0xffffffff00000000UL; + *reg |= SIGP_STATUS_INVALID_ORDER; + return SIGP_CC_STATUS_STORED; + } + li = &dst_vcpu->arch.local_int; if (atomic_read(li->cpuflags) & CPUSTAT_RUNNING) { /* running */ diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c new file mode 100644 index 000000000000..bd98b7d25200 --- /dev/null +++ b/arch/s390/kvm/sthyi.c @@ -0,0 +1,471 @@ +/* + * store hypervisor information instruction emulation functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + * + * Copyright IBM Corp. 2016 + * Author(s): Janosch Frank <frankja@linux.vnet.ibm.com> + */ +#include <linux/kvm_host.h> +#include <linux/errno.h> +#include <linux/pagemap.h> +#include <linux/vmalloc.h> +#include <linux/ratelimit.h> + +#include <asm/kvm_host.h> +#include <asm/asm-offsets.h> +#include <asm/sclp.h> +#include <asm/diag.h> +#include <asm/sysinfo.h> +#include <asm/ebcdic.h> + +#include "kvm-s390.h" +#include "gaccess.h" +#include "trace.h" + +#define DED_WEIGHT 0xffff +/* + * CP and IFL as EBCDIC strings, SP/0x40 determines the end of string + * as they are justified with spaces. + */ +#define CP 0xc3d7404040404040UL +#define IFL 0xc9c6d34040404040UL + +enum hdr_flags { + HDR_NOT_LPAR = 0x10, + HDR_STACK_INCM = 0x20, + HDR_STSI_UNAV = 0x40, + HDR_PERF_UNAV = 0x80, +}; + +enum mac_validity { + MAC_NAME_VLD = 0x20, + MAC_ID_VLD = 0x40, + MAC_CNT_VLD = 0x80, +}; + +enum par_flag { + PAR_MT_EN = 0x80, +}; + +enum par_validity { + PAR_GRP_VLD = 0x08, + PAR_ID_VLD = 0x10, + PAR_ABS_VLD = 0x20, + PAR_WGHT_VLD = 0x40, + PAR_PCNT_VLD = 0x80, +}; + +struct hdr_sctn { + u8 infhflg1; + u8 infhflg2; /* reserved */ + u8 infhval1; /* reserved */ + u8 infhval2; /* reserved */ + u8 reserved[3]; + u8 infhygct; + u16 infhtotl; + u16 infhdln; + u16 infmoff; + u16 infmlen; + u16 infpoff; + u16 infplen; + u16 infhoff1; + u16 infhlen1; + u16 infgoff1; + u16 infglen1; + u16 infhoff2; + u16 infhlen2; + u16 infgoff2; + u16 infglen2; + u16 infhoff3; + u16 infhlen3; + u16 infgoff3; + u16 infglen3; + u8 reserved2[4]; +} __packed; + +struct mac_sctn { + u8 infmflg1; /* reserved */ + u8 infmflg2; /* reserved */ + u8 infmval1; + u8 infmval2; /* reserved */ + u16 infmscps; + u16 infmdcps; + u16 infmsifl; + u16 infmdifl; + char infmname[8]; + char infmtype[4]; + char infmmanu[16]; + char infmseq[16]; + char infmpman[4]; + u8 reserved[4]; +} __packed; + +struct par_sctn { + u8 infpflg1; + u8 infpflg2; /* reserved */ + u8 infpval1; + u8 infpval2; /* reserved */ + u16 infppnum; + u16 infpscps; + u16 infpdcps; + u16 infpsifl; + u16 infpdifl; + u16 reserved; + char infppnam[8]; + u32 infpwbcp; + u32 infpabcp; + u32 infpwbif; + u32 infpabif; + char infplgnm[8]; + u32 infplgcp; + u32 infplgif; +} __packed; + +struct sthyi_sctns { + struct hdr_sctn hdr; + struct mac_sctn mac; + struct par_sctn par; +} __packed; + +struct cpu_inf { + u64 lpar_cap; + u64 lpar_grp_cap; + u64 lpar_weight; + u64 all_weight; + int cpu_num_ded; + int cpu_num_shd; +}; + +struct lpar_cpu_inf { + struct cpu_inf cp; + struct cpu_inf ifl; +}; + +static inline u64 cpu_id(u8 ctidx, void *diag224_buf) +{ + return *((u64 *)(diag224_buf + (ctidx + 1) * DIAG204_CPU_NAME_LEN)); +} + +/* + * Scales the cpu capping from the lpar range to the one expected in + * sthyi data. + * + * diag204 reports a cap in hundredths of processor units. + * z/VM's range for one core is 0 - 0x10000. + */ +static u32 scale_cap(u32 in) +{ + return (0x10000 * in) / 100; +} + +static void fill_hdr(struct sthyi_sctns *sctns) +{ + sctns->hdr.infhdln = sizeof(sctns->hdr); + sctns->hdr.infmoff = sizeof(sctns->hdr); + sctns->hdr.infmlen = sizeof(sctns->mac); + sctns->hdr.infplen = sizeof(sctns->par); + sctns->hdr.infpoff = sctns->hdr.infhdln + sctns->hdr.infmlen; + sctns->hdr.infhtotl = sctns->hdr.infpoff + sctns->hdr.infplen; +} + +static void fill_stsi_mac(struct sthyi_sctns *sctns, + struct sysinfo_1_1_1 *sysinfo) +{ + if (stsi(sysinfo, 1, 1, 1)) + return; + + sclp_ocf_cpc_name_copy(sctns->mac.infmname); + + memcpy(sctns->mac.infmtype, sysinfo->type, sizeof(sctns->mac.infmtype)); + memcpy(sctns->mac.infmmanu, sysinfo->manufacturer, sizeof(sctns->mac.infmmanu)); + memcpy(sctns->mac.infmpman, sysinfo->plant, sizeof(sctns->mac.infmpman)); + memcpy(sctns->mac.infmseq, sysinfo->sequence, sizeof(sctns->mac.infmseq)); + + sctns->mac.infmval1 |= MAC_ID_VLD | MAC_NAME_VLD; +} + +static void fill_stsi_par(struct sthyi_sctns *sctns, + struct sysinfo_2_2_2 *sysinfo) +{ + if (stsi(sysinfo, 2, 2, 2)) + return; + + sctns->par.infppnum = sysinfo->lpar_number; + memcpy(sctns->par.infppnam, sysinfo->name, sizeof(sctns->par.infppnam)); + + sctns->par.infpval1 |= PAR_ID_VLD; +} + +static void fill_stsi(struct sthyi_sctns *sctns) +{ + void *sysinfo; + + /* Errors are handled through the validity bits in the response. */ + sysinfo = (void *)__get_free_page(GFP_KERNEL); + if (!sysinfo) + return; + + fill_stsi_mac(sctns, sysinfo); + fill_stsi_par(sctns, sysinfo); + + free_pages((unsigned long)sysinfo, 0); +} + +static void fill_diag_mac(struct sthyi_sctns *sctns, + struct diag204_x_phys_block *block, + void *diag224_buf) +{ + int i; + + for (i = 0; i < block->hdr.cpus; i++) { + switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) { + case CP: + if (block->cpus[i].weight == DED_WEIGHT) + sctns->mac.infmdcps++; + else + sctns->mac.infmscps++; + break; + case IFL: + if (block->cpus[i].weight == DED_WEIGHT) + sctns->mac.infmdifl++; + else + sctns->mac.infmsifl++; + break; + } + } + sctns->mac.infmval1 |= MAC_CNT_VLD; +} + +/* Returns a pointer to the the next partition block. */ +static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf, + bool this_lpar, + void *diag224_buf, + struct diag204_x_part_block *block) +{ + int i, capped = 0, weight_cp = 0, weight_ifl = 0; + struct cpu_inf *cpu_inf; + + for (i = 0; i < block->hdr.rcpus; i++) { + if (!(block->cpus[i].cflag & DIAG204_CPU_ONLINE)) + continue; + + switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) { + case CP: + cpu_inf = &part_inf->cp; + if (block->cpus[i].cur_weight < DED_WEIGHT) + weight_cp |= block->cpus[i].cur_weight; + break; + case IFL: + cpu_inf = &part_inf->ifl; + if (block->cpus[i].cur_weight < DED_WEIGHT) + weight_ifl |= block->cpus[i].cur_weight; + break; + default: + continue; + } + + if (!this_lpar) + continue; + + capped |= block->cpus[i].cflag & DIAG204_CPU_CAPPED; + cpu_inf->lpar_cap |= block->cpus[i].cpu_type_cap; + cpu_inf->lpar_grp_cap |= block->cpus[i].group_cpu_type_cap; + + if (block->cpus[i].weight == DED_WEIGHT) + cpu_inf->cpu_num_ded += 1; + else + cpu_inf->cpu_num_shd += 1; + } + + if (this_lpar && capped) { + part_inf->cp.lpar_weight = weight_cp; + part_inf->ifl.lpar_weight = weight_ifl; + } + part_inf->cp.all_weight += weight_cp; + part_inf->ifl.all_weight += weight_ifl; + return (struct diag204_x_part_block *)&block->cpus[i]; +} + +static void fill_diag(struct sthyi_sctns *sctns) +{ + int i, r, pages; + bool this_lpar; + void *diag204_buf; + void *diag224_buf = NULL; + struct diag204_x_info_blk_hdr *ti_hdr; + struct diag204_x_part_block *part_block; + struct diag204_x_phys_block *phys_block; + struct lpar_cpu_inf lpar_inf = {}; + + /* Errors are handled through the validity bits in the response. */ + pages = diag204((unsigned long)DIAG204_SUBC_RSI | + (unsigned long)DIAG204_INFO_EXT, 0, NULL); + if (pages <= 0) + return; + + diag204_buf = vmalloc(PAGE_SIZE * pages); + if (!diag204_buf) + return; + + r = diag204((unsigned long)DIAG204_SUBC_STIB7 | + (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf); + if (r < 0) + goto out; + + diag224_buf = kmalloc(PAGE_SIZE, GFP_KERNEL | GFP_DMA); + if (!diag224_buf || diag224(diag224_buf)) + goto out; + + ti_hdr = diag204_buf; + part_block = diag204_buf + sizeof(*ti_hdr); + + for (i = 0; i < ti_hdr->npar; i++) { + /* + * For the calling lpar we also need to get the cpu + * caps and weights. The time information block header + * specifies the offset to the partition block of the + * caller lpar, so we know when we process its data. + */ + this_lpar = (void *)part_block - diag204_buf == ti_hdr->this_part; + part_block = lpar_cpu_inf(&lpar_inf, this_lpar, diag224_buf, + part_block); + } + + phys_block = (struct diag204_x_phys_block *)part_block; + part_block = diag204_buf + ti_hdr->this_part; + if (part_block->hdr.mtid) + sctns->par.infpflg1 = PAR_MT_EN; + + sctns->par.infpval1 |= PAR_GRP_VLD; + sctns->par.infplgcp = scale_cap(lpar_inf.cp.lpar_grp_cap); + sctns->par.infplgif = scale_cap(lpar_inf.ifl.lpar_grp_cap); + memcpy(sctns->par.infplgnm, part_block->hdr.hardware_group_name, + sizeof(sctns->par.infplgnm)); + + sctns->par.infpscps = lpar_inf.cp.cpu_num_shd; + sctns->par.infpdcps = lpar_inf.cp.cpu_num_ded; + sctns->par.infpsifl = lpar_inf.ifl.cpu_num_shd; + sctns->par.infpdifl = lpar_inf.ifl.cpu_num_ded; + sctns->par.infpval1 |= PAR_PCNT_VLD; + + sctns->par.infpabcp = scale_cap(lpar_inf.cp.lpar_cap); + sctns->par.infpabif = scale_cap(lpar_inf.ifl.lpar_cap); + sctns->par.infpval1 |= PAR_ABS_VLD; + + /* + * Everything below needs global performance data to be + * meaningful. + */ + if (!(ti_hdr->flags & DIAG204_LPAR_PHYS_FLG)) { + sctns->hdr.infhflg1 |= HDR_PERF_UNAV; + goto out; + } + + fill_diag_mac(sctns, phys_block, diag224_buf); + + if (lpar_inf.cp.lpar_weight) { + sctns->par.infpwbcp = sctns->mac.infmscps * 0x10000 * + lpar_inf.cp.lpar_weight / lpar_inf.cp.all_weight; + } + + if (lpar_inf.ifl.lpar_weight) { + sctns->par.infpwbif = sctns->mac.infmsifl * 0x10000 * + lpar_inf.ifl.lpar_weight / lpar_inf.ifl.all_weight; + } + sctns->par.infpval1 |= PAR_WGHT_VLD; + +out: + kfree(diag224_buf); + vfree(diag204_buf); +} + +static int sthyi(u64 vaddr) +{ + register u64 code asm("0") = 0; + register u64 addr asm("2") = vaddr; + int cc; + + asm volatile( + ".insn rre,0xB2560000,%[code],%[addr]\n" + "ipm %[cc]\n" + "srl %[cc],28\n" + : [cc] "=d" (cc) + : [code] "d" (code), [addr] "a" (addr) + : "memory", "cc"); + return cc; +} + +int handle_sthyi(struct kvm_vcpu *vcpu) +{ + int reg1, reg2, r = 0; + u64 code, addr, cc = 0; + struct sthyi_sctns *sctns = NULL; + + /* + * STHYI requires extensive locking in the higher hypervisors + * and is very computational/memory expensive. Therefore we + * ratelimit the executions per VM. + */ + if (!__ratelimit(&vcpu->kvm->arch.sthyi_limit)) { + kvm_s390_retry_instr(vcpu); + return 0; + } + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + code = vcpu->run->s.regs.gprs[reg1]; + addr = vcpu->run->s.regs.gprs[reg2]; + + vcpu->stat.instruction_sthyi++; + VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr); + trace_kvm_s390_handle_sthyi(vcpu, code, addr); + + if (reg1 == reg2 || reg1 & 1 || reg2 & 1 || addr & ~PAGE_MASK) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + if (code & 0xffff) { + cc = 3; + goto out; + } + + /* + * If the page has not yet been faulted in, we want to do that + * now and not after all the expensive calculations. + */ + r = write_guest(vcpu, addr, reg2, &cc, 1); + if (r) + return kvm_s390_inject_prog_cond(vcpu, r); + + sctns = (void *)get_zeroed_page(GFP_KERNEL); + if (!sctns) + return -ENOMEM; + + /* + * If we are a guest, we don't want to emulate an emulated + * instruction. We ask the hypervisor to provide the data. + */ + if (test_facility(74)) { + cc = sthyi((u64)sctns); + goto out; + } + + fill_hdr(sctns); + fill_stsi(sctns); + fill_diag(sctns); + +out: + if (!cc) { + r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); + if (r) { + free_page((unsigned long)sctns); + return kvm_s390_inject_prog_cond(vcpu, r); + } + } + + free_page((unsigned long)sctns); + vcpu->run->s.regs.gprs[reg2 + 1] = cc ? 4 : 0; + kvm_s390_set_psw_cc(vcpu, cc); + return r; +} diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h index 916834d7a73a..4fc9d4e5be89 100644 --- a/arch/s390/kvm/trace.h +++ b/arch/s390/kvm/trace.h @@ -41,7 +41,7 @@ TRACE_EVENT(kvm_s390_skey_related_inst, TP_fast_assign( VCPU_ASSIGN_COMMON ), - VCPU_TP_PRINTK("%s", "first instruction related to skeys on vcpu") + VCPU_TP_PRINTK("%s", "storage key related instruction") ); TRACE_EVENT(kvm_s390_major_guest_pfault, @@ -185,8 +185,10 @@ TRACE_EVENT(kvm_s390_intercept_prog, __entry->code = code; ), - VCPU_TP_PRINTK("intercepted program interruption %04x", - __entry->code) + VCPU_TP_PRINTK("intercepted program interruption %04x (%s)", + __entry->code, + __print_symbolic(__entry->code, + icpt_prog_codes)) ); /* @@ -412,6 +414,47 @@ TRACE_EVENT(kvm_s390_handle_stsi, __entry->addr) ); +TRACE_EVENT(kvm_s390_handle_operexc, + TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb), + TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(__u64, instruction) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->instruction = ((__u64)ipa << 48) | + ((__u64)ipb << 16); + ), + + VCPU_TP_PRINTK("operation exception on instruction %016llx (%s)", + __entry->instruction, + __print_symbolic(icpt_insn_decoder(__entry->instruction), + icpt_insn_codes)) + ); + +TRACE_EVENT(kvm_s390_handle_sthyi, + TP_PROTO(VCPU_PROTO_COMMON, u64 code, u64 addr), + TP_ARGS(VCPU_ARGS_COMMON, code, addr), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(u64, code) + __field(u64, addr) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->code = code; + __entry->addr = addr; + ), + + VCPU_TP_PRINTK("STHYI fc: %llu addr: %016llx", + __entry->code, __entry->addr) + ); + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c new file mode 100644 index 000000000000..d8673e243f13 --- /dev/null +++ b/arch/s390/kvm/vsie.c @@ -0,0 +1,1091 @@ +/* + * kvm nested virtualization support for s390x + * + * Copyright IBM Corp. 2016 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + * + * Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com> + */ +#include <linux/vmalloc.h> +#include <linux/kvm_host.h> +#include <linux/bug.h> +#include <linux/list.h> +#include <linux/bitmap.h> +#include <asm/gmap.h> +#include <asm/mmu_context.h> +#include <asm/sclp.h> +#include <asm/nmi.h> +#include <asm/dis.h> +#include "kvm-s390.h" +#include "gaccess.h" + +struct vsie_page { + struct kvm_s390_sie_block scb_s; /* 0x0000 */ + /* the pinned originial scb */ + struct kvm_s390_sie_block *scb_o; /* 0x0200 */ + /* the shadow gmap in use by the vsie_page */ + struct gmap *gmap; /* 0x0208 */ + /* address of the last reported fault to guest2 */ + unsigned long fault_addr; /* 0x0210 */ + __u8 reserved[0x0700 - 0x0218]; /* 0x0218 */ + struct kvm_s390_crypto_cb crycb; /* 0x0700 */ + __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ +} __packed; + +/* trigger a validity icpt for the given scb */ +static int set_validity_icpt(struct kvm_s390_sie_block *scb, + __u16 reason_code) +{ + scb->ipa = 0x1000; + scb->ipb = ((__u32) reason_code) << 16; + scb->icptcode = ICPT_VALIDITY; + return 1; +} + +/* mark the prefix as unmapped, this will block the VSIE */ +static void prefix_unmapped(struct vsie_page *vsie_page) +{ + atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20); +} + +/* mark the prefix as unmapped and wait until the VSIE has been left */ +static void prefix_unmapped_sync(struct vsie_page *vsie_page) +{ + prefix_unmapped(vsie_page); + if (vsie_page->scb_s.prog0c & PROG_IN_SIE) + atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags); + while (vsie_page->scb_s.prog0c & PROG_IN_SIE) + cpu_relax(); +} + +/* mark the prefix as mapped, this will allow the VSIE to run */ +static void prefix_mapped(struct vsie_page *vsie_page) +{ + atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20); +} + +/* test if the prefix is mapped into the gmap shadow */ +static int prefix_is_mapped(struct vsie_page *vsie_page) +{ + return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST); +} + +/* copy the updated intervention request bits into the shadow scb */ +static void update_intervention_requests(struct vsie_page *vsie_page) +{ + const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT; + int cpuflags; + + cpuflags = atomic_read(&vsie_page->scb_o->cpuflags); + atomic_andnot(bits, &vsie_page->scb_s.cpuflags); + atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags); +} + +/* shadow (filter and validate) the cpuflags */ +static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + int newflags, cpuflags = atomic_read(&scb_o->cpuflags); + + /* we don't allow ESA/390 guests */ + if (!(cpuflags & CPUSTAT_ZARCH)) + return set_validity_icpt(scb_s, 0x0001U); + + if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS)) + return set_validity_icpt(scb_s, 0x0001U); + else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR)) + return set_validity_icpt(scb_s, 0x0007U); + + /* intervention requests will be set later */ + newflags = CPUSTAT_ZARCH; + if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8)) + newflags |= CPUSTAT_GED; + if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) { + if (cpuflags & CPUSTAT_GED) + return set_validity_icpt(scb_s, 0x0001U); + newflags |= CPUSTAT_GED2; + } + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE)) + newflags |= cpuflags & CPUSTAT_P; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS)) + newflags |= cpuflags & CPUSTAT_SM; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS)) + newflags |= cpuflags & CPUSTAT_IBS; + + atomic_set(&scb_s->cpuflags, newflags); + return 0; +} + +/* + * Create a shadow copy of the crycb block and setup key wrapping, if + * requested for guest 3 and enabled for guest 2. + * + * We only accept format-1 (no AP in g2), but convert it into format-2 + * There is nothing to do for format-0. + * + * Returns: - 0 if shadowed or nothing to do + * - > 0 if control has to be given to guest 2 + */ +static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U; + unsigned long *b1, *b2; + u8 ecb3_flags; + + scb_s->crycbd = 0; + if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1)) + return 0; + /* format-1 is supported with message-security-assist extension 3 */ + if (!test_kvm_facility(vcpu->kvm, 76)) + return 0; + /* we may only allow it if enabled for guest 2 */ + ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 & + (ECB3_AES | ECB3_DEA); + if (!ecb3_flags) + return 0; + + if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK)) + return set_validity_icpt(scb_s, 0x003CU); + else if (!crycb_addr) + return set_validity_icpt(scb_s, 0x0039U); + + /* copy only the wrapping keys */ + if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56)) + return set_validity_icpt(scb_s, 0x0035U); + + scb_s->ecb3 |= ecb3_flags; + scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 | + CRYCB_FORMAT2; + + /* xor both blocks in one run */ + b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask; + b2 = (unsigned long *) + vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask; + /* as 56%8 == 0, bitmap_xor won't overwrite any data */ + bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56); + return 0; +} + +/* shadow (round up/down) the ibc to avoid validity icpt */ +static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + __u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU; + + scb_s->ibc = 0; + /* ibc installed in g2 and requested for g3 */ + if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) { + scb_s->ibc = scb_o->ibc & 0x0fffU; + /* takte care of the minimum ibc level of the machine */ + if (scb_s->ibc < min_ibc) + scb_s->ibc = min_ibc; + /* take care of the maximum ibc level set for the guest */ + if (scb_s->ibc > vcpu->kvm->arch.model.ibc) + scb_s->ibc = vcpu->kvm->arch.model.ibc; + } +} + +/* unshadow the scb, copying parameters back to the real scb */ +static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + + /* interception */ + scb_o->icptcode = scb_s->icptcode; + scb_o->icptstatus = scb_s->icptstatus; + scb_o->ipa = scb_s->ipa; + scb_o->ipb = scb_s->ipb; + scb_o->gbea = scb_s->gbea; + + /* timer */ + scb_o->cputm = scb_s->cputm; + scb_o->ckc = scb_s->ckc; + scb_o->todpr = scb_s->todpr; + + /* guest state */ + scb_o->gpsw = scb_s->gpsw; + scb_o->gg14 = scb_s->gg14; + scb_o->gg15 = scb_s->gg15; + memcpy(scb_o->gcr, scb_s->gcr, 128); + scb_o->pp = scb_s->pp; + + /* interrupt intercept */ + switch (scb_s->icptcode) { + case ICPT_PROGI: + case ICPT_INSTPROGI: + case ICPT_EXTINT: + memcpy((void *)((u64)scb_o + 0xc0), + (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0); + break; + case ICPT_PARTEXEC: + /* MVPG only */ + memcpy((void *)((u64)scb_o + 0xc0), + (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0); + break; + } + + if (scb_s->ihcpu != 0xffffU) + scb_o->ihcpu = scb_s->ihcpu; +} + +/* + * Setup the shadow scb by copying and checking the relevant parts of the g2 + * provided scb. + * + * Returns: - 0 if the scb has been shadowed + * - > 0 if control has to be given to guest 2 + */ +static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + bool had_tx = scb_s->ecb & 0x10U; + unsigned long new_mso = 0; + int rc; + + /* make sure we don't have any leftovers when reusing the scb */ + scb_s->icptcode = 0; + scb_s->eca = 0; + scb_s->ecb = 0; + scb_s->ecb2 = 0; + scb_s->ecb3 = 0; + scb_s->ecd = 0; + scb_s->fac = 0; + + rc = prepare_cpuflags(vcpu, vsie_page); + if (rc) + goto out; + + /* timer */ + scb_s->cputm = scb_o->cputm; + scb_s->ckc = scb_o->ckc; + scb_s->todpr = scb_o->todpr; + scb_s->epoch = scb_o->epoch; + + /* guest state */ + scb_s->gpsw = scb_o->gpsw; + scb_s->gg14 = scb_o->gg14; + scb_s->gg15 = scb_o->gg15; + memcpy(scb_s->gcr, scb_o->gcr, 128); + scb_s->pp = scb_o->pp; + + /* interception / execution handling */ + scb_s->gbea = scb_o->gbea; + scb_s->lctl = scb_o->lctl; + scb_s->svcc = scb_o->svcc; + scb_s->ictl = scb_o->ictl; + /* + * SKEY handling functions can't deal with false setting of PTE invalid + * bits. Therefore we cannot provide interpretation and would later + * have to provide own emulation handlers. + */ + scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; + scb_s->icpua = scb_o->icpua; + + if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM)) + new_mso = scb_o->mso & 0xfffffffffff00000UL; + /* if the hva of the prefix changes, we have to remap the prefix */ + if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix) + prefix_unmapped(vsie_page); + /* SIE will do mso/msl validity and exception checks for us */ + scb_s->msl = scb_o->msl & 0xfffffffffff00000UL; + scb_s->mso = new_mso; + scb_s->prefix = scb_o->prefix; + + /* We have to definetly flush the tlb if this scb never ran */ + if (scb_s->ihcpu != 0xffffU) + scb_s->ihcpu = scb_o->ihcpu; + + /* MVPG and Protection Exception Interpretation are always available */ + scb_s->eca |= scb_o->eca & 0x01002000U; + /* Host-protection-interruption introduced with ESOP */ + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP)) + scb_s->ecb |= scb_o->ecb & 0x02U; + /* transactional execution */ + if (test_kvm_facility(vcpu->kvm, 73)) { + /* remap the prefix is tx is toggled on */ + if ((scb_o->ecb & 0x10U) && !had_tx) + prefix_unmapped(vsie_page); + scb_s->ecb |= scb_o->ecb & 0x10U; + } + /* SIMD */ + if (test_kvm_facility(vcpu->kvm, 129)) { + scb_s->eca |= scb_o->eca & 0x00020000U; + scb_s->ecd |= scb_o->ecd & 0x20000000U; + } + /* Run-time-Instrumentation */ + if (test_kvm_facility(vcpu->kvm, 64)) + scb_s->ecb3 |= scb_o->ecb3 & 0x01U; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF)) + scb_s->eca |= scb_o->eca & 0x00000001U; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB)) + scb_s->eca |= scb_o->eca & 0x40000000U; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI)) + scb_s->eca |= scb_o->eca & 0x80000000U; + + prepare_ibc(vcpu, vsie_page); + rc = shadow_crycb(vcpu, vsie_page); +out: + if (rc) + unshadow_scb(vcpu, vsie_page); + return rc; +} + +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) +{ + struct kvm *kvm = gmap->private; + struct vsie_page *cur; + unsigned long prefix; + struct page *page; + int i; + + if (!gmap_is_shadow(gmap)) + return; + if (start >= 1UL << 31) + /* We are only interested in prefix pages */ + return; + + /* + * Only new shadow blocks are added to the list during runtime, + * therefore we can safely reference them all the time. + */ + for (i = 0; i < kvm->arch.vsie.page_count; i++) { + page = READ_ONCE(kvm->arch.vsie.pages[i]); + if (!page) + continue; + cur = page_to_virt(page); + if (READ_ONCE(cur->gmap) != gmap) + continue; + prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT; + /* with mso/msl, the prefix lies at an offset */ + prefix += cur->scb_s.mso; + if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1) + prefix_unmapped_sync(cur); + } +} + +/* + * Map the first prefix page and if tx is enabled also the second prefix page. + * + * The prefix will be protected, a gmap notifier will inform about unmaps. + * The shadow scb must not be executed until the prefix is remapped, this is + * guaranteed by properly handling PROG_REQUEST. + * + * Returns: - 0 on if successfully mapped or already mapped + * - > 0 if control has to be given to guest 2 + * - -EAGAIN if the caller can retry immediately + * - -ENOMEM if out of memory + */ +static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT; + int rc; + + if (prefix_is_mapped(vsie_page)) + return 0; + + /* mark it as mapped so we can catch any concurrent unmappers */ + prefix_mapped(vsie_page); + + /* with mso/msl, the prefix lies at offset *mso* */ + prefix += scb_s->mso; + + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix); + if (!rc && (scb_s->ecb & 0x10U)) + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, + prefix + PAGE_SIZE); + /* + * We don't have to mprotect, we will be called for all unshadows. + * SIE will detect if protection applies and trigger a validity. + */ + if (rc) + prefix_unmapped(vsie_page); + if (rc > 0 || rc == -EFAULT) + rc = set_validity_icpt(scb_s, 0x0037U); + return rc; +} + +/* + * Pin the guest page given by gpa and set hpa to the pinned host address. + * Will always be pinned writable. + * + * Returns: - 0 on success + * - -EINVAL if the gpa is not valid guest storage + * - -ENOMEM if out of memory + */ +static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) +{ + struct page *page; + hva_t hva; + int rc; + + hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); + if (kvm_is_error_hva(hva)) + return -EINVAL; + rc = get_user_pages_fast(hva, 1, 1, &page); + if (rc < 0) + return rc; + else if (rc != 1) + return -ENOMEM; + *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK); + return 0; +} + +/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */ +static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa) +{ + struct page *page; + + page = virt_to_page(hpa); + set_page_dirty_lock(page); + put_page(page); + /* mark the page always as dirty for migration */ + mark_page_dirty(kvm, gpa_to_gfn(gpa)); +} + +/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */ +static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + hpa_t hpa; + gpa_t gpa; + + hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol; + if (hpa) { + gpa = scb_o->scaol & ~0xfUL; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO)) + gpa |= (u64) scb_o->scaoh << 32; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->scaol = 0; + scb_s->scaoh = 0; + } + + hpa = scb_s->itdba; + if (hpa) { + gpa = scb_o->itdba & ~0xffUL; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->itdba = 0; + } + + hpa = scb_s->gvrd; + if (hpa) { + gpa = scb_o->gvrd & ~0x1ffUL; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->gvrd = 0; + } + + hpa = scb_s->riccbd; + if (hpa) { + gpa = scb_o->riccbd & ~0x3fUL; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->riccbd = 0; + } +} + +/* + * Instead of shadowing some blocks, we can simply forward them because the + * addresses in the scb are 64 bit long. + * + * This works as long as the data lies in one page. If blocks ever exceed one + * page, we have to fall back to shadowing. + * + * As we reuse the sca, the vcpu pointers contained in it are invalid. We must + * therefore not enable any facilities that access these pointers (e.g. SIGPIF). + * + * Returns: - 0 if all blocks were pinned. + * - > 0 if control has to be given to guest 2 + * - -ENOMEM if out of memory + */ +static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + hpa_t hpa; + gpa_t gpa; + int rc = 0; + + gpa = scb_o->scaol & ~0xfUL; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO)) + gpa |= (u64) scb_o->scaoh << 32; + if (gpa) { + if (!(gpa & ~0x1fffUL)) + rc = set_validity_icpt(scb_s, 0x0038U); + else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu)) + rc = set_validity_icpt(scb_s, 0x0011U); + else if ((gpa & PAGE_MASK) != + ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK)) + rc = set_validity_icpt(scb_s, 0x003bU); + if (!rc) { + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x0034U); + } + if (rc) + goto unpin; + scb_s->scaoh = (u32)((u64)hpa >> 32); + scb_s->scaol = (u32)(u64)hpa; + } + + gpa = scb_o->itdba & ~0xffUL; + if (gpa && (scb_s->ecb & 0x10U)) { + if (!(gpa & ~0x1fffU)) { + rc = set_validity_icpt(scb_s, 0x0080U); + goto unpin; + } + /* 256 bytes cannot cross page boundaries */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x0080U); + if (rc) + goto unpin; + scb_s->itdba = hpa; + } + + gpa = scb_o->gvrd & ~0x1ffUL; + if (gpa && (scb_s->eca & 0x00020000U) && + !(scb_s->ecd & 0x20000000U)) { + if (!(gpa & ~0x1fffUL)) { + rc = set_validity_icpt(scb_s, 0x1310U); + goto unpin; + } + /* + * 512 bytes vector registers cannot cross page boundaries + * if this block gets bigger, we have to shadow it. + */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x1310U); + if (rc) + goto unpin; + scb_s->gvrd = hpa; + } + + gpa = scb_o->riccbd & ~0x3fUL; + if (gpa && (scb_s->ecb3 & 0x01U)) { + if (!(gpa & ~0x1fffUL)) { + rc = set_validity_icpt(scb_s, 0x0043U); + goto unpin; + } + /* 64 bytes cannot cross page boundaries */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x0043U); + /* Validity 0x0044 will be checked by SIE */ + if (rc) + goto unpin; + scb_s->riccbd = hpa; + } + return 0; +unpin: + unpin_blocks(vcpu, vsie_page); + return rc; +} + +/* unpin the scb provided by guest 2, marking it as dirty */ +static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, + gpa_t gpa) +{ + hpa_t hpa = (hpa_t) vsie_page->scb_o; + + if (hpa) + unpin_guest_page(vcpu->kvm, gpa, hpa); + vsie_page->scb_o = NULL; +} + +/* + * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o. + * + * Returns: - 0 if the scb was pinned. + * - > 0 if control has to be given to guest 2 + * - -ENOMEM if out of memory + */ +static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, + gpa_t gpa) +{ + hpa_t hpa; + int rc; + + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) { + rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (!rc) + rc = 1; + } + if (!rc) + vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa; + return rc; +} + +/* + * Inject a fault into guest 2. + * + * Returns: - > 0 if control has to be given to guest 2 + * < 0 if an error occurred during injection. + */ +static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr, + bool write_flag) +{ + struct kvm_s390_pgm_info pgm = { + .code = code, + .trans_exc_code = + /* 0-51: virtual address */ + (vaddr & 0xfffffffffffff000UL) | + /* 52-53: store / fetch */ + (((unsigned int) !write_flag) + 1) << 10, + /* 62-63: asce id (alway primary == 0) */ + .exc_access_id = 0, /* always primary */ + .op_access_id = 0, /* not MVPG */ + }; + int rc; + + if (code == PGM_PROTECTION) + pgm.trans_exc_code |= 0x4UL; + + rc = kvm_s390_inject_prog_irq(vcpu, &pgm); + return rc ? rc : 1; +} + +/* + * Handle a fault during vsie execution on a gmap shadow. + * + * Returns: - 0 if the fault was resolved + * - > 0 if control has to be given to guest 2 + * - < 0 if an error occurred + */ +static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + int rc; + + if (current->thread.gmap_int_code == PGM_PROTECTION) + /* we can directly forward all protection exceptions */ + return inject_fault(vcpu, PGM_PROTECTION, + current->thread.gmap_addr, 1); + + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, + current->thread.gmap_addr); + if (rc > 0) { + rc = inject_fault(vcpu, rc, + current->thread.gmap_addr, + current->thread.gmap_write_flag); + if (rc >= 0) + vsie_page->fault_addr = current->thread.gmap_addr; + } + return rc; +} + +/* + * Retry the previous fault that required guest 2 intervention. This avoids + * one superfluous SIE re-entry and direct exit. + * + * Will ignore any errors. The next SIE fault will do proper fault handling. + */ +static void handle_last_fault(struct kvm_vcpu *vcpu, + struct vsie_page *vsie_page) +{ + if (vsie_page->fault_addr) + kvm_s390_shadow_fault(vcpu, vsie_page->gmap, + vsie_page->fault_addr); + vsie_page->fault_addr = 0; +} + +static inline void clear_vsie_icpt(struct vsie_page *vsie_page) +{ + vsie_page->scb_s.icptcode = 0; +} + +/* rewind the psw and clear the vsie icpt, so we can retry execution */ +static void retry_vsie_icpt(struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + int ilen = insn_length(scb_s->ipa >> 8); + + /* take care of EXECUTE instructions */ + if (scb_s->icptstatus & 1) { + ilen = (scb_s->icptstatus >> 4) & 0x6; + if (!ilen) + ilen = 4; + } + scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen); + clear_vsie_icpt(vsie_page); +} + +/* + * Try to shadow + enable the guest 2 provided facility list. + * Retry instruction execution if enabled for and provided by guest 2. + * + * Returns: - 0 if handled (retry or guest 2 icpt) + * - > 0 if control has to be given to guest 2 + */ +static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + __u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U; + + if (fac && test_kvm_facility(vcpu->kvm, 7)) { + retry_vsie_icpt(vsie_page); + if (read_guest_real(vcpu, fac, &vsie_page->fac, + sizeof(vsie_page->fac))) + return set_validity_icpt(scb_s, 0x1090U); + scb_s->fac = (__u32)(__u64) &vsie_page->fac; + } + return 0; +} + +/* + * Run the vsie on a shadow scb and a shadow gmap, without any further + * sanity checks, handling SIE faults. + * + * Returns: - 0 everything went fine + * - > 0 if control has to be given to guest 2 + * - < 0 if an error occurred + */ +static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + int rc; + + handle_last_fault(vcpu, vsie_page); + + if (need_resched()) + schedule(); + if (test_cpu_flag(CIF_MCCK_PENDING)) + s390_handle_mcck(); + + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + local_irq_disable(); + guest_enter_irqoff(); + local_irq_enable(); + + rc = sie64a(scb_s, vcpu->run->s.regs.gprs); + + local_irq_disable(); + guest_exit_irqoff(); + local_irq_enable(); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + if (rc > 0) + rc = 0; /* we could still have an icpt */ + else if (rc == -EFAULT) + return handle_fault(vcpu, vsie_page); + + switch (scb_s->icptcode) { + case ICPT_INST: + if (scb_s->ipa == 0xb2b0) + rc = handle_stfle(vcpu, vsie_page); + break; + case ICPT_STOP: + /* stop not requested by g2 - must have been a kick */ + if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT)) + clear_vsie_icpt(vsie_page); + break; + case ICPT_VALIDITY: + if ((scb_s->ipa & 0xf000) != 0xf000) + scb_s->ipa += 0x1000; + break; + } + return rc; +} + +static void release_gmap_shadow(struct vsie_page *vsie_page) +{ + if (vsie_page->gmap) + gmap_put(vsie_page->gmap); + WRITE_ONCE(vsie_page->gmap, NULL); + prefix_unmapped(vsie_page); +} + +static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, + struct vsie_page *vsie_page) +{ + unsigned long asce; + union ctlreg0 cr0; + struct gmap *gmap; + int edat; + + asce = vcpu->arch.sie_block->gcr[1]; + cr0.val = vcpu->arch.sie_block->gcr[0]; + edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8); + edat += edat && test_kvm_facility(vcpu->kvm, 78); + + /* + * ASCE or EDAT could have changed since last icpt, or the gmap + * we're holding has been unshadowed. If the gmap is still valid, + * we can safely reuse it. + */ + if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) + return 0; + + /* release the old shadow - if any, and mark the prefix as unmapped */ + release_gmap_shadow(vsie_page); + gmap = gmap_shadow(vcpu->arch.gmap, asce, edat); + if (IS_ERR(gmap)) + return PTR_ERR(gmap); + gmap->private = vcpu->kvm; + WRITE_ONCE(vsie_page->gmap, gmap); + return 0; +} + +/* + * Register the shadow scb at the VCPU, e.g. for kicking out of vsie. + */ +static void register_shadow_scb(struct kvm_vcpu *vcpu, + struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + + WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s); + /* + * External calls have to lead to a kick of the vcpu and + * therefore the vsie -> Simulate Wait state. + */ + atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); + /* + * We have to adjust the g3 epoch by the g2 epoch. The epoch will + * automatically be adjusted on tod clock changes via kvm_sync_clock. + */ + preempt_disable(); + scb_s->epoch += vcpu->kvm->arch.epoch; + preempt_enable(); +} + +/* + * Unregister a shadow scb from a VCPU. + */ +static void unregister_shadow_scb(struct kvm_vcpu *vcpu) +{ + atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); + WRITE_ONCE(vcpu->arch.vsie_block, NULL); +} + +/* + * Run the vsie on a shadowed scb, managing the gmap shadow, handling + * prefix pages and faults. + * + * Returns: - 0 if no errors occurred + * - > 0 if control has to be given to guest 2 + * - -ENOMEM if out of memory + */ +static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + int rc = 0; + + while (1) { + rc = acquire_gmap_shadow(vcpu, vsie_page); + if (!rc) + rc = map_prefix(vcpu, vsie_page); + if (!rc) { + gmap_enable(vsie_page->gmap); + update_intervention_requests(vsie_page); + rc = do_vsie_run(vcpu, vsie_page); + gmap_enable(vcpu->arch.gmap); + } + atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20); + + if (rc == -EAGAIN) + rc = 0; + if (rc || scb_s->icptcode || signal_pending(current) || + kvm_s390_vcpu_has_irq(vcpu, 0)) + break; + }; + + if (rc == -EFAULT) { + /* + * Addressing exceptions are always presentes as intercepts. + * As addressing exceptions are suppressing and our guest 3 PSW + * points at the responsible instruction, we have to + * forward the PSW and set the ilc. If we can't read guest 3 + * instruction, we can use an arbitrary ilc. Let's always use + * ilen = 4 for now, so we can avoid reading in guest 3 virtual + * memory. (we could also fake the shadow so the hardware + * handles it). + */ + scb_s->icptcode = ICPT_PROGI; + scb_s->iprcc = PGM_ADDRESSING; + scb_s->pgmilc = 4; + scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4); + } + return rc; +} + +/* + * Get or create a vsie page for a scb address. + * + * Returns: - address of a vsie page (cached or new one) + * - NULL if the same scb address is already used by another VCPU + * - ERR_PTR(-ENOMEM) if out of memory + */ +static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) +{ + struct vsie_page *vsie_page; + struct page *page; + int nr_vcpus; + + rcu_read_lock(); + page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9); + rcu_read_unlock(); + if (page) { + if (page_ref_inc_return(page) == 2) + return page_to_virt(page); + page_ref_dec(page); + } + + /* + * We want at least #online_vcpus shadows, so every VCPU can execute + * the VSIE in parallel. + */ + nr_vcpus = atomic_read(&kvm->online_vcpus); + + mutex_lock(&kvm->arch.vsie.mutex); + if (kvm->arch.vsie.page_count < nr_vcpus) { + page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA); + if (!page) { + mutex_unlock(&kvm->arch.vsie.mutex); + return ERR_PTR(-ENOMEM); + } + page_ref_inc(page); + kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page; + kvm->arch.vsie.page_count++; + } else { + /* reuse an existing entry that belongs to nobody */ + while (true) { + page = kvm->arch.vsie.pages[kvm->arch.vsie.next]; + if (page_ref_inc_return(page) == 2) + break; + page_ref_dec(page); + kvm->arch.vsie.next++; + kvm->arch.vsie.next %= nr_vcpus; + } + radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); + } + page->index = addr; + /* double use of the same address */ + if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) { + page_ref_dec(page); + mutex_unlock(&kvm->arch.vsie.mutex); + return NULL; + } + mutex_unlock(&kvm->arch.vsie.mutex); + + vsie_page = page_to_virt(page); + memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block)); + release_gmap_shadow(vsie_page); + vsie_page->fault_addr = 0; + vsie_page->scb_s.ihcpu = 0xffffU; + return vsie_page; +} + +/* put a vsie page acquired via get_vsie_page */ +static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page) +{ + struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT); + + page_ref_dec(page); +} + +int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) +{ + struct vsie_page *vsie_page; + unsigned long scb_addr; + int rc; + + vcpu->stat.instruction_sie++; + if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2)) + return -EOPNOTSUPP; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + BUILD_BUG_ON(sizeof(struct vsie_page) != 4096); + scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL); + + /* 512 byte alignment */ + if (unlikely(scb_addr & 0x1ffUL)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0)) + return 0; + + vsie_page = get_vsie_page(vcpu->kvm, scb_addr); + if (IS_ERR(vsie_page)) + return PTR_ERR(vsie_page); + else if (!vsie_page) + /* double use of sie control block - simply do nothing */ + return 0; + + rc = pin_scb(vcpu, vsie_page, scb_addr); + if (rc) + goto out_put; + rc = shadow_scb(vcpu, vsie_page); + if (rc) + goto out_unpin_scb; + rc = pin_blocks(vcpu, vsie_page); + if (rc) + goto out_unshadow; + register_shadow_scb(vcpu, vsie_page); + rc = vsie_run(vcpu, vsie_page); + unregister_shadow_scb(vcpu); + unpin_blocks(vcpu, vsie_page); +out_unshadow: + unshadow_scb(vcpu, vsie_page); +out_unpin_scb: + unpin_scb(vcpu, vsie_page, scb_addr); +out_put: + put_vsie_page(vcpu->kvm, vsie_page); + + return rc < 0 ? rc : 0; +} + +/* Init the vsie data structures. To be called when a vm is initialized. */ +void kvm_s390_vsie_init(struct kvm *kvm) +{ + mutex_init(&kvm->arch.vsie.mutex); + INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL); +} + +/* Destroy the vsie data structures. To be called when a vm is destroyed. */ +void kvm_s390_vsie_destroy(struct kvm *kvm) +{ + struct vsie_page *vsie_page; + struct page *page; + int i; + + mutex_lock(&kvm->arch.vsie.mutex); + for (i = 0; i < kvm->arch.vsie.page_count; i++) { + page = kvm->arch.vsie.pages[i]; + kvm->arch.vsie.pages[i] = NULL; + vsie_page = page_to_virt(page); + release_gmap_shadow(vsie_page); + /* free the radix tree entry */ + radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); + __free_page(page); + } + kvm->arch.vsie.page_count = 0; + mutex_unlock(&kvm->arch.vsie.mutex); +} + +void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block); + + /* + * Even if the VCPU lets go of the shadow sie block reference, it is + * still valid in the cache. So we can safely kick it. + */ + if (scb) { + atomic_or(PROG_BLOCK_SIE, &scb->prog20); + if (scb->prog0c & PROG_IN_SIE) + atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags); + } +} diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c index b647d5ff0ad9..48352bffbc92 100644 --- a/arch/s390/lib/string.c +++ b/arch/s390/lib/string.c @@ -236,6 +236,24 @@ char * strrchr(const char * s, int c) } EXPORT_SYMBOL(strrchr); +static inline int clcle(const char *s1, unsigned long l1, + const char *s2, unsigned long l2) +{ + register unsigned long r2 asm("2") = (unsigned long) s1; + register unsigned long r3 asm("3") = (unsigned long) l1; + register unsigned long r4 asm("4") = (unsigned long) s2; + register unsigned long r5 asm("5") = (unsigned long) l2; + int cc; + + asm volatile ("0: clcle %1,%3,0\n" + " jo 0b\n" + " ipm %0\n" + " srl %0,28" + : "=&d" (cc), "+a" (r2), "+a" (r3), + "+a" (r4), "+a" (r5) : : "cc"); + return cc; +} + /** * strstr - Find the first substring in a %NUL terminated string * @s1: The string to be searched @@ -250,18 +268,9 @@ char * strstr(const char * s1,const char * s2) return (char *) s1; l1 = __strend(s1) - s1; while (l1-- >= l2) { - register unsigned long r2 asm("2") = (unsigned long) s1; - register unsigned long r3 asm("3") = (unsigned long) l2; - register unsigned long r4 asm("4") = (unsigned long) s2; - register unsigned long r5 asm("5") = (unsigned long) l2; int cc; - asm volatile ("0: clcle %1,%3,0\n" - " jo 0b\n" - " ipm %0\n" - " srl %0,28" - : "=&d" (cc), "+a" (r2), "+a" (r3), - "+a" (r4), "+a" (r5) : : "cc" ); + cc = clcle(s1, l2, s2, l2); if (!cc) return (char *) s1; s1++; @@ -302,20 +311,11 @@ EXPORT_SYMBOL(memchr); */ int memcmp(const void *cs, const void *ct, size_t n) { - register unsigned long r2 asm("2") = (unsigned long) cs; - register unsigned long r3 asm("3") = (unsigned long) n; - register unsigned long r4 asm("4") = (unsigned long) ct; - register unsigned long r5 asm("5") = (unsigned long) n; int ret; - asm volatile ("0: clcle %1,%3,0\n" - " jo 0b\n" - " ipm %0\n" - " srl %0,28" - : "=&d" (ret), "+a" (r2), "+a" (r3), "+a" (r4), "+a" (r5) - : : "cc" ); + ret = clcle(cs, n, ct, n); if (ret) - ret = *(char *) r2 - *(char *) r4; + ret = ret == 1 ? -1 : 1; return ret; } EXPORT_SYMBOL(memcmp); diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index ae4de559e3a0..f481fcde067b 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -49,7 +49,7 @@ static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr " jnm 5b\n" " ex %4,0(%3)\n" " j 8f\n" - "7:slgr %0,%0\n" + "7: slgr %0,%0\n" "8:\n" EX_TABLE(0b,2b) EX_TABLE(3b,4b) EX_TABLE(9b,2b) EX_TABLE(10b,4b) : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) @@ -93,7 +93,7 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr, " jnm 6b\n" " ex %4,0(%3)\n" " j 9f\n" - "8:slgr %0,%0\n" + "8: slgr %0,%0\n" "9: sacf 768\n" EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,5b) EX_TABLE(10b,3b) EX_TABLE(11b,3b) EX_TABLE(12b,5b) @@ -104,6 +104,7 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr, unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { + check_object_size(to, n, false); if (static_branch_likely(&have_mvcos)) return copy_from_user_mvcos(to, from, n); return copy_from_user_mvcp(to, from, n); @@ -177,6 +178,7 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x, unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { + check_object_size(from, n, true); if (static_branch_likely(&have_mvcos)) return copy_to_user_mvcos(to, from, n); return copy_to_user_mvcs(to, from, n); @@ -266,7 +268,7 @@ static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size "3: .insn ss,0xc80000000000,0(%3,%1),0(%4),0\n" " slgr %0,%3\n" " j 5f\n" - "4:slgr %0,%0\n" + "4: slgr %0,%0\n" "5:\n" EX_TABLE(0b,2b) EX_TABLE(3b,5b) : "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2) diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 8556d6be9b54..861880df12c7 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -157,7 +157,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pud = pud_offset(pgd, addr); if (!pud_none(*pud)) if (pud_large(*pud)) { - prot = pud_val(*pud) & _REGION3_ENTRY_RO; + prot = pud_val(*pud) & _REGION_ENTRY_PROTECT; note_page(m, st, prot, 2); } else walk_pmd_level(m, st, pud, addr); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index cce577feab1e..661d9fe63c43 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -24,7 +24,7 @@ #include <linux/kdebug.h> #include <linux/init.h> #include <linux/console.h> -#include <linux/module.h> +#include <linux/extable.h> #include <linux/hardirq.h> #include <linux/kprobes.h> #include <linux/uaccess.h> @@ -250,6 +250,7 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code) report_user_fault(regs, SIGSEGV, 1); si.si_signo = SIGSEGV; + si.si_errno = 0; si.si_code = si_code; si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK); force_sig_info(SIGSEGV, &si, current); @@ -417,6 +418,8 @@ static inline int do_exception(struct pt_regs *regs, int access) (struct gmap *) S390_lowcore.gmap : NULL; if (gmap) { current->thread.gmap_addr = address; + current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE); + current->thread.gmap_int_code = regs->int_code & 0xffff; address = __gmap_translate(gmap, address); if (address == -EFAULT) { fault = VM_FAULT_BADMAP; @@ -455,7 +458,7 @@ retry: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, flags); + fault = handle_mm_fault(vma, address, flags); /* No reason to continue if interrupted by SIGKILL. */ if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { fault = VM_FAULT_SIGNAL; @@ -623,7 +626,7 @@ void pfault_fini(void) diag_stat_inc(DIAG_STAT_X258); asm volatile( " diag %0,0,0x258\n" - "0:\n" + "0: nopr %%r7\n" EX_TABLE(0b,0b) : : "a" (&refbk), "m" (refbk) : "cc"); } @@ -631,6 +634,29 @@ void pfault_fini(void) static DEFINE_SPINLOCK(pfault_lock); static LIST_HEAD(pfault_list); +#define PF_COMPLETE 0x0080 + +/* + * The mechanism of our pfault code: if Linux is running as guest, runs a user + * space process and the user space process accesses a page that the host has + * paged out we get a pfault interrupt. + * + * This allows us, within the guest, to schedule a different process. Without + * this mechanism the host would have to suspend the whole virtual cpu until + * the page has been paged in. + * + * So when we get such an interrupt then we set the state of the current task + * to uninterruptible and also set the need_resched flag. Both happens within + * interrupt context(!). If we later on want to return to user space we + * recognize the need_resched flag and then call schedule(). It's not very + * obvious how this works... + * + * Of course we have a lot of additional fun with the completion interrupt (-> + * host signals that a page of a process has been paged in and the process can + * continue to run). This interrupt can arrive on any cpu and, since we have + * virtual cpus, actually appear before the interrupt that signals that a page + * is missing. + */ static void pfault_interrupt(struct ext_code ext_code, unsigned int param32, unsigned long param64) { @@ -639,10 +665,9 @@ static void pfault_interrupt(struct ext_code ext_code, pid_t pid; /* - * Get the external interruption subcode & pfault - * initial/completion signal bit. VM stores this - * in the 'cpu address' field associated with the - * external interrupt. + * Get the external interruption subcode & pfault initial/completion + * signal bit. VM stores this in the 'cpu address' field associated + * with the external interrupt. */ subcode = ext_code.subcode; if ((subcode & 0xff00) != __SUBCODE_MASK) @@ -658,7 +683,7 @@ static void pfault_interrupt(struct ext_code ext_code, if (!tsk) return; spin_lock(&pfault_lock); - if (subcode & 0x0080) { + if (subcode & PF_COMPLETE) { /* signal bit is set -> a page has been swapped in by VM */ if (tsk->thread.pfault_wait == 1) { /* Initial interrupt was faster than the completion @@ -687,8 +712,7 @@ static void pfault_interrupt(struct ext_code ext_code, goto out; if (tsk->thread.pfault_wait == 1) { /* Already on the list with a reference: put to sleep */ - __set_task_state(tsk, TASK_UNINTERRUPTIBLE); - set_tsk_need_resched(tsk); + goto block; } else if (tsk->thread.pfault_wait == -1) { /* Completion interrupt was faster than the initial * interrupt (pfault_wait == -1). Set pfault_wait @@ -703,7 +727,11 @@ static void pfault_interrupt(struct ext_code ext_code, get_task_struct(tsk); tsk->thread.pfault_wait = 1; list_add(&tsk->thread.list, &pfault_list); - __set_task_state(tsk, TASK_UNINTERRUPTIBLE); +block: + /* Since this must be a userspace fault, there + * is no kernel task state to trample. Rely on the + * return to userspace schedule() to block. */ + __set_current_state(TASK_UNINTERRUPTIBLE); set_tsk_need_resched(tsk); } } @@ -712,28 +740,21 @@ out: put_task_struct(tsk); } -static int pfault_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) +static int pfault_cpu_dead(unsigned int cpu) { struct thread_struct *thread, *next; struct task_struct *tsk; - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DEAD: - spin_lock_irq(&pfault_lock); - list_for_each_entry_safe(thread, next, &pfault_list, list) { - thread->pfault_wait = 0; - list_del(&thread->list); - tsk = container_of(thread, struct task_struct, thread); - wake_up_process(tsk); - put_task_struct(tsk); - } - spin_unlock_irq(&pfault_lock); - break; - default: - break; + spin_lock_irq(&pfault_lock); + list_for_each_entry_safe(thread, next, &pfault_list, list) { + thread->pfault_wait = 0; + list_del(&thread->list); + tsk = container_of(thread, struct task_struct, thread); + wake_up_process(tsk); + put_task_struct(tsk); } - return NOTIFY_OK; + spin_unlock_irq(&pfault_lock); + return 0; } static int __init pfault_irq_init(void) @@ -747,7 +768,8 @@ static int __init pfault_irq_init(void) if (rc) goto out_pfault; irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); - hotcpu_notifier(pfault_cpu_notify, 0); + cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", + NULL, pfault_cpu_dead); return 0; out_pfault: diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index cace818d86eb..3ba622702ce4 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -20,14 +20,16 @@ #include <asm/gmap.h> #include <asm/tlb.h> +#define GMAP_SHADOW_FAKE_TABLE 1ULL + /** - * gmap_alloc - allocate a guest address space + * gmap_alloc - allocate and initialize a guest address space * @mm: pointer to the parent mm_struct * @limit: maximum address of the gmap address space * * Returns a guest address space structure. */ -struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) +static struct gmap *gmap_alloc(unsigned long limit) { struct gmap *gmap; struct page *page; @@ -55,10 +57,14 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) if (!gmap) goto out; INIT_LIST_HEAD(&gmap->crst_list); + INIT_LIST_HEAD(&gmap->children); + INIT_LIST_HEAD(&gmap->pt_list); INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); + INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC); spin_lock_init(&gmap->guest_table_lock); - gmap->mm = mm; + spin_lock_init(&gmap->shadow_lock); + atomic_set(&gmap->ref_count, 1); page = alloc_pages(GFP_KERNEL, 2); if (!page) goto out_free; @@ -70,9 +76,6 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) gmap->asce = atype | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | __pa(table); gmap->asce_end = limit; - down_write(&mm->mmap_sem); - list_add(&gmap->list, &mm->context.gmap_list); - up_write(&mm->mmap_sem); return gmap; out_free: @@ -80,12 +83,39 @@ out_free: out: return NULL; } -EXPORT_SYMBOL_GPL(gmap_alloc); + +/** + * gmap_create - create a guest address space + * @mm: pointer to the parent mm_struct + * @limit: maximum size of the gmap address space + * + * Returns a guest address space structure. + */ +struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) +{ + struct gmap *gmap; + unsigned long gmap_asce; + + gmap = gmap_alloc(limit); + if (!gmap) + return NULL; + gmap->mm = mm; + spin_lock(&mm->context.gmap_lock); + list_add_rcu(&gmap->list, &mm->context.gmap_list); + if (list_is_singular(&mm->context.gmap_list)) + gmap_asce = gmap->asce; + else + gmap_asce = -1UL; + WRITE_ONCE(mm->context.gmap_asce, gmap_asce); + spin_unlock(&mm->context.gmap_lock); + return gmap; +} +EXPORT_SYMBOL_GPL(gmap_create); static void gmap_flush_tlb(struct gmap *gmap) { if (MACHINE_HAS_IDTE) - __tlb_flush_asce(gmap->mm, gmap->asce); + __tlb_flush_idte(gmap->asce); else __tlb_flush_global(); } @@ -114,31 +144,126 @@ static void gmap_radix_tree_free(struct radix_tree_root *root) } while (nr > 0); } +static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) +{ + struct gmap_rmap *rmap, *rnext, *head; + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + head = radix_tree_delete(root, index); + gmap_for_each_rmap_safe(rmap, rnext, head) + kfree(rmap); + } + } while (nr > 0); +} + /** * gmap_free - free a guest address space * @gmap: pointer to the guest address space structure + * + * No locks required. There are no references to this gmap anymore. */ -void gmap_free(struct gmap *gmap) +static void gmap_free(struct gmap *gmap) { struct page *page, *next; - /* Flush tlb. */ - if (MACHINE_HAS_IDTE) - __tlb_flush_asce(gmap->mm, gmap->asce); - else - __tlb_flush_global(); - + /* Flush tlb of all gmaps (if not already done for shadows) */ + if (!(gmap_is_shadow(gmap) && gmap->removed)) + gmap_flush_tlb(gmap); /* Free all segment & region tables. */ list_for_each_entry_safe(page, next, &gmap->crst_list, lru) __free_pages(page, 2); gmap_radix_tree_free(&gmap->guest_to_host); gmap_radix_tree_free(&gmap->host_to_guest); - down_write(&gmap->mm->mmap_sem); - list_del(&gmap->list); - up_write(&gmap->mm->mmap_sem); + + /* Free additional data for a shadow gmap */ + if (gmap_is_shadow(gmap)) { + /* Free all page tables. */ + list_for_each_entry_safe(page, next, &gmap->pt_list, lru) + page_table_free_pgste(page); + gmap_rmap_radix_tree_free(&gmap->host_to_rmap); + /* Release reference to the parent */ + gmap_put(gmap->parent); + } + kfree(gmap); } -EXPORT_SYMBOL_GPL(gmap_free); + +/** + * gmap_get - increase reference counter for guest address space + * @gmap: pointer to the guest address space structure + * + * Returns the gmap pointer + */ +struct gmap *gmap_get(struct gmap *gmap) +{ + atomic_inc(&gmap->ref_count); + return gmap; +} +EXPORT_SYMBOL_GPL(gmap_get); + +/** + * gmap_put - decrease reference counter for guest address space + * @gmap: pointer to the guest address space structure + * + * If the reference counter reaches zero the guest address space is freed. + */ +void gmap_put(struct gmap *gmap) +{ + if (atomic_dec_return(&gmap->ref_count) == 0) + gmap_free(gmap); +} +EXPORT_SYMBOL_GPL(gmap_put); + +/** + * gmap_remove - remove a guest address space but do not free it yet + * @gmap: pointer to the guest address space structure + */ +void gmap_remove(struct gmap *gmap) +{ + struct gmap *sg, *next; + unsigned long gmap_asce; + + /* Remove all shadow gmaps linked to this gmap */ + if (!list_empty(&gmap->children)) { + spin_lock(&gmap->shadow_lock); + list_for_each_entry_safe(sg, next, &gmap->children, list) { + list_del(&sg->list); + gmap_put(sg); + } + spin_unlock(&gmap->shadow_lock); + } + /* Remove gmap from the pre-mm list */ + spin_lock(&gmap->mm->context.gmap_lock); + list_del_rcu(&gmap->list); + if (list_empty(&gmap->mm->context.gmap_list)) + gmap_asce = 0; + else if (list_is_singular(&gmap->mm->context.gmap_list)) + gmap_asce = list_first_entry(&gmap->mm->context.gmap_list, + struct gmap, list)->asce; + else + gmap_asce = -1UL; + WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce); + spin_unlock(&gmap->mm->context.gmap_lock); + synchronize_rcu(); + /* Put reference */ + gmap_put(gmap); +} +EXPORT_SYMBOL_GPL(gmap_remove); /** * gmap_enable - switch primary space to the guest address space @@ -160,6 +285,17 @@ void gmap_disable(struct gmap *gmap) } EXPORT_SYMBOL_GPL(gmap_disable); +/** + * gmap_get_enabled - get a pointer to the currently enabled gmap + * + * Returns a pointer to the currently enabled gmap. 0 if none is enabled. + */ +struct gmap *gmap_get_enabled(void) +{ + return (struct gmap *) S390_lowcore.gmap; +} +EXPORT_SYMBOL_GPL(gmap_get_enabled); + /* * gmap_alloc_table is assumed to be called with mmap_sem held */ @@ -175,7 +311,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, return -ENOMEM; new = (unsigned long *) page_to_phys(page); crst_table_init(new, init); - spin_lock(&gmap->mm->page_table_lock); + spin_lock(&gmap->guest_table_lock); if (*table & _REGION_ENTRY_INVALID) { list_add(&page->lru, &gmap->crst_list); *table = (unsigned long) new | _REGION_ENTRY_LENGTH | @@ -183,7 +319,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, page->index = gaddr; page = NULL; } - spin_unlock(&gmap->mm->page_table_lock); + spin_unlock(&gmap->guest_table_lock); if (page) __free_pages(page, 2); return 0; @@ -219,6 +355,7 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) unsigned long *entry; int flush = 0; + BUG_ON(gmap_is_shadow(gmap)); spin_lock(&gmap->guest_table_lock); entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); if (entry) { @@ -258,6 +395,7 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) unsigned long off; int flush; + BUG_ON(gmap_is_shadow(gmap)); if ((to | len) & (PMD_SIZE - 1)) return -EINVAL; if (len == 0 || to + len < to) @@ -289,6 +427,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from, unsigned long off; int flush; + BUG_ON(gmap_is_shadow(gmap)); if ((from | to | len) & (PMD_SIZE - 1)) return -EINVAL; if (len == 0 || from + len < from || to + len < to || @@ -326,6 +465,8 @@ EXPORT_SYMBOL_GPL(gmap_map_segment); * This function does not establish potentially missing page table entries. * The mmap_sem of the mm that belongs to the address space must be held * when this function gets called. + * + * Note: Can also be called for shadow gmaps. */ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) { @@ -333,6 +474,7 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); + /* Note: guest_to_host is empty for a shadow gmap */ return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; } EXPORT_SYMBOL_GPL(__gmap_translate); @@ -369,11 +511,13 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table, struct gmap *gmap; int flush; - list_for_each_entry(gmap, &mm->context.gmap_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); if (flush) gmap_flush_tlb(gmap); } + rcu_read_unlock(); } /** @@ -397,6 +541,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) pmd_t *pmd; int rc; + BUG_ON(gmap_is_shadow(gmap)); /* Create higher level tables in the gmap page table */ table = gmap->table; if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { @@ -430,6 +575,9 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) VM_BUG_ON(pgd_none(*pgd)); pud = pud_offset(pgd, vmaddr); VM_BUG_ON(pud_none(*pud)); + /* large puds cannot yet be handled */ + if (pud_large(*pud)) + return -EFAULT; pmd = pmd_offset(pud, vmaddr); VM_BUG_ON(pmd_none(*pmd)); /* large pmds cannot yet be handled */ @@ -549,116 +697,1412 @@ static LIST_HEAD(gmap_notifier_list); static DEFINE_SPINLOCK(gmap_notifier_lock); /** - * gmap_register_ipte_notifier - register a pte invalidation callback + * gmap_register_pte_notifier - register a pte invalidation callback * @nb: pointer to the gmap notifier block */ -void gmap_register_ipte_notifier(struct gmap_notifier *nb) +void gmap_register_pte_notifier(struct gmap_notifier *nb) { spin_lock(&gmap_notifier_lock); - list_add(&nb->list, &gmap_notifier_list); + list_add_rcu(&nb->list, &gmap_notifier_list); spin_unlock(&gmap_notifier_lock); } -EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); +EXPORT_SYMBOL_GPL(gmap_register_pte_notifier); /** - * gmap_unregister_ipte_notifier - remove a pte invalidation callback + * gmap_unregister_pte_notifier - remove a pte invalidation callback * @nb: pointer to the gmap notifier block */ -void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) +void gmap_unregister_pte_notifier(struct gmap_notifier *nb) { spin_lock(&gmap_notifier_lock); - list_del_init(&nb->list); + list_del_rcu(&nb->list); spin_unlock(&gmap_notifier_lock); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier); + +/** + * gmap_call_notifier - call all registered invalidation callbacks + * @gmap: pointer to guest mapping meta data structure + * @start: start virtual address in the guest address space + * @end: end virtual address in the guest address space + */ +static void gmap_call_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) +{ + struct gmap_notifier *nb; + + list_for_each_entry(nb, &gmap_notifier_list, list) + nb->notifier_call(gmap, start, end); +} + +/** + * gmap_table_walk - walk the gmap page tables + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @level: page table level to stop at + * + * Returns a table entry pointer for the given guest address and @level + * @level=0 : returns a pointer to a page table table entry (or NULL) + * @level=1 : returns a pointer to a segment table entry (or NULL) + * @level=2 : returns a pointer to a region-3 table entry (or NULL) + * @level=3 : returns a pointer to a region-2 table entry (or NULL) + * @level=4 : returns a pointer to a region-1 table entry (or NULL) + * + * Returns NULL if the gmap page tables could not be walked to the + * requested level. + * + * Note: Can also be called for shadow gmaps. + */ +static inline unsigned long *gmap_table_walk(struct gmap *gmap, + unsigned long gaddr, int level) +{ + unsigned long *table; + + if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4)) + return NULL; + if (gmap_is_shadow(gmap) && gmap->removed) + return NULL; + if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11))) + return NULL; + table = gmap->table; + switch (gmap->asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: + table += (gaddr >> 53) & 0x7ff; + if (level == 4) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* Fallthrough */ + case _ASCE_TYPE_REGION2: + table += (gaddr >> 42) & 0x7ff; + if (level == 3) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* Fallthrough */ + case _ASCE_TYPE_REGION3: + table += (gaddr >> 31) & 0x7ff; + if (level == 2) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* Fallthrough */ + case _ASCE_TYPE_SEGMENT: + table += (gaddr >> 20) & 0x7ff; + if (level == 1) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); + table += (gaddr >> 12) & 0xff; + } + return table; +} + +/** + * gmap_pte_op_walk - walk the gmap page table, get the page table lock + * and return the pte pointer + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @ptl: pointer to the spinlock pointer + * + * Returns a pointer to the locked pte for a guest address, or NULL + * + * Note: Can also be called for shadow gmaps. + */ +static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, + spinlock_t **ptl) +{ + unsigned long *table; + + if (gmap_is_shadow(gmap)) + spin_lock(&gmap->guest_table_lock); + /* Walk the gmap page table, lock and get pte pointer */ + table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ + if (!table || *table & _SEGMENT_ENTRY_INVALID) { + if (gmap_is_shadow(gmap)) + spin_unlock(&gmap->guest_table_lock); + return NULL; + } + if (gmap_is_shadow(gmap)) { + *ptl = &gmap->guest_table_lock; + return pte_offset_map((pmd_t *) table, gaddr); + } + return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); } -EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); /** - * gmap_ipte_notify - mark a range of ptes for invalidation notification + * gmap_pte_op_fixup - force a page in and connect the gmap page table + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @vmaddr: address in the host process address space + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * + * Returns 0 if the caller can retry __gmap_translate (might fail again), + * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing + * up or connecting the gmap page table. + */ +static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, + unsigned long vmaddr, int prot) +{ + struct mm_struct *mm = gmap->mm; + unsigned int fault_flags; + bool unlocked = false; + + BUG_ON(gmap_is_shadow(gmap)); + fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; + if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked)) + return -EFAULT; + if (unlocked) + /* lost mmap_sem, caller has to retry __gmap_translate */ + return 0; + /* Connect the page tables */ + return __gmap_link(gmap, gaddr, vmaddr); +} + +/** + * gmap_pte_op_end - release the page table lock + * @ptl: pointer to the spinlock pointer + */ +static void gmap_pte_op_end(spinlock_t *ptl) +{ + spin_unlock(ptl); +} + +/* + * gmap_protect_range - remove access rights to memory and set pgste bits * @gmap: pointer to guest mapping meta data structure * @gaddr: virtual address in the guest address space * @len: size of area + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: pgste notification bits to set + * + * Returns 0 if successfully protected, -ENOMEM if out of memory and + * -EFAULT if gaddr is invalid (or mapping for shadows is missing). * - * Returns 0 if for each page in the given range a gmap mapping exists and - * the invalidation notification could be set. If the gmap mapping is missing - * for one or more pages -EFAULT is returned. If no memory could be allocated - * -ENOMEM is returned. This function establishes missing page table entries. + * Called with sg->mm->mmap_sem in read. + * + * Note: Can also be called for shadow gmaps. */ -int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) +static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, + unsigned long len, int prot, unsigned long bits) { - unsigned long addr; + unsigned long vmaddr; spinlock_t *ptl; pte_t *ptep; - bool unlocked; - int rc = 0; + int rc; + + while (len) { + rc = -EAGAIN; + ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); + if (ptep) { + rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits); + gmap_pte_op_end(ptl); + } + if (rc) { + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) + return vmaddr; + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); + if (rc) + return rc; + continue; + } + gaddr += PAGE_SIZE; + len -= PAGE_SIZE; + } + return 0; +} - if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) +/** + * gmap_mprotect_notify - change access rights for a range of ptes and + * call the notifier if any pte changes again + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @len: size of area + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * + * Returns 0 if for each page in the given range a gmap mapping exists, + * the new access rights could be set and the notifier could be armed. + * If the gmap mapping is missing for one or more pages -EFAULT is + * returned. If no memory could be allocated -ENOMEM is returned. + * This function establishes missing page table entries. + */ +int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, + unsigned long len, int prot) +{ + int rc; + + if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) + return -EINVAL; + if (!MACHINE_HAS_ESOP && prot == PROT_READ) return -EINVAL; down_read(&gmap->mm->mmap_sem); - while (len) { - unlocked = false; - /* Convert gmap address and connect the page tables */ - addr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(addr)) { - rc = addr; + rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT); + up_read(&gmap->mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_mprotect_notify); + +/** + * gmap_read_table - get an unsigned long value from a guest page table using + * absolute addressing, without marking the page referenced. + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @val: pointer to the unsigned long value to return + * + * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT + * if reading using the virtual address failed. + * + * Called with gmap->mm->mmap_sem in read. + */ +int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) +{ + unsigned long address, vmaddr; + spinlock_t *ptl; + pte_t *ptep, pte; + int rc; + + while (1) { + rc = -EAGAIN; + ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); + if (ptep) { + pte = *ptep; + if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { + address = pte_val(pte) & PAGE_MASK; + address += gaddr & ~PAGE_MASK; + *val = *(unsigned long *) address; + pte_val(*ptep) |= _PAGE_YOUNG; + /* Do *NOT* clear the _PAGE_INVALID bit! */ + rc = 0; + } + gmap_pte_op_end(ptl); + } + if (!rc) + break; + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; break; } - /* Get the page mapped */ - if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE, - &unlocked)) { - rc = -EFAULT; + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); + if (rc) break; + } + return rc; +} +EXPORT_SYMBOL_GPL(gmap_read_table); + +/** + * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree + * @sg: pointer to the shadow guest address space structure + * @vmaddr: vm address associated with the rmap + * @rmap: pointer to the rmap structure + * + * Called with the sg->guest_table_lock + */ +static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, + struct gmap_rmap *rmap) +{ + void **slot; + + BUG_ON(!gmap_is_shadow(sg)); + slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); + if (slot) { + rmap->next = radix_tree_deref_slot_protected(slot, + &sg->guest_table_lock); + radix_tree_replace_slot(slot, rmap); + } else { + rmap->next = NULL; + radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, + rmap); + } +} + +/** + * gmap_protect_rmap - modify access rights to memory and create an rmap + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow gmap + * @paddr: address in the parent guest address space + * @len: length of the memory area to protect + * @prot: indicates access rights: none, read-only or read-write + * + * Returns 0 if successfully protected and the rmap was created, -ENOMEM + * if out of memory and -EFAULT if paddr is invalid. + */ +static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, + unsigned long paddr, unsigned long len, int prot) +{ + struct gmap *parent; + struct gmap_rmap *rmap; + unsigned long vmaddr; + spinlock_t *ptl; + pte_t *ptep; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + parent = sg->parent; + while (len) { + vmaddr = __gmap_translate(parent, paddr); + if (IS_ERR_VALUE(vmaddr)) + return vmaddr; + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); + if (!rmap) + return -ENOMEM; + rmap->raddr = raddr; + rc = radix_tree_preload(GFP_KERNEL); + if (rc) { + kfree(rmap); + return rc; } - /* While trying to map mmap_sem got unlocked. Let us retry */ - if (unlocked) + rc = -EAGAIN; + ptep = gmap_pte_op_walk(parent, paddr, &ptl); + if (ptep) { + spin_lock(&sg->guest_table_lock); + rc = ptep_force_prot(parent->mm, paddr, ptep, prot, + PGSTE_VSIE_BIT); + if (!rc) + gmap_insert_rmap(sg, vmaddr, rmap); + spin_unlock(&sg->guest_table_lock); + gmap_pte_op_end(ptl); + } + radix_tree_preload_end(); + if (rc) { + kfree(rmap); + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); + if (rc) + return rc; continue; - rc = __gmap_link(gmap, gaddr, addr); + } + paddr += PAGE_SIZE; + len -= PAGE_SIZE; + } + return 0; +} + +#define _SHADOW_RMAP_MASK 0x7 +#define _SHADOW_RMAP_REGION1 0x5 +#define _SHADOW_RMAP_REGION2 0x4 +#define _SHADOW_RMAP_REGION3 0x3 +#define _SHADOW_RMAP_SEGMENT 0x2 +#define _SHADOW_RMAP_PGTABLE 0x1 + +/** + * gmap_idte_one - invalidate a single region or segment table entry + * @asce: region or segment table *origin* + table-type bits + * @vaddr: virtual address to identify the table entry to flush + * + * The invalid bit of a single region or segment table entry is set + * and the associated TLB entries depending on the entry are flushed. + * The table-type of the @asce identifies the portion of the @vaddr + * that is used as the invalidation index. + */ +static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) +{ + asm volatile( + " .insn rrf,0xb98e0000,%0,%1,0,0" + : : "a" (asce), "a" (vaddr) : "cc", "memory"); +} + +/** + * gmap_unshadow_page - remove a page from a shadow page table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) +{ + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ + if (!table || *table & _PAGE_INVALID) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1); + ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); +} + +/** + * __gmap_unshadow_pgt - remove all entries from a shadow page table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @pgt: pointer to the start of a shadow page table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, + unsigned long *pgt) +{ + int i; + + BUG_ON(!gmap_is_shadow(sg)); + for (i = 0; i < 256; i++, raddr += 1UL << 12) + pgt[i] = _PAGE_INVALID; +} + +/** + * gmap_unshadow_pgt - remove a shadow page table from a segment entry + * @sg: pointer to the shadow guest address space structure + * @raddr: address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) +{ + unsigned long sto, *ste, *pgt; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ + if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1); + sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff)); + gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); + pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); + *ste = _SEGMENT_ENTRY_EMPTY; + __gmap_unshadow_pgt(sg, raddr, pgt); + /* Free page table */ + page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); + list_del(&page->lru); + page_table_free_pgste(page); +} + +/** + * __gmap_unshadow_sgt - remove all entries from a shadow segment table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @sgt: pointer to the start of a shadow segment table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, + unsigned long *sgt) +{ + unsigned long asce, *pgt; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT; + for (i = 0; i < 2048; i++, raddr += 1UL << 20) { + if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) + continue; + pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); + sgt[i] = _SEGMENT_ENTRY_EMPTY; + __gmap_unshadow_pgt(sg, raddr, pgt); + /* Free page table */ + page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); + list_del(&page->lru); + page_table_free_pgste(page); + } +} + +/** + * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the shadow->guest_table_lock + */ +static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) +{ + unsigned long r3o, *r3e, *sgt; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ + if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1); + r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff)); + gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); + sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); + *r3e = _REGION3_ENTRY_EMPTY; + __gmap_unshadow_sgt(sg, raddr, sgt); + /* Free segment table */ + page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); +} + +/** + * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table + * @sg: pointer to the shadow guest address space structure + * @raddr: address in the shadow guest address space + * @r3t: pointer to the start of a shadow region-3 table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, + unsigned long *r3t) +{ + unsigned long asce, *sgt; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) r3t | _ASCE_TYPE_REGION3; + for (i = 0; i < 2048; i++, raddr += 1UL << 31) { + if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) + continue; + sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); + r3t[i] = _REGION3_ENTRY_EMPTY; + __gmap_unshadow_sgt(sg, raddr, sgt); + /* Free segment table */ + page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); + } +} + +/** + * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) +{ + unsigned long r2o, *r2e, *r3t; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ + if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1); + r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff)); + gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); + r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); + *r2e = _REGION2_ENTRY_EMPTY; + __gmap_unshadow_r3t(sg, raddr, r3t); + /* Free region 3 table */ + page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); +} + +/** + * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @r2t: pointer to the start of a shadow region-2 table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, + unsigned long *r2t) +{ + unsigned long asce, *r3t; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) r2t | _ASCE_TYPE_REGION2; + for (i = 0; i < 2048; i++, raddr += 1UL << 42) { + if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) + continue; + r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); + r2t[i] = _REGION2_ENTRY_EMPTY; + __gmap_unshadow_r3t(sg, raddr, r3t); + /* Free region 3 table */ + page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); + } +} + +/** + * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) +{ + unsigned long r1o, *r1e, *r2t; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ + if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1); + r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff)); + gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); + r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); + *r1e = _REGION1_ENTRY_EMPTY; + __gmap_unshadow_r2t(sg, raddr, r2t); + /* Free region 2 table */ + page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); +} + +/** + * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @r1t: pointer to the start of a shadow region-1 table + * + * Called with the shadow->guest_table_lock + */ +static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, + unsigned long *r1t) +{ + unsigned long asce, *r2t; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; + for (i = 0; i < 2048; i++, raddr += 1UL << 53) { + if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) + continue; + r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); + __gmap_unshadow_r2t(sg, raddr, r2t); + /* Clear entry and flush translation r1t -> r2t */ + gmap_idte_one(asce, raddr); + r1t[i] = _REGION1_ENTRY_EMPTY; + /* Free region 2 table */ + page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); + } +} + +/** + * gmap_unshadow - remove a shadow page table completely + * @sg: pointer to the shadow guest address space structure + * + * Called with sg->guest_table_lock + */ +static void gmap_unshadow(struct gmap *sg) +{ + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + if (sg->removed) + return; + sg->removed = 1; + gmap_call_notifier(sg, 0, -1UL); + gmap_flush_tlb(sg); + table = (unsigned long *)(sg->asce & _ASCE_ORIGIN); + switch (sg->asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: + __gmap_unshadow_r1t(sg, 0, table); + break; + case _ASCE_TYPE_REGION2: + __gmap_unshadow_r2t(sg, 0, table); + break; + case _ASCE_TYPE_REGION3: + __gmap_unshadow_r3t(sg, 0, table); + break; + case _ASCE_TYPE_SEGMENT: + __gmap_unshadow_sgt(sg, 0, table); + break; + } +} + +/** + * gmap_find_shadow - find a specific asce in the list of shadow tables + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation + * + * Returns the pointer to a gmap if a shadow table with the given asce is + * already available, ERR_PTR(-EAGAIN) if another one is just being created, + * otherwise NULL + */ +static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, + int edat_level) +{ + struct gmap *sg; + + list_for_each_entry(sg, &parent->children, list) { + if (sg->orig_asce != asce || sg->edat_level != edat_level || + sg->removed) + continue; + if (!sg->initialized) + return ERR_PTR(-EAGAIN); + atomic_inc(&sg->ref_count); + return sg; + } + return NULL; +} + +/** + * gmap_shadow_valid - check if a shadow guest address space matches the + * given properties and is still valid + * @sg: pointer to the shadow guest address space structure + * @asce: ASCE for which the shadow table is requested + * @edat_level: edat level to be used for the shadow translation + * + * Returns 1 if the gmap shadow is still valid and matches the given + * properties, the caller can continue using it. Returns 0 otherwise, the + * caller has to request a new shadow gmap in this case. + * + */ +int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) +{ + if (sg->removed) + return 0; + return sg->orig_asce == asce && sg->edat_level == edat_level; +} +EXPORT_SYMBOL_GPL(gmap_shadow_valid); + +/** + * gmap_shadow - create/find a shadow guest address space + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation + * + * The pages of the top level page table referred by the asce parameter + * will be set to read-only and marked in the PGSTEs of the kvm process. + * The shadow table will be removed automatically on any change to the + * PTE mapping for the source table. + * + * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, + * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the + * parent gmap table could not be protected. + */ +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, + int edat_level) +{ + struct gmap *sg, *new; + unsigned long limit; + int rc; + + BUG_ON(gmap_is_shadow(parent)); + spin_lock(&parent->shadow_lock); + sg = gmap_find_shadow(parent, asce, edat_level); + spin_unlock(&parent->shadow_lock); + if (sg) + return sg; + /* Create a new shadow gmap */ + limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); + if (asce & _ASCE_REAL_SPACE) + limit = -1UL; + new = gmap_alloc(limit); + if (!new) + return ERR_PTR(-ENOMEM); + new->mm = parent->mm; + new->parent = gmap_get(parent); + new->orig_asce = asce; + new->edat_level = edat_level; + new->initialized = false; + spin_lock(&parent->shadow_lock); + /* Recheck if another CPU created the same shadow */ + sg = gmap_find_shadow(parent, asce, edat_level); + if (sg) { + spin_unlock(&parent->shadow_lock); + gmap_free(new); + return sg; + } + if (asce & _ASCE_REAL_SPACE) { + /* only allow one real-space gmap shadow */ + list_for_each_entry(sg, &parent->children, list) { + if (sg->orig_asce & _ASCE_REAL_SPACE) { + spin_lock(&sg->guest_table_lock); + gmap_unshadow(sg); + spin_unlock(&sg->guest_table_lock); + list_del(&sg->list); + gmap_put(sg); + break; + } + } + } + atomic_set(&new->ref_count, 2); + list_add(&new->list, &parent->children); + if (asce & _ASCE_REAL_SPACE) { + /* nothing to protect, return right away */ + new->initialized = true; + spin_unlock(&parent->shadow_lock); + return new; + } + spin_unlock(&parent->shadow_lock); + /* protect after insertion, so it will get properly invalidated */ + down_read(&parent->mm->mmap_sem); + rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, + ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096, + PROT_READ, PGSTE_VSIE_BIT); + up_read(&parent->mm->mmap_sem); + spin_lock(&parent->shadow_lock); + new->initialized = true; + if (rc) { + list_del(&new->list); + gmap_free(new); + new = ERR_PTR(rc); + } + spin_unlock(&parent->shadow_lock); + return new; +} +EXPORT_SYMBOL_GPL(gmap_shadow); + +/** + * gmap_shadow_r2t - create an empty shadow region 2 table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @r2t: parent gmap address of the region 2 table to get shadowed + * @fake: r2t references contiguous guest memory block, not a r2t + * + * The r2t parameter specifies the address of the source table. The + * four pages of the source table are made read-only in the parent gmap + * address space. A write to the source table area @r2t will automatically + * remove the shadow r2 table and all of its decendents. + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *s_r2t, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + /* Allocate a shadow region second table */ + page = alloc_pages(GFP_KERNEL, 2); + if (!page) + return -ENOMEM; + page->index = r2t & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_r2t = (unsigned long *) page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= (r2t & _REGION_ENTRY_PROTECT); + list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make r2t read-only in parent gmap page table */ + raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1; + origin = r2t & _REGION_ENTRY_ORIGIN; + offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096; + len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 4); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != + (unsigned long) s_r2t) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_r2t(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, 2); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_r2t); + +/** + * gmap_shadow_r3t - create a shadow region 3 table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @r3t: parent gmap address of the region 3 table to get shadowed + * @fake: r3t references contiguous guest memory block, not a r3t + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *s_r3t, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + /* Allocate a shadow region second table */ + page = alloc_pages(GFP_KERNEL, 2); + if (!page) + return -ENOMEM; + page->index = r3t & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_r3t = (unsigned long *) page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + } + crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= (r3t & _REGION_ENTRY_PROTECT); + list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make r3t read-only in parent gmap page table */ + raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2; + origin = r3t & _REGION_ENTRY_ORIGIN; + offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096; + len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 3); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != + (unsigned long) s_r3t) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_r3t(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, 2); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_r3t); + +/** + * gmap_shadow_sgt - create a shadow segment table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @sgt: parent gmap address of the segment table to get shadowed + * @fake: sgt references contiguous guest memory block, not a sgt + * + * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *s_sgt, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); + /* Allocate a shadow segment table */ + page = alloc_pages(GFP_KERNEL, 2); + if (!page) + return -ENOMEM; + page->index = sgt & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_sgt = (unsigned long *) page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= sgt & _REGION_ENTRY_PROTECT; + list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make sgt read-only in parent gmap page table */ + raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3; + origin = sgt & _REGION_ENTRY_ORIGIN; + offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096; + len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 2); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != + (unsigned long) s_sgt) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_sgt(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, 2); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_sgt); + +/** + * gmap_shadow_lookup_pgtable - find a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: the address in the shadow aguest address space + * @pgt: parent gmap address of the page table to get shadowed + * @dat_protection: if the pgtable is marked as protected by dat + * @fake: pgt references contiguous guest memory block, not a pgtable + * + * Returns 0 if the shadow page table was found and -EAGAIN if the page + * table was not found. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, + unsigned long *pgt, int *dat_protection, + int *fake) +{ + unsigned long *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ + if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { + /* Shadow page tables are full pages (pte+pgste) */ + page = pfn_to_page(*table >> PAGE_SHIFT); + *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; + *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); + *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); + rc = 0; + } else { + rc = -EAGAIN; + } + spin_unlock(&sg->guest_table_lock); + return rc; + +} +EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); + +/** + * gmap_shadow_pgt - instantiate a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pgt: parent gmap address of the page table to get shadowed + * @fake: pgt references contiguous guest memory block, not a pgtable + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory, + * -EFAULT if an address in the parent gmap could not be resolved and + * + * Called with gmap->mm->mmap_sem in read + */ +int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, + int fake) +{ + unsigned long raddr, origin; + unsigned long *s_pgt, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); + /* Allocate a shadow page table */ + page = page_table_alloc_pgste(sg->mm); + if (!page) + return -ENOMEM; + page->index = pgt & _SEGMENT_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_pgt = (unsigned long *) page_to_phys(page); + /* Install shadow page table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _SEGMENT_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _SEGMENT_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | + (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; + list_add(&page->lru, &sg->pt_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_SEGMENT_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make pgt read-only in parent gmap page table (not the pgste) */ + raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT; + origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; + rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 1); + if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != + (unsigned long) s_pgt) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_SEGMENT_ENTRY_INVALID; + } else { + gmap_unshadow_pgt(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + page_table_free_pgste(page); + return rc; + +} +EXPORT_SYMBOL_GPL(gmap_shadow_pgt); + +/** + * gmap_shadow_page - create a shadow page mapping + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pte: pte in parent gmap address space to get shadowed + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) +{ + struct gmap *parent; + struct gmap_rmap *rmap; + unsigned long vmaddr, paddr; + spinlock_t *ptl; + pte_t *sptep, *tptep; + int prot; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + parent = sg->parent; + prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; + + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); + if (!rmap) + return -ENOMEM; + rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; + + while (1) { + paddr = pte_val(pte) & PAGE_MASK; + vmaddr = __gmap_translate(parent, paddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + break; + } + rc = radix_tree_preload(GFP_KERNEL); if (rc) break; - /* Walk the process page table, lock and get pte pointer */ - ptep = get_locked_pte(gmap->mm, addr, &ptl); - VM_BUG_ON(!ptep); - /* Set notification bit in the pgste of the pte */ - if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { - ptep_set_notify(gmap->mm, addr, ptep); - gaddr += PAGE_SIZE; - len -= PAGE_SIZE; + rc = -EAGAIN; + sptep = gmap_pte_op_walk(parent, paddr, &ptl); + if (sptep) { + spin_lock(&sg->guest_table_lock); + /* Get page table pointer */ + tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); + if (!tptep) { + spin_unlock(&sg->guest_table_lock); + gmap_pte_op_end(ptl); + radix_tree_preload_end(); + break; + } + rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte); + if (rc > 0) { + /* Success and a new mapping */ + gmap_insert_rmap(sg, vmaddr, rmap); + rmap = NULL; + rc = 0; + } + gmap_pte_op_end(ptl); + spin_unlock(&sg->guest_table_lock); } - pte_unmap_unlock(ptep, ptl); + radix_tree_preload_end(); + if (!rc) + break; + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); + if (rc) + break; } - up_read(&gmap->mm->mmap_sem); + kfree(rmap); return rc; } -EXPORT_SYMBOL_GPL(gmap_ipte_notify); +EXPORT_SYMBOL_GPL(gmap_shadow_page); + +/** + * gmap_shadow_notify - handle notifications for shadow gmap + * + * Called with sg->parent->shadow_lock. + */ +static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, + unsigned long offset, pte_t *pte) +{ + struct gmap_rmap *rmap, *rnext, *head; + unsigned long gaddr, start, end, bits, raddr; + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + spin_lock(&sg->parent->guest_table_lock); + table = radix_tree_lookup(&sg->parent->host_to_guest, + vmaddr >> PMD_SHIFT); + gaddr = table ? __gmap_segment_gaddr(table) + offset : 0; + spin_unlock(&sg->parent->guest_table_lock); + if (!table) + return; + + spin_lock(&sg->guest_table_lock); + if (sg->removed) { + spin_unlock(&sg->guest_table_lock); + return; + } + /* Check for top level table */ + start = sg->orig_asce & _ASCE_ORIGIN; + end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096; + if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && + gaddr < end) { + /* The complete shadow table has to go */ + gmap_unshadow(sg); + spin_unlock(&sg->guest_table_lock); + list_del(&sg->list); + gmap_put(sg); + return; + } + /* Remove the page table tree from on specific entry */ + head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12); + gmap_for_each_rmap_safe(rmap, rnext, head) { + bits = rmap->raddr & _SHADOW_RMAP_MASK; + raddr = rmap->raddr ^ bits; + switch (bits) { + case _SHADOW_RMAP_REGION1: + gmap_unshadow_r2t(sg, raddr); + break; + case _SHADOW_RMAP_REGION2: + gmap_unshadow_r3t(sg, raddr); + break; + case _SHADOW_RMAP_REGION3: + gmap_unshadow_sgt(sg, raddr); + break; + case _SHADOW_RMAP_SEGMENT: + gmap_unshadow_pgt(sg, raddr); + break; + case _SHADOW_RMAP_PGTABLE: + gmap_unshadow_page(sg, raddr); + break; + } + kfree(rmap); + } + spin_unlock(&sg->guest_table_lock); +} /** * ptep_notify - call all invalidation callbacks for a specific pte. * @mm: pointer to the process mm_struct * @addr: virtual address in the process address space * @pte: pointer to the page table entry + * @bits: bits from the pgste that caused the notify call * * This function is assumed to be called with the page table lock held * for the pte to notify. */ -void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) +void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, + pte_t *pte, unsigned long bits) { unsigned long offset, gaddr; unsigned long *table; - struct gmap_notifier *nb; - struct gmap *gmap; + struct gmap *gmap, *sg, *next; offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); offset = offset * (4096 / sizeof(pte_t)); - spin_lock(&gmap_notifier_lock); - list_for_each_entry(gmap, &mm->context.gmap_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { + spin_lock(&gmap->shadow_lock); + list_for_each_entry_safe(sg, next, + &gmap->children, list) + gmap_shadow_notify(sg, vmaddr, offset, pte); + spin_unlock(&gmap->shadow_lock); + } + if (!(bits & PGSTE_IN_BIT)) + continue; + spin_lock(&gmap->guest_table_lock); table = radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); - if (!table) - continue; - gaddr = __gmap_segment_gaddr(table) + offset; - list_for_each_entry(nb, &gmap_notifier_list, list) - nb->notifier_call(gmap, gaddr); + if (table) + gaddr = __gmap_segment_gaddr(table) + offset; + spin_unlock(&gmap->guest_table_lock); + if (table) + gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); } - spin_unlock(&gmap_notifier_lock); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ptep_notify); diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index a8a6765f1a51..adb0c34bf431 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -128,6 +128,44 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, return 1; } +static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + struct page *head, *page; + unsigned long mask; + int refs; + + mask = (write ? _REGION_ENTRY_PROTECT : 0) | _REGION_ENTRY_INVALID; + if ((pud_val(pud) & mask) != 0) + return 0; + VM_BUG_ON(!pfn_valid(pud_pfn(pud))); + + refs = 0; + head = pud_page(pud); + page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + do { + VM_BUG_ON_PAGE(compound_head(page) != head, page); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pud_val(pud) != pud_val(*pudp))) { + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + return 1; +} + static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -144,7 +182,12 @@ static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, next = pud_addr_end(addr, end); if (pud_none(pud)) return 0; - if (!gup_pmd_range(pudp, pud, addr, next, write, pages, nr)) + if (unlikely(pud_large(pud))) { + if (!gup_huge_pud(pudp, pud, addr, next, write, pages, + nr)) + return 0; + } else if (!gup_pmd_range(pudp, pud, addr, next, write, pages, + nr)) return 0; } while (pudp++, addr = next, addr != end); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 1b5e8983f4f3..cd404aa3931c 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -1,19 +1,28 @@ /* * IBM System z Huge TLB Page Support for Kernel. * - * Copyright IBM Corp. 2007 + * Copyright IBM Corp. 2007,2016 * Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com> */ +#define KMSG_COMPONENT "hugetlb" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/mm.h> #include <linux/hugetlb.h> -static inline pmd_t __pte_to_pmd(pte_t pte) +/* + * If the bit selected by single-bit bitmask "a" is set within "x", move + * it to the position indicated by single-bit bitmask "b". + */ +#define move_set_bit(x, a, b) (((x) & (a)) >> ilog2(a) << ilog2(b)) + +static inline unsigned long __pte_to_rste(pte_t pte) { - pmd_t pmd; + unsigned long rste; /* - * Convert encoding pte bits pmd bits + * Convert encoding pte bits pmd / pud bits * lIR.uswrdy.p dy..R...I...wr * empty 010.000000.0 -> 00..0...1...00 * prot-none, clean, old 111.000000.1 -> 00..1...1...00 @@ -33,25 +42,40 @@ static inline pmd_t __pte_to_pmd(pte_t pte) * u unused, l large */ if (pte_present(pte)) { - pmd_val(pmd) = pte_val(pte) & PAGE_MASK; - pmd_val(pmd) |= (pte_val(pte) & _PAGE_READ) >> 4; - pmd_val(pmd) |= (pte_val(pte) & _PAGE_WRITE) >> 4; - pmd_val(pmd) |= (pte_val(pte) & _PAGE_INVALID) >> 5; - pmd_val(pmd) |= (pte_val(pte) & _PAGE_PROTECT); - pmd_val(pmd) |= (pte_val(pte) & _PAGE_DIRTY) << 10; - pmd_val(pmd) |= (pte_val(pte) & _PAGE_YOUNG) << 10; - pmd_val(pmd) |= (pte_val(pte) & _PAGE_SOFT_DIRTY) << 13; + rste = pte_val(pte) & PAGE_MASK; + rste |= move_set_bit(pte_val(pte), _PAGE_READ, + _SEGMENT_ENTRY_READ); + rste |= move_set_bit(pte_val(pte), _PAGE_WRITE, + _SEGMENT_ENTRY_WRITE); + rste |= move_set_bit(pte_val(pte), _PAGE_INVALID, + _SEGMENT_ENTRY_INVALID); + rste |= move_set_bit(pte_val(pte), _PAGE_PROTECT, + _SEGMENT_ENTRY_PROTECT); + rste |= move_set_bit(pte_val(pte), _PAGE_DIRTY, + _SEGMENT_ENTRY_DIRTY); + rste |= move_set_bit(pte_val(pte), _PAGE_YOUNG, + _SEGMENT_ENTRY_YOUNG); +#ifdef CONFIG_MEM_SOFT_DIRTY + rste |= move_set_bit(pte_val(pte), _PAGE_SOFT_DIRTY, + _SEGMENT_ENTRY_SOFT_DIRTY); +#endif } else - pmd_val(pmd) = _SEGMENT_ENTRY_INVALID; - return pmd; + rste = _SEGMENT_ENTRY_INVALID; + return rste; } -static inline pte_t __pmd_to_pte(pmd_t pmd) +static inline pte_t __rste_to_pte(unsigned long rste) { + int present; pte_t pte; + if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + present = pud_present(__pud(rste)); + else + present = pmd_present(__pmd(rste)); + /* - * Convert encoding pmd bits pte bits + * Convert encoding pmd / pud bits pte bits * dy..R...I...wr lIR.uswrdy.p * empty 00..0...1...00 -> 010.000000.0 * prot-none, clean, old 00..1...1...00 -> 111.000000.1 @@ -70,16 +94,25 @@ static inline pte_t __pmd_to_pte(pmd_t pmd) * SW-bits: p present, y young, d dirty, r read, w write, s special, * u unused, l large */ - if (pmd_present(pmd)) { - pte_val(pte) = pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN_LARGE; + if (present) { + pte_val(pte) = rste & _SEGMENT_ENTRY_ORIGIN_LARGE; pte_val(pte) |= _PAGE_LARGE | _PAGE_PRESENT; - pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_READ) << 4; - pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) << 4; - pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_INVALID) << 5; - pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_PROTECT); - pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) >> 10; - pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) >> 10; - pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_SOFT_DIRTY) >> 13; + pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_READ, + _PAGE_READ); + pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, + _PAGE_WRITE); + pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, + _PAGE_INVALID); + pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, + _PAGE_PROTECT); + pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, + _PAGE_DIRTY); + pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, + _PAGE_YOUNG); +#ifdef CONFIG_MEM_SOFT_DIRTY + pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, + _PAGE_DIRTY); +#endif } else pte_val(pte) = _PAGE_INVALID; return pte; @@ -88,27 +121,33 @@ static inline pte_t __pmd_to_pte(pmd_t pmd) void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - pmd_t pmd = __pte_to_pmd(pte); - - pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE; - *(pmd_t *) ptep = pmd; + unsigned long rste = __pte_to_rste(pte); + + /* Set correct table type for 2G hugepages */ + if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + rste |= _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE; + else + rste |= _SEGMENT_ENTRY_LARGE; + pte_val(*ptep) = rste; } pte_t huge_ptep_get(pte_t *ptep) { - pmd_t pmd = *(pmd_t *) ptep; - - return __pmd_to_pte(pmd); + return __rste_to_pte(pte_val(*ptep)); } pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { + pte_t pte = huge_ptep_get(ptep); pmd_t *pmdp = (pmd_t *) ptep; - pmd_t old; + pud_t *pudp = (pud_t *) ptep; - old = pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); - return __pmd_to_pte(old); + if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + pudp_xchg_direct(mm, addr, pudp, __pud(_REGION3_ENTRY_EMPTY)); + else + pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + return pte; } pte_t *huge_pte_alloc(struct mm_struct *mm, @@ -120,8 +159,12 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, pgdp = pgd_offset(mm, addr); pudp = pud_alloc(mm, pgdp, addr); - if (pudp) - pmdp = pmd_alloc(mm, pudp, addr); + if (pudp) { + if (sz == PUD_SIZE) + return (pte_t *) pudp; + else if (sz == PMD_SIZE) + pmdp = pmd_alloc(mm, pudp, addr); + } return (pte_t *) pmdp; } @@ -134,8 +177,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) pgdp = pgd_offset(mm, addr); if (pgd_present(*pgdp)) { pudp = pud_offset(pgdp, addr); - if (pud_present(*pudp)) + if (pud_present(*pudp)) { + if (pud_large(*pudp)) + return (pte_t *) pudp; pmdp = pmd_offset(pudp, addr); + } } return (pte_t *) pmdp; } @@ -147,5 +193,34 @@ int pmd_huge(pmd_t pmd) int pud_huge(pud_t pud) { - return 0; + return pud_large(pud); +} + +struct page * +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int flags) +{ + if (flags & FOLL_GET) + return NULL; + + return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); +} + +static __init int setup_hugepagesz(char *opt) +{ + unsigned long size; + char *string = opt; + + size = memparse(opt, &opt); + if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) { + hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); + } else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) { + hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + } else { + pr_err("hugepagesz= specifies an unsupported page size %s\n", + string); + return 0; + } + return 1; } +__setup("hugepagesz=", setup_hugepagesz); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 2489b2e917c8..f56a39bd8ba6 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -40,7 +40,7 @@ #include <asm/ctl_reg.h> #include <asm/sclp.h> -pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((__aligned__(PAGE_SIZE))); +pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir); unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); @@ -111,17 +111,16 @@ void __init paging_init(void) void mark_rodata_ro(void) { - /* Text and rodata are already protected. Nothing to do here. */ - pr_info("Write protecting the kernel read-only data: %luk\n", - ((unsigned long)&_eshared - (unsigned long)&_stext) >> 10); + unsigned long size = __end_ro_after_init - __start_ro_after_init; + + set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT); + pr_info("Write protected read-only-after-init data: %luk\n", size >> 10); } void __init mem_init(void) { - if (MACHINE_HAS_TLB_LC) - cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); + cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(0, mm_cpumask(&init_mm)); - atomic_set(&init_mm.context.attach_count, 1); set_max_mapnr(max_low_pfn); high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 89cf09e5f168..eb9df2822da1 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -22,6 +22,7 @@ * Started by Ingo Molnar <mingo@elte.hu> */ +#include <linux/elf-randomize.h> #include <linux/personality.h> #include <linux/mm.h> #include <linux/mman.h> diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index a90d45e9dfb0..3330ea124eec 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -34,20 +34,25 @@ static int __init cmma(char *str) } __setup("cmma=", cmma); -void __init cmma_init(void) +static inline int cmma_test_essa(void) { register unsigned long tmp asm("0") = 0; register int rc asm("1") = -EOPNOTSUPP; - if (!cmma_flag) - return; asm volatile( " .insn rrf,0xb9ab0000,%1,%1,0,0\n" "0: la %0,0\n" "1:\n" EX_TABLE(0b,1b) : "+&d" (rc), "+&d" (tmp)); - if (rc) + return rc; +} + +void __init cmma_init(void) +{ + if (!cmma_flag) + return; + if (cmma_test_essa()) cmma_flag = 0; } diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index f2a5c29a97e9..44f150312a16 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -10,7 +10,6 @@ #include <asm/pgtable.h> #include <asm/page.h> -#if PAGE_DEFAULT_KEY static inline unsigned long sske_frame(unsigned long addr, unsigned char skey) { asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],9,0" @@ -22,6 +21,8 @@ void __storage_key_init_range(unsigned long start, unsigned long end) { unsigned long boundary, size; + if (!PAGE_DEFAULT_KEY) + return; while (start < end) { if (MACHINE_HAS_EDAT1) { /* set storage keys for a 1MB frame */ @@ -38,56 +39,256 @@ void __storage_key_init_range(unsigned long start, unsigned long end) start += PAGE_SIZE; } } -#endif -static pte_t *walk_page_table(unsigned long addr) +#ifdef CONFIG_PROC_FS +atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX]; + +void arch_report_meminfo(struct seq_file *m) { - pgd_t *pgdp; - pud_t *pudp; + seq_printf(m, "DirectMap4k: %8lu kB\n", + atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_4K]) << 2); + seq_printf(m, "DirectMap1M: %8lu kB\n", + atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_1M]) << 10); + seq_printf(m, "DirectMap2G: %8lu kB\n", + atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_2G]) << 21); +} +#endif /* CONFIG_PROC_FS */ + +static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, + unsigned long dtt) +{ + unsigned long table, mask; + + mask = 0; + if (MACHINE_HAS_EDAT2) { + switch (dtt) { + case CRDTE_DTT_REGION3: + mask = ~(PTRS_PER_PUD * sizeof(pud_t) - 1); + break; + case CRDTE_DTT_SEGMENT: + mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); + break; + case CRDTE_DTT_PAGE: + mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1); + break; + } + table = (unsigned long)old & mask; + crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce); + } else if (MACHINE_HAS_IDTE) { + cspg(old, *old, new); + } else { + csp((unsigned int *)old + 1, *old, new); + } +} + +struct cpa { + unsigned int set_ro : 1; + unsigned int clear_ro : 1; +}; + +static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end, + struct cpa cpa) +{ + pte_t *ptep, new; + + ptep = pte_offset(pmdp, addr); + do { + if (pte_none(*ptep)) + return -EINVAL; + if (cpa.set_ro) + new = pte_wrprotect(*ptep); + else if (cpa.clear_ro) + new = pte_mkwrite(pte_mkdirty(*ptep)); + pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE); + ptep++; + addr += PAGE_SIZE; + cond_resched(); + } while (addr < end); + return 0; +} + +static int split_pmd_page(pmd_t *pmdp, unsigned long addr) +{ + unsigned long pte_addr, prot; + pte_t *pt_dir, *ptep; + pmd_t new; + int i, ro; + + pt_dir = vmem_pte_alloc(); + if (!pt_dir) + return -ENOMEM; + pte_addr = pmd_pfn(*pmdp) << PAGE_SHIFT; + ro = !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT); + prot = pgprot_val(ro ? PAGE_KERNEL_RO : PAGE_KERNEL); + ptep = pt_dir; + for (i = 0; i < PTRS_PER_PTE; i++) { + pte_val(*ptep) = pte_addr | prot; + pte_addr += PAGE_SIZE; + ptep++; + } + pmd_val(new) = __pa(pt_dir) | _SEGMENT_ENTRY; + pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); + update_page_count(PG_DIRECT_MAP_4K, PTRS_PER_PTE); + update_page_count(PG_DIRECT_MAP_1M, -1); + return 0; +} + +static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, struct cpa cpa) +{ + pmd_t new; + + if (cpa.set_ro) + new = pmd_wrprotect(*pmdp); + else if (cpa.clear_ro) + new = pmd_mkwrite(pmd_mkdirty(*pmdp)); + pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); +} + +static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end, + struct cpa cpa) +{ + unsigned long next; pmd_t *pmdp; - pte_t *ptep; + int rc = 0; - pgdp = pgd_offset_k(addr); - if (pgd_none(*pgdp)) - return NULL; - pudp = pud_offset(pgdp, addr); - if (pud_none(*pudp) || pud_large(*pudp)) - return NULL; pmdp = pmd_offset(pudp, addr); - if (pmd_none(*pmdp) || pmd_large(*pmdp)) - return NULL; - ptep = pte_offset_kernel(pmdp, addr); - if (pte_none(*ptep)) - return NULL; - return ptep; + do { + if (pmd_none(*pmdp)) + return -EINVAL; + next = pmd_addr_end(addr, end); + if (pmd_large(*pmdp)) { + if (addr & ~PMD_MASK || addr + PMD_SIZE > next) { + rc = split_pmd_page(pmdp, addr); + if (rc) + return rc; + continue; + } + modify_pmd_page(pmdp, addr, cpa); + } else { + rc = walk_pte_level(pmdp, addr, next, cpa); + if (rc) + return rc; + } + pmdp++; + addr = next; + cond_resched(); + } while (addr < end); + return rc; } -static void change_page_attr(unsigned long addr, int numpages, - pte_t (*set) (pte_t)) +static int split_pud_page(pud_t *pudp, unsigned long addr) { - pte_t *ptep; - int i; + unsigned long pmd_addr, prot; + pmd_t *pm_dir, *pmdp; + pud_t new; + int i, ro; - for (i = 0; i < numpages; i++) { - ptep = walk_page_table(addr); - if (WARN_ON_ONCE(!ptep)) - break; - *ptep = set(*ptep); - addr += PAGE_SIZE; + pm_dir = vmem_pmd_alloc(); + if (!pm_dir) + return -ENOMEM; + pmd_addr = pud_pfn(*pudp) << PAGE_SHIFT; + ro = !!(pud_val(*pudp) & _REGION_ENTRY_PROTECT); + prot = pgprot_val(ro ? SEGMENT_KERNEL_RO : SEGMENT_KERNEL); + pmdp = pm_dir; + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd_val(*pmdp) = pmd_addr | prot; + pmd_addr += PMD_SIZE; + pmdp++; } - __tlb_flush_kernel(); + pud_val(new) = __pa(pm_dir) | _REGION3_ENTRY; + pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); + update_page_count(PG_DIRECT_MAP_1M, PTRS_PER_PMD); + update_page_count(PG_DIRECT_MAP_2G, -1); + return 0; +} + +static void modify_pud_page(pud_t *pudp, unsigned long addr, struct cpa cpa) +{ + pud_t new; + + if (cpa.set_ro) + new = pud_wrprotect(*pudp); + else if (cpa.clear_ro) + new = pud_mkwrite(pud_mkdirty(*pudp)); + pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); +} + +static int walk_pud_level(pgd_t *pgd, unsigned long addr, unsigned long end, + struct cpa cpa) +{ + unsigned long next; + pud_t *pudp; + int rc = 0; + + pudp = pud_offset(pgd, addr); + do { + if (pud_none(*pudp)) + return -EINVAL; + next = pud_addr_end(addr, end); + if (pud_large(*pudp)) { + if (addr & ~PUD_MASK || addr + PUD_SIZE > next) { + rc = split_pud_page(pudp, addr); + if (rc) + break; + continue; + } + modify_pud_page(pudp, addr, cpa); + } else { + rc = walk_pmd_level(pudp, addr, next, cpa); + } + pudp++; + addr = next; + cond_resched(); + } while (addr < end && !rc); + return rc; +} + +static DEFINE_MUTEX(cpa_mutex); + +static int change_page_attr(unsigned long addr, unsigned long end, + struct cpa cpa) +{ + unsigned long next; + int rc = -EINVAL; + pgd_t *pgdp; + + if (addr == end) + return 0; + if (end >= MODULES_END) + return -EINVAL; + mutex_lock(&cpa_mutex); + pgdp = pgd_offset_k(addr); + do { + if (pgd_none(*pgdp)) + break; + next = pgd_addr_end(addr, end); + rc = walk_pud_level(pgdp, addr, next, cpa); + if (rc) + break; + cond_resched(); + } while (pgdp++, addr = next, addr < end && !rc); + mutex_unlock(&cpa_mutex); + return rc; } int set_memory_ro(unsigned long addr, int numpages) { - change_page_attr(addr, numpages, pte_wrprotect); - return 0; + struct cpa cpa = { + .set_ro = 1, + }; + + addr &= PAGE_MASK; + return change_page_attr(addr, addr + numpages * PAGE_SIZE, cpa); } int set_memory_rw(unsigned long addr, int numpages) { - change_page_attr(addr, numpages, pte_mkwrite); - return 0; + struct cpa cpa = { + .clear_ro = 1, + }; + + addr &= PAGE_MASK; + return change_page_attr(addr, addr + numpages * PAGE_SIZE, cpa); } /* not possible */ @@ -108,11 +309,11 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr) int i; if (test_facility(13)) { - __ptep_ipte_range(address, nr - 1, pte); + __ptep_ipte_range(address, nr - 1, pte, IPTE_GLOBAL); return; } for (i = 0; i < nr; i++) { - __ptep_ipte(address, pte); + __ptep_ipte(address, pte, IPTE_GLOBAL); address += PAGE_SIZE; pte++; } @@ -138,7 +339,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) nr = min(numpages - i, nr); if (enable) { for (j = 0; j < nr; j++) { - pte_val(*pte) = __pa(address); + pte_val(*pte) = address | pgprot_val(PAGE_KERNEL); address += PAGE_SIZE; pte++; } diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index e8b5962ac12a..995f78532cc2 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -137,6 +137,29 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) return new; } +#ifdef CONFIG_PGSTE + +struct page *page_table_alloc_pgste(struct mm_struct *mm) +{ + struct page *page; + unsigned long *table; + + page = alloc_page(GFP_KERNEL|__GFP_REPEAT); + if (page) { + table = (unsigned long *) page_to_phys(page); + clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); + clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); + } + return page; +} + +void page_table_free_pgste(struct page *page) +{ + __free_page(page); +} + +#endif /* CONFIG_PGSTE */ + /* * page table entry allocation/free routines. */ @@ -149,7 +172,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) /* Try to get a fragment of a 4K page as a 2K page table */ if (!mm_alloc_pgste(mm)) { table = NULL; - spin_lock_bh(&mm->context.list_lock); + spin_lock_bh(&mm->context.pgtable_lock); if (!list_empty(&mm->context.pgtable_list)) { page = list_first_entry(&mm->context.pgtable_list, struct page, lru); @@ -164,12 +187,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm) list_del(&page->lru); } } - spin_unlock_bh(&mm->context.list_lock); + spin_unlock_bh(&mm->context.pgtable_lock); if (table) return table; } /* Allocate a fresh page */ - page = alloc_page(GFP_KERNEL|__GFP_REPEAT); + page = alloc_page(GFP_KERNEL); if (!page) return NULL; if (!pgtable_page_ctor(page)) { @@ -187,9 +210,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm) /* Return the first 2K fragment of the page */ atomic_set(&page->_mapcount, 1); clear_table(table, _PAGE_INVALID, PAGE_SIZE); - spin_lock_bh(&mm->context.list_lock); + spin_lock_bh(&mm->context.pgtable_lock); list_add(&page->lru, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.list_lock); + spin_unlock_bh(&mm->context.pgtable_lock); } return table; } @@ -203,13 +226,13 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) if (!mm_alloc_pgste(mm)) { /* Free 2K page table fragment of a 4K page */ bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.list_lock); + spin_lock_bh(&mm->context.pgtable_lock); mask = atomic_xor_bits(&page->_mapcount, 1U << bit); if (mask & 3) list_add(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); - spin_unlock_bh(&mm->context.list_lock); + spin_unlock_bh(&mm->context.pgtable_lock); if (mask != 0) return; } @@ -235,13 +258,13 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, return; } bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.list_lock); + spin_lock_bh(&mm->context.pgtable_lock); mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); if (mask & 3) list_add_tail(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); - spin_unlock_bh(&mm->context.list_lock); + spin_unlock_bh(&mm->context.pgtable_lock); table = (unsigned long *) (__pa(table) | (1U << bit)); tlb_remove_table(tlb, table); } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 4324b87f9398..7a1897c51c54 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -27,40 +27,37 @@ static inline pte_t ptep_flush_direct(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - int active, count; pte_t old; old = *ptep; if (unlikely(pte_val(old) & _PAGE_INVALID)) return old; - active = (mm == current->active_mm) ? 1 : 0; - count = atomic_add_return(0x10000, &mm->context.attach_count); - if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active && + atomic_inc(&mm->context.flush_count); + if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __ptep_ipte_local(addr, ptep); + __ptep_ipte(addr, ptep, IPTE_LOCAL); else - __ptep_ipte(addr, ptep); - atomic_sub(0x10000, &mm->context.attach_count); + __ptep_ipte(addr, ptep, IPTE_GLOBAL); + atomic_dec(&mm->context.flush_count); return old; } static inline pte_t ptep_flush_lazy(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - int active, count; pte_t old; old = *ptep; if (unlikely(pte_val(old) & _PAGE_INVALID)) return old; - active = (mm == current->active_mm) ? 1 : 0; - count = atomic_add_return(0x10000, &mm->context.attach_count); - if ((count & 0xffff) <= active) { + atomic_inc(&mm->context.flush_count); + if (cpumask_equal(&mm->context.cpu_attach_mask, + cpumask_of(smp_processor_id()))) { pte_val(*ptep) |= _PAGE_INVALID; mm->context.flush_mm = 1; } else - __ptep_ipte(addr, ptep); - atomic_sub(0x10000, &mm->context.attach_count); + __ptep_ipte(addr, ptep, IPTE_GLOBAL); + atomic_dec(&mm->context.flush_count); return old; } @@ -70,7 +67,6 @@ static inline pgste_t pgste_get_lock(pte_t *ptep) #ifdef CONFIG_PGSTE unsigned long old; - preempt_disable(); asm( " lg %0,%2\n" "0: lgr %1,%0\n" @@ -93,7 +89,6 @@ static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) : "=Q" (ptep[PTRS_PER_PTE]) : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory"); - preempt_enable(); #endif } @@ -179,14 +174,17 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) return pgste; } -static inline pgste_t pgste_ipte_notify(struct mm_struct *mm, - unsigned long addr, - pte_t *ptep, pgste_t pgste) +static inline pgste_t pgste_pte_notify(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, pgste_t pgste) { #ifdef CONFIG_PGSTE - if (pgste_val(pgste) & PGSTE_IN_BIT) { - pgste_val(pgste) &= ~PGSTE_IN_BIT; - ptep_notify(mm, addr, ptep); + unsigned long bits; + + bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); + if (bits) { + pgste_val(pgste) ^= bits; + ptep_notify(mm, addr, ptep, bits); } #endif return pgste; @@ -199,7 +197,7 @@ static inline pgste_t ptep_xchg_start(struct mm_struct *mm, if (mm_has_pgste(mm)) { pgste = pgste_get_lock(ptep); - pgste = pgste_ipte_notify(mm, addr, ptep, pgste); + pgste = pgste_pte_notify(mm, addr, ptep, pgste); } return pgste; } @@ -230,9 +228,11 @@ pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, pgste_t pgste; pte_t old; + preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); old = ptep_flush_direct(mm, addr, ptep); ptep_xchg_commit(mm, addr, ptep, pgste, old, new); + preempt_enable(); return old; } EXPORT_SYMBOL(ptep_xchg_direct); @@ -243,9 +243,11 @@ pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, pgste_t pgste; pte_t old; + preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); old = ptep_flush_lazy(mm, addr, ptep); ptep_xchg_commit(mm, addr, ptep, pgste, old, new); + preempt_enable(); return old; } EXPORT_SYMBOL(ptep_xchg_lazy); @@ -256,6 +258,7 @@ pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pgste_t pgste; pte_t old; + preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); old = ptep_flush_lazy(mm, addr, ptep); if (mm_has_pgste(mm)) { @@ -279,13 +282,13 @@ void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, } else { *ptep = pte; } + preempt_enable(); } EXPORT_SYMBOL(ptep_modify_prot_commit); static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - int active, count; pmd_t old; old = *pmdp; @@ -295,36 +298,34 @@ static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, __pmdp_csp(pmdp); return old; } - active = (mm == current->active_mm) ? 1 : 0; - count = atomic_add_return(0x10000, &mm->context.attach_count); - if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active && + atomic_inc(&mm->context.flush_count); + if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __pmdp_idte_local(addr, pmdp); + __pmdp_idte(addr, pmdp, IDTE_LOCAL); else - __pmdp_idte(addr, pmdp); - atomic_sub(0x10000, &mm->context.attach_count); + __pmdp_idte(addr, pmdp, IDTE_GLOBAL); + atomic_dec(&mm->context.flush_count); return old; } static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - int active, count; pmd_t old; old = *pmdp; if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) return old; - active = (mm == current->active_mm) ? 1 : 0; - count = atomic_add_return(0x10000, &mm->context.attach_count); - if ((count & 0xffff) <= active) { + atomic_inc(&mm->context.flush_count); + if (cpumask_equal(&mm->context.cpu_attach_mask, + cpumask_of(smp_processor_id()))) { pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID; mm->context.flush_mm = 1; } else if (MACHINE_HAS_IDTE) - __pmdp_idte(addr, pmdp); + __pmdp_idte(addr, pmdp, IDTE_GLOBAL); else __pmdp_csp(pmdp); - atomic_sub(0x10000, &mm->context.attach_count); + atomic_dec(&mm->context.flush_count); return old; } @@ -333,8 +334,10 @@ pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, { pmd_t old; + preempt_disable(); old = pmdp_flush_direct(mm, addr, pmdp); *pmdp = new; + preempt_enable(); return old; } EXPORT_SYMBOL(pmdp_xchg_direct); @@ -344,12 +347,53 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, { pmd_t old; + preempt_disable(); old = pmdp_flush_lazy(mm, addr, pmdp); *pmdp = new; + preempt_enable(); return old; } EXPORT_SYMBOL(pmdp_xchg_lazy); +static inline pud_t pudp_flush_direct(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + pud_t old; + + old = *pudp; + if (pud_val(old) & _REGION_ENTRY_INVALID) + return old; + if (!MACHINE_HAS_IDTE) { + /* + * Invalid bit position is the same for pmd and pud, so we can + * re-use _pmd_csp() here + */ + __pmdp_csp((pmd_t *) pudp); + return old; + } + atomic_inc(&mm->context.flush_count); + if (MACHINE_HAS_TLB_LC && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + __pudp_idte(addr, pudp, IDTE_LOCAL); + else + __pudp_idte(addr, pudp, IDTE_GLOBAL); + atomic_dec(&mm->context.flush_count); + return old; +} + +pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t new) +{ + pud_t old; + + preempt_disable(); + old = pudp_flush_direct(mm, addr, pudp); + *pudp = new; + preempt_enable(); + return old; +} +EXPORT_SYMBOL(pudp_xchg_direct); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) @@ -398,20 +442,108 @@ void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, pgste_t pgste; /* the mm_has_pgste() check is done in set_pte_at() */ + preempt_disable(); pgste = pgste_get_lock(ptep); pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; pgste_set_key(ptep, pgste, entry, mm); pgste = pgste_set_pte(ptep, pgste, entry); pgste_set_unlock(ptep, pgste); + preempt_enable(); } void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pgste_t pgste; + preempt_disable(); pgste = pgste_get_lock(ptep); pgste_val(pgste) |= PGSTE_IN_BIT; pgste_set_unlock(ptep, pgste); + preempt_enable(); +} + +/** + * ptep_force_prot - change access rights of a locked pte + * @mm: pointer to the process mm_struct + * @addr: virtual address in the guest address space + * @ptep: pointer to the page table entry + * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bit: pgste bit to set (e.g. for notification) + * + * Returns 0 if the access rights were changed and -EAGAIN if the current + * and requested access rights are incompatible. + */ +int ptep_force_prot(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int prot, unsigned long bit) +{ + pte_t entry; + pgste_t pgste; + int pte_i, pte_p; + + pgste = pgste_get_lock(ptep); + entry = *ptep; + /* Check pte entry after all locks have been acquired */ + pte_i = pte_val(entry) & _PAGE_INVALID; + pte_p = pte_val(entry) & _PAGE_PROTECT; + if ((pte_i && (prot != PROT_NONE)) || + (pte_p && (prot & PROT_WRITE))) { + pgste_set_unlock(ptep, pgste); + return -EAGAIN; + } + /* Change access rights and set pgste bit */ + if (prot == PROT_NONE && !pte_i) { + ptep_flush_direct(mm, addr, ptep); + pgste = pgste_update_all(entry, pgste, mm); + pte_val(entry) |= _PAGE_INVALID; + } + if (prot == PROT_READ && !pte_p) { + ptep_flush_direct(mm, addr, ptep); + pte_val(entry) &= ~_PAGE_INVALID; + pte_val(entry) |= _PAGE_PROTECT; + } + pgste_val(pgste) |= bit; + pgste = pgste_set_pte(ptep, pgste, entry); + pgste_set_unlock(ptep, pgste); + return 0; +} + +int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, + pte_t *sptep, pte_t *tptep, pte_t pte) +{ + pgste_t spgste, tpgste; + pte_t spte, tpte; + int rc = -EAGAIN; + + if (!(pte_val(*tptep) & _PAGE_INVALID)) + return 0; /* already shadowed */ + spgste = pgste_get_lock(sptep); + spte = *sptep; + if (!(pte_val(spte) & _PAGE_INVALID) && + !((pte_val(spte) & _PAGE_PROTECT) && + !(pte_val(pte) & _PAGE_PROTECT))) { + pgste_val(spgste) |= PGSTE_VSIE_BIT; + tpgste = pgste_get_lock(tptep); + pte_val(tpte) = (pte_val(spte) & PAGE_MASK) | + (pte_val(pte) & _PAGE_PROTECT); + /* don't touch the storage key - it belongs to parent pgste */ + tpgste = pgste_set_pte(tptep, tpgste, tpte); + pgste_set_unlock(tptep, tpgste); + rc = 1; + } + pgste_set_unlock(sptep, spgste); + return rc; +} + +void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) +{ + pgste_t pgste; + + pgste = pgste_get_lock(ptep); + /* notifier is called by the caller */ + ptep_flush_direct(mm, saddr, ptep); + /* don't touch the storage key - it belongs to parent pgste */ + pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); + pgste_set_unlock(ptep, pgste); } static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) @@ -434,10 +566,11 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, pte_t pte; /* Zap unused and logically-zero pages */ + preempt_disable(); pgste = pgste_get_lock(ptep); pgstev = pgste_val(pgste); pte = *ptep; - if (pte_swap(pte) && + if (!reset && pte_swap(pte) && ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || (pgstev & _PGSTE_GPS_ZERO))) { ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); @@ -446,6 +579,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, if (reset) pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; pgste_set_unlock(ptep, pgste); + preempt_enable(); } void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -454,6 +588,7 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) pgste_t pgste; /* Clear storage key */ + preempt_disable(); pgste = pgste_get_lock(ptep); pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT); @@ -461,6 +596,7 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); pgste_set_unlock(ptep, pgste); + preempt_enable(); } /* @@ -483,8 +619,8 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pgste_val(pgste) &= ~PGSTE_UC_BIT; pte = *ptep; if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { - pgste = pgste_ipte_notify(mm, addr, ptep, pgste); - __ptep_ipte(addr, ptep); + pgste = pgste_pte_notify(mm, addr, ptep, pgste); + __ptep_ipte(addr, ptep, IPTE_GLOBAL); if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) pte_val(pte) |= _PAGE_PROTECT; else @@ -506,12 +642,9 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, pgste_t old, new; pte_t *ptep; - down_read(&mm->mmap_sem); ptep = get_locked_pte(mm, addr, &ptl); - if (unlikely(!ptep)) { - up_read(&mm->mmap_sem); + if (unlikely(!ptep)) return -EFAULT; - } new = old = pgste_get_lock(ptep); pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | @@ -538,45 +671,100 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, pgste_set_unlock(ptep, new); pte_unmap_unlock(ptep, ptl); - up_read(&mm->mmap_sem); return 0; } EXPORT_SYMBOL(set_guest_storage_key); -unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr) +/** + * Conditionally set a guest storage key (handling csske). + * oldkey will be updated when either mr or mc is set and a pointer is given. + * + * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest + * storage key was updated and -EFAULT on access errors. + */ +int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, unsigned char *oldkey, + bool nq, bool mr, bool mc) +{ + unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; + int rc; + + /* we can drop the pgste lock between getting and setting the key */ + if (mr | mc) { + rc = get_guest_storage_key(current->mm, addr, &tmp); + if (rc) + return rc; + if (oldkey) + *oldkey = tmp; + if (!mr) + mask |= _PAGE_REFERENCED; + if (!mc) + mask |= _PAGE_CHANGED; + if (!((tmp ^ key) & mask)) + return 0; + } + rc = set_guest_storage_key(current->mm, addr, key, nq); + return rc < 0 ? rc : 1; +} +EXPORT_SYMBOL(cond_set_guest_storage_key); + +/** + * Reset a guest reference bit (rrbe), returning the reference and changed bit. + * + * Returns < 0 in case of error, otherwise the cc to be reported to the guest. + */ +int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) { - unsigned char key; spinlock_t *ptl; - pgste_t pgste; + pgste_t old, new; pte_t *ptep; + int cc = 0; - down_read(&mm->mmap_sem); ptep = get_locked_pte(mm, addr, &ptl); - if (unlikely(!ptep)) { - up_read(&mm->mmap_sem); + if (unlikely(!ptep)) return -EFAULT; - } - pgste = pgste_get_lock(ptep); - if (pte_val(*ptep) & _PAGE_INVALID) { - key = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56; - key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56; - key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48; - key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48; - } else { - key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK); + new = old = pgste_get_lock(ptep); + /* Reset guest reference bit only */ + pgste_val(new) &= ~PGSTE_GR_BIT; - /* Reflect guest's logical view, not physical */ - if (pgste_val(pgste) & PGSTE_GR_BIT) - key |= _PAGE_REFERENCED; - if (pgste_val(pgste) & PGSTE_GC_BIT) - key |= _PAGE_CHANGED; + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK); + /* Merge real referenced bit into host-set */ + pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT; } + /* Reflect guest's logical view, not physical */ + cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; + /* Changing the guest storage key is considered a change of the page */ + if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) + pgste_val(new) |= PGSTE_UC_BIT; + + pgste_set_unlock(ptep, new); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(reset_guest_reference_bit); + +int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char *key) +{ + spinlock_t *ptl; + pgste_t pgste; + pte_t *ptep; + ptep = get_locked_pte(mm, addr, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + + pgste = pgste_get_lock(ptep); + *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; + if (!(pte_val(*ptep) & _PAGE_INVALID)) + *key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK); + /* Reflect guest's logical view, not physical */ + *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; pgste_set_unlock(ptep, pgste); pte_unmap_unlock(ptep, ptl); - up_read(&mm->mmap_sem); - return key; + return 0; } EXPORT_SYMBOL(get_guest_storage_key); #endif diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index d27fccbad7c1..1848292766ef 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -11,6 +11,7 @@ #include <linux/hugetlb.h> #include <linux/slab.h> #include <linux/memblock.h> +#include <asm/cacheflush.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/setup.h> @@ -29,9 +30,11 @@ static LIST_HEAD(mem_segs); static void __ref *vmem_alloc_pages(unsigned int order) { + unsigned long size = PAGE_SIZE << order; + if (slab_is_available()) return (void *)__get_free_pages(GFP_KERNEL, order); - return alloc_bootmem_pages((1 << order) * PAGE_SIZE); + return alloc_bootmem_align(size, size); } static inline pud_t *vmem_pud_alloc(void) @@ -45,7 +48,7 @@ static inline pud_t *vmem_pud_alloc(void) return pud; } -static inline pmd_t *vmem_pmd_alloc(void) +pmd_t *vmem_pmd_alloc(void) { pmd_t *pmd = NULL; @@ -56,7 +59,7 @@ static inline pmd_t *vmem_pmd_alloc(void) return pmd; } -static pte_t __ref *vmem_pte_alloc(unsigned long address) +pte_t __ref *vmem_pte_alloc(void) { pte_t *pte; @@ -75,8 +78,9 @@ static pte_t __ref *vmem_pte_alloc(unsigned long address) /* * Add a physical memory range to the 1:1 mapping. */ -static int vmem_add_mem(unsigned long start, unsigned long size, int ro) +static int vmem_add_mem(unsigned long start, unsigned long size) { + unsigned long pages4k, pages1m, pages2g; unsigned long end = start + size; unsigned long address = start; pgd_t *pg_dir; @@ -85,6 +89,7 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro) pte_t *pt_dir; int ret = -ENOMEM; + pages4k = pages1m = pages2g = 0; while (address < end) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { @@ -97,10 +102,9 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro) if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address && !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) && !debug_pagealloc_enabled()) { - pud_val(*pu_dir) = __pa(address) | - _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE | - (ro ? _REGION_ENTRY_PROTECT : 0); + pud_val(*pu_dir) = address | pgprot_val(REGION3_KERNEL); address += PUD_SIZE; + pages2g++; continue; } if (pud_none(*pu_dir)) { @@ -113,27 +117,28 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro) if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address && !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) && !debug_pagealloc_enabled()) { - pmd_val(*pm_dir) = __pa(address) | - _SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE | - _SEGMENT_ENTRY_YOUNG | - (ro ? _SEGMENT_ENTRY_PROTECT : 0); + pmd_val(*pm_dir) = address | pgprot_val(SEGMENT_KERNEL); address += PMD_SIZE; + pages1m++; continue; } if (pmd_none(*pm_dir)) { - pt_dir = vmem_pte_alloc(address); + pt_dir = vmem_pte_alloc(); if (!pt_dir) goto out; pmd_populate(&init_mm, pm_dir, pt_dir); } pt_dir = pte_offset_kernel(pm_dir, address); - pte_val(*pt_dir) = __pa(address) | - pgprot_val(ro ? PAGE_KERNEL_RO : PAGE_KERNEL); + pte_val(*pt_dir) = address | pgprot_val(PAGE_KERNEL); address += PAGE_SIZE; + pages4k++; } ret = 0; out: + update_page_count(PG_DIRECT_MAP_4K, pages4k); + update_page_count(PG_DIRECT_MAP_1M, pages1m); + update_page_count(PG_DIRECT_MAP_2G, pages2g); return ret; } @@ -143,15 +148,15 @@ out: */ static void vmem_remove_range(unsigned long start, unsigned long size) { + unsigned long pages4k, pages1m, pages2g; unsigned long end = start + size; unsigned long address = start; pgd_t *pg_dir; pud_t *pu_dir; pmd_t *pm_dir; pte_t *pt_dir; - pte_t pte; - pte_val(pte) = _PAGE_INVALID; + pages4k = pages1m = pages2g = 0; while (address < end) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { @@ -166,6 +171,7 @@ static void vmem_remove_range(unsigned long start, unsigned long size) if (pud_large(*pu_dir)) { pud_clear(pu_dir); address += PUD_SIZE; + pages2g++; continue; } pm_dir = pmd_offset(pu_dir, address); @@ -176,13 +182,18 @@ static void vmem_remove_range(unsigned long start, unsigned long size) if (pmd_large(*pm_dir)) { pmd_clear(pm_dir); address += PMD_SIZE; + pages1m++; continue; } pt_dir = pte_offset_kernel(pm_dir, address); - *pt_dir = pte; + pte_clear(&init_mm, address, pt_dir); address += PAGE_SIZE; + pages4k++; } flush_tlb_kernel_range(start, end); + update_page_count(PG_DIRECT_MAP_4K, -pages4k); + update_page_count(PG_DIRECT_MAP_1M, -pages1m); + update_page_count(PG_DIRECT_MAP_2G, -pages2g); } /* @@ -233,7 +244,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) address = (address + PMD_SIZE) & PMD_MASK; continue; } - pt_dir = vmem_pte_alloc(address); + pt_dir = vmem_pte_alloc(); if (!pt_dir) goto out; pmd_populate(&init_mm, pm_dir, pt_dir); @@ -341,7 +352,7 @@ int vmem_add_mapping(unsigned long start, unsigned long size) if (ret) goto out_free; - ret = vmem_add_mem(start, size, 0); + ret = vmem_add_mem(start, size); if (ret) goto out_remove; goto out; @@ -362,31 +373,13 @@ out: */ void __init vmem_map_init(void) { - unsigned long ro_start, ro_end; + unsigned long size = _eshared - _stext; struct memblock_region *reg; - phys_addr_t start, end; - ro_start = PFN_ALIGN((unsigned long)&_stext); - ro_end = (unsigned long)&_eshared & PAGE_MASK; - for_each_memblock(memory, reg) { - start = reg->base; - end = reg->base + reg->size - 1; - if (start >= ro_end || end <= ro_start) - vmem_add_mem(start, end - start, 0); - else if (start >= ro_start && end <= ro_end) - vmem_add_mem(start, end - start, 1); - else if (start >= ro_start) { - vmem_add_mem(start, ro_end - start, 1); - vmem_add_mem(ro_end, end - ro_end, 0); - } else if (end < ro_end) { - vmem_add_mem(start, ro_start - start, 0); - vmem_add_mem(ro_start, end - ro_start, 1); - } else { - vmem_add_mem(start, ro_start - start, 0); - vmem_add_mem(ro_start, ro_end - ro_start, 1); - vmem_add_mem(ro_end, end - ro_end, 0); - } - } + for_each_memblock(memory, reg) + vmem_add_mem(reg->base, reg->size); + set_memory_ro((unsigned long)_stext, size >> PAGE_SHIFT); + pr_info("Write protected kernel read-only data: %luk\n", size >> 10); } /* diff --git a/arch/s390/net/bpf_jit.h b/arch/s390/net/bpf_jit.h index f010c93a88b1..fda605dbc1b4 100644 --- a/arch/s390/net/bpf_jit.h +++ b/arch/s390/net/bpf_jit.h @@ -37,7 +37,7 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[]; * | | | * +---------------+ | * | 8 byte skbp | | - * R15+170 -> +---------------+ | + * R15+176 -> +---------------+ | * | 8 byte hlen | | * R15+168 -> +---------------+ | * | 4 byte align | | @@ -58,7 +58,7 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[]; #define STK_OFF (STK_SPACE - STK_160_UNUSED) #define STK_OFF_TMP 160 /* Offset of tmp buffer on stack */ #define STK_OFF_HLEN 168 /* Offset of SKB header length on stack */ -#define STK_OFF_SKBP 170 /* Offset of SKB pointer on stack */ +#define STK_OFF_SKBP 176 /* Offset of SKB pointer on stack */ #define STK_OFF_R6 (160 - 11 * 8) /* Offset of r6 on stack */ #define STK_OFF_TCCNT (160 - 12 * 8) /* Offset of tail_call_cnt on stack */ diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 3c0bfc1f2694..bee281f3163d 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -45,7 +45,7 @@ struct bpf_jit { int labels[1]; /* Labels for local jumps */ }; -#define BPF_SIZE_MAX 0x7ffff /* Max size for program (20 bit signed displ) */ +#define BPF_SIZE_MAX 0xffff /* Max size for program (16 bit branches) */ #define SEEN_SKB 1 /* skb access */ #define SEEN_MEM 2 /* use mem[] for temporary storage */ @@ -54,16 +54,17 @@ struct bpf_jit { #define SEEN_FUNC 16 /* calls C functions */ #define SEEN_TAIL_CALL 32 /* code uses tail calls */ #define SEEN_SKB_CHANGE 64 /* code changes skb data */ +#define SEEN_REG_AX 128 /* code uses constant blinding */ #define SEEN_STACK (SEEN_FUNC | SEEN_MEM | SEEN_SKB) /* * s390 registers */ -#define REG_W0 (__MAX_BPF_REG+0) /* Work register 1 (even) */ -#define REG_W1 (__MAX_BPF_REG+1) /* Work register 2 (odd) */ -#define REG_SKB_DATA (__MAX_BPF_REG+2) /* SKB data register */ -#define REG_L (__MAX_BPF_REG+3) /* Literal pool register */ -#define REG_15 (__MAX_BPF_REG+4) /* Register 15 */ +#define REG_W0 (MAX_BPF_JIT_REG + 0) /* Work register 1 (even) */ +#define REG_W1 (MAX_BPF_JIT_REG + 1) /* Work register 2 (odd) */ +#define REG_SKB_DATA (MAX_BPF_JIT_REG + 2) /* SKB data register */ +#define REG_L (MAX_BPF_JIT_REG + 3) /* Literal pool register */ +#define REG_15 (MAX_BPF_JIT_REG + 4) /* Register 15 */ #define REG_0 REG_W0 /* Register 0 */ #define REG_1 REG_W1 /* Register 1 */ #define REG_2 BPF_REG_1 /* Register 2 */ @@ -88,6 +89,8 @@ static const int reg2hex[] = { [BPF_REG_9] = 10, /* BPF stack pointer */ [BPF_REG_FP] = 13, + /* Register for blinding (shared with REG_SKB_DATA) */ + [BPF_REG_AX] = 12, /* SKB data pointer */ [REG_SKB_DATA] = 12, /* Work registers for s390x backend */ @@ -385,7 +388,7 @@ static void save_restore_regs(struct bpf_jit *jit, int op) /* * For SKB access %b1 contains the SKB pointer. For "bpf_jit.S" * we store the SKB header length on the stack and the SKB data - * pointer in REG_SKB_DATA. + * pointer in REG_SKB_DATA if BPF_REG_AX is not used. */ static void emit_load_skb_data_hlen(struct bpf_jit *jit) { @@ -397,9 +400,10 @@ static void emit_load_skb_data_hlen(struct bpf_jit *jit) offsetof(struct sk_buff, data_len)); /* stg %w1,ST_OFF_HLEN(%r0,%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, REG_15, STK_OFF_HLEN); - /* lg %skb_data,data_off(%b1) */ - EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0, - BPF_REG_1, offsetof(struct sk_buff, data)); + if (!(jit->seen & SEEN_REG_AX)) + /* lg %skb_data,data_off(%b1) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0, + BPF_REG_1, offsetof(struct sk_buff, data)); } /* @@ -446,7 +450,7 @@ static void bpf_jit_prologue(struct bpf_jit *jit) emit_load_skb_data_hlen(jit); if (jit->seen & SEEN_SKB_CHANGE) /* stg %b1,ST_OFF_SKBP(%r0,%r15) */ - EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, REG_15, + EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15, STK_OFF_SKBP); } @@ -487,6 +491,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i s32 imm = insn->imm; s16 off = insn->off; + if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX) + jit->seen |= SEEN_REG_AX; switch (insn->code) { /* * BPF_MOV @@ -1188,7 +1194,7 @@ call_fn: /* * Implicit input: * BPF_REG_6 (R7) : skb pointer - * REG_SKB_DATA (R12): skb data pointer + * REG_SKB_DATA (R12): skb data pointer (if no BPF_REG_AX) * * Calculated input: * BPF_REG_2 (R3) : offset of byte(s) to fetch in skb @@ -1209,6 +1215,11 @@ call_fn: /* agfr %b2,%src (%src is s32 here) */ EMIT4(0xb9180000, BPF_REG_2, src_reg); + /* Reload REG_SKB_DATA if BPF_REG_AX is used */ + if (jit->seen & SEEN_REG_AX) + /* lg %skb_data,data_off(%b6) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0, + BPF_REG_6, offsetof(struct sk_buff, data)); /* basr %b5,%w1 (%b5 is call saved) */ EMIT2(0x0d00, BPF_REG_5, REG_W1); @@ -1262,37 +1273,62 @@ void bpf_jit_compile(struct bpf_prog *fp) /* * Compile eBPF program "fp" */ -void bpf_int_jit_compile(struct bpf_prog *fp) +struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) { + struct bpf_prog *tmp, *orig_fp = fp; struct bpf_binary_header *header; + bool tmp_blinded = false; struct bpf_jit jit; int pass; if (!bpf_jit_enable) - return; + return orig_fp; + + tmp = bpf_jit_blind_constants(fp); + /* + * If blinding was requested and we failed during blinding, + * we must fall back to the interpreter. + */ + if (IS_ERR(tmp)) + return orig_fp; + if (tmp != fp) { + tmp_blinded = true; + fp = tmp; + } + memset(&jit, 0, sizeof(jit)); jit.addrs = kcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL); - if (jit.addrs == NULL) - return; + if (jit.addrs == NULL) { + fp = orig_fp; + goto out; + } /* * Three initial passes: * - 1/2: Determine clobbered registers * - 3: Calculate program size and addrs arrray */ for (pass = 1; pass <= 3; pass++) { - if (bpf_jit_prog(&jit, fp)) + if (bpf_jit_prog(&jit, fp)) { + fp = orig_fp; goto free_addrs; + } } /* * Final pass: Allocate and generate program */ - if (jit.size >= BPF_SIZE_MAX) + if (jit.size >= BPF_SIZE_MAX) { + fp = orig_fp; goto free_addrs; + } header = bpf_jit_binary_alloc(jit.size, &jit.prg_buf, 2, jit_fill_hole); - if (!header) + if (!header) { + fp = orig_fp; goto free_addrs; - if (bpf_jit_prog(&jit, fp)) + } + if (bpf_jit_prog(&jit, fp)) { + fp = orig_fp; goto free_addrs; + } if (bpf_jit_enable > 1) { bpf_jit_dump(fp->len, jit.size, pass, jit.prg_buf); if (jit.prg_buf) @@ -1305,6 +1341,11 @@ void bpf_int_jit_compile(struct bpf_prog *fp) } free_addrs: kfree(jit.addrs); +out: + if (tmp_blinded) + bpf_jit_prog_release_other(fp, fp == orig_fp ? + tmp : orig_fp); + return fp; } /* diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c index 828d0695d0d4..37e0bb835516 100644 --- a/arch/s390/numa/mode_emu.c +++ b/arch/s390/numa/mode_emu.c @@ -34,7 +34,8 @@ #define DIST_CORE 1 #define DIST_MC 2 #define DIST_BOOK 3 -#define DIST_MAX 4 +#define DIST_DRAWER 4 +#define DIST_MAX 5 /* Node distance reported to common code */ #define EMU_NODE_DIST 10 @@ -43,7 +44,7 @@ #define NODE_ID_FREE -1 /* Different levels of toptree */ -enum toptree_level {CORE, MC, BOOK, NODE, TOPOLOGY}; +enum toptree_level {CORE, MC, BOOK, DRAWER, NODE, TOPOLOGY}; /* The two toptree IDs */ enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA}; @@ -114,6 +115,14 @@ static int cores_free(struct toptree *tree) */ static struct toptree *core_node(struct toptree *core) { + return core->parent->parent->parent->parent; +} + +/* + * Return drawer of core + */ +static struct toptree *core_drawer(struct toptree *core) +{ return core->parent->parent->parent; } @@ -138,6 +147,8 @@ static struct toptree *core_mc(struct toptree *core) */ static int dist_core_to_core(struct toptree *core1, struct toptree *core2) { + if (core_drawer(core1)->id != core_drawer(core2)->id) + return DIST_DRAWER; if (core_book(core1)->id != core_book(core2)->id) return DIST_BOOK; if (core_mc(core1)->id != core_mc(core2)->id) @@ -262,6 +273,8 @@ static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys) struct toptree *core; /* Always try to move perfectly fitting structures first */ + move_level_to_numa(numa, phys, DRAWER, true); + move_level_to_numa(numa, phys, DRAWER, false); move_level_to_numa(numa, phys, BOOK, true); move_level_to_numa(numa, phys, BOOK, false); move_level_to_numa(numa, phys, MC, true); @@ -335,7 +348,7 @@ static struct toptree *toptree_to_numa(struct toptree *phys) */ static struct toptree *toptree_from_topology(void) { - struct toptree *phys, *node, *book, *mc, *core; + struct toptree *phys, *node, *drawer, *book, *mc, *core; struct cpu_topology_s390 *top; int cpu; @@ -344,10 +357,11 @@ static struct toptree *toptree_from_topology(void) for_each_online_cpu(cpu) { top = &per_cpu(cpu_topology, cpu); node = toptree_get_child(phys, 0); - book = toptree_get_child(node, top->book_id); + drawer = toptree_get_child(node, top->drawer_id); + book = toptree_get_child(drawer, top->book_id); mc = toptree_get_child(book, top->socket_id); core = toptree_get_child(mc, top->core_id); - if (!book || !mc || !core) + if (!drawer || !book || !mc || !core) panic("NUMA emulation could not allocate memory"); cpumask_set_cpu(cpu, &core->mask); toptree_update_mask(mc); @@ -368,6 +382,7 @@ static void topology_add_core(struct toptree *core) cpumask_copy(&top->thread_mask, &core->mask); cpumask_copy(&top->core_mask, &core_mc(core)->mask); cpumask_copy(&top->book_mask, &core_book(core)->mask); + cpumask_copy(&top->drawer_mask, &core_drawer(core)->mask); cpumask_set_cpu(cpu, &node_to_cpumask_map[core_node(core)->id]); top->node_id = core_node(core)->id; } @@ -467,8 +482,12 @@ static int emu_setup_nodes_adjust(int nodes) */ static void emu_setup(void) { + int nid; + emu_size = emu_setup_size_adjust(emu_size); emu_nodes = emu_setup_nodes_adjust(emu_nodes); + for (nid = 0; nid < emu_nodes; nid++) + node_set(nid, node_possible_map); pr_info("Creating %d nodes with memory stripe size %ld MB\n", emu_nodes, emu_size >> 20); } diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c index 2794845061c6..f576f1073378 100644 --- a/arch/s390/numa/numa.c +++ b/arch/s390/numa/numa.c @@ -26,8 +26,14 @@ EXPORT_SYMBOL(node_data); cpumask_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); +static void plain_setup(void) +{ + node_set(0, node_possible_map); +} + const struct numa_mode numa_mode_plain = { .name = "plain", + .setup = plain_setup, }; static const struct numa_mode *mode = &numa_mode_plain; @@ -126,13 +132,13 @@ static void __init numa_setup_memory(void) void __init numa_setup(void) { pr_info("NUMA mode: %s\n", mode->name); + nodes_clear(node_possible_map); if (mode->setup) mode->setup(); numa_setup_memory(); memblock_dump_all(); } - /* * numa_init_early() - Initialization initcall * diff --git a/arch/s390/oprofile/Makefile b/arch/s390/oprofile/Makefile index 496e4a7ee00e..e9dd41b0b8d3 100644 --- a/arch/s390/oprofile/Makefile +++ b/arch/s390/oprofile/Makefile @@ -7,4 +7,3 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ timer_int.o ) oprofile-y := $(DRIVER_OBJS) init.o -oprofile-y += hwsampler.o diff --git a/arch/s390/oprofile/hwsampler.c b/arch/s390/oprofile/hwsampler.c deleted file mode 100644 index ff9b4eb34589..000000000000 --- a/arch/s390/oprofile/hwsampler.c +++ /dev/null @@ -1,1178 +0,0 @@ -/* - * Copyright IBM Corp. 2010 - * Author: Heinz Graalfs <graalfs@de.ibm.com> - */ - -#include <linux/kernel_stat.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/smp.h> -#include <linux/errno.h> -#include <linux/workqueue.h> -#include <linux/interrupt.h> -#include <linux/notifier.h> -#include <linux/cpu.h> -#include <linux/semaphore.h> -#include <linux/oom.h> -#include <linux/oprofile.h> - -#include <asm/facility.h> -#include <asm/cpu_mf.h> -#include <asm/irq.h> - -#include "hwsampler.h" -#include "op_counter.h" - -#define MAX_NUM_SDB 511 -#define MIN_NUM_SDB 1 - -DECLARE_PER_CPU(struct hws_cpu_buffer, sampler_cpu_buffer); - -struct hws_execute_parms { - void *buffer; - signed int rc; -}; - -DEFINE_PER_CPU(struct hws_cpu_buffer, sampler_cpu_buffer); -EXPORT_PER_CPU_SYMBOL(sampler_cpu_buffer); - -static DEFINE_MUTEX(hws_sem); -static DEFINE_MUTEX(hws_sem_oom); - -static unsigned char hws_flush_all; -static unsigned int hws_oom; -static unsigned int hws_alert; -static struct workqueue_struct *hws_wq; - -static unsigned int hws_state; -enum { - HWS_INIT = 1, - HWS_DEALLOCATED, - HWS_STOPPED, - HWS_STARTED, - HWS_STOPPING }; - -/* set to 1 if called by kernel during memory allocation */ -static unsigned char oom_killer_was_active; -/* size of SDBT and SDB as of allocate API */ -static unsigned long num_sdbt = 100; -static unsigned long num_sdb = 511; -/* sampling interval (machine cycles) */ -static unsigned long interval; - -static unsigned long min_sampler_rate; -static unsigned long max_sampler_rate; - -static void execute_qsi(void *parms) -{ - struct hws_execute_parms *ep = parms; - - ep->rc = qsi(ep->buffer); -} - -static void execute_ssctl(void *parms) -{ - struct hws_execute_parms *ep = parms; - - ep->rc = lsctl(ep->buffer); -} - -static int smp_ctl_ssctl_stop(int cpu) -{ - int rc; - struct hws_execute_parms ep; - struct hws_cpu_buffer *cb; - - cb = &per_cpu(sampler_cpu_buffer, cpu); - - cb->ssctl.es = 0; - cb->ssctl.cs = 0; - - ep.buffer = &cb->ssctl; - smp_call_function_single(cpu, execute_ssctl, &ep, 1); - rc = ep.rc; - if (rc) { - printk(KERN_ERR "hwsampler: CPU %d CPUMF SSCTL failed.\n", cpu); - dump_stack(); - } - - ep.buffer = &cb->qsi; - smp_call_function_single(cpu, execute_qsi, &ep, 1); - - if (cb->qsi.es || cb->qsi.cs) { - printk(KERN_EMERG "CPUMF sampling did not stop properly.\n"); - dump_stack(); - } - - return rc; -} - -static int smp_ctl_ssctl_deactivate(int cpu) -{ - int rc; - struct hws_execute_parms ep; - struct hws_cpu_buffer *cb; - - cb = &per_cpu(sampler_cpu_buffer, cpu); - - cb->ssctl.es = 1; - cb->ssctl.cs = 0; - - ep.buffer = &cb->ssctl; - smp_call_function_single(cpu, execute_ssctl, &ep, 1); - rc = ep.rc; - if (rc) - printk(KERN_ERR "hwsampler: CPU %d CPUMF SSCTL failed.\n", cpu); - - ep.buffer = &cb->qsi; - smp_call_function_single(cpu, execute_qsi, &ep, 1); - - if (cb->qsi.cs) - printk(KERN_EMERG "CPUMF sampling was not set inactive.\n"); - - return rc; -} - -static int smp_ctl_ssctl_enable_activate(int cpu, unsigned long interval) -{ - int rc; - struct hws_execute_parms ep; - struct hws_cpu_buffer *cb; - - cb = &per_cpu(sampler_cpu_buffer, cpu); - - cb->ssctl.h = 1; - cb->ssctl.tear = cb->first_sdbt; - cb->ssctl.dear = *(unsigned long *) cb->first_sdbt; - cb->ssctl.interval = interval; - cb->ssctl.es = 1; - cb->ssctl.cs = 1; - - ep.buffer = &cb->ssctl; - smp_call_function_single(cpu, execute_ssctl, &ep, 1); - rc = ep.rc; - if (rc) - printk(KERN_ERR "hwsampler: CPU %d CPUMF SSCTL failed.\n", cpu); - - ep.buffer = &cb->qsi; - smp_call_function_single(cpu, execute_qsi, &ep, 1); - if (ep.rc) - printk(KERN_ERR "hwsampler: CPU %d CPUMF QSI failed.\n", cpu); - - return rc; -} - -static int smp_ctl_qsi(int cpu) -{ - struct hws_execute_parms ep; - struct hws_cpu_buffer *cb; - - cb = &per_cpu(sampler_cpu_buffer, cpu); - - ep.buffer = &cb->qsi; - smp_call_function_single(cpu, execute_qsi, &ep, 1); - - return ep.rc; -} - -static void hws_ext_handler(struct ext_code ext_code, - unsigned int param32, unsigned long param64) -{ - struct hws_cpu_buffer *cb = this_cpu_ptr(&sampler_cpu_buffer); - - if (!(param32 & CPU_MF_INT_SF_MASK)) - return; - - if (!hws_alert) - return; - - inc_irq_stat(IRQEXT_CMS); - atomic_xchg(&cb->ext_params, atomic_read(&cb->ext_params) | param32); - - if (hws_wq) - queue_work(hws_wq, &cb->worker); -} - -static void worker(struct work_struct *work); - -static void add_samples_to_oprofile(unsigned cpu, unsigned long *, - unsigned long *dear); - -static void init_all_cpu_buffers(void) -{ - int cpu; - struct hws_cpu_buffer *cb; - - for_each_online_cpu(cpu) { - cb = &per_cpu(sampler_cpu_buffer, cpu); - memset(cb, 0, sizeof(struct hws_cpu_buffer)); - } -} - -static void prepare_cpu_buffers(void) -{ - struct hws_cpu_buffer *cb; - int cpu; - - for_each_online_cpu(cpu) { - cb = &per_cpu(sampler_cpu_buffer, cpu); - atomic_set(&cb->ext_params, 0); - cb->worker_entry = 0; - cb->sample_overflow = 0; - cb->req_alert = 0; - cb->incorrect_sdbt_entry = 0; - cb->invalid_entry_address = 0; - cb->loss_of_sample_data = 0; - cb->sample_auth_change_alert = 0; - cb->finish = 0; - cb->oom = 0; - cb->stop_mode = 0; - } -} - -/* - * allocate_sdbt() - allocate sampler memory - * @cpu: the cpu for which sampler memory is allocated - * - * A 4K page is allocated for each requested SDBT. - * A maximum of 511 4K pages are allocated for the SDBs in each of the SDBTs. - * Set ALERT_REQ mask in each SDBs trailer. - * Returns zero if successful, <0 otherwise. - */ -static int allocate_sdbt(int cpu) -{ - int j, k, rc; - unsigned long *sdbt; - unsigned long sdb; - unsigned long *tail; - unsigned long *trailer; - struct hws_cpu_buffer *cb; - - cb = &per_cpu(sampler_cpu_buffer, cpu); - - if (cb->first_sdbt) - return -EINVAL; - - sdbt = NULL; - tail = sdbt; - - for (j = 0; j < num_sdbt; j++) { - sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL); - - mutex_lock(&hws_sem_oom); - /* OOM killer might have been activated */ - barrier(); - if (oom_killer_was_active || !sdbt) { - if (sdbt) - free_page((unsigned long)sdbt); - - goto allocate_sdbt_error; - } - if (cb->first_sdbt == 0) - cb->first_sdbt = (unsigned long)sdbt; - - /* link current page to tail of chain */ - if (tail) - *tail = (unsigned long)(void *)sdbt + 1; - - mutex_unlock(&hws_sem_oom); - - for (k = 0; k < num_sdb; k++) { - /* get and set SDB page */ - sdb = get_zeroed_page(GFP_KERNEL); - - mutex_lock(&hws_sem_oom); - /* OOM killer might have been activated */ - barrier(); - if (oom_killer_was_active || !sdb) { - if (sdb) - free_page(sdb); - - goto allocate_sdbt_error; - } - *sdbt = sdb; - trailer = trailer_entry_ptr(*sdbt); - *trailer = SDB_TE_ALERT_REQ_MASK; - sdbt++; - mutex_unlock(&hws_sem_oom); - } - tail = sdbt; - } - mutex_lock(&hws_sem_oom); - if (oom_killer_was_active) - goto allocate_sdbt_error; - - rc = 0; - if (tail) - *tail = (unsigned long) - ((void *)cb->first_sdbt) + 1; - -allocate_sdbt_exit: - mutex_unlock(&hws_sem_oom); - return rc; - -allocate_sdbt_error: - rc = -ENOMEM; - goto allocate_sdbt_exit; -} - -/* - * deallocate_sdbt() - deallocate all sampler memory - * - * For each online CPU all SDBT trees are deallocated. - * Returns the number of freed pages. - */ -static int deallocate_sdbt(void) -{ - int cpu; - int counter; - - counter = 0; - - for_each_online_cpu(cpu) { - unsigned long start; - unsigned long sdbt; - unsigned long *curr; - struct hws_cpu_buffer *cb; - - cb = &per_cpu(sampler_cpu_buffer, cpu); - - if (!cb->first_sdbt) - continue; - - sdbt = cb->first_sdbt; - curr = (unsigned long *) sdbt; - start = sdbt; - - /* we'll free the SDBT after all SDBs are processed... */ - while (1) { - if (!*curr || !sdbt) - break; - - /* watch for link entry reset if found */ - if (is_link_entry(curr)) { - curr = get_next_sdbt(curr); - if (sdbt) - free_page(sdbt); - - /* we are done if we reach the start */ - if ((unsigned long) curr == start) - break; - else - sdbt = (unsigned long) curr; - } else { - /* process SDB pointer */ - if (*curr) { - free_page(*curr); - curr++; - } - } - counter++; - } - cb->first_sdbt = 0; - } - return counter; -} - -static int start_sampling(int cpu) -{ - int rc; - struct hws_cpu_buffer *cb; - - cb = &per_cpu(sampler_cpu_buffer, cpu); - rc = smp_ctl_ssctl_enable_activate(cpu, interval); - if (rc) { - printk(KERN_INFO "hwsampler: CPU %d ssctl failed.\n", cpu); - goto start_exit; - } - - rc = -EINVAL; - if (!cb->qsi.es) { - printk(KERN_INFO "hwsampler: CPU %d ssctl not enabled.\n", cpu); - goto start_exit; - } - - if (!cb->qsi.cs) { - printk(KERN_INFO "hwsampler: CPU %d ssctl not active.\n", cpu); - goto start_exit; - } - - printk(KERN_INFO - "hwsampler: CPU %d, CPUMF Sampling started, interval %lu.\n", - cpu, interval); - - rc = 0; - -start_exit: - return rc; -} - -static int stop_sampling(int cpu) -{ - unsigned long v; - int rc; - struct hws_cpu_buffer *cb; - - rc = smp_ctl_qsi(cpu); - WARN_ON(rc); - - cb = &per_cpu(sampler_cpu_buffer, cpu); - if (!rc && !cb->qsi.es) - printk(KERN_INFO "hwsampler: CPU %d, already stopped.\n", cpu); - - rc = smp_ctl_ssctl_stop(cpu); - if (rc) { - printk(KERN_INFO "hwsampler: CPU %d, ssctl stop error %d.\n", - cpu, rc); - goto stop_exit; - } - - printk(KERN_INFO "hwsampler: CPU %d, CPUMF Sampling stopped.\n", cpu); - -stop_exit: - v = cb->req_alert; - if (v) - printk(KERN_ERR "hwsampler: CPU %d CPUMF Request alert," - " count=%lu.\n", cpu, v); - - v = cb->loss_of_sample_data; - if (v) - printk(KERN_ERR "hwsampler: CPU %d CPUMF Loss of sample data," - " count=%lu.\n", cpu, v); - - v = cb->invalid_entry_address; - if (v) - printk(KERN_ERR "hwsampler: CPU %d CPUMF Invalid entry address," - " count=%lu.\n", cpu, v); - - v = cb->incorrect_sdbt_entry; - if (v) - printk(KERN_ERR - "hwsampler: CPU %d CPUMF Incorrect SDBT address," - " count=%lu.\n", cpu, v); - - v = cb->sample_auth_change_alert; - if (v) - printk(KERN_ERR - "hwsampler: CPU %d CPUMF Sample authorization change," - " count=%lu.\n", cpu, v); - - return rc; -} - -static int check_hardware_prerequisites(void) -{ - if (!test_facility(68)) - return -EOPNOTSUPP; - return 0; -} -/* - * hws_oom_callback() - the OOM callback function - * - * In case the callback is invoked during memory allocation for the - * hw sampler, all obtained memory is deallocated and a flag is set - * so main sampler memory allocation can exit with a failure code. - * In case the callback is invoked during sampling the hw sampler - * is deactivated for all CPUs. - */ -static int hws_oom_callback(struct notifier_block *nfb, - unsigned long dummy, void *parm) -{ - unsigned long *freed; - int cpu; - struct hws_cpu_buffer *cb; - - freed = parm; - - mutex_lock(&hws_sem_oom); - - if (hws_state == HWS_DEALLOCATED) { - /* during memory allocation */ - if (oom_killer_was_active == 0) { - oom_killer_was_active = 1; - *freed += deallocate_sdbt(); - } - } else { - int i; - cpu = get_cpu(); - cb = &per_cpu(sampler_cpu_buffer, cpu); - - if (!cb->oom) { - for_each_online_cpu(i) { - smp_ctl_ssctl_deactivate(i); - cb->oom = 1; - } - cb->finish = 1; - - printk(KERN_INFO - "hwsampler: CPU %d, OOM notify during CPUMF Sampling.\n", - cpu); - } - } - - mutex_unlock(&hws_sem_oom); - - return NOTIFY_OK; -} - -static struct notifier_block hws_oom_notifier = { - .notifier_call = hws_oom_callback -}; - -static int hws_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - /* We do not have sampler space available for all possible CPUs. - All CPUs should be online when hw sampling is activated. */ - return (hws_state <= HWS_DEALLOCATED) ? NOTIFY_OK : NOTIFY_BAD; -} - -static struct notifier_block hws_cpu_notifier = { - .notifier_call = hws_cpu_callback -}; - -/** - * hwsampler_deactivate() - set hardware sampling temporarily inactive - * @cpu: specifies the CPU to be set inactive. - * - * Returns 0 on success, !0 on failure. - */ -int hwsampler_deactivate(unsigned int cpu) -{ - /* - * Deactivate hw sampling temporarily and flush the buffer - * by pushing all the pending samples to oprofile buffer. - * - * This function can be called under one of the following conditions: - * Memory unmap, task is exiting. - */ - int rc; - struct hws_cpu_buffer *cb; - - rc = 0; - mutex_lock(&hws_sem); - - cb = &per_cpu(sampler_cpu_buffer, cpu); - if (hws_state == HWS_STARTED) { - rc = smp_ctl_qsi(cpu); - WARN_ON(rc); - if (cb->qsi.cs) { - rc = smp_ctl_ssctl_deactivate(cpu); - if (rc) { - printk(KERN_INFO - "hwsampler: CPU %d, CPUMF Deactivation failed.\n", cpu); - cb->finish = 1; - hws_state = HWS_STOPPING; - } else { - hws_flush_all = 1; - /* Add work to queue to read pending samples.*/ - queue_work_on(cpu, hws_wq, &cb->worker); - } - } - } - mutex_unlock(&hws_sem); - - if (hws_wq) - flush_workqueue(hws_wq); - - return rc; -} - -/** - * hwsampler_activate() - activate/resume hardware sampling which was deactivated - * @cpu: specifies the CPU to be set active. - * - * Returns 0 on success, !0 on failure. - */ -int hwsampler_activate(unsigned int cpu) -{ - /* - * Re-activate hw sampling. This should be called in pair with - * hwsampler_deactivate(). - */ - int rc; - struct hws_cpu_buffer *cb; - - rc = 0; - mutex_lock(&hws_sem); - - cb = &per_cpu(sampler_cpu_buffer, cpu); - if (hws_state == HWS_STARTED) { - rc = smp_ctl_qsi(cpu); - WARN_ON(rc); - if (!cb->qsi.cs) { - hws_flush_all = 0; - rc = smp_ctl_ssctl_enable_activate(cpu, interval); - if (rc) { - printk(KERN_ERR - "CPU %d, CPUMF activate sampling failed.\n", - cpu); - } - } - } - - mutex_unlock(&hws_sem); - - return rc; -} - -static int check_qsi_on_setup(void) -{ - int rc; - unsigned int cpu; - struct hws_cpu_buffer *cb; - - for_each_online_cpu(cpu) { - cb = &per_cpu(sampler_cpu_buffer, cpu); - rc = smp_ctl_qsi(cpu); - WARN_ON(rc); - if (rc) - return -EOPNOTSUPP; - - if (!cb->qsi.as) { - printk(KERN_INFO "hwsampler: CPUMF sampling is not authorized.\n"); - return -EINVAL; - } - - if (cb->qsi.es) { - printk(KERN_WARNING "hwsampler: CPUMF is still enabled.\n"); - rc = smp_ctl_ssctl_stop(cpu); - if (rc) - return -EINVAL; - - printk(KERN_INFO - "CPU %d, CPUMF Sampling stopped now.\n", cpu); - } - } - return 0; -} - -static int check_qsi_on_start(void) -{ - unsigned int cpu; - int rc; - struct hws_cpu_buffer *cb; - - for_each_online_cpu(cpu) { - cb = &per_cpu(sampler_cpu_buffer, cpu); - rc = smp_ctl_qsi(cpu); - WARN_ON(rc); - - if (!cb->qsi.as) - return -EINVAL; - - if (cb->qsi.es) - return -EINVAL; - - if (cb->qsi.cs) - return -EINVAL; - } - return 0; -} - -static void worker_on_start(unsigned int cpu) -{ - struct hws_cpu_buffer *cb; - - cb = &per_cpu(sampler_cpu_buffer, cpu); - cb->worker_entry = cb->first_sdbt; -} - -static int worker_check_error(unsigned int cpu, int ext_params) -{ - int rc; - unsigned long *sdbt; - struct hws_cpu_buffer *cb; - - rc = 0; - cb = &per_cpu(sampler_cpu_buffer, cpu); - sdbt = (unsigned long *) cb->worker_entry; - - if (!sdbt || !*sdbt) - return -EINVAL; - - if (ext_params & CPU_MF_INT_SF_PRA) - cb->req_alert++; - - if (ext_params & CPU_MF_INT_SF_LSDA) - cb->loss_of_sample_data++; - - if (ext_params & CPU_MF_INT_SF_IAE) { - cb->invalid_entry_address++; - rc = -EINVAL; - } - - if (ext_params & CPU_MF_INT_SF_ISE) { - cb->incorrect_sdbt_entry++; - rc = -EINVAL; - } - - if (ext_params & CPU_MF_INT_SF_SACA) { - cb->sample_auth_change_alert++; - rc = -EINVAL; - } - - return rc; -} - -static void worker_on_finish(unsigned int cpu) -{ - int rc, i; - struct hws_cpu_buffer *cb; - - cb = &per_cpu(sampler_cpu_buffer, cpu); - - if (cb->finish) { - rc = smp_ctl_qsi(cpu); - WARN_ON(rc); - if (cb->qsi.es) { - printk(KERN_INFO - "hwsampler: CPU %d, CPUMF Stop/Deactivate sampling.\n", - cpu); - rc = smp_ctl_ssctl_stop(cpu); - if (rc) - printk(KERN_INFO - "hwsampler: CPU %d, CPUMF Deactivation failed.\n", - cpu); - - for_each_online_cpu(i) { - if (i == cpu) - continue; - if (!cb->finish) { - cb->finish = 1; - queue_work_on(i, hws_wq, - &cb->worker); - } - } - } - } -} - -static void worker_on_interrupt(unsigned int cpu) -{ - unsigned long *sdbt; - unsigned char done; - struct hws_cpu_buffer *cb; - - cb = &per_cpu(sampler_cpu_buffer, cpu); - - sdbt = (unsigned long *) cb->worker_entry; - - done = 0; - /* do not proceed if stop was entered, - * forget the buffers not yet processed */ - while (!done && !cb->stop_mode) { - unsigned long *trailer; - struct hws_trailer_entry *te; - unsigned long *dear = 0; - - trailer = trailer_entry_ptr(*sdbt); - /* leave loop if no more work to do */ - if (!(*trailer & SDB_TE_BUFFER_FULL_MASK)) { - done = 1; - if (!hws_flush_all) - continue; - } - - te = (struct hws_trailer_entry *)trailer; - cb->sample_overflow += te->overflow; - - add_samples_to_oprofile(cpu, sdbt, dear); - - /* reset trailer */ - xchg((unsigned char *) te, 0x40); - - /* advance to next sdb slot in current sdbt */ - sdbt++; - /* in case link bit is set use address w/o link bit */ - if (is_link_entry(sdbt)) - sdbt = get_next_sdbt(sdbt); - - cb->worker_entry = (unsigned long)sdbt; - } -} - -static void add_samples_to_oprofile(unsigned int cpu, unsigned long *sdbt, - unsigned long *dear) -{ - struct hws_basic_entry *sample_data_ptr; - unsigned long *trailer; - - trailer = trailer_entry_ptr(*sdbt); - if (dear) { - if (dear > trailer) - return; - trailer = dear; - } - - sample_data_ptr = (struct hws_basic_entry *)(*sdbt); - - while ((unsigned long *)sample_data_ptr < trailer) { - struct pt_regs *regs = NULL; - struct task_struct *tsk = NULL; - - /* - * Check sampling mode, 1 indicates basic (=customer) sampling - * mode. - */ - if (sample_data_ptr->def != 1) { - /* sample slot is not yet written */ - break; - } else { - /* make sure we don't use it twice, - * the next time the sampler will set it again */ - sample_data_ptr->def = 0; - } - - /* Get pt_regs. */ - if (sample_data_ptr->P == 1) { - /* userspace sample */ - unsigned int pid = sample_data_ptr->prim_asn; - if (!counter_config.user) - goto skip_sample; - rcu_read_lock(); - tsk = pid_task(find_vpid(pid), PIDTYPE_PID); - if (tsk) - regs = task_pt_regs(tsk); - rcu_read_unlock(); - } else { - /* kernelspace sample */ - if (!counter_config.kernel) - goto skip_sample; - regs = task_pt_regs(current); - } - - mutex_lock(&hws_sem); - oprofile_add_ext_hw_sample(sample_data_ptr->ia, regs, 0, - !sample_data_ptr->P, tsk); - mutex_unlock(&hws_sem); - skip_sample: - sample_data_ptr++; - } -} - -static void worker(struct work_struct *work) -{ - unsigned int cpu; - int ext_params; - struct hws_cpu_buffer *cb; - - cb = container_of(work, struct hws_cpu_buffer, worker); - cpu = smp_processor_id(); - ext_params = atomic_xchg(&cb->ext_params, 0); - - if (!cb->worker_entry) - worker_on_start(cpu); - - if (worker_check_error(cpu, ext_params)) - return; - - if (!cb->finish) - worker_on_interrupt(cpu); - - if (cb->finish) - worker_on_finish(cpu); -} - -/** - * hwsampler_allocate() - allocate memory for the hardware sampler - * @sdbt: number of SDBTs per online CPU (must be > 0) - * @sdb: number of SDBs per SDBT (minimum 1, maximum 511) - * - * Returns 0 on success, !0 on failure. - */ -int hwsampler_allocate(unsigned long sdbt, unsigned long sdb) -{ - int cpu, rc; - mutex_lock(&hws_sem); - - rc = -EINVAL; - if (hws_state != HWS_DEALLOCATED) - goto allocate_exit; - - if (sdbt < 1) - goto allocate_exit; - - if (sdb > MAX_NUM_SDB || sdb < MIN_NUM_SDB) - goto allocate_exit; - - num_sdbt = sdbt; - num_sdb = sdb; - - oom_killer_was_active = 0; - register_oom_notifier(&hws_oom_notifier); - - for_each_online_cpu(cpu) { - if (allocate_sdbt(cpu)) { - unregister_oom_notifier(&hws_oom_notifier); - goto allocate_error; - } - } - unregister_oom_notifier(&hws_oom_notifier); - if (oom_killer_was_active) - goto allocate_error; - - hws_state = HWS_STOPPED; - rc = 0; - -allocate_exit: - mutex_unlock(&hws_sem); - return rc; - -allocate_error: - rc = -ENOMEM; - printk(KERN_ERR "hwsampler: CPUMF Memory allocation failed.\n"); - goto allocate_exit; -} - -/** - * hwsampler_deallocate() - deallocate hardware sampler memory - * - * Returns 0 on success, !0 on failure. - */ -int hwsampler_deallocate(void) -{ - int rc; - - mutex_lock(&hws_sem); - - rc = -EINVAL; - if (hws_state != HWS_STOPPED) - goto deallocate_exit; - - irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); - hws_alert = 0; - deallocate_sdbt(); - - hws_state = HWS_DEALLOCATED; - rc = 0; - -deallocate_exit: - mutex_unlock(&hws_sem); - - return rc; -} - -unsigned long hwsampler_query_min_interval(void) -{ - return min_sampler_rate; -} - -unsigned long hwsampler_query_max_interval(void) -{ - return max_sampler_rate; -} - -unsigned long hwsampler_get_sample_overflow_count(unsigned int cpu) -{ - struct hws_cpu_buffer *cb; - - cb = &per_cpu(sampler_cpu_buffer, cpu); - - return cb->sample_overflow; -} - -int hwsampler_setup(void) -{ - int rc; - int cpu; - struct hws_cpu_buffer *cb; - - mutex_lock(&hws_sem); - - rc = -EINVAL; - if (hws_state) - goto setup_exit; - - hws_state = HWS_INIT; - - init_all_cpu_buffers(); - - rc = check_hardware_prerequisites(); - if (rc) - goto setup_exit; - - rc = check_qsi_on_setup(); - if (rc) - goto setup_exit; - - rc = -EINVAL; - hws_wq = create_workqueue("hwsampler"); - if (!hws_wq) - goto setup_exit; - - register_cpu_notifier(&hws_cpu_notifier); - - for_each_online_cpu(cpu) { - cb = &per_cpu(sampler_cpu_buffer, cpu); - INIT_WORK(&cb->worker, worker); - rc = smp_ctl_qsi(cpu); - WARN_ON(rc); - if (min_sampler_rate != cb->qsi.min_sampl_rate) { - if (min_sampler_rate) { - printk(KERN_WARNING - "hwsampler: different min sampler rate values.\n"); - if (min_sampler_rate < cb->qsi.min_sampl_rate) - min_sampler_rate = - cb->qsi.min_sampl_rate; - } else - min_sampler_rate = cb->qsi.min_sampl_rate; - } - if (max_sampler_rate != cb->qsi.max_sampl_rate) { - if (max_sampler_rate) { - printk(KERN_WARNING - "hwsampler: different max sampler rate values.\n"); - if (max_sampler_rate > cb->qsi.max_sampl_rate) - max_sampler_rate = - cb->qsi.max_sampl_rate; - } else - max_sampler_rate = cb->qsi.max_sampl_rate; - } - } - register_external_irq(EXT_IRQ_MEASURE_ALERT, hws_ext_handler); - - hws_state = HWS_DEALLOCATED; - rc = 0; - -setup_exit: - mutex_unlock(&hws_sem); - return rc; -} - -int hwsampler_shutdown(void) -{ - int rc; - - mutex_lock(&hws_sem); - - rc = -EINVAL; - if (hws_state == HWS_DEALLOCATED || hws_state == HWS_STOPPED) { - mutex_unlock(&hws_sem); - - if (hws_wq) - flush_workqueue(hws_wq); - - mutex_lock(&hws_sem); - - if (hws_state == HWS_STOPPED) { - irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); - hws_alert = 0; - deallocate_sdbt(); - } - if (hws_wq) { - destroy_workqueue(hws_wq); - hws_wq = NULL; - } - - unregister_external_irq(EXT_IRQ_MEASURE_ALERT, hws_ext_handler); - hws_state = HWS_INIT; - rc = 0; - } - mutex_unlock(&hws_sem); - - unregister_cpu_notifier(&hws_cpu_notifier); - - return rc; -} - -/** - * hwsampler_start_all() - start hardware sampling on all online CPUs - * @rate: specifies the used interval when samples are taken - * - * Returns 0 on success, !0 on failure. - */ -int hwsampler_start_all(unsigned long rate) -{ - int rc, cpu; - - mutex_lock(&hws_sem); - - hws_oom = 0; - - rc = -EINVAL; - if (hws_state != HWS_STOPPED) - goto start_all_exit; - - interval = rate; - - /* fail if rate is not valid */ - if (interval < min_sampler_rate || interval > max_sampler_rate) - goto start_all_exit; - - rc = check_qsi_on_start(); - if (rc) - goto start_all_exit; - - prepare_cpu_buffers(); - - for_each_online_cpu(cpu) { - rc = start_sampling(cpu); - if (rc) - break; - } - if (rc) { - for_each_online_cpu(cpu) { - stop_sampling(cpu); - } - goto start_all_exit; - } - hws_state = HWS_STARTED; - rc = 0; - -start_all_exit: - mutex_unlock(&hws_sem); - - if (rc) - return rc; - - register_oom_notifier(&hws_oom_notifier); - hws_oom = 1; - hws_flush_all = 0; - /* now let them in, 1407 CPUMF external interrupts */ - hws_alert = 1; - irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); - - return 0; -} - -/** - * hwsampler_stop_all() - stop hardware sampling on all online CPUs - * - * Returns 0 on success, !0 on failure. - */ -int hwsampler_stop_all(void) -{ - int tmp_rc, rc, cpu; - struct hws_cpu_buffer *cb; - - mutex_lock(&hws_sem); - - rc = 0; - if (hws_state == HWS_INIT) { - mutex_unlock(&hws_sem); - return 0; - } - hws_state = HWS_STOPPING; - mutex_unlock(&hws_sem); - - for_each_online_cpu(cpu) { - cb = &per_cpu(sampler_cpu_buffer, cpu); - cb->stop_mode = 1; - tmp_rc = stop_sampling(cpu); - if (tmp_rc) - rc = tmp_rc; - } - - if (hws_wq) - flush_workqueue(hws_wq); - - mutex_lock(&hws_sem); - if (hws_oom) { - unregister_oom_notifier(&hws_oom_notifier); - hws_oom = 0; - } - hws_state = HWS_STOPPED; - mutex_unlock(&hws_sem); - - return rc; -} diff --git a/arch/s390/oprofile/hwsampler.h b/arch/s390/oprofile/hwsampler.h deleted file mode 100644 index a483d06f2fa7..000000000000 --- a/arch/s390/oprofile/hwsampler.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * CPUMF HW sampler functions and internal structures - * - * Copyright IBM Corp. 2010 - * Author(s): Heinz Graalfs <graalfs@de.ibm.com> - */ - -#ifndef HWSAMPLER_H_ -#define HWSAMPLER_H_ - -#include <linux/workqueue.h> -#include <asm/cpu_mf.h> - -struct hws_ssctl_request_block /* SET SAMPLING CONTROLS req block */ -{ /* bytes 0 - 7 Bit(s) */ - unsigned int s:1; /* 0: maximum buffer indicator */ - unsigned int h:1; /* 1: part. level reserved for VM use*/ - unsigned long b2_53:52; /* 2-53: zeros */ - unsigned int es:1; /* 54: sampling enable control */ - unsigned int b55_61:7; /* 55-61: - zeros */ - unsigned int cs:1; /* 62: sampling activation control */ - unsigned int b63:1; /* 63: zero */ - unsigned long interval; /* 8-15: sampling interval */ - unsigned long tear; /* 16-23: TEAR contents */ - unsigned long dear; /* 24-31: DEAR contents */ - /* 32-63: */ - unsigned long rsvrd1; /* reserved */ - unsigned long rsvrd2; /* reserved */ - unsigned long rsvrd3; /* reserved */ - unsigned long rsvrd4; /* reserved */ -}; - -struct hws_cpu_buffer { - unsigned long first_sdbt; /* @ of 1st SDB-Table for this CP*/ - unsigned long worker_entry; - unsigned long sample_overflow; /* taken from SDB ... */ - struct hws_qsi_info_block qsi; - struct hws_ssctl_request_block ssctl; - struct work_struct worker; - atomic_t ext_params; - unsigned long req_alert; - unsigned long loss_of_sample_data; - unsigned long invalid_entry_address; - unsigned long incorrect_sdbt_entry; - unsigned long sample_auth_change_alert; - unsigned int finish:1; - unsigned int oom:1; - unsigned int stop_mode:1; -}; - -int hwsampler_setup(void); -int hwsampler_shutdown(void); -int hwsampler_allocate(unsigned long sdbt, unsigned long sdb); -int hwsampler_deallocate(void); -unsigned long hwsampler_query_min_interval(void); -unsigned long hwsampler_query_max_interval(void); -int hwsampler_start_all(unsigned long interval); -int hwsampler_stop_all(void); -int hwsampler_deactivate(unsigned int cpu); -int hwsampler_activate(unsigned int cpu); -unsigned long hwsampler_get_sample_overflow_count(unsigned int cpu); - -#endif /*HWSAMPLER_H_*/ diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c index 791935a65800..16f4c3960b87 100644 --- a/arch/s390/oprofile/init.c +++ b/arch/s390/oprofile/init.c @@ -10,488 +10,8 @@ */ #include <linux/oprofile.h> -#include <linux/perf_event.h> #include <linux/init.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/module.h> #include <asm/processor.h> -#include <asm/perf_event.h> - -#include "../../../drivers/oprofile/oprof.h" - -#include "hwsampler.h" -#include "op_counter.h" - -#define DEFAULT_INTERVAL 4127518 - -#define DEFAULT_SDBT_BLOCKS 1 -#define DEFAULT_SDB_BLOCKS 511 - -static unsigned long oprofile_hw_interval = DEFAULT_INTERVAL; -static unsigned long oprofile_min_interval; -static unsigned long oprofile_max_interval; - -static unsigned long oprofile_sdbt_blocks = DEFAULT_SDBT_BLOCKS; -static unsigned long oprofile_sdb_blocks = DEFAULT_SDB_BLOCKS; - -static int hwsampler_enabled; -static int hwsampler_running; /* start_mutex must be held to change */ -static int hwsampler_available; - -static struct oprofile_operations timer_ops; - -struct op_counter_config counter_config; - -enum __force_cpu_type { - reserved = 0, /* do not force */ - timer, -}; -static int force_cpu_type; - -static int set_cpu_type(const char *str, struct kernel_param *kp) -{ - if (!strcmp(str, "timer")) { - force_cpu_type = timer; - printk(KERN_INFO "oprofile: forcing timer to be returned " - "as cpu type\n"); - } else { - force_cpu_type = 0; - } - - return 0; -} -module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0); -MODULE_PARM_DESC(cpu_type, "Force legacy basic mode sampling" - "(report cpu_type \"timer\""); - -static int __oprofile_hwsampler_start(void) -{ - int retval; - - retval = hwsampler_allocate(oprofile_sdbt_blocks, oprofile_sdb_blocks); - if (retval) - return retval; - - retval = hwsampler_start_all(oprofile_hw_interval); - if (retval) - hwsampler_deallocate(); - - return retval; -} - -static int oprofile_hwsampler_start(void) -{ - int retval; - - hwsampler_running = hwsampler_enabled; - - if (!hwsampler_running) - return timer_ops.start(); - - retval = perf_reserve_sampling(); - if (retval) - return retval; - - retval = __oprofile_hwsampler_start(); - if (retval) - perf_release_sampling(); - - return retval; -} - -static void oprofile_hwsampler_stop(void) -{ - if (!hwsampler_running) { - timer_ops.stop(); - return; - } - - hwsampler_stop_all(); - hwsampler_deallocate(); - perf_release_sampling(); - return; -} - -/* - * File ops used for: - * /dev/oprofile/0/enabled - * /dev/oprofile/hwsampling/hwsampler (cpu_type = timer) - */ - -static ssize_t hwsampler_read(struct file *file, char __user *buf, - size_t count, loff_t *offset) -{ - return oprofilefs_ulong_to_user(hwsampler_enabled, buf, count, offset); -} - -static ssize_t hwsampler_write(struct file *file, char const __user *buf, - size_t count, loff_t *offset) -{ - unsigned long val; - int retval; - - if (*offset) - return -EINVAL; - - retval = oprofilefs_ulong_from_user(&val, buf, count); - if (retval <= 0) - return retval; - - if (val != 0 && val != 1) - return -EINVAL; - - if (oprofile_started) - /* - * save to do without locking as we set - * hwsampler_running in start() when start_mutex is - * held - */ - return -EBUSY; - - hwsampler_enabled = val; - - return count; -} - -static const struct file_operations hwsampler_fops = { - .read = hwsampler_read, - .write = hwsampler_write, -}; - -/* - * File ops used for: - * /dev/oprofile/0/count - * /dev/oprofile/hwsampling/hw_interval (cpu_type = timer) - * - * Make sure that the value is within the hardware range. - */ - -static ssize_t hw_interval_read(struct file *file, char __user *buf, - size_t count, loff_t *offset) -{ - return oprofilefs_ulong_to_user(oprofile_hw_interval, buf, - count, offset); -} - -static ssize_t hw_interval_write(struct file *file, char const __user *buf, - size_t count, loff_t *offset) -{ - unsigned long val; - int retval; - - if (*offset) - return -EINVAL; - retval = oprofilefs_ulong_from_user(&val, buf, count); - if (retval <= 0) - return retval; - if (val < oprofile_min_interval) - oprofile_hw_interval = oprofile_min_interval; - else if (val > oprofile_max_interval) - oprofile_hw_interval = oprofile_max_interval; - else - oprofile_hw_interval = val; - - return count; -} - -static const struct file_operations hw_interval_fops = { - .read = hw_interval_read, - .write = hw_interval_write, -}; - -/* - * File ops used for: - * /dev/oprofile/0/event - * Only a single event with number 0 is supported with this counter. - * - * /dev/oprofile/0/unit_mask - * This is a dummy file needed by the user space tools. - * No value other than 0 is accepted or returned. - */ - -static ssize_t hwsampler_zero_read(struct file *file, char __user *buf, - size_t count, loff_t *offset) -{ - return oprofilefs_ulong_to_user(0, buf, count, offset); -} - -static ssize_t hwsampler_zero_write(struct file *file, char const __user *buf, - size_t count, loff_t *offset) -{ - unsigned long val; - int retval; - - if (*offset) - return -EINVAL; - - retval = oprofilefs_ulong_from_user(&val, buf, count); - if (retval <= 0) - return retval; - if (val != 0) - return -EINVAL; - return count; -} - -static const struct file_operations zero_fops = { - .read = hwsampler_zero_read, - .write = hwsampler_zero_write, -}; - -/* /dev/oprofile/0/kernel file ops. */ - -static ssize_t hwsampler_kernel_read(struct file *file, char __user *buf, - size_t count, loff_t *offset) -{ - return oprofilefs_ulong_to_user(counter_config.kernel, - buf, count, offset); -} - -static ssize_t hwsampler_kernel_write(struct file *file, char const __user *buf, - size_t count, loff_t *offset) -{ - unsigned long val; - int retval; - - if (*offset) - return -EINVAL; - - retval = oprofilefs_ulong_from_user(&val, buf, count); - if (retval <= 0) - return retval; - - if (val != 0 && val != 1) - return -EINVAL; - - counter_config.kernel = val; - - return count; -} - -static const struct file_operations kernel_fops = { - .read = hwsampler_kernel_read, - .write = hwsampler_kernel_write, -}; - -/* /dev/oprofile/0/user file ops. */ - -static ssize_t hwsampler_user_read(struct file *file, char __user *buf, - size_t count, loff_t *offset) -{ - return oprofilefs_ulong_to_user(counter_config.user, - buf, count, offset); -} - -static ssize_t hwsampler_user_write(struct file *file, char const __user *buf, - size_t count, loff_t *offset) -{ - unsigned long val; - int retval; - - if (*offset) - return -EINVAL; - - retval = oprofilefs_ulong_from_user(&val, buf, count); - if (retval <= 0) - return retval; - - if (val != 0 && val != 1) - return -EINVAL; - - counter_config.user = val; - - return count; -} - -static const struct file_operations user_fops = { - .read = hwsampler_user_read, - .write = hwsampler_user_write, -}; - - -/* - * File ops used for: /dev/oprofile/timer/enabled - * The value always has to be the inverted value of hwsampler_enabled. So - * no separate variable is created. That way we do not need locking. - */ - -static ssize_t timer_enabled_read(struct file *file, char __user *buf, - size_t count, loff_t *offset) -{ - return oprofilefs_ulong_to_user(!hwsampler_enabled, buf, count, offset); -} - -static ssize_t timer_enabled_write(struct file *file, char const __user *buf, - size_t count, loff_t *offset) -{ - unsigned long val; - int retval; - - if (*offset) - return -EINVAL; - - retval = oprofilefs_ulong_from_user(&val, buf, count); - if (retval <= 0) - return retval; - - if (val != 0 && val != 1) - return -EINVAL; - - /* Timer cannot be disabled without having hardware sampling. */ - if (val == 0 && !hwsampler_available) - return -EINVAL; - - if (oprofile_started) - /* - * save to do without locking as we set - * hwsampler_running in start() when start_mutex is - * held - */ - return -EBUSY; - - hwsampler_enabled = !val; - - return count; -} - -static const struct file_operations timer_enabled_fops = { - .read = timer_enabled_read, - .write = timer_enabled_write, -}; - - -static int oprofile_create_hwsampling_files(struct dentry *root) -{ - struct dentry *dir; - - dir = oprofilefs_mkdir(root, "timer"); - if (!dir) - return -EINVAL; - - oprofilefs_create_file(dir, "enabled", &timer_enabled_fops); - - if (!hwsampler_available) - return 0; - - /* reinitialize default values */ - hwsampler_enabled = 1; - counter_config.kernel = 1; - counter_config.user = 1; - - if (!force_cpu_type) { - /* - * Create the counter file system. A single virtual - * counter is created which can be used to - * enable/disable hardware sampling dynamically from - * user space. The user space will configure a single - * counter with a single event. The value of 'event' - * and 'unit_mask' are not evaluated by the kernel code - * and can only be set to 0. - */ - - dir = oprofilefs_mkdir(root, "0"); - if (!dir) - return -EINVAL; - - oprofilefs_create_file(dir, "enabled", &hwsampler_fops); - oprofilefs_create_file(dir, "event", &zero_fops); - oprofilefs_create_file(dir, "count", &hw_interval_fops); - oprofilefs_create_file(dir, "unit_mask", &zero_fops); - oprofilefs_create_file(dir, "kernel", &kernel_fops); - oprofilefs_create_file(dir, "user", &user_fops); - oprofilefs_create_ulong(dir, "hw_sdbt_blocks", - &oprofile_sdbt_blocks); - - } else { - /* - * Hardware sampling can be used but the cpu_type is - * forced to timer in order to deal with legacy user - * space tools. The /dev/oprofile/hwsampling fs is - * provided in that case. - */ - dir = oprofilefs_mkdir(root, "hwsampling"); - if (!dir) - return -EINVAL; - - oprofilefs_create_file(dir, "hwsampler", - &hwsampler_fops); - oprofilefs_create_file(dir, "hw_interval", - &hw_interval_fops); - oprofilefs_create_ro_ulong(dir, "hw_min_interval", - &oprofile_min_interval); - oprofilefs_create_ro_ulong(dir, "hw_max_interval", - &oprofile_max_interval); - oprofilefs_create_ulong(dir, "hw_sdbt_blocks", - &oprofile_sdbt_blocks); - } - return 0; -} - -static int oprofile_hwsampler_init(struct oprofile_operations *ops) -{ - /* - * Initialize the timer mode infrastructure as well in order - * to be able to switch back dynamically. oprofile_timer_init - * is not supposed to fail. - */ - if (oprofile_timer_init(ops)) - BUG(); - - memcpy(&timer_ops, ops, sizeof(timer_ops)); - ops->create_files = oprofile_create_hwsampling_files; - - /* - * If the user space tools do not support newer cpu types, - * the force_cpu_type module parameter - * can be used to always return \"timer\" as cpu type. - */ - if (force_cpu_type != timer) { - struct cpuid id; - - get_cpu_id (&id); - - switch (id.machine) { - case 0x2097: case 0x2098: ops->cpu_type = "s390/z10"; break; - case 0x2817: case 0x2818: ops->cpu_type = "s390/z196"; break; - case 0x2827: case 0x2828: ops->cpu_type = "s390/zEC12"; break; - case 0x2964: case 0x2965: ops->cpu_type = "s390/z13"; break; - default: return -ENODEV; - } - } - - if (hwsampler_setup()) - return -ENODEV; - - /* - * Query the range for the sampling interval from the - * hardware. - */ - oprofile_min_interval = hwsampler_query_min_interval(); - if (oprofile_min_interval == 0) - return -ENODEV; - oprofile_max_interval = hwsampler_query_max_interval(); - if (oprofile_max_interval == 0) - return -ENODEV; - - /* The initial value should be sane */ - if (oprofile_hw_interval < oprofile_min_interval) - oprofile_hw_interval = oprofile_min_interval; - if (oprofile_hw_interval > oprofile_max_interval) - oprofile_hw_interval = oprofile_max_interval; - - printk(KERN_INFO "oprofile: System z hardware sampling " - "facility found.\n"); - - ops->start = oprofile_hwsampler_start; - ops->stop = oprofile_hwsampler_stop; - - return 0; -} - -static void oprofile_hwsampler_exit(void) -{ - hwsampler_shutdown(); -} static int __s390_backtrace(void *data, unsigned long address) { @@ -514,18 +34,9 @@ static void s390_backtrace(struct pt_regs *regs, unsigned int depth) int __init oprofile_arch_init(struct oprofile_operations *ops) { ops->backtrace = s390_backtrace; - - /* - * -ENODEV is not reported to the caller. The module itself - * will use the timer mode sampling as fallback and this is - * always available. - */ - hwsampler_available = oprofile_hwsampler_init(ops) == 0; - return 0; } void oprofile_arch_exit(void) { - oprofile_hwsampler_exit(); } diff --git a/arch/s390/oprofile/op_counter.h b/arch/s390/oprofile/op_counter.h deleted file mode 100644 index 61b2531eef17..000000000000 --- a/arch/s390/oprofile/op_counter.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright IBM Corp. 2011 - * Author(s): Andreas Krebbel (krebbel@linux.vnet.ibm.com) - * - * @remark Copyright 2011 OProfile authors - */ - -#ifndef OP_COUNTER_H -#define OP_COUNTER_H - -struct op_counter_config { - /* `enabled' maps to the hwsampler_file variable. */ - /* `count' maps to the oprofile_hw_interval variable. */ - /* `event' and `unit_mask' are unused. */ - unsigned long kernel; - unsigned long user; -}; - -extern struct op_counter_config counter_config; - -#endif /* OP_COUNTER_H */ diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 871af75c69c2..15ffc19c8c0c 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -854,6 +854,15 @@ void zpci_stop_device(struct zpci_dev *zdev) } EXPORT_SYMBOL_GPL(zpci_stop_device); +int zpci_report_error(struct pci_dev *pdev, + struct zpci_report_error_header *report) +{ + struct zpci_dev *zdev = to_zpci(pdev); + + return sclp_pci_report(report, zdev->fh, zdev->fid); +} +EXPORT_SYMBOL(zpci_report_error); + static inline int barsize(u8 size) { return (size) ? (1 << size) >> 10 : 0; diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c index c555de3d12d6..38993b156924 100644 --- a/arch/s390/pci/pci_debug.c +++ b/arch/s390/pci/pci_debug.c @@ -1,5 +1,5 @@ /* - * Copyright IBM Corp. 2012 + * Copyright IBM Corp. 2012,2015 * * Author(s): * Jan Glauber <jang@linux.vnet.ibm.com> @@ -23,22 +23,45 @@ EXPORT_SYMBOL_GPL(pci_debug_msg_id); debug_info_t *pci_debug_err_id; EXPORT_SYMBOL_GPL(pci_debug_err_id); -static char *pci_perf_names[] = { - /* hardware counters */ +static char *pci_common_names[] = { "Load operations", "Store operations", "Store block operations", "Refresh operations", +}; + +static char *pci_fmt0_names[] = { "DMA read bytes", "DMA write bytes", }; +static char *pci_fmt1_names[] = { + "Received bytes", + "Received packets", + "Transmitted bytes", + "Transmitted packets", +}; + +static char *pci_fmt2_names[] = { + "Consumed work units", + "Maximum work units", +}; + static char *pci_sw_names[] = { "Allocated pages", "Mapped pages", "Unmapped pages", }; +static void pci_fmb_show(struct seq_file *m, char *name[], int length, + u64 *data) +{ + int i; + + for (i = 0; i < length; i++, data++) + seq_printf(m, "%26s:\t%llu\n", name[i], *data); +} + static void pci_sw_counter_show(struct seq_file *m) { struct zpci_dev *zdev = m->private; @@ -53,8 +76,6 @@ static void pci_sw_counter_show(struct seq_file *m) static int pci_perf_show(struct seq_file *m, void *v) { struct zpci_dev *zdev = m->private; - u64 *stat; - int i; if (!zdev) return 0; @@ -72,15 +93,27 @@ static int pci_perf_show(struct seq_file *m, void *v) seq_printf(m, "Samples: %u\n", zdev->fmb->samples); seq_printf(m, "Last update TOD: %Lx\n", zdev->fmb->last_update); - /* hardware counters */ - stat = (u64 *) &zdev->fmb->ld_ops; - for (i = 0; i < 4; i++) - seq_printf(m, "%26s:\t%llu\n", - pci_perf_names[i], *(stat + i)); - if (zdev->fmb->dma_valid) - for (i = 4; i < 6; i++) - seq_printf(m, "%26s:\t%llu\n", - pci_perf_names[i], *(stat + i)); + pci_fmb_show(m, pci_common_names, ARRAY_SIZE(pci_common_names), + &zdev->fmb->ld_ops); + + switch (zdev->fmb->format) { + case 0: + if (!(zdev->fmb->fmt_ind & ZPCI_FMB_DMA_COUNTER_VALID)) + break; + pci_fmb_show(m, pci_fmt0_names, ARRAY_SIZE(pci_fmt0_names), + &zdev->fmb->fmt0.dma_rbytes); + break; + case 1: + pci_fmb_show(m, pci_fmt1_names, ARRAY_SIZE(pci_fmt1_names), + &zdev->fmb->fmt1.rx_bytes); + break; + case 2: + pci_fmb_show(m, pci_fmt2_names, ARRAY_SIZE(pci_fmt2_names), + &zdev->fmb->fmt2.consumed_work_units); + break; + default: + seq_puts(m, "Unknown format\n"); + } pci_sw_counter_show(m); mutex_unlock(&zdev->lock); diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index 1ea8c07eab84..7350c8bc13a2 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -129,12 +129,11 @@ void dma_update_cpu_trans(unsigned long *entry, void *page_addr, int flags) entry_clr_protected(entry); } -static int dma_update_trans(struct zpci_dev *zdev, unsigned long pa, - dma_addr_t dma_addr, size_t size, int flags) +static int __dma_update_trans(struct zpci_dev *zdev, unsigned long pa, + dma_addr_t dma_addr, size_t size, int flags) { unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; u8 *page_addr = (u8 *) (pa & PAGE_MASK); - dma_addr_t start_dma_addr = dma_addr; unsigned long irq_flags; unsigned long *entry; int i, rc = 0; @@ -145,7 +144,7 @@ static int dma_update_trans(struct zpci_dev *zdev, unsigned long pa, spin_lock_irqsave(&zdev->dma_table_lock, irq_flags); if (!zdev->dma_table) { rc = -EINVAL; - goto no_refresh; + goto out_unlock; } for (i = 0; i < nr_pages; i++) { @@ -159,20 +158,6 @@ static int dma_update_trans(struct zpci_dev *zdev, unsigned long pa, dma_addr += PAGE_SIZE; } - /* - * With zdev->tlb_refresh == 0, rpcit is not required to establish new - * translations when previously invalid translation-table entries are - * validated. With lazy unmap, it also is skipped for previously valid - * entries, but a global rpcit is then required before any address can - * be re-used, i.e. after each iommu bitmap wrap-around. - */ - if (!zdev->tlb_refresh && - (!s390_iommu_strict || - ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID))) - goto no_refresh; - - rc = zpci_refresh_trans((u64) zdev->fh << 32, start_dma_addr, - nr_pages * PAGE_SIZE); undo_cpu_trans: if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) { flags = ZPCI_PTE_INVALID; @@ -185,12 +170,46 @@ undo_cpu_trans: dma_update_cpu_trans(entry, page_addr, flags); } } - -no_refresh: +out_unlock: spin_unlock_irqrestore(&zdev->dma_table_lock, irq_flags); return rc; } +static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr, + size_t size, int flags) +{ + /* + * With zdev->tlb_refresh == 0, rpcit is not required to establish new + * translations when previously invalid translation-table entries are + * validated. With lazy unmap, it also is skipped for previously valid + * entries, but a global rpcit is then required before any address can + * be re-used, i.e. after each iommu bitmap wrap-around. + */ + if (!zdev->tlb_refresh && + (!s390_iommu_strict || + ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID))) + return 0; + + return zpci_refresh_trans((u64) zdev->fh << 32, dma_addr, + PAGE_ALIGN(size)); +} + +static int dma_update_trans(struct zpci_dev *zdev, unsigned long pa, + dma_addr_t dma_addr, size_t size, int flags) +{ + int rc; + + rc = __dma_update_trans(zdev, pa, dma_addr, size, flags); + if (rc) + return rc; + + rc = __dma_purge_tlb(zdev, dma_addr, size, flags); + if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) + __dma_update_trans(zdev, pa, dma_addr, size, ZPCI_PTE_INVALID); + + return rc; +} + void dma_free_seg_table(unsigned long entry) { unsigned long *sto = get_rt_sto(entry); @@ -226,48 +245,58 @@ static unsigned long __dma_alloc_iommu(struct device *dev, boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; return iommu_area_alloc(zdev->iommu_bitmap, zdev->iommu_pages, - start, size, 0, boundary_size, 0); + start, size, zdev->start_dma >> PAGE_SHIFT, + boundary_size, 0); } -static unsigned long dma_alloc_iommu(struct device *dev, int size) +static dma_addr_t dma_alloc_address(struct device *dev, int size) { struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); unsigned long offset, flags; - int wrap = 0; spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags); offset = __dma_alloc_iommu(dev, zdev->next_bit, size); if (offset == -1) { + if (!zdev->tlb_refresh && !s390_iommu_strict) { + /* global flush before DMA addresses are reused */ + if (zpci_refresh_global(zdev)) + goto out_error; + + bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap, + zdev->lazy_bitmap, zdev->iommu_pages); + bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages); + } /* wrap-around */ offset = __dma_alloc_iommu(dev, 0, size); - wrap = 1; + if (offset == -1) + goto out_error; } + zdev->next_bit = offset + size; + spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags); - if (offset != -1) { - zdev->next_bit = offset + size; - if (!zdev->tlb_refresh && !s390_iommu_strict && wrap) - /* global flush after wrap-around with lazy unmap */ - zpci_refresh_global(zdev); - } + return zdev->start_dma + offset * PAGE_SIZE; + +out_error: spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags); - return offset; + return DMA_ERROR_CODE; } -static void dma_free_iommu(struct device *dev, unsigned long offset, int size) +static void dma_free_address(struct device *dev, dma_addr_t dma_addr, int size) { struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - unsigned long flags; + unsigned long flags, offset; + + offset = (dma_addr - zdev->start_dma) >> PAGE_SHIFT; spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags); if (!zdev->iommu_bitmap) goto out; - bitmap_clear(zdev->iommu_bitmap, offset, size); - /* - * Lazy flush for unmap: need to move next_bit to avoid address re-use - * until wrap-around. - */ - if (!s390_iommu_strict && offset >= zdev->next_bit) - zdev->next_bit = offset + size; + + if (zdev->tlb_refresh || s390_iommu_strict) + bitmap_clear(zdev->iommu_bitmap, offset, size); + else + bitmap_set(zdev->lazy_bitmap, offset, size); + out: spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags); } @@ -285,19 +314,19 @@ static inline void zpci_err_dma(unsigned long rc, unsigned long addr) static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction, - struct dma_attrs *attrs) + unsigned long attrs) { struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - unsigned long nr_pages, iommu_page_index; unsigned long pa = page_to_phys(page) + offset; int flags = ZPCI_PTE_VALID; + unsigned long nr_pages; dma_addr_t dma_addr; int ret; /* This rounds up number of pages based on size and offset */ nr_pages = iommu_num_pages(pa, size, PAGE_SIZE); - iommu_page_index = dma_alloc_iommu(dev, nr_pages); - if (iommu_page_index == -1) { + dma_addr = dma_alloc_address(dev, nr_pages); + if (dma_addr == DMA_ERROR_CODE) { ret = -ENOSPC; goto out_err; } @@ -305,12 +334,6 @@ static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page, /* Use rounded up size */ size = nr_pages * PAGE_SIZE; - dma_addr = zdev->start_dma + iommu_page_index * PAGE_SIZE; - if (dma_addr + size > zdev->end_dma) { - ret = -ERANGE; - goto out_free; - } - if (direction == DMA_NONE || direction == DMA_TO_DEVICE) flags |= ZPCI_TABLE_PROTECTED; @@ -322,7 +345,7 @@ static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page, return dma_addr + (offset & ~PAGE_MASK); out_free: - dma_free_iommu(dev, iommu_page_index, nr_pages); + dma_free_address(dev, dma_addr, nr_pages); out_err: zpci_err("map error:\n"); zpci_err_dma(ret, pa); @@ -331,10 +354,9 @@ out_err: static void s390_dma_unmap_pages(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction direction, - struct dma_attrs *attrs) + unsigned long attrs) { struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); - unsigned long iommu_page_index; int npages, ret; npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); @@ -348,13 +370,12 @@ static void s390_dma_unmap_pages(struct device *dev, dma_addr_t dma_addr, } atomic64_add(npages, &zdev->unmapped_pages); - iommu_page_index = (dma_addr - zdev->start_dma) >> PAGE_SHIFT; - dma_free_iommu(dev, iommu_page_index, npages); + dma_free_address(dev, dma_addr, npages); } static void *s390_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, - struct dma_attrs *attrs) + unsigned long attrs) { struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); struct page *page; @@ -369,7 +390,7 @@ static void *s390_dma_alloc(struct device *dev, size_t size, pa = page_to_phys(page); memset((void *) pa, 0, size); - map = s390_dma_map_pages(dev, page, 0, size, DMA_BIDIRECTIONAL, NULL); + map = s390_dma_map_pages(dev, page, 0, size, DMA_BIDIRECTIONAL, 0); if (dma_mapping_error(dev, map)) { free_pages(pa, get_order(size)); return NULL; @@ -383,58 +404,121 @@ static void *s390_dma_alloc(struct device *dev, size_t size, static void s390_dma_free(struct device *dev, size_t size, void *pa, dma_addr_t dma_handle, - struct dma_attrs *attrs) + unsigned long attrs) { struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); size = PAGE_ALIGN(size); atomic64_sub(size / PAGE_SIZE, &zdev->allocated_pages); - s390_dma_unmap_pages(dev, dma_handle, size, DMA_BIDIRECTIONAL, NULL); + s390_dma_unmap_pages(dev, dma_handle, size, DMA_BIDIRECTIONAL, 0); free_pages((unsigned long) pa, get_order(size)); } -static int s390_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nr_elements, enum dma_data_direction dir, - struct dma_attrs *attrs) +/* Map a segment into a contiguous dma address area */ +static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg, + size_t size, dma_addr_t *handle, + enum dma_data_direction dir) { - int mapped_elements = 0; + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); + dma_addr_t dma_addr_base, dma_addr; + int flags = ZPCI_PTE_VALID; struct scatterlist *s; - int i; + unsigned long pa; + int ret; - for_each_sg(sg, s, nr_elements, i) { - struct page *page = sg_page(s); - s->dma_address = s390_dma_map_pages(dev, page, s->offset, - s->length, dir, NULL); - if (!dma_mapping_error(dev, s->dma_address)) { - s->dma_length = s->length; - mapped_elements++; - } else + size = PAGE_ALIGN(size); + dma_addr_base = dma_alloc_address(dev, size >> PAGE_SHIFT); + if (dma_addr_base == DMA_ERROR_CODE) + return -ENOMEM; + + dma_addr = dma_addr_base; + if (dir == DMA_NONE || dir == DMA_TO_DEVICE) + flags |= ZPCI_TABLE_PROTECTED; + + for (s = sg; dma_addr < dma_addr_base + size; s = sg_next(s)) { + pa = page_to_phys(sg_page(s)) + s->offset; + ret = __dma_update_trans(zdev, pa, dma_addr, s->length, flags); + if (ret) goto unmap; + + dma_addr += s->length; } -out: - return mapped_elements; + ret = __dma_purge_tlb(zdev, dma_addr_base, size, flags); + if (ret) + goto unmap; + + *handle = dma_addr_base; + atomic64_add(size >> PAGE_SHIFT, &zdev->mapped_pages); + + return ret; unmap: - for_each_sg(sg, s, mapped_elements, i) { - if (s->dma_address) - s390_dma_unmap_pages(dev, s->dma_address, s->dma_length, - dir, NULL); - s->dma_address = 0; + dma_update_trans(zdev, 0, dma_addr_base, dma_addr - dma_addr_base, + ZPCI_PTE_INVALID); + dma_free_address(dev, dma_addr_base, size >> PAGE_SHIFT); + zpci_err("map error:\n"); + zpci_err_dma(ret, pa); + return ret; +} + +static int s390_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nr_elements, enum dma_data_direction dir, + unsigned long attrs) +{ + struct scatterlist *s = sg, *start = sg, *dma = sg; + unsigned int max = dma_get_max_seg_size(dev); + unsigned int size = s->offset + s->length; + unsigned int offset = s->offset; + int count = 0, i; + + for (i = 1; i < nr_elements; i++) { + s = sg_next(s); + + s->dma_address = DMA_ERROR_CODE; s->dma_length = 0; + + if (s->offset || (size & ~PAGE_MASK) || + size + s->length > max) { + if (__s390_dma_map_sg(dev, start, size, + &dma->dma_address, dir)) + goto unmap; + + dma->dma_address += offset; + dma->dma_length = size - offset; + + size = offset = s->offset; + start = s; + dma = sg_next(dma); + count++; + } + size += s->length; } - mapped_elements = 0; - goto out; + if (__s390_dma_map_sg(dev, start, size, &dma->dma_address, dir)) + goto unmap; + + dma->dma_address += offset; + dma->dma_length = size - offset; + + return count + 1; +unmap: + for_each_sg(sg, s, count, i) + s390_dma_unmap_pages(dev, sg_dma_address(s), sg_dma_len(s), + dir, attrs); + + return 0; } static void s390_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nr_elements, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { struct scatterlist *s; int i; for_each_sg(sg, s, nr_elements, i) { - s390_dma_unmap_pages(dev, s->dma_address, s->dma_length, dir, NULL); + if (s->dma_length) + s390_dma_unmap_pages(dev, s->dma_address, s->dma_length, + dir, attrs); s->dma_address = 0; s->dma_length = 0; } @@ -469,6 +553,7 @@ int zpci_dma_init_device(struct zpci_dev *zdev) * Also set zdev->end_dma to the actual end address of the usable * range, instead of the theoretical maximum as reported by hardware. */ + zdev->start_dma = PAGE_ALIGN(zdev->start_dma); zdev->iommu_size = min3((u64) high_memory, ZPCI_TABLE_SIZE_RT - zdev->start_dma, zdev->end_dma - zdev->start_dma + 1); @@ -479,7 +564,14 @@ int zpci_dma_init_device(struct zpci_dev *zdev) rc = -ENOMEM; goto free_dma_table; } + if (!zdev->tlb_refresh && !s390_iommu_strict) { + zdev->lazy_bitmap = vzalloc(zdev->iommu_pages / 8); + if (!zdev->lazy_bitmap) { + rc = -ENOMEM; + goto free_bitmap; + } + } rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, (u64) zdev->dma_table); if (rc) @@ -489,6 +581,8 @@ int zpci_dma_init_device(struct zpci_dev *zdev) free_bitmap: vfree(zdev->iommu_bitmap); zdev->iommu_bitmap = NULL; + vfree(zdev->lazy_bitmap); + zdev->lazy_bitmap = NULL; free_dma_table: dma_free_cpu_table(zdev->dma_table); zdev->dma_table = NULL; @@ -510,6 +604,9 @@ void zpci_dma_exit_device(struct zpci_dev *zdev) zdev->dma_table = NULL; vfree(zdev->iommu_bitmap); zdev->iommu_bitmap = NULL; + vfree(zdev->lazy_bitmap); + zdev->lazy_bitmap = NULL; + zdev->next_bit = 0; } diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index fb2a9a560fdc..c2b27ad8e94d 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -145,8 +145,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) default: break; } - if (pdev) - pci_dev_put(pdev); + pci_dev_put(pdev); } void zpci_event_availability(void *data) diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index 10ca15dcab11..fa8d7d4b9751 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -99,7 +99,7 @@ void zpci_set_irq_ctrl(u16 ctl, char *unused, u8 isc) } /* PCI Load */ -static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status) +static inline int ____pcilg(u64 *data, u64 req, u64 offset, u8 *status) { register u64 __req asm("2") = req; register u64 __offset asm("3") = offset; @@ -116,6 +116,16 @@ static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status) : "d" (__offset) : "cc"); *status = __req >> 24 & 0xff; + *data = __data; + return cc; +} + +static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status) +{ + u64 __data; + int cc; + + cc = ____pcilg(&__data, req, offset, status); if (!cc) *data = __data; diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c index f37a5808883d..ed484dc84d14 100644 --- a/arch/s390/pci/pci_sysfs.c +++ b/arch/s390/pci/pci_sysfs.c @@ -12,6 +12,8 @@ #include <linux/stat.h> #include <linux/pci.h> +#include <asm/sclp.h> + #define zpci_attr(name, fmt, member) \ static ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ @@ -77,8 +79,29 @@ static ssize_t util_string_read(struct file *filp, struct kobject *kobj, sizeof(zdev->util_str)); } static BIN_ATTR_RO(util_string, CLP_UTIL_STR_LEN); + +static ssize_t report_error_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + struct zpci_report_error_header *report = (void *) buf; + struct device *dev = kobj_to_dev(kobj); + struct pci_dev *pdev = to_pci_dev(dev); + struct zpci_dev *zdev = to_zpci(pdev); + int ret; + + if (off || (count < sizeof(*report))) + return -EINVAL; + + ret = sclp_pci_report(report, zdev->fh, zdev->fid); + + return ret ? ret : count; +} +static BIN_ATTR(report_error, S_IWUSR, NULL, report_error_write, PAGE_SIZE); + static struct bin_attribute *zpci_bin_attrs[] = { &bin_attr_util_string, + &bin_attr_report_error, NULL, }; diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index e2660d27889b..fe4e6c910dd7 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -39,7 +39,6 @@ static void print_facility_list(struct facility_def *def) printf("#define %s ", def->name); for (i = 0; i <= high; i++) printf("_AC(0x%016llx,UL)%c", array[i], i < high ? ',' : '\n'); - printf("#define %s_DWORDS %d\n", def->name, high + 1); free(array); } |