diff options
Diffstat (limited to 'arch/arm64')
154 files changed, 7699 insertions, 982 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 969ef880d234..111742126897 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -52,6 +52,7 @@ config ARM64 select GENERIC_TIME_VSYSCALL select HANDLE_DOMAIN_IRQ select HARDIRQS_SW_RESEND + select HAVE_ACPI_APEI if (ACPI && EFI) select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_BITREVERSE @@ -109,6 +110,7 @@ config ARM64 select POWER_SUPPLY select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE + select THREAD_INFO_IN_TASK help ARM 64-bit (AArch64) Linux support. @@ -238,6 +240,9 @@ config PGTABLE_LEVELS default 3 if ARM64_16K_PAGES && ARM64_VA_BITS_47 default 4 if !ARM64_64K_PAGES && ARM64_VA_BITS_48 +config ARCH_SUPPORTS_UPROBES + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" @@ -790,6 +795,14 @@ config SETEND_EMULATION If unsure, say Y endif +config ARM64_SW_TTBR0_PAN + bool "Emulate Privileged Access Never using TTBR0_EL1 switching" + help + Enabling this option prevents the kernel from accessing + user-space memory directly by pointing TTBR0_EL1 to a reserved + zeroed area and reserved ASID. The user access routines + restore the valid TTBR0_EL1 temporarily. + menu "ARMv8.1 architectural features" config ARM64_HW_AFDBM diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug index b661fe742615..d1ebd46872fd 100644 --- a/arch/arm64/Kconfig.debug +++ b/arch/arm64/Kconfig.debug @@ -2,9 +2,13 @@ menu "Kernel hacking" source "lib/Kconfig.debug" -config ARM64_PTDUMP +config ARM64_PTDUMP_CORE + def_bool n + +config ARM64_PTDUMP_DEBUGFS bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL + select ARM64_PTDUMP_CORE select DEBUG_FS help Say Y here if you want to show the kernel pagetable layout in a @@ -38,6 +42,35 @@ config ARM64_RANDOMIZE_TEXT_OFFSET of TEXT_OFFSET and platforms must not require a specific value. +config DEBUG_WX + bool "Warn on W+X mappings at boot" + select ARM64_PTDUMP_CORE + ---help--- + Generate a warning if any W+X mappings are found at boot. + + This is useful for discovering cases where the kernel is leaving + W+X mappings after applying NX, as such mappings are a security risk. + This check also includes UXN, which should be set on all kernel + mappings. + + Look for a message in dmesg output like this: + + arm64/mm: Checked W+X mappings: passed, no W+X pages found. + + or like this, if the check failed: + + arm64/mm: Checked W+X mappings: FAILED, <N> W+X pages found. + + Note that even if the check fails, your kernel is possibly + still fine, as W+X mappings are not a security hole in + themselves, what they do is that they make the exploitation + of other unfixed kernel bugs easier. + + There is no runtime or memory usage effect of this option + once the kernel has booted up - it's a one time check. + + If in doubt, say "Y". + config DEBUG_SET_MODULE_RONX bool "Set loadable kernel module data as NX and text as RO" depends on MODULES diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index e9fc22645b7f..6419a572f6ba 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -191,6 +191,7 @@ config ARCH_THUNDER config ARCH_UNIPHIER bool "Socionext UniPhier SoC Family" + select ARCH_HAS_RESET_CONTROLLER select PINCTRL help This enables support for Socionext UniPhier SoC family. diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 3635b8662724..b9a4a934ca05 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -37,10 +37,16 @@ $(warning LSE atomics not supported by binutils) endif endif -KBUILD_CFLAGS += -mgeneral-regs-only $(lseinstr) +brokengasinst := $(call as-instr,1:\n.inst 0\n.rept . - 1b\n\nnop\n.endr\n,,-DCONFIG_BROKEN_GAS_INST=1) + +ifneq ($(brokengasinst),) +$(warning Detected assembler with broken .inst; disassembly will be unreliable) +endif + +KBUILD_CFLAGS += -mgeneral-regs-only $(lseinstr) $(brokengasinst) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += $(call cc-option, -mpc-relative-literal-loads) -KBUILD_AFLAGS += $(lseinstr) +KBUILD_AFLAGS += $(lseinstr) $(brokengasinst) ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) KBUILD_CPPFLAGS += -mbig-endian diff --git a/arch/arm64/boot/dts/arm/juno-base.dtsi b/arch/arm64/boot/dts/arm/juno-base.dtsi index 334271a25f70..7d3a2acc6a55 100644 --- a/arch/arm64/boot/dts/arm/juno-base.dtsi +++ b/arch/arm64/boot/dts/arm/juno-base.dtsi @@ -393,7 +393,7 @@ #address-cells = <3>; #size-cells = <2>; dma-coherent; - ranges = <0x01000000 0x00 0x5f800000 0x00 0x5f800000 0x0 0x00800000>, + ranges = <0x01000000 0x00 0x00000000 0x00 0x5f800000 0x0 0x00800000>, <0x02000000 0x00 0x50000000 0x00 0x50000000 0x0 0x08000000>, <0x42000000 0x40 0x00000000 0x40 0x00000000 0x1 0x00000000>; #interrupt-cells = <1>; diff --git a/arch/arm64/boot/dts/arm/juno-r1.dts b/arch/arm64/boot/dts/arm/juno-r1.dts index 123a58b29cbd..f0b857d6d73c 100644 --- a/arch/arm64/boot/dts/arm/juno-r1.dts +++ b/arch/arm64/boot/dts/arm/juno-r1.dts @@ -76,7 +76,7 @@ compatible = "arm,idle-state"; arm,psci-suspend-param = <0x1010000>; local-timer-stop; - entry-latency-us = <300>; + entry-latency-us = <400>; exit-latency-us = <1200>; min-residency-us = <2500>; }; diff --git a/arch/arm64/boot/dts/arm/juno-r2.dts b/arch/arm64/boot/dts/arm/juno-r2.dts index 007be826efce..26aaa6a7670f 100644 --- a/arch/arm64/boot/dts/arm/juno-r2.dts +++ b/arch/arm64/boot/dts/arm/juno-r2.dts @@ -76,7 +76,7 @@ compatible = "arm,idle-state"; arm,psci-suspend-param = <0x1010000>; local-timer-stop; - entry-latency-us = <300>; + entry-latency-us = <400>; exit-latency-us = <1200>; min-residency-us = <2500>; }; diff --git a/arch/arm64/boot/dts/arm/juno.dts b/arch/arm64/boot/dts/arm/juno.dts index a7270eff6939..6e154d948a80 100644 --- a/arch/arm64/boot/dts/arm/juno.dts +++ b/arch/arm64/boot/dts/arm/juno.dts @@ -76,7 +76,7 @@ compatible = "arm,idle-state"; arm,psci-suspend-param = <0x1010000>; local-timer-stop; - entry-latency-us = <300>; + entry-latency-us = <400>; exit-latency-us = <1200>; min-residency-us = <2500>; }; diff --git a/arch/arm64/boot/dts/broadcom/ns2-svk.dts b/arch/arm64/boot/dts/broadcom/ns2-svk.dts index 2d7872a36b91..c4d544244b19 100644 --- a/arch/arm64/boot/dts/broadcom/ns2-svk.dts +++ b/arch/arm64/boot/dts/broadcom/ns2-svk.dts @@ -56,6 +56,10 @@ }; }; +&enet { + status = "ok"; +}; + &pci_phy0 { status = "ok"; }; @@ -164,6 +168,8 @@ nand-ecc-mode = "hw"; nand-ecc-strength = <8>; nand-ecc-step-size = <512>; + nand-bus-width = <16>; + brcm,nand-oob-sector-size = <16>; #address-cells = <1>; #size-cells = <1>; }; @@ -172,6 +178,7 @@ &mdio_mux_iproc { mdio@10 { gphy0: eth-phy@10 { + enet-phy-lane-swap; reg = <0x10>; }; }; diff --git a/arch/arm64/boot/dts/broadcom/ns2.dtsi b/arch/arm64/boot/dts/broadcom/ns2.dtsi index d95dc408629a..773ed593da4d 100644 --- a/arch/arm64/boot/dts/broadcom/ns2.dtsi +++ b/arch/arm64/boot/dts/broadcom/ns2.dtsi @@ -191,6 +191,18 @@ #include "ns2-clock.dtsi" + enet: ethernet@61000000 { + compatible = "brcm,ns2-amac"; + reg = <0x61000000 0x1000>, + <0x61090000 0x1000>, + <0x61030000 0x100>; + reg-names = "amac_base", "idm_base", "nicpm_base"; + interrupts = <GIC_SPI 341 IRQ_TYPE_LEVEL_HIGH>; + phy-handle = <&gphy0>; + phy-mode = "rgmii"; + status = "disabled"; + }; + dma0: dma@61360000 { compatible = "arm,pl330", "arm,primecell"; reg = <0x61360000 0x1000>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi index 220ac7057d12..97d331ec2500 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi @@ -123,6 +123,7 @@ <1 14 0xf08>, /* Physical Non-Secure PPI */ <1 11 0xf08>, /* Virtual PPI */ <1 10 0xf08>; /* Hypervisor PPI */ + fsl,erratum-a008585; }; pmu { diff --git a/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi index 337da90bd7da..d058e56db72d 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi @@ -195,6 +195,7 @@ <1 14 4>, /* Physical Non-Secure PPI, active-low */ <1 11 4>, /* Virtual PPI, active-low */ <1 10 4>; /* Hypervisor PPI, active-low */ + fsl,erratum-a008585; }; pmu { @@ -215,6 +216,12 @@ clocks = <&sysclk>; }; + dcfg: dcfg@1e00000 { + compatible = "fsl,ls2080a-dcfg", "syscon"; + reg = <0x0 0x1e00000 0x0 0x10000>; + little-endian; + }; + serial0: serial@21c0500 { compatible = "fsl,ns16550", "ns16550a"; reg = <0x0 0x21c0500 0x0 0x100>; diff --git a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi index 17839db585d5..e0ea60382087 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi +++ b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi @@ -747,7 +747,6 @@ clocks = <&sys_ctrl HI6220_USBOTG_HCLK>; clock-names = "otg"; dr_mode = "otg"; - g-use-dma; g-rx-fifo-size = <512>; g-np-tx-fifo-size = <128>; g-tx-fifo-size = <128 128 128 128 128 128>; diff --git a/arch/arm64/boot/dts/marvell/armada-3720-db.dts b/arch/arm64/boot/dts/marvell/armada-3720-db.dts index 1372e9a6aaa4..a59d36cd6caf 100644 --- a/arch/arm64/boot/dts/marvell/armada-3720-db.dts +++ b/arch/arm64/boot/dts/marvell/armada-3720-db.dts @@ -81,3 +81,26 @@ &pcie0 { status = "okay"; }; + +&mdio { + status = "okay"; + phy0: ethernet-phy@0 { + reg = <0>; + }; + + phy1: ethernet-phy@1 { + reg = <1>; + }; +}; + +ð0 { + phy-mode = "rgmii-id"; + phy = <&phy0>; + status = "okay"; +}; + +ð1 { + phy-mode = "sgmii"; + phy = <&phy1>; + status = "okay"; +}; diff --git a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi index c4762538ec01..3b8eb45bdc76 100644 --- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi @@ -105,7 +105,7 @@ status = "disabled"; }; - nb_perih_clk: nb-periph-clk@13000{ + nb_periph_clk: nb-periph-clk@13000 { compatible = "marvell,armada-3700-periph-clock-nb"; reg = <0x13000 0x100>; clocks = <&tbg 0>, <&tbg 1>, <&tbg 2>, @@ -113,7 +113,7 @@ #clock-cells = <1>; }; - sb_perih_clk: sb-periph-clk@18000{ + sb_periph_clk: sb-periph-clk@18000 { compatible = "marvell,armada-3700-periph-clock-sb"; reg = <0x18000 0x100>; clocks = <&tbg 0>, <&tbg 1>, <&tbg 2>, @@ -140,6 +140,29 @@ }; }; + eth0: ethernet@30000 { + compatible = "marvell,armada-3700-neta"; + reg = <0x30000 0x4000>; + interrupts = <GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&sb_periph_clk 8>; + status = "disabled"; + }; + + mdio: mdio@32004 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "marvell,orion-mdio"; + reg = <0x32004 0x4>; + }; + + eth1: ethernet@40000 { + compatible = "marvell,armada-3700-neta"; + reg = <0x40000 0x4000>; + interrupts = <GIC_SPI 45 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&sb_periph_clk 7>; + status = "disabled"; + }; + usb3: usb@58000 { compatible = "marvell,armada3700-xhci", "generic-xhci"; diff --git a/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi b/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi index e5e3ed678b6f..93ec8fef82a1 100644 --- a/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi @@ -131,7 +131,7 @@ #address-cells = <0x1>; #size-cells = <0x0>; cell-index = <1>; - clocks = <&cpm_syscon0 0 3>; + clocks = <&cpm_syscon0 1 21>; status = "disabled"; }; @@ -164,6 +164,14 @@ clocks = <&cpm_syscon0 1 21>; status = "disabled"; }; + + cpm_trng: trng@760000 { + compatible = "marvell,armada-8k-rng", "inside-secure,safexcel-eip76"; + reg = <0x760000 0x7d>; + interrupts = <GIC_SPI 59 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&cpm_syscon0 1 25>; + status = "okay"; + }; }; cpm_pcie0: pcie@f2600000 { diff --git a/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi b/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi index 842fb333285c..ee8db0556791 100644 --- a/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi @@ -130,8 +130,8 @@ reg = <0x700600 0x50>; #address-cells = <0x1>; #size-cells = <0x0>; - cell-index = <1>; - clocks = <&cps_syscon0 0 3>; + cell-index = <3>; + clocks = <&cps_syscon0 1 21>; status = "disabled"; }; @@ -140,7 +140,7 @@ reg = <0x700680 0x50>; #address-cells = <1>; #size-cells = <0>; - cell-index = <2>; + cell-index = <4>; clocks = <&cps_syscon0 1 21>; status = "disabled"; }; @@ -164,6 +164,14 @@ clocks = <&cps_syscon0 1 21>; status = "disabled"; }; + + cps_trng: trng@760000 { + compatible = "marvell,armada-8k-rng", "inside-secure,safexcel-eip76"; + reg = <0x760000 0x7d>; + interrupts = <GIC_SPI 312 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&cps_syscon0 1 25>; + status = "okay"; + }; }; cps_pcie0: pcie@f4600000 { diff --git a/arch/arm64/boot/dts/mediatek/mt8173-evb.dts b/arch/arm64/boot/dts/mediatek/mt8173-evb.dts index 2a7f731c7759..0ecaad4333a7 100644 --- a/arch/arm64/boot/dts/mediatek/mt8173-evb.dts +++ b/arch/arm64/boot/dts/mediatek/mt8173-evb.dts @@ -34,15 +34,6 @@ chosen { }; - usb_p1_vbus: regulator@0 { - compatible = "regulator-fixed"; - regulator-name = "usb_vbus"; - regulator-min-microvolt = <5000000>; - regulator-max-microvolt = <5000000>; - gpio = <&pio 130 GPIO_ACTIVE_HIGH>; - enable-active-high; - }; - connector { compatible = "hdmi-connector"; label = "hdmi"; @@ -54,6 +45,29 @@ }; }; }; + + extcon_usb: extcon_iddig { + compatible = "linux,extcon-usb-gpio"; + id-gpio = <&pio 16 GPIO_ACTIVE_HIGH>; + }; + + usb_p1_vbus: regulator@0 { + compatible = "regulator-fixed"; + regulator-name = "usb_vbus"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + gpio = <&pio 130 GPIO_ACTIVE_HIGH>; + enable-active-high; + }; + + usb_p0_vbus: regulator@1 { + compatible = "regulator-fixed"; + regulator-name = "vbus"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + gpio = <&pio 9 GPIO_ACTIVE_HIGH>; + enable-active-high; + }; }; &cec { @@ -243,6 +257,20 @@ bias-pull-down = <MTK_PUPD_SET_R1R0_10>; }; }; + + usb_id_pins_float: usb_iddig_pull_up { + pins_iddig { + pinmux = <MT8173_PIN_16_IDDIG__FUNC_IDDIG>; + bias-pull-up; + }; + }; + + usb_id_pins_ground: usb_iddig_pull_down { + pins_iddig { + pinmux = <MT8173_PIN_16_IDDIG__FUNC_IDDIG>; + bias-pull-down; + }; + }; }; &pwm0 { @@ -469,12 +497,25 @@ status = "okay"; }; +&ssusb { + vusb33-supply = <&mt6397_vusb_reg>; + vbus-supply = <&usb_p0_vbus>; + extcon = <&extcon_usb>; + dr_mode = "otg"; + mediatek,enable-wakeup; + pinctrl-names = "default", "id_float", "id_ground"; + pinctrl-0 = <&usb_id_pins_float>; + pinctrl-1 = <&usb_id_pins_float>; + pinctrl-2 = <&usb_id_pins_ground>; + status = "okay"; +}; + &uart0 { status = "okay"; }; -&usb30 { +&usb_host { vusb33-supply = <&mt6397_vusb_reg>; vbus-supply = <&usb_p1_vbus>; - mediatek,wakeup-src = <1>; + status = "okay"; }; diff --git a/arch/arm64/boot/dts/mediatek/mt8173.dtsi b/arch/arm64/boot/dts/mediatek/mt8173.dtsi index 1c71e256601d..c2d588ca59b7 100644 --- a/arch/arm64/boot/dts/mediatek/mt8173.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt8173.dtsi @@ -707,11 +707,14 @@ status = "disabled"; }; - usb30: usb@11270000 { - compatible = "mediatek,mt8173-xhci"; - reg = <0 0x11270000 0 0x1000>, + ssusb: usb@11271000 { + compatible = "mediatek,mt8173-mtu3"; + reg = <0 0x11271000 0 0x3000>, <0 0x11280700 0 0x0100>; - interrupts = <GIC_SPI 115 IRQ_TYPE_LEVEL_LOW>; + reg-names = "mac", "ippc"; + interrupts = <GIC_SPI 64 IRQ_TYPE_LEVEL_LOW>; + phys = <&phy_port0 PHY_TYPE_USB3>, + <&phy_port1 PHY_TYPE_USB2>; power-domains = <&scpsys MT8173_POWER_DOMAIN_USB>; clocks = <&topckgen CLK_TOP_USB30_SEL>, <&pericfg CLK_PERI_USB0>, @@ -719,10 +722,22 @@ clock-names = "sys_ck", "wakeup_deb_p0", "wakeup_deb_p1"; - phys = <&phy_port0 PHY_TYPE_USB3>, - <&phy_port1 PHY_TYPE_USB2>; mediatek,syscon-wakeup = <&pericfg>; - status = "okay"; + #address-cells = <2>; + #size-cells = <2>; + ranges; + status = "disabled"; + + usb_host: xhci@11270000 { + compatible = "mediatek,mt8173-xhci"; + reg = <0 0x11270000 0 0x1000>; + reg-names = "mac"; + interrupts = <GIC_SPI 115 IRQ_TYPE_LEVEL_LOW>; + power-domains = <&scpsys MT8173_POWER_DOMAIN_USB>; + clocks = <&topckgen CLK_TOP_USB30_SEL>; + clock-names = "sys_ck"; + status = "disabled"; + }; }; u3phy: usb-phy@11290000 { diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2180.dtsi b/arch/arm64/boot/dts/nvidia/tegra210-p2180.dtsi index 5fda583351d7..906fb836d241 100644 --- a/arch/arm64/boot/dts/nvidia/tegra210-p2180.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra210-p2180.dtsi @@ -21,6 +21,10 @@ reg = <0x0 0x80000000 0x1 0x0>; }; + gpu@57000000 { + vdd-supply = <&vdd_gpu>; + }; + /* debug port */ serial@70006000 { status = "okay"; @@ -291,4 +295,18 @@ clock-frequency = <32768>; }; }; + + regulators { + vdd_gpu: regulator@100 { + compatible = "pwm-regulator"; + reg = <100>; + pwms = <&pwm 1 4880>; + regulator-name = "VDD_GPU"; + regulator-min-microvolt = <710000>; + regulator-max-microvolt = <1320000>; + enable-gpios = <&pmic 6 GPIO_ACTIVE_HIGH>; + regulator-ramp-delay = <80>; + regulator-enable-ramp-delay = <1000>; + }; + }; }; diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2371-2180.dts b/arch/arm64/boot/dts/nvidia/tegra210-p2371-2180.dts index 983775e637a4..4c1ea7a08d43 100644 --- a/arch/arm64/boot/dts/nvidia/tegra210-p2371-2180.dts +++ b/arch/arm64/boot/dts/nvidia/tegra210-p2371-2180.dts @@ -7,6 +7,32 @@ model = "NVIDIA Jetson TX1 Developer Kit"; compatible = "nvidia,p2371-2180", "nvidia,tegra210"; + pcie-controller@01003000 { + status = "okay"; + + avdd-pll-uerefe-supply = <&avdd_1v05_pll>; + hvddio-pex-supply = <&vdd_1v8>; + dvddio-pex-supply = <&vdd_pex_1v05>; + dvdd-pex-pll-supply = <&vdd_pex_1v05>; + hvdd-pex-pll-e-supply = <&vdd_1v8>; + vddio-pex-ctl-supply = <&vdd_1v8>; + + pci@1,0 { + phys = <&{/padctl@7009f000/pads/pcie/lanes/pcie-0}>, + <&{/padctl@7009f000/pads/pcie/lanes/pcie-1}>, + <&{/padctl@7009f000/pads/pcie/lanes/pcie-2}>, + <&{/padctl@7009f000/pads/pcie/lanes/pcie-3}>; + phy-names = "pcie-0", "pcie-1", "pcie-2", "pcie-3"; + status = "okay"; + }; + + pci@2,0 { + phys = <&{/padctl@7009f000/pads/pcie/lanes/pcie-4}>; + phy-names = "pcie-0"; + status = "okay"; + }; + }; + host1x@50000000 { dsi@54300000 { status = "okay"; diff --git a/arch/arm64/boot/dts/nvidia/tegra210-smaug.dts b/arch/arm64/boot/dts/nvidia/tegra210-smaug.dts index c2becb603e11..7703227f5d1a 100644 --- a/arch/arm64/boot/dts/nvidia/tegra210-smaug.dts +++ b/arch/arm64/boot/dts/nvidia/tegra210-smaug.dts @@ -11,7 +11,8 @@ compatible = "google,smaug-rev8", "google,smaug-rev7", "google,smaug-rev6", "google,smaug-rev5", "google,smaug-rev4", "google,smaug-rev3", - "google,smaug-rev1", "google,smaug", "nvidia,tegra210"; + "google,smaug-rev2", "google,smaug-rev1", + "google,smaug", "nvidia,tegra210"; aliases { serial0 = &uarta; diff --git a/arch/arm64/boot/dts/nvidia/tegra210.dtsi b/arch/arm64/boot/dts/nvidia/tegra210.dtsi index 46045fe719da..2f832df29da8 100644 --- a/arch/arm64/boot/dts/nvidia/tegra210.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra210.dtsi @@ -11,6 +11,69 @@ #address-cells = <2>; #size-cells = <2>; + pcie-controller@01003000 { + compatible = "nvidia,tegra210-pcie"; + device_type = "pci"; + reg = <0x0 0x01003000 0x0 0x00000800 /* PADS registers */ + 0x0 0x01003800 0x0 0x00000800 /* AFI registers */ + 0x0 0x02000000 0x0 0x10000000>; /* configuration space */ + reg-names = "pads", "afi", "cs"; + interrupts = <GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>, /* controller interrupt */ + <GIC_SPI 99 IRQ_TYPE_LEVEL_HIGH>; /* MSI interrupt */ + interrupt-names = "intr", "msi"; + + #interrupt-cells = <1>; + interrupt-map-mask = <0 0 0 0>; + interrupt-map = <0 0 0 0 &gic GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>; + + bus-range = <0x00 0xff>; + #address-cells = <3>; + #size-cells = <2>; + + ranges = <0x82000000 0 0x01000000 0x0 0x01000000 0 0x00001000 /* port 0 configuration space */ + 0x82000000 0 0x01001000 0x0 0x01001000 0 0x00001000 /* port 1 configuration space */ + 0x81000000 0 0x0 0x0 0x12000000 0 0x00010000 /* downstream I/O (64 KiB) */ + 0x82000000 0 0x13000000 0x0 0x13000000 0 0x0d000000 /* non-prefetchable memory (208 MiB) */ + 0xc2000000 0 0x20000000 0x0 0x20000000 0 0x20000000>; /* prefetchable memory (512 MiB) */ + + clocks = <&tegra_car TEGRA210_CLK_PCIE>, + <&tegra_car TEGRA210_CLK_AFI>, + <&tegra_car TEGRA210_CLK_PLL_E>, + <&tegra_car TEGRA210_CLK_CML0>; + clock-names = "pex", "afi", "pll_e", "cml"; + resets = <&tegra_car 70>, + <&tegra_car 72>, + <&tegra_car 74>; + reset-names = "pex", "afi", "pcie_x"; + status = "disabled"; + + pci@1,0 { + device_type = "pci"; + assigned-addresses = <0x82000800 0 0x01000000 0 0x1000>; + reg = <0x000800 0 0 0 0>; + status = "disabled"; + + #address-cells = <3>; + #size-cells = <2>; + ranges; + + nvidia,num-lanes = <4>; + }; + + pci@2,0 { + device_type = "pci"; + assigned-addresses = <0x82001000 0 0x01001000 0 0x1000>; + reg = <0x001000 0 0 0 0>; + status = "disabled"; + + #address-cells = <3>; + #size-cells = <2>; + ranges; + + nvidia,num-lanes = <1>; + }; + }; + host1x@50000000 { compatible = "nvidia,tegra210-host1x", "simple-bus"; reg = <0x0 0x50000000 0x0 0x00034000>; diff --git a/arch/arm64/boot/dts/renesas/r8a7795.dtsi b/arch/arm64/boot/dts/renesas/r8a7795.dtsi index 8c15040f2540..625dda713548 100644 --- a/arch/arm64/boot/dts/renesas/r8a7795.dtsi +++ b/arch/arm64/boot/dts/renesas/r8a7795.dtsi @@ -321,6 +321,11 @@ #power-domain-cells = <0>; }; + rst: reset-controller@e6160000 { + compatible = "renesas,r8a7795-rst"; + reg = <0 0xe6160000 0 0x0200>; + }; + sysc: system-controller@e6180000 { compatible = "renesas,r8a7795-sysc"; reg = <0 0xe6180000 0 0x0400>; diff --git a/arch/arm64/boot/dts/renesas/r8a7796.dtsi b/arch/arm64/boot/dts/renesas/r8a7796.dtsi index 9217da983525..75c8c55a8248 100644 --- a/arch/arm64/boot/dts/renesas/r8a7796.dtsi +++ b/arch/arm64/boot/dts/renesas/r8a7796.dtsi @@ -233,6 +233,11 @@ #power-domain-cells = <0>; }; + rst: reset-controller@e6160000 { + compatible = "renesas,r8a7796-rst"; + reg = <0 0xe6160000 0 0x0200>; + }; + sysc: system-controller@e6180000 { compatible = "renesas,r8a7796-sysc"; reg = <0 0xe6180000 0 0x0400>; diff --git a/arch/arm64/boot/dts/rockchip/rk3368-geekbox.dts b/arch/arm64/boot/dts/rockchip/rk3368-geekbox.dts index 46cdddfcea6c..e5eeca2c2456 100644 --- a/arch/arm64/boot/dts/rockchip/rk3368-geekbox.dts +++ b/arch/arm64/boot/dts/rockchip/rk3368-geekbox.dts @@ -116,7 +116,6 @@ cap-mmc-highspeed; clock-frequency = <150000000>; disable-wp; - keep-power-in-suspend; non-removable; num-slots = <1>; vmmc-supply = <&vcc_io>; @@ -258,8 +257,6 @@ }; vcc_sd: SWITCH_REG1 { - regulator-always-on; - regulator-boot-on; regulator-name = "vcc_sd"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3368-orion-r68-meta.dts b/arch/arm64/boot/dts/rockchip/rk3368-orion-r68-meta.dts index 5797933ef80e..ea0a8eceefd4 100644 --- a/arch/arm64/boot/dts/rockchip/rk3368-orion-r68-meta.dts +++ b/arch/arm64/boot/dts/rockchip/rk3368-orion-r68-meta.dts @@ -152,8 +152,6 @@ gpio = <&gpio3 11 GPIO_ACTIVE_LOW>; regulator-min-microvolt = <1800000>; regulator-max-microvolt = <3300000>; - regulator-always-on; - regulator-boot-on; vin-supply = <&vcc_io>; }; @@ -201,7 +199,6 @@ bus-width = <8>; cap-mmc-highspeed; disable-wp; - keep-power-in-suspend; mmc-pwrseq = <&emmc_pwrseq>; mmc-hs200-1_2v; mmc-hs200-1_8v; @@ -350,7 +347,6 @@ clock-freq-min-max = <400000 50000000>; cap-sd-highspeed; card-detect-delay = <200>; - keep-power-in-suspend; num-slots = <1>; pinctrl-names = "default"; pinctrl-0 = <&sdmmc_clk &sdmmc_cmd &sdmmc_cd &sdmmc_bus4>; diff --git a/arch/arm64/boot/dts/rockchip/rk3368.dtsi b/arch/arm64/boot/dts/rockchip/rk3368.dtsi index 0fcb2147c9f9..df231c4df5a5 100644 --- a/arch/arm64/boot/dts/rockchip/rk3368.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3368.dtsi @@ -537,7 +537,6 @@ g-np-tx-fifo-size = <16>; g-rx-fifo-size = <275>; g-tx-fifo-size = <256 128 128 64 64 32>; - g-use-dma; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index b65c193dc64e..1e24e455700b 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -174,6 +174,7 @@ <GIC_PPI 14 IRQ_TYPE_LEVEL_LOW 0>, <GIC_PPI 11 IRQ_TYPE_LEVEL_LOW 0>, <GIC_PPI 10 IRQ_TYPE_LEVEL_LOW 0>; + arm,no-tick-in-suspend; }; xin24m: xin24m { @@ -300,8 +301,11 @@ ranges = <0x83000000 0x0 0xfa000000 0x0 0xfa000000 0x0 0x600000 0x81000000 0x0 0xfa600000 0x0 0xfa600000 0x0 0x100000>; resets = <&cru SRST_PCIE_CORE>, <&cru SRST_PCIE_MGMT>, - <&cru SRST_PCIE_MGMT_STICKY>, <&cru SRST_PCIE_PIPE>; - reset-names = "core", "mgmt", "mgmt-sticky", "pipe"; + <&cru SRST_PCIE_MGMT_STICKY>, <&cru SRST_PCIE_PIPE>, + <&cru SRST_PCIE_PM>, <&cru SRST_P_PCIE>, + <&cru SRST_A_PCIE>; + reset-names = "core", "mgmt", "mgmt-sticky", "pipe", + "pm", "pclk", "aclk"; status = "disabled"; pcie0_intc: interrupt-controller { diff --git a/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi b/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi index 08fd7cf7769c..56a1b2e92cf3 100644 --- a/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi +++ b/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi @@ -257,18 +257,18 @@ reg = <0x59801000 0x400>; }; - mioctrl@59810000 { - compatible = "socionext,uniphier-mioctrl", + sdctrl@59810000 { + compatible = "socionext,uniphier-ld20-sdctrl", "simple-mfd", "syscon"; reg = <0x59810000 0x800>; - mio_clk: clock { - compatible = "socionext,uniphier-ld20-mio-clock"; + sd_clk: clock { + compatible = "socionext,uniphier-ld20-sd-clock"; #clock-cells = <1>; }; - mio_rst: reset { - compatible = "socionext,uniphier-ld20-mio-reset"; + sd_rst: reset { + compatible = "socionext,uniphier-ld20-sd-reset"; #reset-cells = <1>; }; }; diff --git a/arch/arm64/boot/dts/zte/zx296718.dtsi b/arch/arm64/boot/dts/zte/zx296718.dtsi index a223066f24ce..dd2cd73c774c 100644 --- a/arch/arm64/boot/dts/zte/zx296718.dtsi +++ b/arch/arm64/boot/dts/zte/zx296718.dtsi @@ -239,16 +239,9 @@ compatible = "arm,gic-v3"; #interrupt-cells = <3>; #address-cells = <0>; - #redistributor-regions = <6>; - redistributor-stride = <0x0 0x40000>; interrupt-controller; reg = <0x02a00000 0x10000>, - <0x02b00000 0x20000>, - <0x02b20000 0x20000>, - <0x02b40000 0x20000>, - <0x02b60000 0x20000>, - <0x02b80000 0x20000>, - <0x02ba0000 0x20000>; + <0x02b00000 0xc0000>; interrupts = <GIC_PPI 9 IRQ_TYPE_LEVEL_HIGH>; }; diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 0bba32a91e08..869dded0f09f 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -270,6 +270,7 @@ CONFIG_GPIO_DWAPB=y CONFIG_GPIO_PL061=y CONFIG_GPIO_RCAR=y CONFIG_GPIO_XGENE=y +CONFIG_GPIO_XGENE_SB=y CONFIG_GPIO_PCA953X=y CONFIG_GPIO_PCA953X_IRQ=y CONFIG_GPIO_MAX77620=y diff --git a/arch/arm64/crypto/.gitignore b/arch/arm64/crypto/.gitignore new file mode 100644 index 000000000000..879df8781ed5 --- /dev/null +++ b/arch/arm64/crypto/.gitignore @@ -0,0 +1,2 @@ +sha256-core.S +sha512-core.S diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 2cf32e9887e1..450a85df041a 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -8,6 +8,14 @@ menuconfig ARM64_CRYPTO if ARM64_CRYPTO +config CRYPTO_SHA256_ARM64 + tristate "SHA-224/SHA-256 digest algorithm for arm64" + select CRYPTO_HASH + +config CRYPTO_SHA512_ARM64 + tristate "SHA-384/SHA-512 digest algorithm for arm64" + select CRYPTO_HASH + config CRYPTO_SHA1_ARM64_CE tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)" depends on ARM64 && KERNEL_MODE_NEON @@ -23,6 +31,16 @@ config CRYPTO_GHASH_ARM64_CE depends on ARM64 && KERNEL_MODE_NEON select CRYPTO_HASH +config CRYPTO_CRCT10DIF_ARM64_CE + tristate "CRCT10DIF digest algorithm using PMULL instructions" + depends on KERNEL_MODE_NEON && CRC_T10DIF + select CRYPTO_HASH + +config CRYPTO_CRC32_ARM64_CE + tristate "CRC32 and CRC32C digest algorithms using PMULL instructions" + depends on KERNEL_MODE_NEON && CRC32 + select CRYPTO_HASH + config CRYPTO_AES_ARM64_CE tristate "AES core cipher using ARMv8 Crypto Extensions" depends on ARM64 && KERNEL_MODE_NEON @@ -40,17 +58,18 @@ config CRYPTO_AES_ARM64_CE_BLK depends on ARM64 && KERNEL_MODE_NEON select CRYPTO_BLKCIPHER select CRYPTO_AES_ARM64_CE - select CRYPTO_ABLK_HELPER + select CRYPTO_SIMD config CRYPTO_AES_ARM64_NEON_BLK tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions" depends on ARM64 && KERNEL_MODE_NEON select CRYPTO_BLKCIPHER select CRYPTO_AES - select CRYPTO_ABLK_HELPER + select CRYPTO_SIMD config CRYPTO_CRC32_ARM64 tristate "CRC32 and CRC32C using optional ARMv8 instructions" depends on ARM64 select CRYPTO_HASH + endif diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index abb79b3cfcfe..aa8888d7b744 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -17,6 +17,12 @@ sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o +obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o +crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o + +obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o +crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o + obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto @@ -29,6 +35,12 @@ aes-ce-blk-y := aes-glue-ce.o aes-ce.o obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o aes-neon-blk-y := aes-glue-neon.o aes-neon.o +obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o +sha256-arm64-y := sha256-glue.o sha256-core.o + +obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o +sha512-arm64-y := sha512-glue.o sha512-core.o + AFLAGS_aes-ce.o := -DINTERLEAVE=4 AFLAGS_aes-neon.o := -DINTERLEAVE=4 @@ -40,3 +52,14 @@ CFLAGS_crc32-arm64.o := -mcpu=generic+crc $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE $(call if_changed_rule,cc_o_c) + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $(<) void $(@) + +$(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl + $(call cmd,perlasm) + +$(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl + $(call cmd,perlasm) + +.PRECIOUS: $(obj)/sha256-core.S $(obj)/sha512-core.S diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S index a2a7fbcacc14..3363560c79b7 100644 --- a/arch/arm64/crypto/aes-ce-ccm-core.S +++ b/arch/arm64/crypto/aes-ce-ccm-core.S @@ -9,6 +9,7 @@ */ #include <linux/linkage.h> +#include <asm/assembler.h> .text .arch armv8-a+crypto @@ -19,7 +20,7 @@ */ ENTRY(ce_aes_ccm_auth_data) ldr w8, [x3] /* leftover from prev round? */ - ld1 {v0.2d}, [x0] /* load mac */ + ld1 {v0.16b}, [x0] /* load mac */ cbz w8, 1f sub w8, w8, #16 eor v1.16b, v1.16b, v1.16b @@ -31,7 +32,7 @@ ENTRY(ce_aes_ccm_auth_data) beq 8f /* out of input? */ cbnz w8, 0b eor v0.16b, v0.16b, v1.16b -1: ld1 {v3.2d}, [x4] /* load first round key */ +1: ld1 {v3.16b}, [x4] /* load first round key */ prfm pldl1strm, [x1] cmp w5, #12 /* which key size? */ add x6, x4, #16 @@ -41,17 +42,17 @@ ENTRY(ce_aes_ccm_auth_data) mov v5.16b, v3.16b b 4f 2: mov v4.16b, v3.16b - ld1 {v5.2d}, [x6], #16 /* load 2nd round key */ + ld1 {v5.16b}, [x6], #16 /* load 2nd round key */ 3: aese v0.16b, v4.16b aesmc v0.16b, v0.16b -4: ld1 {v3.2d}, [x6], #16 /* load next round key */ +4: ld1 {v3.16b}, [x6], #16 /* load next round key */ aese v0.16b, v5.16b aesmc v0.16b, v0.16b -5: ld1 {v4.2d}, [x6], #16 /* load next round key */ +5: ld1 {v4.16b}, [x6], #16 /* load next round key */ subs w7, w7, #3 aese v0.16b, v3.16b aesmc v0.16b, v0.16b - ld1 {v5.2d}, [x6], #16 /* load next round key */ + ld1 {v5.16b}, [x6], #16 /* load next round key */ bpl 3b aese v0.16b, v4.16b subs w2, w2, #16 /* last data? */ @@ -60,7 +61,7 @@ ENTRY(ce_aes_ccm_auth_data) ld1 {v1.16b}, [x1], #16 /* load next input block */ eor v0.16b, v0.16b, v1.16b /* xor with mac */ bne 1b -6: st1 {v0.2d}, [x0] /* store mac */ +6: st1 {v0.16b}, [x0] /* store mac */ beq 10f adds w2, w2, #16 beq 10f @@ -79,7 +80,7 @@ ENTRY(ce_aes_ccm_auth_data) adds w7, w7, #1 bne 9b eor v0.16b, v0.16b, v1.16b - st1 {v0.2d}, [x0] + st1 {v0.16b}, [x0] 10: str w8, [x3] ret ENDPROC(ce_aes_ccm_auth_data) @@ -89,27 +90,27 @@ ENDPROC(ce_aes_ccm_auth_data) * u32 rounds); */ ENTRY(ce_aes_ccm_final) - ld1 {v3.2d}, [x2], #16 /* load first round key */ - ld1 {v0.2d}, [x0] /* load mac */ + ld1 {v3.16b}, [x2], #16 /* load first round key */ + ld1 {v0.16b}, [x0] /* load mac */ cmp w3, #12 /* which key size? */ sub w3, w3, #2 /* modified # of rounds */ - ld1 {v1.2d}, [x1] /* load 1st ctriv */ + ld1 {v1.16b}, [x1] /* load 1st ctriv */ bmi 0f bne 3f mov v5.16b, v3.16b b 2f 0: mov v4.16b, v3.16b -1: ld1 {v5.2d}, [x2], #16 /* load next round key */ +1: ld1 {v5.16b}, [x2], #16 /* load next round key */ aese v0.16b, v4.16b aesmc v0.16b, v0.16b aese v1.16b, v4.16b aesmc v1.16b, v1.16b -2: ld1 {v3.2d}, [x2], #16 /* load next round key */ +2: ld1 {v3.16b}, [x2], #16 /* load next round key */ aese v0.16b, v5.16b aesmc v0.16b, v0.16b aese v1.16b, v5.16b aesmc v1.16b, v1.16b -3: ld1 {v4.2d}, [x2], #16 /* load next round key */ +3: ld1 {v4.16b}, [x2], #16 /* load next round key */ subs w3, w3, #3 aese v0.16b, v3.16b aesmc v0.16b, v0.16b @@ -120,47 +121,47 @@ ENTRY(ce_aes_ccm_final) aese v1.16b, v4.16b /* final round key cancels out */ eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */ - st1 {v0.2d}, [x0] /* store result */ + st1 {v0.16b}, [x0] /* store result */ ret ENDPROC(ce_aes_ccm_final) .macro aes_ccm_do_crypt,enc ldr x8, [x6, #8] /* load lower ctr */ - ld1 {v0.2d}, [x5] /* load mac */ - rev x8, x8 /* keep swabbed ctr in reg */ + ld1 {v0.16b}, [x5] /* load mac */ +CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */ 0: /* outer loop */ - ld1 {v1.1d}, [x6] /* load upper ctr */ + ld1 {v1.8b}, [x6] /* load upper ctr */ prfm pldl1strm, [x1] add x8, x8, #1 rev x9, x8 cmp w4, #12 /* which key size? */ sub w7, w4, #2 /* get modified # of rounds */ ins v1.d[1], x9 /* no carry in lower ctr */ - ld1 {v3.2d}, [x3] /* load first round key */ + ld1 {v3.16b}, [x3] /* load first round key */ add x10, x3, #16 bmi 1f bne 4f mov v5.16b, v3.16b b 3f 1: mov v4.16b, v3.16b - ld1 {v5.2d}, [x10], #16 /* load 2nd round key */ + ld1 {v5.16b}, [x10], #16 /* load 2nd round key */ 2: /* inner loop: 3 rounds, 2x interleaved */ aese v0.16b, v4.16b aesmc v0.16b, v0.16b aese v1.16b, v4.16b aesmc v1.16b, v1.16b -3: ld1 {v3.2d}, [x10], #16 /* load next round key */ +3: ld1 {v3.16b}, [x10], #16 /* load next round key */ aese v0.16b, v5.16b aesmc v0.16b, v0.16b aese v1.16b, v5.16b aesmc v1.16b, v1.16b -4: ld1 {v4.2d}, [x10], #16 /* load next round key */ +4: ld1 {v4.16b}, [x10], #16 /* load next round key */ subs w7, w7, #3 aese v0.16b, v3.16b aesmc v0.16b, v0.16b aese v1.16b, v3.16b aesmc v1.16b, v1.16b - ld1 {v5.2d}, [x10], #16 /* load next round key */ + ld1 {v5.16b}, [x10], #16 /* load next round key */ bpl 2b aese v0.16b, v4.16b aese v1.16b, v4.16b @@ -177,14 +178,14 @@ ENDPROC(ce_aes_ccm_final) eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ st1 {v1.16b}, [x0], #16 /* write output block */ bne 0b - rev x8, x8 - st1 {v0.2d}, [x5] /* store mac */ +CPU_LE( rev x8, x8 ) + st1 {v0.16b}, [x5] /* store mac */ str x8, [x6, #8] /* store lsb end of ctr (BE) */ 5: ret 6: eor v0.16b, v0.16b, v5.16b /* final round mac */ eor v1.16b, v1.16b, v5.16b /* final round enc */ - st1 {v0.2d}, [x5] /* store mac */ + st1 {v0.16b}, [x5] /* store mac */ add w2, w2, #16 /* process partial tail block */ 7: ldrb w9, [x1], #1 /* get 1 byte of input */ umov w6, v1.b[0] /* get top crypted ctr byte */ diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c index f4bf2f2a014c..cc5515dac74a 100644 --- a/arch/arm64/crypto/aes-ce-ccm-glue.c +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c @@ -11,9 +11,9 @@ #include <asm/neon.h> #include <asm/unaligned.h> #include <crypto/aes.h> -#include <crypto/algapi.h> #include <crypto/scatterwalk.h> #include <crypto/internal/aead.h> +#include <crypto/internal/skcipher.h> #include <linux/module.h> #include "aes-ce-setkey.h" @@ -149,12 +149,7 @@ static int ccm_encrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); - struct blkcipher_desc desc = { .info = req->iv }; - struct blkcipher_walk walk; - struct scatterlist srcbuf[2]; - struct scatterlist dstbuf[2]; - struct scatterlist *src; - struct scatterlist *dst; + struct skcipher_walk walk; u8 __aligned(8) mac[AES_BLOCK_SIZE]; u8 buf[AES_BLOCK_SIZE]; u32 len = req->cryptlen; @@ -172,27 +167,19 @@ static int ccm_encrypt(struct aead_request *req) /* preserve the original iv for the final round */ memcpy(buf, req->iv, AES_BLOCK_SIZE); - src = scatterwalk_ffwd(srcbuf, req->src, req->assoclen); - dst = src; - if (req->src != req->dst) - dst = scatterwalk_ffwd(dstbuf, req->dst, req->assoclen); - - blkcipher_walk_init(&walk, dst, src, len); - err = blkcipher_aead_walk_virt_block(&desc, &walk, aead, - AES_BLOCK_SIZE); + err = skcipher_walk_aead_encrypt(&walk, req, true); while (walk.nbytes) { u32 tail = walk.nbytes % AES_BLOCK_SIZE; - if (walk.nbytes == len) + if (walk.nbytes == walk.total) tail = 0; ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes - tail, ctx->key_enc, num_rounds(ctx), mac, walk.iv); - len -= walk.nbytes - tail; - err = blkcipher_walk_done(&desc, &walk, tail); + err = skcipher_walk_done(&walk, tail); } if (!err) ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); @@ -203,7 +190,7 @@ static int ccm_encrypt(struct aead_request *req) return err; /* copy authtag to end of dst */ - scatterwalk_map_and_copy(mac, dst, req->cryptlen, + scatterwalk_map_and_copy(mac, req->dst, req->assoclen + req->cryptlen, crypto_aead_authsize(aead), 1); return 0; @@ -214,12 +201,7 @@ static int ccm_decrypt(struct aead_request *req) struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); unsigned int authsize = crypto_aead_authsize(aead); - struct blkcipher_desc desc = { .info = req->iv }; - struct blkcipher_walk walk; - struct scatterlist srcbuf[2]; - struct scatterlist dstbuf[2]; - struct scatterlist *src; - struct scatterlist *dst; + struct skcipher_walk walk; u8 __aligned(8) mac[AES_BLOCK_SIZE]; u8 buf[AES_BLOCK_SIZE]; u32 len = req->cryptlen - authsize; @@ -237,27 +219,19 @@ static int ccm_decrypt(struct aead_request *req) /* preserve the original iv for the final round */ memcpy(buf, req->iv, AES_BLOCK_SIZE); - src = scatterwalk_ffwd(srcbuf, req->src, req->assoclen); - dst = src; - if (req->src != req->dst) - dst = scatterwalk_ffwd(dstbuf, req->dst, req->assoclen); - - blkcipher_walk_init(&walk, dst, src, len); - err = blkcipher_aead_walk_virt_block(&desc, &walk, aead, - AES_BLOCK_SIZE); + err = skcipher_walk_aead_decrypt(&walk, req, true); while (walk.nbytes) { u32 tail = walk.nbytes % AES_BLOCK_SIZE; - if (walk.nbytes == len) + if (walk.nbytes == walk.total) tail = 0; ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes - tail, ctx->key_enc, num_rounds(ctx), mac, walk.iv); - len -= walk.nbytes - tail; - err = blkcipher_walk_done(&desc, &walk, tail); + err = skcipher_walk_done(&walk, tail); } if (!err) ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); @@ -268,7 +242,8 @@ static int ccm_decrypt(struct aead_request *req) return err; /* compare calculated auth tag with the stored one */ - scatterwalk_map_and_copy(buf, src, req->cryptlen - authsize, + scatterwalk_map_and_copy(buf, req->src, + req->assoclen + req->cryptlen - authsize, authsize, 0); if (crypto_memneq(mac, buf, authsize)) @@ -287,6 +262,7 @@ static struct aead_alg ccm_aes_alg = { .cra_module = THIS_MODULE, }, .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, .maxauthsize = AES_BLOCK_SIZE, .setkey = ccm_setkey, .setauthsize = ccm_setauthsize, diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher.c index f7bd9bf0bbb3..50d9fe11d0c8 100644 --- a/arch/arm64/crypto/aes-ce-cipher.c +++ b/arch/arm64/crypto/aes-ce-cipher.c @@ -47,24 +47,24 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) kernel_neon_begin_partial(4); __asm__(" ld1 {v0.16b}, %[in] ;" - " ld1 {v1.2d}, [%[key]], #16 ;" + " ld1 {v1.16b}, [%[key]], #16 ;" " cmp %w[rounds], #10 ;" " bmi 0f ;" " bne 3f ;" " mov v3.16b, v1.16b ;" " b 2f ;" "0: mov v2.16b, v1.16b ;" - " ld1 {v3.2d}, [%[key]], #16 ;" + " ld1 {v3.16b}, [%[key]], #16 ;" "1: aese v0.16b, v2.16b ;" " aesmc v0.16b, v0.16b ;" - "2: ld1 {v1.2d}, [%[key]], #16 ;" + "2: ld1 {v1.16b}, [%[key]], #16 ;" " aese v0.16b, v3.16b ;" " aesmc v0.16b, v0.16b ;" - "3: ld1 {v2.2d}, [%[key]], #16 ;" + "3: ld1 {v2.16b}, [%[key]], #16 ;" " subs %w[rounds], %w[rounds], #3 ;" " aese v0.16b, v1.16b ;" " aesmc v0.16b, v0.16b ;" - " ld1 {v3.2d}, [%[key]], #16 ;" + " ld1 {v3.16b}, [%[key]], #16 ;" " bpl 1b ;" " aese v0.16b, v2.16b ;" " eor v0.16b, v0.16b, v3.16b ;" @@ -92,24 +92,24 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) kernel_neon_begin_partial(4); __asm__(" ld1 {v0.16b}, %[in] ;" - " ld1 {v1.2d}, [%[key]], #16 ;" + " ld1 {v1.16b}, [%[key]], #16 ;" " cmp %w[rounds], #10 ;" " bmi 0f ;" " bne 3f ;" " mov v3.16b, v1.16b ;" " b 2f ;" "0: mov v2.16b, v1.16b ;" - " ld1 {v3.2d}, [%[key]], #16 ;" + " ld1 {v3.16b}, [%[key]], #16 ;" "1: aesd v0.16b, v2.16b ;" " aesimc v0.16b, v0.16b ;" - "2: ld1 {v1.2d}, [%[key]], #16 ;" + "2: ld1 {v1.16b}, [%[key]], #16 ;" " aesd v0.16b, v3.16b ;" " aesimc v0.16b, v0.16b ;" - "3: ld1 {v2.2d}, [%[key]], #16 ;" + "3: ld1 {v2.16b}, [%[key]], #16 ;" " subs %w[rounds], %w[rounds], #3 ;" " aesd v0.16b, v1.16b ;" " aesimc v0.16b, v0.16b ;" - " ld1 {v3.2d}, [%[key]], #16 ;" + " ld1 {v3.16b}, [%[key]], #16 ;" " bpl 1b ;" " aesd v0.16b, v2.16b ;" " eor v0.16b, v0.16b, v3.16b ;" @@ -173,7 +173,12 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, u32 *rki = ctx->key_enc + (i * kwords); u32 *rko = rki + kwords; +#ifndef CONFIG_CPU_BIG_ENDIAN rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0]; +#else + rko[0] = rol32(aes_sub(rki[kwords - 1]), 8) ^ (rcon[i] << 24) ^ + rki[0]; +#endif rko[1] = rko[0] ^ rki[1]; rko[2] = rko[1] ^ rki[2]; rko[3] = rko[2] ^ rki[3]; diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S index 78f3cfe92c08..b46093d567e5 100644 --- a/arch/arm64/crypto/aes-ce.S +++ b/arch/arm64/crypto/aes-ce.S @@ -10,6 +10,7 @@ */ #include <linux/linkage.h> +#include <asm/assembler.h> #define AES_ENTRY(func) ENTRY(ce_ ## func) #define AES_ENDPROC(func) ENDPROC(ce_ ## func) diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 6b2aa0fd6cd0..4e3f8adb1793 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -11,8 +11,8 @@ #include <asm/neon.h> #include <asm/hwcap.h> #include <crypto/aes.h> -#include <crypto/ablk_helper.h> -#include <crypto/algapi.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/skcipher.h> #include <linux/module.h> #include <linux/cpufeature.h> #include <crypto/xts.h> @@ -80,13 +80,19 @@ struct crypto_aes_xts_ctx { struct crypto_aes_ctx __aligned(8) key2; }; -static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key, +static int skcipher_aes_setkey(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + return aes_setkey(crypto_skcipher_tfm(tfm), in_key, key_len); +} + +static int xts_set_key(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { - struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm); + struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm); int ret; - ret = xts_check_key(tfm, in_key, key_len); + ret = xts_verify_key(tfm, in_key, key_len); if (ret) return ret; @@ -97,111 +103,101 @@ static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key, if (!ret) return 0; - tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_encrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); int err, first, rounds = 6 + ctx->key_length / 4; - struct blkcipher_walk walk; + struct skcipher_walk walk; unsigned int blocks; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + err = skcipher_walk_virt(&walk, req, true); kernel_neon_begin(); for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr, (u8 *)ctx->key_enc, rounds, blocks, first); - err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); + err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); } kernel_neon_end(); return err; } -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); int err, first, rounds = 6 + ctx->key_length / 4; - struct blkcipher_walk walk; + struct skcipher_walk walk; unsigned int blocks; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + err = skcipher_walk_virt(&walk, req, true); kernel_neon_begin(); for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr, (u8 *)ctx->key_dec, rounds, blocks, first); - err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); + err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); } kernel_neon_end(); return err; } -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_encrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); int err, first, rounds = 6 + ctx->key_length / 4; - struct blkcipher_walk walk; + struct skcipher_walk walk; unsigned int blocks; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + err = skcipher_walk_virt(&walk, req, true); kernel_neon_begin(); for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr, (u8 *)ctx->key_enc, rounds, blocks, walk.iv, first); - err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); + err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); } kernel_neon_end(); return err; } -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); int err, first, rounds = 6 + ctx->key_length / 4; - struct blkcipher_walk walk; + struct skcipher_walk walk; unsigned int blocks; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + err = skcipher_walk_virt(&walk, req, true); kernel_neon_begin(); for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr, (u8 *)ctx->key_dec, rounds, blocks, walk.iv, first); - err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); + err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); } kernel_neon_end(); return err; } -static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_encrypt(struct skcipher_request *req) { - struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); int err, first, rounds = 6 + ctx->key_length / 4; - struct blkcipher_walk walk; + struct skcipher_walk walk; int blocks; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); + err = skcipher_walk_virt(&walk, req, true); first = 1; kernel_neon_begin(); @@ -209,17 +205,14 @@ static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, (u8 *)ctx->key_enc, rounds, blocks, walk.iv, first); + err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); first = 0; - nbytes -= blocks * AES_BLOCK_SIZE; - if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE) - break; - err = blkcipher_walk_done(desc, &walk, - walk.nbytes % AES_BLOCK_SIZE); } - if (walk.nbytes % AES_BLOCK_SIZE) { - u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; - u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; + if (walk.nbytes) { u8 __aligned(8) tail[AES_BLOCK_SIZE]; + unsigned int nbytes = walk.nbytes; + u8 *tdst = walk.dst.virt.addr; + u8 *tsrc = walk.src.virt.addr; /* * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need @@ -230,227 +223,169 @@ static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds, blocks, walk.iv, first); memcpy(tdst, tail, nbytes); - err = blkcipher_walk_done(desc, &walk, 0); + err = skcipher_walk_done(&walk, 0); } kernel_neon_end(); return err; } -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int xts_encrypt(struct skcipher_request *req) { - struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm); int err, first, rounds = 6 + ctx->key1.key_length / 4; - struct blkcipher_walk walk; + struct skcipher_walk walk; unsigned int blocks; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + err = skcipher_walk_virt(&walk, req, true); kernel_neon_begin(); for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr, (u8 *)ctx->key1.key_enc, rounds, blocks, (u8 *)ctx->key2.key_enc, walk.iv, first); - err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); + err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); } kernel_neon_end(); return err; } -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int xts_decrypt(struct skcipher_request *req) { - struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm); int err, first, rounds = 6 + ctx->key1.key_length / 4; - struct blkcipher_walk walk; + struct skcipher_walk walk; unsigned int blocks; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + err = skcipher_walk_virt(&walk, req, true); kernel_neon_begin(); for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr, (u8 *)ctx->key1.key_dec, rounds, blocks, (u8 *)ctx->key2.key_enc, walk.iv, first); - err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE); + err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); } kernel_neon_end(); return err; } -static struct crypto_alg aes_algs[] = { { - .cra_name = "__ecb-aes-" MODE, - .cra_driver_name = "__driver-ecb-aes-" MODE, - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = 0, - .setkey = aes_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, +static struct skcipher_alg aes_algs[] = { { + .base = { + .cra_name = "__ecb(aes)", + .cra_driver_name = "__ecb-aes-" MODE, + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_alignmask = 7, + .cra_module = THIS_MODULE, }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = skcipher_aes_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, }, { - .cra_name = "__cbc-aes-" MODE, - .cra_driver_name = "__driver-cbc-aes-" MODE, - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = aes_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, + .base = { + .cra_name = "__cbc(aes)", + .cra_driver_name = "__cbc-aes-" MODE, + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_alignmask = 7, + .cra_module = THIS_MODULE, }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = skcipher_aes_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, }, { - .cra_name = "__ctr-aes-" MODE, - .cra_driver_name = "__driver-ctr-aes-" MODE, - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct crypto_aes_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = aes_setkey, - .encrypt = ctr_encrypt, - .decrypt = ctr_encrypt, + .base = { + .cra_name = "__ctr(aes)", + .cra_driver_name = "__ctr-aes-" MODE, + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_alignmask = 7, + .cra_module = THIS_MODULE, }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .setkey = skcipher_aes_setkey, + .encrypt = ctr_encrypt, + .decrypt = ctr_encrypt, }, { - .cra_name = "__xts-aes-" MODE, - .cra_driver_name = "__driver-xts-aes-" MODE, - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_blkcipher = { - .min_keysize = 2 * AES_MIN_KEY_SIZE, - .max_keysize = 2 * AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = xts_set_key, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, + .base = { + .cra_name = "__xts(aes)", + .cra_driver_name = "__xts-aes-" MODE, + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx), + .cra_alignmask = 7, + .cra_module = THIS_MODULE, }, -}, { - .cra_name = "ecb(aes)", - .cra_driver_name = "ecb-aes-" MODE, - .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = 0, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - } -}, { - .cra_name = "cbc(aes)", - .cra_driver_name = "cbc-aes-" MODE, - .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - } -}, { - .cra_name = "ctr(aes)", - .cra_driver_name = "ctr-aes-" MODE, - .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - } -}, { - .cra_name = "xts(aes)", - .cra_driver_name = "xts-aes-" MODE, - .cra_priority = PRIO, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 7, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_ablkcipher = { - .min_keysize = 2 * AES_MIN_KEY_SIZE, - .max_keysize = 2 * AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - } + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_set_key, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, } }; -static int __init aes_init(void) +static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)]; + +static void aes_exit(void) { - return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs)); + int i; + + for (i = 0; i < ARRAY_SIZE(aes_simd_algs) && aes_simd_algs[i]; i++) + simd_skcipher_free(aes_simd_algs[i]); + + crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); } -static void __exit aes_exit(void) +static int __init aes_init(void) { - crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs)); + struct simd_skcipher_alg *simd; + const char *basename; + const char *algname; + const char *drvname; + int err; + int i; + + err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); + if (err) + return err; + + for (i = 0; i < ARRAY_SIZE(aes_algs); i++) { + algname = aes_algs[i].base.cra_name + 2; + drvname = aes_algs[i].base.cra_driver_name + 2; + basename = aes_algs[i].base.cra_driver_name; + simd = simd_skcipher_create_compat(algname, drvname, basename); + err = PTR_ERR(simd); + if (IS_ERR(simd)) + goto unregister_simds; + + aes_simd_algs[i] = simd; + } + + return 0; + +unregister_simds: + aes_exit(); + return err; } #ifdef USE_V8_CRYPTO_EXTENSIONS diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S index f6e372c528eb..c53dbeae79f2 100644 --- a/arch/arm64/crypto/aes-modes.S +++ b/arch/arm64/crypto/aes-modes.S @@ -386,7 +386,8 @@ AES_ENDPROC(aes_ctr_encrypt) .endm .Lxts_mul_x: - .word 1, 0, 0x87, 0 +CPU_LE( .quad 1, 0x87 ) +CPU_BE( .quad 0x87, 1 ) AES_ENTRY(aes_xts_encrypt) FRAME_PUSH diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S index b93170e1cc93..85f07ead7c5c 100644 --- a/arch/arm64/crypto/aes-neon.S +++ b/arch/arm64/crypto/aes-neon.S @@ -9,6 +9,7 @@ */ #include <linux/linkage.h> +#include <asm/assembler.h> #define AES_ENTRY(func) ENTRY(neon_ ## func) #define AES_ENDPROC(func) ENDPROC(neon_ ## func) @@ -83,13 +84,13 @@ .endm .macro do_block, enc, in, rounds, rk, rkp, i - ld1 {v15.16b}, [\rk] + ld1 {v15.4s}, [\rk] add \rkp, \rk, #16 mov \i, \rounds 1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ sub_bytes \in - ld1 {v15.16b}, [\rkp], #16 + ld1 {v15.4s}, [\rkp], #16 subs \i, \i, #1 beq 2222f .if \enc == 1 @@ -229,7 +230,7 @@ .endm .macro do_block_2x, enc, in0, in1 rounds, rk, rkp, i - ld1 {v15.16b}, [\rk] + ld1 {v15.4s}, [\rk] add \rkp, \rk, #16 mov \i, \rounds 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ @@ -237,7 +238,7 @@ sub_bytes_2x \in0, \in1 tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ - ld1 {v15.16b}, [\rkp], #16 + ld1 {v15.4s}, [\rkp], #16 subs \i, \i, #1 beq 2222f .if \enc == 1 @@ -254,7 +255,7 @@ .endm .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i - ld1 {v15.16b}, [\rk] + ld1 {v15.4s}, [\rk] add \rkp, \rk, #16 mov \i, \rounds 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ @@ -266,7 +267,7 @@ tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ - ld1 {v15.16b}, [\rkp], #16 + ld1 {v15.4s}, [\rkp], #16 subs \i, \i, #1 beq 2222f .if \enc == 1 @@ -306,12 +307,16 @@ .text .align 4 .LForward_ShiftRows: - .byte 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3 - .byte 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb +CPU_LE( .byte 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3 ) +CPU_LE( .byte 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb ) +CPU_BE( .byte 0xb, 0x6, 0x1, 0xc, 0x7, 0x2, 0xd, 0x8 ) +CPU_BE( .byte 0x3, 0xe, 0x9, 0x4, 0xf, 0xa, 0x5, 0x0 ) .LReverse_ShiftRows: - .byte 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb - .byte 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3 +CPU_LE( .byte 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb ) +CPU_LE( .byte 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3 ) +CPU_BE( .byte 0x3, 0x6, 0x9, 0xc, 0xf, 0x2, 0x5, 0x8 ) +CPU_BE( .byte 0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0 ) .LForward_Sbox: .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 diff --git a/arch/arm64/crypto/crc32-ce-core.S b/arch/arm64/crypto/crc32-ce-core.S new file mode 100644 index 000000000000..18f5a8442276 --- /dev/null +++ b/arch/arm64/crypto/crc32-ce-core.S @@ -0,0 +1,266 @@ +/* + * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 + * calculation. + * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) + * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found + * at: + * http://www.intel.com/products/processor/manuals/ + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual + * Volume 2B: Instruction Set Reference, N-Z + * + * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> + * Alexander Boyko <Alexander_Boyko@xyratex.com> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .align 6 + .cpu generic+crypto+crc + +.Lcrc32_constants: + /* + * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 + * #define CONSTANT_R1 0x154442bd4LL + * + * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 + * #define CONSTANT_R2 0x1c6e41596LL + */ + .octa 0x00000001c6e415960000000154442bd4 + + /* + * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 + * #define CONSTANT_R3 0x1751997d0LL + * + * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e + * #define CONSTANT_R4 0x0ccaa009eLL + */ + .octa 0x00000000ccaa009e00000001751997d0 + + /* + * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 + * #define CONSTANT_R5 0x163cd6124LL + */ + .quad 0x0000000163cd6124 + .quad 0x00000000FFFFFFFF + + /* + * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL + * + * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` + * = 0x1F7011641LL + * #define CONSTANT_RU 0x1F7011641LL + */ + .octa 0x00000001F701164100000001DB710641 + +.Lcrc32c_constants: + .octa 0x000000009e4addf800000000740eef02 + .octa 0x000000014cd00bd600000000f20c0dfe + .quad 0x00000000dd45aab8 + .quad 0x00000000FFFFFFFF + .octa 0x00000000dea713f10000000105ec76f0 + + vCONSTANT .req v0 + dCONSTANT .req d0 + qCONSTANT .req q0 + + BUF .req x0 + LEN .req x1 + CRC .req x2 + + vzr .req v9 + + /** + * Calculate crc32 + * BUF - buffer + * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63 + * CRC - initial crc32 + * return %eax crc32 + * uint crc32_pmull_le(unsigned char const *buffer, + * size_t len, uint crc32) + */ +ENTRY(crc32_pmull_le) + adr x3, .Lcrc32_constants + b 0f + +ENTRY(crc32c_pmull_le) + adr x3, .Lcrc32c_constants + +0: bic LEN, LEN, #15 + ld1 {v1.16b-v4.16b}, [BUF], #0x40 + movi vzr.16b, #0 + fmov dCONSTANT, CRC + eor v1.16b, v1.16b, vCONSTANT.16b + sub LEN, LEN, #0x40 + cmp LEN, #0x40 + b.lt less_64 + + ldr qCONSTANT, [x3] + +loop_64: /* 64 bytes Full cache line folding */ + sub LEN, LEN, #0x40 + + pmull2 v5.1q, v1.2d, vCONSTANT.2d + pmull2 v6.1q, v2.2d, vCONSTANT.2d + pmull2 v7.1q, v3.2d, vCONSTANT.2d + pmull2 v8.1q, v4.2d, vCONSTANT.2d + + pmull v1.1q, v1.1d, vCONSTANT.1d + pmull v2.1q, v2.1d, vCONSTANT.1d + pmull v3.1q, v3.1d, vCONSTANT.1d + pmull v4.1q, v4.1d, vCONSTANT.1d + + eor v1.16b, v1.16b, v5.16b + ld1 {v5.16b}, [BUF], #0x10 + eor v2.16b, v2.16b, v6.16b + ld1 {v6.16b}, [BUF], #0x10 + eor v3.16b, v3.16b, v7.16b + ld1 {v7.16b}, [BUF], #0x10 + eor v4.16b, v4.16b, v8.16b + ld1 {v8.16b}, [BUF], #0x10 + + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + eor v4.16b, v4.16b, v8.16b + + cmp LEN, #0x40 + b.ge loop_64 + +less_64: /* Folding cache line into 128bit */ + ldr qCONSTANT, [x3, #16] + + pmull2 v5.1q, v1.2d, vCONSTANT.2d + pmull v1.1q, v1.1d, vCONSTANT.1d + eor v1.16b, v1.16b, v5.16b + eor v1.16b, v1.16b, v2.16b + + pmull2 v5.1q, v1.2d, vCONSTANT.2d + pmull v1.1q, v1.1d, vCONSTANT.1d + eor v1.16b, v1.16b, v5.16b + eor v1.16b, v1.16b, v3.16b + + pmull2 v5.1q, v1.2d, vCONSTANT.2d + pmull v1.1q, v1.1d, vCONSTANT.1d + eor v1.16b, v1.16b, v5.16b + eor v1.16b, v1.16b, v4.16b + + cbz LEN, fold_64 + +loop_16: /* Folding rest buffer into 128bit */ + subs LEN, LEN, #0x10 + + ld1 {v2.16b}, [BUF], #0x10 + pmull2 v5.1q, v1.2d, vCONSTANT.2d + pmull v1.1q, v1.1d, vCONSTANT.1d + eor v1.16b, v1.16b, v5.16b + eor v1.16b, v1.16b, v2.16b + + b.ne loop_16 + +fold_64: + /* perform the last 64 bit fold, also adds 32 zeroes + * to the input stream */ + ext v2.16b, v1.16b, v1.16b, #8 + pmull2 v2.1q, v2.2d, vCONSTANT.2d + ext v1.16b, v1.16b, vzr.16b, #8 + eor v1.16b, v1.16b, v2.16b + + /* final 32-bit fold */ + ldr dCONSTANT, [x3, #32] + ldr d3, [x3, #40] + + ext v2.16b, v1.16b, vzr.16b, #4 + and v1.16b, v1.16b, v3.16b + pmull v1.1q, v1.1d, vCONSTANT.1d + eor v1.16b, v1.16b, v2.16b + + /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ + ldr qCONSTANT, [x3, #48] + + and v2.16b, v1.16b, v3.16b + ext v2.16b, vzr.16b, v2.16b, #8 + pmull2 v2.1q, v2.2d, vCONSTANT.2d + and v2.16b, v2.16b, v3.16b + pmull v2.1q, v2.1d, vCONSTANT.1d + eor v1.16b, v1.16b, v2.16b + mov w0, v1.s[1] + + ret +ENDPROC(crc32_pmull_le) +ENDPROC(crc32c_pmull_le) + + .macro __crc32, c +0: subs x2, x2, #16 + b.mi 8f + ldp x3, x4, [x1], #16 +CPU_BE( rev x3, x3 ) +CPU_BE( rev x4, x4 ) + crc32\c\()x w0, w0, x3 + crc32\c\()x w0, w0, x4 + b.ne 0b + ret + +8: tbz x2, #3, 4f + ldr x3, [x1], #8 +CPU_BE( rev x3, x3 ) + crc32\c\()x w0, w0, x3 +4: tbz x2, #2, 2f + ldr w3, [x1], #4 +CPU_BE( rev w3, w3 ) + crc32\c\()w w0, w0, w3 +2: tbz x2, #1, 1f + ldrh w3, [x1], #2 +CPU_BE( rev16 w3, w3 ) + crc32\c\()h w0, w0, w3 +1: tbz x2, #0, 0f + ldrb w3, [x1] + crc32\c\()b w0, w0, w3 +0: ret + .endm + + .align 5 +ENTRY(crc32_armv8_le) + __crc32 +ENDPROC(crc32_armv8_le) + + .align 5 +ENTRY(crc32c_armv8_le) + __crc32 c +ENDPROC(crc32c_armv8_le) diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c new file mode 100644 index 000000000000..8594127d5e01 --- /dev/null +++ b/arch/arm64/crypto/crc32-ce-glue.c @@ -0,0 +1,212 @@ +/* + * Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/cpufeature.h> +#include <linux/crc32.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> + +#include <crypto/internal/hash.h> + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/unaligned.h> + +#define PMULL_MIN_LEN 64L /* minimum size of buffer + * for crc32_pmull_le_16 */ +#define SCALE_F 16L /* size of NEON register */ + +asmlinkage u32 crc32_pmull_le(const u8 buf[], u64 len, u32 init_crc); +asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], size_t len); + +asmlinkage u32 crc32c_pmull_le(const u8 buf[], u64 len, u32 init_crc); +asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], size_t len); + +static u32 (*fallback_crc32)(u32 init_crc, const u8 buf[], size_t len); +static u32 (*fallback_crc32c)(u32 init_crc, const u8 buf[], size_t len); + +static int crc32_pmull_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = 0; + return 0; +} + +static int crc32c_pmull_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = ~0; + return 0; +} + +static int crc32_pmull_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) { + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + *mctx = le32_to_cpup((__le32 *)key); + return 0; +} + +static int crc32_pmull_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crc = shash_desc_ctx(desc); + + *crc = *mctx; + return 0; +} + +static int crc32_pmull_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + u32 *crc = shash_desc_ctx(desc); + unsigned int l; + + if ((u64)data % SCALE_F) { + l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F)); + + *crc = fallback_crc32(*crc, data, l); + + data += l; + length -= l; + } + + if (length >= PMULL_MIN_LEN) { + l = round_down(length, SCALE_F); + + kernel_neon_begin_partial(10); + *crc = crc32_pmull_le(data, l, *crc); + kernel_neon_end(); + + data += l; + length -= l; + } + + if (length > 0) + *crc = fallback_crc32(*crc, data, length); + + return 0; +} + +static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + u32 *crc = shash_desc_ctx(desc); + unsigned int l; + + if ((u64)data % SCALE_F) { + l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F)); + + *crc = fallback_crc32c(*crc, data, l); + + data += l; + length -= l; + } + + if (length >= PMULL_MIN_LEN) { + l = round_down(length, SCALE_F); + + kernel_neon_begin_partial(10); + *crc = crc32c_pmull_le(data, l, *crc); + kernel_neon_end(); + + data += l; + length -= l; + } + + if (length > 0) { + *crc = fallback_crc32c(*crc, data, length); + } + + return 0; +} + +static int crc32_pmull_final(struct shash_desc *desc, u8 *out) +{ + u32 *crc = shash_desc_ctx(desc); + + put_unaligned_le32(*crc, out); + return 0; +} + +static int crc32c_pmull_final(struct shash_desc *desc, u8 *out) +{ + u32 *crc = shash_desc_ctx(desc); + + put_unaligned_le32(~*crc, out); + return 0; +} + +static struct shash_alg crc32_pmull_algs[] = { { + .setkey = crc32_pmull_setkey, + .init = crc32_pmull_init, + .update = crc32_pmull_update, + .final = crc32_pmull_final, + .descsize = sizeof(u32), + .digestsize = sizeof(u32), + + .base.cra_ctxsize = sizeof(u32), + .base.cra_init = crc32_pmull_cra_init, + .base.cra_name = "crc32", + .base.cra_driver_name = "crc32-arm64-ce", + .base.cra_priority = 200, + .base.cra_blocksize = 1, + .base.cra_module = THIS_MODULE, +}, { + .setkey = crc32_pmull_setkey, + .init = crc32_pmull_init, + .update = crc32c_pmull_update, + .final = crc32c_pmull_final, + .descsize = sizeof(u32), + .digestsize = sizeof(u32), + + .base.cra_ctxsize = sizeof(u32), + .base.cra_init = crc32c_pmull_cra_init, + .base.cra_name = "crc32c", + .base.cra_driver_name = "crc32c-arm64-ce", + .base.cra_priority = 200, + .base.cra_blocksize = 1, + .base.cra_module = THIS_MODULE, +} }; + +static int __init crc32_pmull_mod_init(void) +{ + if (elf_hwcap & HWCAP_CRC32) { + fallback_crc32 = crc32_armv8_le; + fallback_crc32c = crc32c_armv8_le; + } else { + fallback_crc32 = crc32_le; + fallback_crc32c = __crc32c_le; + } + + return crypto_register_shashes(crc32_pmull_algs, + ARRAY_SIZE(crc32_pmull_algs)); +} + +static void __exit crc32_pmull_mod_exit(void) +{ + crypto_unregister_shashes(crc32_pmull_algs, + ARRAY_SIZE(crc32_pmull_algs)); +} + +module_cpu_feature_match(PMULL, crc32_pmull_mod_init); +module_exit(crc32_pmull_mod_exit); + +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S new file mode 100644 index 000000000000..d5b5a8c038c8 --- /dev/null +++ b/arch/arm64/crypto/crct10dif-ce-core.S @@ -0,0 +1,392 @@ +// +// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions +// +// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License version 2 as +// published by the Free Software Foundation. +// + +// +// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +// +// Copyright (c) 2013, Intel Corporation +// +// Authors: +// Erdinc Ozturk <erdinc.ozturk@intel.com> +// Vinodh Gopal <vinodh.gopal@intel.com> +// James Guilford <james.guilford@intel.com> +// Tim Chen <tim.c.chen@linux.intel.com> +// +// This software is available to you under a choice of one of two +// licenses. You may choose to be licensed under the terms of the GNU +// General Public License (GPL) Version 2, available from the file +// COPYING in the main directory of this source tree, or the +// OpenIB.org BSD license below: +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the +// distribution. +// +// * Neither the name of the Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// +// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Function API: +// UINT16 crc_t10dif_pcl( +// UINT16 init_crc, //initial CRC value, 16 bits +// const unsigned char *buf, //buffer pointer to calculate CRC on +// UINT64 len //buffer length in bytes (64-bit data) +// ); +// +// Reference paper titled "Fast CRC Computation for Generic +// Polynomials Using PCLMULQDQ Instruction" +// URL: http://www.intel.com/content/dam/www/public/us/en/documents +// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +// +// + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .cpu generic+crypto + + arg1_low32 .req w0 + arg2 .req x1 + arg3 .req x2 + + vzr .req v13 + +ENTRY(crc_t10dif_pmull) + movi vzr.16b, #0 // init zero register + + // adjust the 16-bit initial_crc value, scale it to 32 bits + lsl arg1_low32, arg1_low32, #16 + + // check if smaller than 256 + cmp arg3, #256 + + // for sizes less than 128, we can't fold 64B at a time... + b.lt _less_than_128 + + // load the initial crc value + // crc value does not need to be byte-reflected, but it needs + // to be moved to the high part of the register. + // because data will be byte-reflected and will align with + // initial crc at correct place. + movi v10.16b, #0 + mov v10.s[3], arg1_low32 // initial crc + + // receive the initial 64B data, xor the initial crc value + ldp q0, q1, [arg2] + ldp q2, q3, [arg2, #0x20] + ldp q4, q5, [arg2, #0x40] + ldp q6, q7, [arg2, #0x60] + add arg2, arg2, #0x80 + +CPU_LE( rev64 v0.16b, v0.16b ) +CPU_LE( rev64 v1.16b, v1.16b ) +CPU_LE( rev64 v2.16b, v2.16b ) +CPU_LE( rev64 v3.16b, v3.16b ) +CPU_LE( rev64 v4.16b, v4.16b ) +CPU_LE( rev64 v5.16b, v5.16b ) +CPU_LE( rev64 v6.16b, v6.16b ) +CPU_LE( rev64 v7.16b, v7.16b ) + +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) +CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) +CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) +CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) +CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) +CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) +CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) +CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + + // XOR the initial_crc value + eor v0.16b, v0.16b, v10.16b + + ldr q10, rk3 // xmm10 has rk3 and rk4 + // type of pmull instruction + // will determine which constant to use + + // + // we subtract 256 instead of 128 to save one instruction from the loop + // + sub arg3, arg3, #256 + + // at this section of the code, there is 64*x+y (0<=y<64) bytes of + // buffer. The _fold_64_B_loop will fold 64B at a time + // until we have 64+y Bytes of buffer + + + // fold 64B at a time. This section of the code folds 4 vector + // registers in parallel +_fold_64_B_loop: + + .macro fold64, reg1, reg2 + ldp q11, q12, [arg2], #0x20 + + pmull2 v8.1q, \reg1\().2d, v10.2d + pmull \reg1\().1q, \reg1\().1d, v10.1d + +CPU_LE( rev64 v11.16b, v11.16b ) +CPU_LE( rev64 v12.16b, v12.16b ) + + pmull2 v9.1q, \reg2\().2d, v10.2d + pmull \reg2\().1q, \reg2\().1d, v10.1d + +CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) +CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) + + eor \reg1\().16b, \reg1\().16b, v8.16b + eor \reg2\().16b, \reg2\().16b, v9.16b + eor \reg1\().16b, \reg1\().16b, v11.16b + eor \reg2\().16b, \reg2\().16b, v12.16b + .endm + + fold64 v0, v1 + fold64 v2, v3 + fold64 v4, v5 + fold64 v6, v7 + + subs arg3, arg3, #128 + + // check if there is another 64B in the buffer to be able to fold + b.ge _fold_64_B_loop + + // at this point, the buffer pointer is pointing at the last y Bytes + // of the buffer the 64B of folded data is in 4 of the vector + // registers: v0, v1, v2, v3 + + // fold the 8 vector registers to 1 vector register with different + // constants + + ldr q10, rk9 + + .macro fold16, reg, rk + pmull v8.1q, \reg\().1d, v10.1d + pmull2 \reg\().1q, \reg\().2d, v10.2d + .ifnb \rk + ldr q10, \rk + .endif + eor v7.16b, v7.16b, v8.16b + eor v7.16b, v7.16b, \reg\().16b + .endm + + fold16 v0, rk11 + fold16 v1, rk13 + fold16 v2, rk15 + fold16 v3, rk17 + fold16 v4, rk19 + fold16 v5, rk1 + fold16 v6 + + // instead of 64, we add 48 to the loop counter to save 1 instruction + // from the loop instead of a cmp instruction, we use the negative + // flag with the jl instruction + adds arg3, arg3, #(128-16) + b.lt _final_reduction_for_128 + + // now we have 16+y bytes left to reduce. 16 Bytes is in register v7 + // and the rest is in memory. We can fold 16 bytes at a time if y>=16 + // continue folding 16B at a time + +_16B_reduction_loop: + pmull v8.1q, v7.1d, v10.1d + pmull2 v7.1q, v7.2d, v10.2d + eor v7.16b, v7.16b, v8.16b + + ldr q0, [arg2], #16 +CPU_LE( rev64 v0.16b, v0.16b ) +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) + eor v7.16b, v7.16b, v0.16b + subs arg3, arg3, #16 + + // instead of a cmp instruction, we utilize the flags with the + // jge instruction equivalent of: cmp arg3, 16-16 + // check if there is any more 16B in the buffer to be able to fold + b.ge _16B_reduction_loop + + // now we have 16+z bytes left to reduce, where 0<= z < 16. + // first, we reduce the data in the xmm7 register + +_final_reduction_for_128: + // check if any more data to fold. If not, compute the CRC of + // the final 128 bits + adds arg3, arg3, #16 + b.eq _128_done + + // here we are getting data that is less than 16 bytes. + // since we know that there was data before the pointer, we can + // offset the input pointer before the actual point, to receive + // exactly 16 bytes. after that the registers need to be adjusted. +_get_last_two_regs: + add arg2, arg2, arg3 + ldr q1, [arg2, #-16] +CPU_LE( rev64 v1.16b, v1.16b ) +CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) + + // get rid of the extra data that was loaded before + // load the shift constant + adr x4, tbl_shf_table + 16 + sub x4, x4, arg3 + ld1 {v0.16b}, [x4] + + // shift v2 to the left by arg3 bytes + tbl v2.16b, {v7.16b}, v0.16b + + // shift v7 to the right by 16-arg3 bytes + movi v9.16b, #0x80 + eor v0.16b, v0.16b, v9.16b + tbl v7.16b, {v7.16b}, v0.16b + + // blend + sshr v0.16b, v0.16b, #7 // convert to 8-bit mask + bsl v0.16b, v2.16b, v1.16b + + // fold 16 Bytes + pmull v8.1q, v7.1d, v10.1d + pmull2 v7.1q, v7.2d, v10.2d + eor v7.16b, v7.16b, v8.16b + eor v7.16b, v7.16b, v0.16b + +_128_done: + // compute crc of a 128-bit value + ldr q10, rk5 // rk5 and rk6 in xmm10 + + // 64b fold + ext v0.16b, vzr.16b, v7.16b, #8 + mov v7.d[0], v7.d[1] + pmull v7.1q, v7.1d, v10.1d + eor v7.16b, v7.16b, v0.16b + + // 32b fold + ext v0.16b, v7.16b, vzr.16b, #4 + mov v7.s[3], vzr.s[0] + pmull2 v0.1q, v0.2d, v10.2d + eor v7.16b, v7.16b, v0.16b + + // barrett reduction +_barrett: + ldr q10, rk7 + mov v0.d[0], v7.d[1] + + pmull v0.1q, v0.1d, v10.1d + ext v0.16b, vzr.16b, v0.16b, #12 + pmull2 v0.1q, v0.2d, v10.2d + ext v0.16b, vzr.16b, v0.16b, #12 + eor v7.16b, v7.16b, v0.16b + mov w0, v7.s[1] + +_cleanup: + // scale the result back to 16 bits + lsr x0, x0, #16 + ret + +_less_than_128: + cbz arg3, _cleanup + + movi v0.16b, #0 + mov v0.s[3], arg1_low32 // get the initial crc value + + ldr q7, [arg2], #0x10 +CPU_LE( rev64 v7.16b, v7.16b ) +CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + eor v7.16b, v7.16b, v0.16b // xor the initial crc value + + cmp arg3, #16 + b.eq _128_done // exactly 16 left + b.lt _less_than_16_left + + ldr q10, rk1 // rk1 and rk2 in xmm10 + + // update the counter. subtract 32 instead of 16 to save one + // instruction from the loop + subs arg3, arg3, #32 + b.ge _16B_reduction_loop + + add arg3, arg3, #16 + b _get_last_two_regs + +_less_than_16_left: + // shl r9, 4 + adr x0, tbl_shf_table + 16 + sub x0, x0, arg3 + ld1 {v0.16b}, [x0] + movi v9.16b, #0x80 + eor v0.16b, v0.16b, v9.16b + tbl v7.16b, {v7.16b}, v0.16b + b _128_done +ENDPROC(crc_t10dif_pmull) + +// precomputed constants +// these constants are precomputed from the poly: +// 0x8bb70000 (0x8bb7 scaled to 32 bits) + .align 4 +// Q = 0x18BB70000 +// rk1 = 2^(32*3) mod Q << 32 +// rk2 = 2^(32*5) mod Q << 32 +// rk3 = 2^(32*15) mod Q << 32 +// rk4 = 2^(32*17) mod Q << 32 +// rk5 = 2^(32*3) mod Q << 32 +// rk6 = 2^(32*2) mod Q << 32 +// rk7 = floor(2^64/Q) +// rk8 = Q + +rk1: .octa 0x06df0000000000002d56000000000000 +rk3: .octa 0x7cf50000000000009d9d000000000000 +rk5: .octa 0x13680000000000002d56000000000000 +rk7: .octa 0x000000018bb7000000000001f65a57f8 +rk9: .octa 0xbfd6000000000000ceae000000000000 +rk11: .octa 0x713c0000000000001e16000000000000 +rk13: .octa 0x80a6000000000000f7f9000000000000 +rk15: .octa 0xe658000000000000044c000000000000 +rk17: .octa 0xa497000000000000ad18000000000000 +rk19: .octa 0xe7b50000000000006ee3000000000000 + +tbl_shf_table: +// use these values for shift constants for the tbl/tbx instruction +// different alignments result in values as shown: +// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 +// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 +// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 +// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 +// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 +// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 +// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 +// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 +// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 +// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 +// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 +// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 +// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 +// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 +// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 + + .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 + .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c new file mode 100644 index 000000000000..60cb590c2590 --- /dev/null +++ b/arch/arm64/crypto/crct10dif-ce-glue.c @@ -0,0 +1,95 @@ +/* + * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/cpufeature.h> +#include <linux/crc-t10dif.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> + +#include <crypto/internal/hash.h> + +#include <asm/neon.h> + +#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U + +asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u64 len); + +static int crct10dif_init(struct shash_desc *desc) +{ + u16 *crc = shash_desc_ctx(desc); + + *crc = 0; + return 0; +} + +static int crct10dif_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + u16 *crc = shash_desc_ctx(desc); + unsigned int l; + + if (unlikely((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) { + l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE - + ((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE)); + + *crc = crc_t10dif_generic(*crc, data, l); + + length -= l; + data += l; + } + + if (length > 0) { + kernel_neon_begin_partial(14); + *crc = crc_t10dif_pmull(*crc, data, length); + kernel_neon_end(); + } + + return 0; +} + +static int crct10dif_final(struct shash_desc *desc, u8 *out) +{ + u16 *crc = shash_desc_ctx(desc); + + *(u16 *)out = *crc; + return 0; +} + +static struct shash_alg crc_t10dif_alg = { + .digestsize = CRC_T10DIF_DIGEST_SIZE, + .init = crct10dif_init, + .update = crct10dif_update, + .final = crct10dif_final, + .descsize = CRC_T10DIF_DIGEST_SIZE, + + .base.cra_name = "crct10dif", + .base.cra_driver_name = "crct10dif-arm64-ce", + .base.cra_priority = 200, + .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}; + +static int __init crc_t10dif_mod_init(void) +{ + return crypto_register_shash(&crc_t10dif_alg); +} + +static void __exit crc_t10dif_mod_exit(void) +{ + crypto_unregister_shash(&crc_t10dif_alg); +} + +module_cpu_feature_match(PMULL, crc_t10dif_mod_init); +module_exit(crc_t10dif_mod_exit); + +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index dc457015884e..f0bb9f0b524f 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -29,8 +29,8 @@ * struct ghash_key const *k, const char *head) */ ENTRY(pmull_ghash_update) - ld1 {SHASH.16b}, [x3] - ld1 {XL.16b}, [x1] + ld1 {SHASH.2d}, [x3] + ld1 {XL.2d}, [x1] movi MASK.16b, #0xe1 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 shl MASK.2d, MASK.2d, #57 @@ -74,6 +74,6 @@ CPU_LE( rev64 T1.16b, T1.16b ) cbnz w0, 0b - st1 {XL.16b}, [x1] + st1 {XL.2d}, [x1] ret ENDPROC(pmull_ghash_update) diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S index 033aae6d732a..c98e7e849f06 100644 --- a/arch/arm64/crypto/sha1-ce-core.S +++ b/arch/arm64/crypto/sha1-ce-core.S @@ -78,7 +78,7 @@ ENTRY(sha1_ce_transform) ld1r {k3.4s}, [x6] /* load state */ - ldr dga, [x0] + ld1 {dgav.4s}, [x0] ldr dgb, [x0, #16] /* load sha1_ce_state::finalize */ @@ -144,7 +144,7 @@ CPU_LE( rev32 v11.16b, v11.16b ) b 1b /* store new state */ -3: str dga, [x0] +3: st1 {dgav.4s}, [x0] str dgb, [x0, #16] ret ENDPROC(sha1_ce_transform) diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S index 5df9d9d470ad..01cfee066837 100644 --- a/arch/arm64/crypto/sha2-ce-core.S +++ b/arch/arm64/crypto/sha2-ce-core.S @@ -85,7 +85,7 @@ ENTRY(sha2_ce_transform) ld1 {v12.4s-v15.4s}, [x8] /* load state */ - ldp dga, dgb, [x0] + ld1 {dgav.4s, dgbv.4s}, [x0] /* load sha256_ce_state::finalize */ ldr w4, [x0, #:lo12:sha256_ce_offsetof_finalize] @@ -148,6 +148,6 @@ CPU_LE( rev32 v19.16b, v19.16b ) b 1b /* store new state */ -3: stp dga, dgb, [x0] +3: st1 {dgav.4s, dgbv.4s}, [x0] ret ENDPROC(sha2_ce_transform) diff --git a/arch/arm64/crypto/sha256-core.S_shipped b/arch/arm64/crypto/sha256-core.S_shipped new file mode 100644 index 000000000000..3ce82cc860bc --- /dev/null +++ b/arch/arm64/crypto/sha256-core.S_shipped @@ -0,0 +1,2061 @@ +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + +// ==================================================================== +// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +// project. The module is, however, dual licensed under OpenSSL and +// CRYPTOGAMS licenses depending on where you obtain it. For further +// details see http://www.openssl.org/~appro/cryptogams/. +// +// Permission to use under GPLv2 terms is granted. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significanty faster +// and the gap is only 40-90%. +// +// October 2016. +// +// Originally it was reckoned that it makes no sense to implement NEON +// version of SHA256 for 64-bit processors. This is because performance +// improvement on most wide-spread Cortex-A5x processors was observed +// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was +// observed that 32-bit NEON SHA256 performs significantly better than +// 64-bit scalar version on *some* of the more recent processors. As +// result 64-bit NEON version of SHA256 was added to provide best +// all-round performance. For example it executes ~30% faster on X-Gene +// and Mongoose. [For reference, NEON version of SHA512 is bound to +// deliver much less improvement, likely *negative* on Cortex-A5x. +// Which is why NEON support is limited to SHA256.] + +#ifndef __KERNEL__ +# include "arm_arch.h" +#endif + +.text + +.extern OPENSSL_armcap_P +.globl sha256_block_data_order +.type sha256_block_data_order,%function +.align 6 +sha256_block_data_order: +#ifndef __KERNEL__ +# ifdef __ILP32__ + ldrsw x16,.LOPENSSL_armcap_P +# else + ldr x16,.LOPENSSL_armcap_P +# endif + adr x17,.LOPENSSL_armcap_P + add x16,x16,x17 + ldr w16,[x16] + tst w16,#ARMV8_SHA256 + b.ne .Lv8_entry + tst w16,#ARMV7_NEON + b.ne .Lneon_entry +#endif + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*4 + + ldp w20,w21,[x0] // load context + ldp w22,w23,[x0,#2*4] + ldp w24,w25,[x0,#4*4] + add x2,x1,x2,lsl#6 // end of input + ldp w26,w27,[x0,#6*4] + adr x30,.LK256 + stp x0,x2,[x29,#96] + +.Loop: + ldp w3,w4,[x1],#2*4 + ldr w19,[x30],#4 // *K++ + eor w28,w21,w22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev w3,w3 // 0 +#endif + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w6,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w3 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w4,w4 // 1 +#endif + ldp w5,w6,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w7,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w4 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w5,w5 // 2 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w8,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w5 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w6,w6 // 3 +#endif + ldp w7,w8,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w9,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w6 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w7,w7 // 4 +#endif + add w24,w24,w17 // h+=Sigma0(a) + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w10,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w7 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w10,ror#11 // Sigma1(e) + ror w10,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w10,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w8,w8 // 5 +#endif + ldp w9,w10,[x1],#2*4 + add w23,w23,w17 // h+=Sigma0(a) + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w11,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w8 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w11,ror#11 // Sigma1(e) + ror w11,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w11,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w9,w9 // 6 +#endif + add w22,w22,w17 // h+=Sigma0(a) + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w12,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w9 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w12,ror#11 // Sigma1(e) + ror w12,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w12,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w10,w10 // 7 +#endif + ldp w11,w12,[x1],#2*4 + add w21,w21,w17 // h+=Sigma0(a) + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + eor w13,w25,w25,ror#14 + and w17,w26,w25 + bic w28,w27,w25 + add w20,w20,w10 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w13,ror#11 // Sigma1(e) + ror w13,w21,#2 + add w20,w20,w17 // h+=Ch(e,f,g) + eor w17,w21,w21,ror#9 + add w20,w20,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w24,w24,w20 // d+=h + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w13,w17,ror#13 // Sigma0(a) + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w20,w20,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w11,w11 // 8 +#endif + add w20,w20,w17 // h+=Sigma0(a) + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w14,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w11 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w14,ror#11 // Sigma1(e) + ror w14,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w14,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w12,w12 // 9 +#endif + ldp w13,w14,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w15,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w12 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w15,ror#11 // Sigma1(e) + ror w15,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w15,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w13,w13 // 10 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w0,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w13 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w0,ror#11 // Sigma1(e) + ror w0,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w0,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w14,w14 // 11 +#endif + ldp w15,w0,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w6,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w14 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w15,w15 // 12 +#endif + add w24,w24,w17 // h+=Sigma0(a) + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w7,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w15 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w0,w0 // 13 +#endif + ldp w1,w2,[x1] + add w23,w23,w17 // h+=Sigma0(a) + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w8,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w0 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w1,w1 // 14 +#endif + ldr w6,[sp,#12] + add w22,w22,w17 // h+=Sigma0(a) + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w9,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w1 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w2,w2 // 15 +#endif + ldr w7,[sp,#0] + add w21,w21,w17 // h+=Sigma0(a) + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 +.Loop_16_xx: + ldr w8,[sp,#4] + str w11,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w10,w5,#7 + and w17,w25,w24 + ror w9,w2,#17 + bic w19,w26,w24 + ror w11,w20,#2 + add w27,w27,w3 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w10,w10,w5,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w11,w11,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w9,w9,w2,ror#19 + eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w11,w20,ror#22 // Sigma0(a) + eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) + add w4,w4,w13 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w4,w4,w10 + add w27,w27,w17 // h+=Sigma0(a) + add w4,w4,w9 + ldr w9,[sp,#8] + str w12,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w11,w6,#7 + and w17,w24,w23 + ror w10,w3,#17 + bic w28,w25,w23 + ror w12,w27,#2 + add w26,w26,w4 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w11,w11,w6,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w12,w12,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w10,w10,w3,ror#19 + eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w12,w27,ror#22 // Sigma0(a) + eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) + add w5,w5,w14 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w5,w5,w11 + add w26,w26,w17 // h+=Sigma0(a) + add w5,w5,w10 + ldr w10,[sp,#12] + str w13,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w12,w7,#7 + and w17,w23,w22 + ror w11,w4,#17 + bic w19,w24,w22 + ror w13,w26,#2 + add w25,w25,w5 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w12,w12,w7,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w13,w13,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w11,w11,w4,ror#19 + eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w13,w26,ror#22 // Sigma0(a) + eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) + add w6,w6,w15 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w6,w6,w12 + add w25,w25,w17 // h+=Sigma0(a) + add w6,w6,w11 + ldr w11,[sp,#0] + str w14,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w13,w8,#7 + and w17,w22,w21 + ror w12,w5,#17 + bic w28,w23,w21 + ror w14,w25,#2 + add w24,w24,w6 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w13,w13,w8,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w14,w14,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w12,w12,w5,ror#19 + eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w14,w25,ror#22 // Sigma0(a) + eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) + add w7,w7,w0 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w7,w7,w13 + add w24,w24,w17 // h+=Sigma0(a) + add w7,w7,w12 + ldr w12,[sp,#4] + str w15,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w14,w9,#7 + and w17,w21,w20 + ror w13,w6,#17 + bic w19,w22,w20 + ror w15,w24,#2 + add w23,w23,w7 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w14,w14,w9,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w15,w15,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w13,w13,w6,ror#19 + eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w15,w24,ror#22 // Sigma0(a) + eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) + add w8,w8,w1 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w8,w8,w14 + add w23,w23,w17 // h+=Sigma0(a) + add w8,w8,w13 + ldr w13,[sp,#8] + str w0,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w15,w10,#7 + and w17,w20,w27 + ror w14,w7,#17 + bic w28,w21,w27 + ror w0,w23,#2 + add w22,w22,w8 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w15,w15,w10,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w0,w0,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w14,w14,w7,ror#19 + eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w0,w23,ror#22 // Sigma0(a) + eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) + add w9,w9,w2 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w9,w9,w15 + add w22,w22,w17 // h+=Sigma0(a) + add w9,w9,w14 + ldr w14,[sp,#12] + str w1,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w0,w11,#7 + and w17,w27,w26 + ror w15,w8,#17 + bic w19,w20,w26 + ror w1,w22,#2 + add w21,w21,w9 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w0,w0,w11,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w1,w1,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w15,w15,w8,ror#19 + eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w1,w22,ror#22 // Sigma0(a) + eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) + add w10,w10,w3 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w10,w10,w0 + add w21,w21,w17 // h+=Sigma0(a) + add w10,w10,w15 + ldr w15,[sp,#0] + str w2,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w1,w12,#7 + and w17,w26,w25 + ror w0,w9,#17 + bic w28,w27,w25 + ror w2,w21,#2 + add w20,w20,w10 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w1,w1,w12,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w2,w2,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w0,w0,w9,ror#19 + eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w2,w21,ror#22 // Sigma0(a) + eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) + add w11,w11,w4 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w11,w11,w1 + add w20,w20,w17 // h+=Sigma0(a) + add w11,w11,w0 + ldr w0,[sp,#4] + str w3,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w2,w13,#7 + and w17,w25,w24 + ror w1,w10,#17 + bic w19,w26,w24 + ror w3,w20,#2 + add w27,w27,w11 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w2,w2,w13,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w3,w3,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w1,w1,w10,ror#19 + eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w3,w20,ror#22 // Sigma0(a) + eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) + add w12,w12,w5 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w12,w12,w2 + add w27,w27,w17 // h+=Sigma0(a) + add w12,w12,w1 + ldr w1,[sp,#8] + str w4,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w3,w14,#7 + and w17,w24,w23 + ror w2,w11,#17 + bic w28,w25,w23 + ror w4,w27,#2 + add w26,w26,w12 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w3,w3,w14,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w4,w4,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w2,w2,w11,ror#19 + eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w4,w27,ror#22 // Sigma0(a) + eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) + add w13,w13,w6 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w13,w13,w3 + add w26,w26,w17 // h+=Sigma0(a) + add w13,w13,w2 + ldr w2,[sp,#12] + str w5,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w4,w15,#7 + and w17,w23,w22 + ror w3,w12,#17 + bic w19,w24,w22 + ror w5,w26,#2 + add w25,w25,w13 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w4,w4,w15,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w5,w5,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w3,w3,w12,ror#19 + eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w5,w26,ror#22 // Sigma0(a) + eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) + add w14,w14,w7 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w14,w14,w4 + add w25,w25,w17 // h+=Sigma0(a) + add w14,w14,w3 + ldr w3,[sp,#0] + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w5,w0,#7 + and w17,w22,w21 + ror w4,w13,#17 + bic w28,w23,w21 + ror w6,w25,#2 + add w24,w24,w14 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w5,w5,w0,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w6,w6,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w4,w4,w13,ror#19 + eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w25,ror#22 // Sigma0(a) + eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) + add w15,w15,w8 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w15,w15,w5 + add w24,w24,w17 // h+=Sigma0(a) + add w15,w15,w4 + ldr w4,[sp,#4] + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w6,w1,#7 + and w17,w21,w20 + ror w5,w14,#17 + bic w19,w22,w20 + ror w7,w24,#2 + add w23,w23,w15 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w6,w6,w1,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w7,w7,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w5,w5,w14,ror#19 + eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w24,ror#22 // Sigma0(a) + eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) + add w0,w0,w9 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w0,w0,w6 + add w23,w23,w17 // h+=Sigma0(a) + add w0,w0,w5 + ldr w5,[sp,#8] + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w7,w2,#7 + and w17,w20,w27 + ror w6,w15,#17 + bic w28,w21,w27 + ror w8,w23,#2 + add w22,w22,w0 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w7,w7,w2,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w8,w8,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w6,w6,w15,ror#19 + eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w23,ror#22 // Sigma0(a) + eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) + add w1,w1,w10 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w1,w1,w7 + add w22,w22,w17 // h+=Sigma0(a) + add w1,w1,w6 + ldr w6,[sp,#12] + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w8,w3,#7 + and w17,w27,w26 + ror w7,w0,#17 + bic w19,w20,w26 + ror w9,w22,#2 + add w21,w21,w1 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w8,w8,w3,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w9,w9,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w7,w7,w0,ror#19 + eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w22,ror#22 // Sigma0(a) + eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) + add w2,w2,w11 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w2,w2,w8 + add w21,w21,w17 // h+=Sigma0(a) + add w2,w2,w7 + ldr w7,[sp,#0] + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 + cbnz w19,.Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#260 // rewind + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#2*4] + add x1,x1,#14*4 // advance input pointer + ldp w7,w8,[x0,#4*4] + add w20,w20,w3 + ldp w9,w10,[x0,#6*4] + add w21,w21,w4 + add w22,w22,w5 + add w23,w23,w6 + stp w20,w21,[x0] + add w24,w24,w7 + add w25,w25,w8 + stp w22,w23,[x0,#2*4] + add w26,w26,w9 + add w27,w27,w10 + cmp x1,x2 + stp w24,w25,[x0,#4*4] + stp w26,w27,[x0,#6*4] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*4 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size sha256_block_data_order,.-sha256_block_data_order + +.align 6 +.type .LK256,%object +.LK256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0 //terminator +.size .LK256,.-.LK256 +#ifndef __KERNEL__ +.align 3 +.LOPENSSL_armcap_P: +# ifdef __ILP32__ + .long OPENSSL_armcap_P-. +# else + .quad OPENSSL_armcap_P-. +# endif +#endif +.asciz "SHA256 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align 2 +#ifndef __KERNEL__ +.type sha256_block_armv8,%function +.align 6 +sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,.LK256 + +.Loop_hw: + ld1 {v4.16b-v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + .inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + .inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + .inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret +.size sha256_block_armv8,.-sha256_block_armv8 +#endif +#ifdef __KERNEL__ +.globl sha256_block_neon +#endif +.type sha256_block_neon,%function +.align 4 +sha256_block_neon: +.Lneon_entry: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,.LK256 + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s-v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s-v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b .L_00_48 + +.align 4 +.L_00_48: + ext v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne .L_00_48 + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + csel x17, x17, xzr, eq + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size sha256_block_neon,.-sha256_block_neon +#ifndef __KERNEL__ +.comm OPENSSL_armcap_P,4,4 +#endif diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c new file mode 100644 index 000000000000..a2226f841960 --- /dev/null +++ b/arch/arm64/crypto/sha256-glue.c @@ -0,0 +1,185 @@ +/* + * Linux/arm64 port of the OpenSSL SHA256 implementation for AArch64 + * + * Copyright (c) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <crypto/internal/hash.h> +#include <crypto/sha.h> +#include <crypto/sha256_base.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <linux/string.h> + +MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash for arm64"); +MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("sha224"); +MODULE_ALIAS_CRYPTO("sha256"); + +asmlinkage void sha256_block_data_order(u32 *digest, const void *data, + unsigned int num_blks); + +asmlinkage void sha256_block_neon(u32 *digest, const void *data, + unsigned int num_blks); + +static int sha256_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_block_data_order); +} + +static int sha256_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (len) + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_block_data_order); + sha256_base_do_finalize(desc, + (sha256_block_fn *)sha256_block_data_order); + + return sha256_base_finish(desc, out); +} + +static int sha256_final(struct shash_desc *desc, u8 *out) +{ + return sha256_finup(desc, NULL, 0, out); +} + +static struct shash_alg algs[] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_base_init, + .update = sha256_update, + .final = sha256_final, + .finup = sha256_finup, + .descsize = sizeof(struct sha256_state), + .base.cra_name = "sha256", + .base.cra_driver_name = "sha256-arm64", + .base.cra_priority = 100, + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA256_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_base_init, + .update = sha256_update, + .final = sha256_final, + .finup = sha256_finup, + .descsize = sizeof(struct sha256_state), + .base.cra_name = "sha224", + .base.cra_driver_name = "sha224-arm64", + .base.cra_priority = 100, + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA224_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +} }; + +static int sha256_update_neon(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + /* + * Stacking and unstacking a substantial slice of the NEON register + * file may significantly affect performance for small updates when + * executing in interrupt context, so fall back to the scalar code + * in that case. + */ + if (!may_use_simd()) + return sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_block_data_order); + + kernel_neon_begin(); + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_block_neon); + kernel_neon_end(); + + return 0; +} + +static int sha256_finup_neon(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (!may_use_simd()) { + if (len) + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_block_data_order); + sha256_base_do_finalize(desc, + (sha256_block_fn *)sha256_block_data_order); + } else { + kernel_neon_begin(); + if (len) + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_block_neon); + sha256_base_do_finalize(desc, + (sha256_block_fn *)sha256_block_neon); + kernel_neon_end(); + } + return sha256_base_finish(desc, out); +} + +static int sha256_final_neon(struct shash_desc *desc, u8 *out) +{ + return sha256_finup_neon(desc, NULL, 0, out); +} + +static struct shash_alg neon_algs[] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_base_init, + .update = sha256_update_neon, + .final = sha256_final_neon, + .finup = sha256_finup_neon, + .descsize = sizeof(struct sha256_state), + .base.cra_name = "sha256", + .base.cra_driver_name = "sha256-arm64-neon", + .base.cra_priority = 150, + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA256_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_base_init, + .update = sha256_update_neon, + .final = sha256_final_neon, + .finup = sha256_finup_neon, + .descsize = sizeof(struct sha256_state), + .base.cra_name = "sha224", + .base.cra_driver_name = "sha224-arm64-neon", + .base.cra_priority = 150, + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA224_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +} }; + +static int __init sha256_mod_init(void) +{ + int ret = crypto_register_shashes(algs, ARRAY_SIZE(algs)); + if (ret) + return ret; + + if (elf_hwcap & HWCAP_ASIMD) { + ret = crypto_register_shashes(neon_algs, ARRAY_SIZE(neon_algs)); + if (ret) + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); + } + return ret; +} + +static void __exit sha256_mod_fini(void) +{ + if (elf_hwcap & HWCAP_ASIMD) + crypto_unregister_shashes(neon_algs, ARRAY_SIZE(neon_algs)); + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_init(sha256_mod_init); +module_exit(sha256_mod_fini); diff --git a/arch/arm64/crypto/sha512-armv8.pl b/arch/arm64/crypto/sha512-armv8.pl new file mode 100644 index 000000000000..c55efb308544 --- /dev/null +++ b/arch/arm64/crypto/sha512-armv8.pl @@ -0,0 +1,778 @@ +#! /usr/bin/env perl +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# +# Permission to use under GPLv2 terms is granted. +# ==================================================================== +# +# SHA256/512 for ARMv8. +# +# Performance in cycles per processed byte and improvement coefficient +# over code generated with "default" compiler: +# +# SHA256-hw SHA256(*) SHA512 +# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +# Denver 2.01 10.5 (+26%) 6.70 (+8%) +# X-Gene 20.0 (+100%) 12.8 (+300%(***)) +# Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +# +# (*) Software SHA256 results are of lesser relevance, presented +# mostly for informational purposes. +# (**) The result is a trade-off: it's possible to improve it by +# 10% (or by 1 cycle per round), but at the cost of 20% loss +# on Cortex-A53 (or by 4 cycles per round). +# (***) Super-impressive coefficients over gcc-generated code are +# indication of some compiler "pathology", most notably code +# generated with -mgeneral-regs-only is significanty faster +# and the gap is only 40-90%. +# +# October 2016. +# +# Originally it was reckoned that it makes no sense to implement NEON +# version of SHA256 for 64-bit processors. This is because performance +# improvement on most wide-spread Cortex-A5x processors was observed +# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was +# observed that 32-bit NEON SHA256 performs significantly better than +# 64-bit scalar version on *some* of the more recent processors. As +# result 64-bit NEON version of SHA256 was added to provide best +# all-round performance. For example it executes ~30% faster on X-Gene +# and Mongoose. [For reference, NEON version of SHA512 is bound to +# deliver much less improvement, likely *negative* on Cortex-A5x. +# Which is why NEON support is limited to SHA256.] + +$output=pop; +$flavour=pop; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open OUT,"| \"$^X\" $xlate $flavour $output"; + *STDOUT=*OUT; +} else { + open STDOUT,">$output"; +} + +if ($output =~ /512/) { + $BITS=512; + $SZ=8; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=(1, 8, 7); + @sigma1=(19,61, 6); + $rounds=80; + $reg_t="x"; +} else { + $BITS=256; + $SZ=4; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 7,18, 3); + @sigma1=(17,19,10); + $rounds=64; + $reg_t="w"; +} + +$func="sha${BITS}_block_data_order"; + +($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); + +@X=map("$reg_t$_",(3..15,0..2)); +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); +($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); + +sub BODY_00_xx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my $j=($i+1)&15; +my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); + $T0=@X[$i+3] if ($i<11); + +$code.=<<___ if ($i<16); +#ifndef __AARCH64EB__ + rev @X[$i],@X[$i] // $i +#endif +___ +$code.=<<___ if ($i<13 && ($i&1)); + ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ +___ +$code.=<<___ if ($i==13); + ldp @X[14],@X[15],[$inp] +___ +$code.=<<___ if ($i>=14); + ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] +___ +$code.=<<___ if ($i>0 && $i<16); + add $a,$a,$t1 // h+=Sigma0(a) +___ +$code.=<<___ if ($i>=11); + str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] +___ +# While ARMv8 specifies merged rotate-n-logical operation such as +# 'eor x,y,z,ror#n', it was found to negatively affect performance +# on Apple A7. The reason seems to be that it requires even 'y' to +# be available earlier. This means that such merged instruction is +# not necessarily best choice on critical path... On the other hand +# Cortex-A5x handles merged instructions much better than disjoint +# rotate and logical... See (**) footnote above. +$code.=<<___ if ($i<15); + ror $t0,$e,#$Sigma1[0] + add $h,$h,$t2 // h+=K[i] + eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` + and $t1,$f,$e + bic $t2,$g,$e + add $h,$h,@X[$i&15] // h+=X[i] + orr $t1,$t1,$t2 // Ch(e,f,g) + eor $t2,$a,$b // a^b, b^c in next round + eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) + ror $T0,$a,#$Sigma0[0] + add $h,$h,$t1 // h+=Ch(e,f,g) + eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` + add $h,$h,$t0 // h+=Sigma1(e) + and $t3,$t3,$t2 // (b^c)&=(a^b) + add $d,$d,$h // d+=h + eor $t3,$t3,$b // Maj(a,b,c) + eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) + add $h,$h,$t3 // h+=Maj(a,b,c) + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round + //add $h,$h,$t1 // h+=Sigma0(a) +___ +$code.=<<___ if ($i>=15); + ror $t0,$e,#$Sigma1[0] + add $h,$h,$t2 // h+=K[i] + ror $T1,@X[($j+1)&15],#$sigma0[0] + and $t1,$f,$e + ror $T2,@X[($j+14)&15],#$sigma1[0] + bic $t2,$g,$e + ror $T0,$a,#$Sigma0[0] + add $h,$h,@X[$i&15] // h+=X[i] + eor $t0,$t0,$e,ror#$Sigma1[1] + eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] + orr $t1,$t1,$t2 // Ch(e,f,g) + eor $t2,$a,$b // a^b, b^c in next round + eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) + eor $T0,$T0,$a,ror#$Sigma0[1] + add $h,$h,$t1 // h+=Ch(e,f,g) + and $t3,$t3,$t2 // (b^c)&=(a^b) + eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] + eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) + add $h,$h,$t0 // h+=Sigma1(e) + eor $t3,$t3,$b // Maj(a,b,c) + eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) + eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) + add @X[$j],@X[$j],@X[($j+9)&15] + add $d,$d,$h // d+=h + add $h,$h,$t3 // h+=Maj(a,b,c) + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round + add @X[$j],@X[$j],$T1 + add $h,$h,$t1 // h+=Sigma0(a) + add @X[$j],@X[$j],$T2 +___ + ($t2,$t3)=($t3,$t2); +} + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +#endif + +.text + +.extern OPENSSL_armcap_P +.globl $func +.type $func,%function +.align 6 +$func: +___ +$code.=<<___ if ($SZ==4); +#ifndef __KERNEL__ +# ifdef __ILP32__ + ldrsw x16,.LOPENSSL_armcap_P +# else + ldr x16,.LOPENSSL_armcap_P +# endif + adr x17,.LOPENSSL_armcap_P + add x16,x16,x17 + ldr w16,[x16] + tst w16,#ARMV8_SHA256 + b.ne .Lv8_entry + tst w16,#ARMV7_NEON + b.ne .Lneon_entry +#endif +___ +$code.=<<___; + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*$SZ + + ldp $A,$B,[$ctx] // load context + ldp $C,$D,[$ctx,#2*$SZ] + ldp $E,$F,[$ctx,#4*$SZ] + add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input + ldp $G,$H,[$ctx,#6*$SZ] + adr $Ktbl,.LK$BITS + stp $ctx,$num,[x29,#96] + +.Loop: + ldp @X[0],@X[1],[$inp],#2*$SZ + ldr $t2,[$Ktbl],#$SZ // *K++ + eor $t3,$B,$C // magic seed + str $inp,[x29,#112] +___ +for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=".Loop_16_xx:\n"; +for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + cbnz $t2,.Loop_16_xx + + ldp $ctx,$num,[x29,#96] + ldr $inp,[x29,#112] + sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind + + ldp @X[0],@X[1],[$ctx] + ldp @X[2],@X[3],[$ctx,#2*$SZ] + add $inp,$inp,#14*$SZ // advance input pointer + ldp @X[4],@X[5],[$ctx,#4*$SZ] + add $A,$A,@X[0] + ldp @X[6],@X[7],[$ctx,#6*$SZ] + add $B,$B,@X[1] + add $C,$C,@X[2] + add $D,$D,@X[3] + stp $A,$B,[$ctx] + add $E,$E,@X[4] + add $F,$F,@X[5] + stp $C,$D,[$ctx,#2*$SZ] + add $G,$G,@X[6] + add $H,$H,@X[7] + cmp $inp,$num + stp $E,$F,[$ctx,#4*$SZ] + stp $G,$H,[$ctx,#6*$SZ] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*$SZ + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size $func,.-$func + +.align 6 +.type .LK$BITS,%object +.LK$BITS: +___ +$code.=<<___ if ($SZ==8); + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + .quad 0 // terminator +___ +$code.=<<___ if ($SZ==4); + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0 //terminator +___ +$code.=<<___; +.size .LK$BITS,.-.LK$BITS +#ifndef __KERNEL__ +.align 3 +.LOPENSSL_armcap_P: +# ifdef __ILP32__ + .long OPENSSL_armcap_P-. +# else + .quad OPENSSL_armcap_P-. +# endif +#endif +.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +___ + +if ($SZ==4) { +my $Ktbl="x3"; + +my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); +my @MSG=map("v$_.16b",(4..7)); +my ($W0,$W1)=("v16.4s","v17.4s"); +my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); + +$code.=<<___; +#ifndef __KERNEL__ +.type sha256_block_armv8,%function +.align 6 +sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1.32 {$ABCD,$EFGH},[$ctx] + adr $Ktbl,.LK256 + +.Loop_hw: + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 + sub $num,$num,#1 + ld1.32 {$W0},[$Ktbl],#16 + rev32 @MSG[0],@MSG[0] + rev32 @MSG[1],@MSG[1] + rev32 @MSG[2],@MSG[2] + rev32 @MSG[3],@MSG[3] + orr $ABCD_SAVE,$ABCD,$ABCD // offload + orr $EFGH_SAVE,$EFGH,$EFGH +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + ld1.32 {$W0},[$Ktbl],#16 + add.i32 $W1,$W1,@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + ld1.32 {$W1},[$Ktbl] + add.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + add.i32 $W1,$W1,@MSG[3] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + add.i32 $ABCD,$ABCD,$ABCD_SAVE + add.i32 $EFGH,$EFGH,$EFGH_SAVE + + cbnz $num,.Loop_hw + + st1.32 {$ABCD,$EFGH},[$ctx] + + ldr x29,[sp],#16 + ret +.size sha256_block_armv8,.-sha256_block_armv8 +#endif +___ +} + +if ($SZ==4) { ######################################### NEON stuff # +# You'll surely note a lot of similarities with sha256-armv4 module, +# and of course it's not a coincidence. sha256-armv4 was used as +# initial template, but was adapted for ARMv8 instruction set and +# extensively re-tuned for all-round performance. + +my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); +my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); +my $Ktbl="x16"; +my $Xfer="x17"; +my @X = map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); +my $j=0; + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } +sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } +sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } + +sub Xupdate() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T2,$T0,$sigma0[0]); + eval(shift(@insns)); + &ushr_32 ($T1,$T0,$sigma0[2]); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] + eval(shift(@insns)); + &sli_32 ($T2,$T0,32-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T0,$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T2); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T3,$T0,32-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T4,$T7,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T4,$T7,32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T5,$T7,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T7,$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_u32 ($T3,$T7,32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T6,@X[0],$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T7,@X[0],$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T6,@X[0],32-$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T5,@X[0],$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T6); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T5,@X[0],32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl], #16"); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T5); + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dhi($T5), &Dlo($T7)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + while($#insns>=1) { eval(shift(@insns)); } + &st1_32 ("{$T0}","[$Xfer], #16"); + eval(shift(@insns)); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xpreload() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_8 ("{@X[0]}","[$inp],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &rev32 (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &st1_32 ("{$T0}","[$Xfer], #16"); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. + '&add ($h,$h,$t1)', # h+=X[i]+K[i] + '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past + '&and ($t1,$f,$e)', + '&bic ($t4,$g,$e)', + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past + '&orr ($t1,$t1,$t4)', # Ch(e,f,g) + '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) + '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) + '&ror ($t0,$t0,"#$Sigma1[0]")', + '&eor ($t2,$a,$b)', # a^b, b^c in next round + '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) + '&add ($h,$h,$t0)', # h+=Sigma1(e) + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) + '&ror ($t4,$t4,"#$Sigma0[0]")', + '&add ($d,$d,$h)', # d+=h + '&eor ($t3,$t3,$b)', # Maj(a,b,c) + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' + ) +} + +$code.=<<___; +#ifdef __KERNEL__ +.globl sha256_block_neon +#endif +.type sha256_block_neon,%function +.align 4 +sha256_block_neon: +.Lneon_entry: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr $Ktbl,.LK256 + add $num,$inp,$num,lsl#6 // len to point at the end of inp + + ld1.8 {@X[0]},[$inp], #16 + ld1.8 {@X[1]},[$inp], #16 + ld1.8 {@X[2]},[$inp], #16 + ld1.8 {@X[3]},[$inp], #16 + ld1.32 {$T0},[$Ktbl], #16 + ld1.32 {$T1},[$Ktbl], #16 + ld1.32 {$T2},[$Ktbl], #16 + ld1.32 {$T3},[$Ktbl], #16 + rev32 @X[0],@X[0] // yes, even on + rev32 @X[1],@X[1] // big-endian + rev32 @X[2],@X[2] + rev32 @X[3],@X[3] + mov $Xfer,sp + add.32 $T0,$T0,@X[0] + add.32 $T1,$T1,@X[1] + add.32 $T2,$T2,@X[2] + st1.32 {$T0-$T1},[$Xfer], #32 + add.32 $T3,$T3,@X[3] + st1.32 {$T2-$T3},[$Xfer] + sub $Xfer,$Xfer,#32 + + ldp $A,$B,[$ctx] + ldp $C,$D,[$ctx,#8] + ldp $E,$F,[$ctx,#16] + ldp $G,$H,[$ctx,#24] + ldr $t1,[sp,#0] + mov $t2,wzr + eor $t3,$B,$C + mov $t4,wzr + b .L_00_48 + +.align 4 +.L_00_48: +___ + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); +$code.=<<___; + cmp $t1,#0 // check for K256 terminator + ldr $t1,[sp,#0] + sub $Xfer,$Xfer,#64 + bne .L_00_48 + + sub $Ktbl,$Ktbl,#256 // rewind $Ktbl + cmp $inp,$num + mov $Xfer, #64 + csel $Xfer, $Xfer, xzr, eq + sub $inp,$inp,$Xfer // avoid SEGV + mov $Xfer,sp +___ + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); +$code.=<<___; + add $A,$A,$t4 // h+=Sigma0(a) from the past + ldp $t0,$t1,[$ctx,#0] + add $A,$A,$t2 // h+=Maj(a,b,c) from the past + ldp $t2,$t3,[$ctx,#8] + add $A,$A,$t0 // accumulate + add $B,$B,$t1 + ldp $t0,$t1,[$ctx,#16] + add $C,$C,$t2 + add $D,$D,$t3 + ldp $t2,$t3,[$ctx,#24] + add $E,$E,$t0 + add $F,$F,$t1 + ldr $t1,[sp,#0] + stp $A,$B,[$ctx,#0] + add $G,$G,$t2 + mov $t2,wzr + stp $C,$D,[$ctx,#8] + add $H,$H,$t3 + stp $E,$F,[$ctx,#16] + eor $t3,$B,$C + stp $G,$H,[$ctx,#24] + mov $t4,wzr + mov $Xfer,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size sha256_block_neon,.-sha256_block_neon +___ +} + +$code.=<<___; +#ifndef __KERNEL__ +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +{ my %opcode = ( + "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, + "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; + + s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers + + s/\.[ui]?8(\s)/$1/; + s/\.\w?32\b// and s/\.16b/\.4s/g; + m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; + + print $_,"\n"; +} + +close STDOUT; diff --git a/arch/arm64/crypto/sha512-core.S_shipped b/arch/arm64/crypto/sha512-core.S_shipped new file mode 100644 index 000000000000..bd0f59f06c9d --- /dev/null +++ b/arch/arm64/crypto/sha512-core.S_shipped @@ -0,0 +1,1085 @@ +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + +// ==================================================================== +// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +// project. The module is, however, dual licensed under OpenSSL and +// CRYPTOGAMS licenses depending on where you obtain it. For further +// details see http://www.openssl.org/~appro/cryptogams/. +// +// Permission to use under GPLv2 terms is granted. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significanty faster +// and the gap is only 40-90%. +// +// October 2016. +// +// Originally it was reckoned that it makes no sense to implement NEON +// version of SHA256 for 64-bit processors. This is because performance +// improvement on most wide-spread Cortex-A5x processors was observed +// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was +// observed that 32-bit NEON SHA256 performs significantly better than +// 64-bit scalar version on *some* of the more recent processors. As +// result 64-bit NEON version of SHA256 was added to provide best +// all-round performance. For example it executes ~30% faster on X-Gene +// and Mongoose. [For reference, NEON version of SHA512 is bound to +// deliver much less improvement, likely *negative* on Cortex-A5x. +// Which is why NEON support is limited to SHA256.] + +#ifndef __KERNEL__ +# include "arm_arch.h" +#endif + +.text + +.extern OPENSSL_armcap_P +.globl sha512_block_data_order +.type sha512_block_data_order,%function +.align 6 +sha512_block_data_order: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*8 + + ldp x20,x21,[x0] // load context + ldp x22,x23,[x0,#2*8] + ldp x24,x25,[x0,#4*8] + add x2,x1,x2,lsl#7 // end of input + ldp x26,x27,[x0,#6*8] + adr x30,.LK512 + stp x0,x2,[x29,#96] + +.Loop: + ldp x3,x4,[x1],#2*8 + ldr x19,[x30],#8 // *K++ + eor x28,x21,x22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev x3,x3 // 0 +#endif + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x6,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x3 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x4,x4 // 1 +#endif + ldp x5,x6,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x7,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x4 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x5,x5 // 2 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x8,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x5 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x6,x6 // 3 +#endif + ldp x7,x8,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x9,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x6 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x7,x7 // 4 +#endif + add x24,x24,x17 // h+=Sigma0(a) + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x10,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x7 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x10,ror#18 // Sigma1(e) + ror x10,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x10,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x8,x8 // 5 +#endif + ldp x9,x10,[x1],#2*8 + add x23,x23,x17 // h+=Sigma0(a) + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x11,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x8 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x11,ror#18 // Sigma1(e) + ror x11,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x11,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x9,x9 // 6 +#endif + add x22,x22,x17 // h+=Sigma0(a) + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x12,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x9 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x12,ror#18 // Sigma1(e) + ror x12,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x12,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x10,x10 // 7 +#endif + ldp x11,x12,[x1],#2*8 + add x21,x21,x17 // h+=Sigma0(a) + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + eor x13,x25,x25,ror#23 + and x17,x26,x25 + bic x28,x27,x25 + add x20,x20,x10 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x13,ror#18 // Sigma1(e) + ror x13,x21,#28 + add x20,x20,x17 // h+=Ch(e,f,g) + eor x17,x21,x21,ror#5 + add x20,x20,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x24,x24,x20 // d+=h + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x13,x17,ror#34 // Sigma0(a) + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x20,x20,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x11,x11 // 8 +#endif + add x20,x20,x17 // h+=Sigma0(a) + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x14,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x11 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x14,ror#18 // Sigma1(e) + ror x14,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x14,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x12,x12 // 9 +#endif + ldp x13,x14,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x15,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x12 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x15,ror#18 // Sigma1(e) + ror x15,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x15,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x13,x13 // 10 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x0,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x13 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x0,ror#18 // Sigma1(e) + ror x0,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x0,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x14,x14 // 11 +#endif + ldp x15,x0,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x6,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x14 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x15,x15 // 12 +#endif + add x24,x24,x17 // h+=Sigma0(a) + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x7,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x15 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x0,x0 // 13 +#endif + ldp x1,x2,[x1] + add x23,x23,x17 // h+=Sigma0(a) + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x8,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x0 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x1,x1 // 14 +#endif + ldr x6,[sp,#24] + add x22,x22,x17 // h+=Sigma0(a) + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x9,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x1 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x2,x2 // 15 +#endif + ldr x7,[sp,#0] + add x21,x21,x17 // h+=Sigma0(a) + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 +.Loop_16_xx: + ldr x8,[sp,#8] + str x11,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x10,x5,#1 + and x17,x25,x24 + ror x9,x2,#19 + bic x19,x26,x24 + ror x11,x20,#28 + add x27,x27,x3 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x10,x10,x5,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x11,x11,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x9,x9,x2,ror#61 + eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x11,x20,ror#39 // Sigma0(a) + eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) + add x4,x4,x13 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x4,x4,x10 + add x27,x27,x17 // h+=Sigma0(a) + add x4,x4,x9 + ldr x9,[sp,#16] + str x12,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x11,x6,#1 + and x17,x24,x23 + ror x10,x3,#19 + bic x28,x25,x23 + ror x12,x27,#28 + add x26,x26,x4 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x11,x11,x6,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x12,x12,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x10,x10,x3,ror#61 + eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x12,x27,ror#39 // Sigma0(a) + eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) + add x5,x5,x14 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x5,x5,x11 + add x26,x26,x17 // h+=Sigma0(a) + add x5,x5,x10 + ldr x10,[sp,#24] + str x13,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x12,x7,#1 + and x17,x23,x22 + ror x11,x4,#19 + bic x19,x24,x22 + ror x13,x26,#28 + add x25,x25,x5 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x12,x12,x7,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x13,x13,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x11,x11,x4,ror#61 + eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x13,x26,ror#39 // Sigma0(a) + eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) + add x6,x6,x15 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x6,x6,x12 + add x25,x25,x17 // h+=Sigma0(a) + add x6,x6,x11 + ldr x11,[sp,#0] + str x14,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x13,x8,#1 + and x17,x22,x21 + ror x12,x5,#19 + bic x28,x23,x21 + ror x14,x25,#28 + add x24,x24,x6 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x13,x13,x8,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x14,x14,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x12,x12,x5,ror#61 + eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x14,x25,ror#39 // Sigma0(a) + eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) + add x7,x7,x0 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x7,x7,x13 + add x24,x24,x17 // h+=Sigma0(a) + add x7,x7,x12 + ldr x12,[sp,#8] + str x15,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x14,x9,#1 + and x17,x21,x20 + ror x13,x6,#19 + bic x19,x22,x20 + ror x15,x24,#28 + add x23,x23,x7 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x14,x14,x9,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x15,x15,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x13,x13,x6,ror#61 + eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x15,x24,ror#39 // Sigma0(a) + eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) + add x8,x8,x1 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x8,x8,x14 + add x23,x23,x17 // h+=Sigma0(a) + add x8,x8,x13 + ldr x13,[sp,#16] + str x0,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x15,x10,#1 + and x17,x20,x27 + ror x14,x7,#19 + bic x28,x21,x27 + ror x0,x23,#28 + add x22,x22,x8 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x15,x15,x10,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x0,x0,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x14,x14,x7,ror#61 + eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x0,x23,ror#39 // Sigma0(a) + eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) + add x9,x9,x2 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x9,x9,x15 + add x22,x22,x17 // h+=Sigma0(a) + add x9,x9,x14 + ldr x14,[sp,#24] + str x1,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x0,x11,#1 + and x17,x27,x26 + ror x15,x8,#19 + bic x19,x20,x26 + ror x1,x22,#28 + add x21,x21,x9 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x0,x0,x11,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x1,x1,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x15,x15,x8,ror#61 + eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x1,x22,ror#39 // Sigma0(a) + eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) + add x10,x10,x3 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x10,x10,x0 + add x21,x21,x17 // h+=Sigma0(a) + add x10,x10,x15 + ldr x15,[sp,#0] + str x2,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x1,x12,#1 + and x17,x26,x25 + ror x0,x9,#19 + bic x28,x27,x25 + ror x2,x21,#28 + add x20,x20,x10 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x1,x1,x12,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x2,x2,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x0,x0,x9,ror#61 + eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x2,x21,ror#39 // Sigma0(a) + eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) + add x11,x11,x4 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x11,x11,x1 + add x20,x20,x17 // h+=Sigma0(a) + add x11,x11,x0 + ldr x0,[sp,#8] + str x3,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x2,x13,#1 + and x17,x25,x24 + ror x1,x10,#19 + bic x19,x26,x24 + ror x3,x20,#28 + add x27,x27,x11 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x2,x2,x13,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x3,x3,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x1,x1,x10,ror#61 + eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x3,x20,ror#39 // Sigma0(a) + eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) + add x12,x12,x5 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x12,x12,x2 + add x27,x27,x17 // h+=Sigma0(a) + add x12,x12,x1 + ldr x1,[sp,#16] + str x4,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x3,x14,#1 + and x17,x24,x23 + ror x2,x11,#19 + bic x28,x25,x23 + ror x4,x27,#28 + add x26,x26,x12 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x3,x3,x14,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x4,x4,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x2,x2,x11,ror#61 + eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x4,x27,ror#39 // Sigma0(a) + eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) + add x13,x13,x6 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x13,x13,x3 + add x26,x26,x17 // h+=Sigma0(a) + add x13,x13,x2 + ldr x2,[sp,#24] + str x5,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x4,x15,#1 + and x17,x23,x22 + ror x3,x12,#19 + bic x19,x24,x22 + ror x5,x26,#28 + add x25,x25,x13 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x4,x4,x15,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x5,x5,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x3,x3,x12,ror#61 + eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x5,x26,ror#39 // Sigma0(a) + eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) + add x14,x14,x7 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x14,x14,x4 + add x25,x25,x17 // h+=Sigma0(a) + add x14,x14,x3 + ldr x3,[sp,#0] + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x5,x0,#1 + and x17,x22,x21 + ror x4,x13,#19 + bic x28,x23,x21 + ror x6,x25,#28 + add x24,x24,x14 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x5,x5,x0,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x6,x6,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x4,x4,x13,ror#61 + eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x25,ror#39 // Sigma0(a) + eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) + add x15,x15,x8 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x15,x15,x5 + add x24,x24,x17 // h+=Sigma0(a) + add x15,x15,x4 + ldr x4,[sp,#8] + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x6,x1,#1 + and x17,x21,x20 + ror x5,x14,#19 + bic x19,x22,x20 + ror x7,x24,#28 + add x23,x23,x15 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x6,x6,x1,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x7,x7,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x5,x5,x14,ror#61 + eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x24,ror#39 // Sigma0(a) + eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) + add x0,x0,x9 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x0,x0,x6 + add x23,x23,x17 // h+=Sigma0(a) + add x0,x0,x5 + ldr x5,[sp,#16] + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x7,x2,#1 + and x17,x20,x27 + ror x6,x15,#19 + bic x28,x21,x27 + ror x8,x23,#28 + add x22,x22,x0 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x7,x7,x2,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x8,x8,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x6,x6,x15,ror#61 + eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x23,ror#39 // Sigma0(a) + eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) + add x1,x1,x10 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x1,x1,x7 + add x22,x22,x17 // h+=Sigma0(a) + add x1,x1,x6 + ldr x6,[sp,#24] + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x8,x3,#1 + and x17,x27,x26 + ror x7,x0,#19 + bic x19,x20,x26 + ror x9,x22,#28 + add x21,x21,x1 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x8,x8,x3,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x9,x9,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x7,x7,x0,ror#61 + eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x22,ror#39 // Sigma0(a) + eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) + add x2,x2,x11 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x2,x2,x8 + add x21,x21,x17 // h+=Sigma0(a) + add x2,x2,x7 + ldr x7,[sp,#0] + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 + cbnz x19,.Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#648 // rewind + + ldp x3,x4,[x0] + ldp x5,x6,[x0,#2*8] + add x1,x1,#14*8 // advance input pointer + ldp x7,x8,[x0,#4*8] + add x20,x20,x3 + ldp x9,x10,[x0,#6*8] + add x21,x21,x4 + add x22,x22,x5 + add x23,x23,x6 + stp x20,x21,[x0] + add x24,x24,x7 + add x25,x25,x8 + stp x22,x23,[x0,#2*8] + add x26,x26,x9 + add x27,x27,x10 + cmp x1,x2 + stp x24,x25,[x0,#4*8] + stp x26,x27,[x0,#6*8] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*8 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size sha512_block_data_order,.-sha512_block_data_order + +.align 6 +.type .LK512,%object +.LK512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + .quad 0 // terminator +.size .LK512,.-.LK512 +#ifndef __KERNEL__ +.align 3 +.LOPENSSL_armcap_P: +# ifdef __ILP32__ + .long OPENSSL_armcap_P-. +# else + .quad OPENSSL_armcap_P-. +# endif +#endif +.asciz "SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align 2 +#ifndef __KERNEL__ +.comm OPENSSL_armcap_P,4,4 +#endif diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c new file mode 100644 index 000000000000..aff35c9992a4 --- /dev/null +++ b/arch/arm64/crypto/sha512-glue.c @@ -0,0 +1,94 @@ +/* + * Linux/arm64 port of the OpenSSL SHA512 implementation for AArch64 + * + * Copyright (c) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <crypto/internal/hash.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <linux/string.h> +#include <crypto/sha.h> +#include <crypto/sha512_base.h> +#include <asm/neon.h> + +MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash for arm64"); +MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("sha384"); +MODULE_ALIAS_CRYPTO("sha512"); + +asmlinkage void sha512_block_data_order(u32 *digest, const void *data, + unsigned int num_blks); + +static int sha512_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha512_base_do_update(desc, data, len, + (sha512_block_fn *)sha512_block_data_order); +} + +static int sha512_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (len) + sha512_base_do_update(desc, data, len, + (sha512_block_fn *)sha512_block_data_order); + sha512_base_do_finalize(desc, + (sha512_block_fn *)sha512_block_data_order); + + return sha512_base_finish(desc, out); +} + +static int sha512_final(struct shash_desc *desc, u8 *out) +{ + return sha512_finup(desc, NULL, 0, out); +} + +static struct shash_alg algs[] = { { + .digestsize = SHA512_DIGEST_SIZE, + .init = sha512_base_init, + .update = sha512_update, + .final = sha512_final, + .finup = sha512_finup, + .descsize = sizeof(struct sha512_state), + .base.cra_name = "sha512", + .base.cra_driver_name = "sha512-arm64", + .base.cra_priority = 150, + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA512_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}, { + .digestsize = SHA384_DIGEST_SIZE, + .init = sha384_base_init, + .update = sha512_update, + .final = sha512_final, + .finup = sha512_finup, + .descsize = sizeof(struct sha512_state), + .base.cra_name = "sha384", + .base.cra_driver_name = "sha384-arm64", + .base.cra_priority = 150, + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA384_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +} }; + +static int __init sha512_mod_init(void) +{ + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} + +static void __exit sha512_mod_fini(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_init(sha512_mod_init); +module_exit(sha512_mod_fini); diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 44e1d7f10add..8365a84c2640 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -1,7 +1,6 @@ generic-y += bugs.h generic-y += clkdev.h generic-y += cputime.h -generic-y += current.h generic-y += delay.h generic-y += div64.h generic-y += dma.h @@ -24,7 +23,6 @@ generic-y += mm-arch-hooks.h generic-y += mman.h generic-y += msgbuf.h generic-y += msi.h -generic-y += mutex.h generic-y += poll.h generic-y += preempt.h generic-y += resource.h diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index e517088d635f..d0de0e032bc2 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -17,6 +17,7 @@ #include <asm/cputype.h> #include <asm/smp_plat.h> +#include <asm/tlbflush.h> /* Macros for consistency checks of the GICC subtable of MADT */ #define ACPI_MADT_GICC_LENGTH \ @@ -114,8 +115,28 @@ static inline const char *acpi_get_enable_method(int cpu) } #ifdef CONFIG_ACPI_APEI +/* + * acpi_disable_cmcff is used in drivers/acpi/apei/hest.c for disabling + * IA-32 Architecture Corrected Machine Check (CMC) Firmware-First mode + * with a kernel command line parameter "acpi=nocmcoff". But we don't + * have this IA-32 specific feature on ARM64, this definition is only + * for compatibility. + */ +#define acpi_disable_cmcff 1 pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr); -#endif + +/* + * Despite its name, this function must still broadcast the TLB + * invalidation in order to ensure other CPUs don't end up with junk + * entries as a result of speculation. Unusually, its also called in + * IRQ context (ghes_iounmap_irq) so if we ever need to use IPIs for + * TLB broadcasting, then we're in trouble here. + */ +static inline void arch_apei_flush_tlb_one(unsigned long addr) +{ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); +} +#endif /* CONFIG_ACPI_APEI */ #ifdef CONFIG_ACPI_NUMA int arm64_acpi_numa_init(void); diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index 39feb85a6931..6e1cb8c5af4d 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -1,7 +1,7 @@ #ifndef __ASM_ALTERNATIVE_H #define __ASM_ALTERNATIVE_H -#include <asm/cpufeature.h> +#include <asm/cpucaps.h> #include <asm/insn.h> #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index f8ae6d6e4767..f37e3a21f6e7 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -79,19 +79,10 @@ #include <linux/stringify.h> #include <asm/barrier.h> +#include <asm/cacheflush.h> -#define read_gicreg(r) \ - ({ \ - u64 reg; \ - asm volatile("mrs_s %0, " __stringify(r) : "=r" (reg)); \ - reg; \ - }) - -#define write_gicreg(v,r) \ - do { \ - u64 __val = (v); \ - asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\ - } while (0) +#define read_gicreg read_sysreg_s +#define write_gicreg write_sysreg_s /* * Low-level accessors @@ -102,13 +93,13 @@ static inline void gic_write_eoir(u32 irq) { - asm volatile("msr_s " __stringify(ICC_EOIR1_EL1) ", %0" : : "r" ((u64)irq)); + write_sysreg_s(irq, ICC_EOIR1_EL1); isb(); } static inline void gic_write_dir(u32 irq) { - asm volatile("msr_s " __stringify(ICC_DIR_EL1) ", %0" : : "r" ((u64)irq)); + write_sysreg_s(irq, ICC_DIR_EL1); isb(); } @@ -116,7 +107,7 @@ static inline u64 gic_read_iar_common(void) { u64 irqstat; - asm volatile("mrs_s %0, " __stringify(ICC_IAR1_EL1) : "=r" (irqstat)); + irqstat = read_sysreg_s(ICC_IAR1_EL1); dsb(sy); return irqstat; } @@ -132,12 +123,9 @@ static inline u64 gic_read_iar_cavium_thunderx(void) { u64 irqstat; - asm volatile( - "nop;nop;nop;nop\n\t" - "nop;nop;nop;nop\n\t" - "mrs_s %0, " __stringify(ICC_IAR1_EL1) "\n\t" - "nop;nop;nop;nop" - : "=r" (irqstat)); + nops(8); + irqstat = read_sysreg_s(ICC_IAR1_EL1); + nops(4); mb(); return irqstat; @@ -145,37 +133,34 @@ static inline u64 gic_read_iar_cavium_thunderx(void) static inline void gic_write_pmr(u32 val) { - asm volatile("msr_s " __stringify(ICC_PMR_EL1) ", %0" : : "r" ((u64)val)); + write_sysreg_s(val, ICC_PMR_EL1); } static inline void gic_write_ctlr(u32 val) { - asm volatile("msr_s " __stringify(ICC_CTLR_EL1) ", %0" : : "r" ((u64)val)); + write_sysreg_s(val, ICC_CTLR_EL1); isb(); } static inline void gic_write_grpen1(u32 val) { - asm volatile("msr_s " __stringify(ICC_GRPEN1_EL1) ", %0" : : "r" ((u64)val)); + write_sysreg_s(val, ICC_GRPEN1_EL1); isb(); } static inline void gic_write_sgi1r(u64 val) { - asm volatile("msr_s " __stringify(ICC_SGI1R_EL1) ", %0" : : "r" (val)); + write_sysreg_s(val, ICC_SGI1R_EL1); } static inline u32 gic_read_sre(void) { - u64 val; - - asm volatile("mrs_s %0, " __stringify(ICC_SRE_EL1) : "=r" (val)); - return val; + return read_sysreg_s(ICC_SRE_EL1); } static inline void gic_write_sre(u32 val) { - asm volatile("msr_s " __stringify(ICC_SRE_EL1) ", %0" : : "r" ((u64)val)); + write_sysreg_s(val, ICC_SRE_EL1); isb(); } @@ -187,5 +172,21 @@ static inline void gic_write_bpr1(u32 val) #define gic_read_typer(c) readq_relaxed(c) #define gic_write_irouter(v, c) writeq_relaxed(v, c) +#define gic_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l)) + +#define gits_read_baser(c) readq_relaxed(c) +#define gits_write_baser(v, c) writeq_relaxed(v, c) + +#define gits_read_cbaser(c) readq_relaxed(c) +#define gits_write_cbaser(v, c) writeq_relaxed(v, c) + +#define gits_write_cwriter(v, c) writeq_relaxed(v, c) + +#define gicr_read_propbaser(c) readq_relaxed(c) +#define gicr_write_propbaser(v, c) writeq_relaxed(v, c) + +#define gicr_write_pendbaser(v, c) writeq_relaxed(v, c) +#define gicr_read_pendbaser(c) readq_relaxed(c) + #endif /* __ASSEMBLY__ */ #endif /* __ASM_ARCH_GICV3_H */ diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 28bfe6132eb6..446f6c46d4b1 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -41,6 +41,15 @@ msr daifclr, #2 .endm + .macro save_and_disable_irq, flags + mrs \flags, daif + msr daifset, #2 + .endm + + .macro restore_irq, flags + msr daif, \flags + .endm + /* * Enable and disable debug exceptions. */ @@ -202,14 +211,25 @@ lr .req x30 // link register .endm /* + * @dst: Result of per_cpu(sym, smp_processor_id()) * @sym: The name of the per-cpu variable - * @reg: Result of per_cpu(sym, smp_processor_id()) * @tmp: scratch register */ - .macro this_cpu_ptr, sym, reg, tmp - adr_l \reg, \sym + .macro adr_this_cpu, dst, sym, tmp + adr_l \dst, \sym mrs \tmp, tpidr_el1 - add \reg, \reg, \tmp + add \dst, \dst, \tmp + .endm + + /* + * @dst: Result of READ_ONCE(per_cpu(sym, smp_processor_id())) + * @sym: The name of the per-cpu variable + * @tmp: scratch register + */ + .macro ldr_this_cpu dst, sym, tmp + adr_l \dst, \sym + mrs \tmp, tpidr_el1 + ldr \dst, [\dst, \tmp] .endm /* @@ -395,4 +415,24 @@ alternative_endif movk \reg, :abs_g0_nc:\val .endm +/* + * Return the current thread_info. + */ + .macro get_thread_info, rd + mrs \rd, sp_el0 + .endm + +/* + * Errata workaround post TTBR0_EL1 update. + */ + .macro post_ttbr0_update_workaround +#ifdef CONFIG_CAVIUM_ERRATUM_27456 +alternative_if ARM64_WORKAROUND_CAVIUM_27456 + ic iallu + dsb nsh + isb +alternative_else_nop_endif +#endif + .endm + #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 2e5fb976a572..5a2a6ee65f65 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -65,12 +65,12 @@ * - kaddr - page address * - size - region size */ -extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void flush_icache_range(unsigned long start, unsigned long end); extern void __flush_dcache_area(void *addr, size_t len); extern void __clean_dcache_area_poc(void *addr, size_t len); extern void __clean_dcache_area_pou(void *addr, size_t len); extern long __flush_cache_user_range(unsigned long start, unsigned long end); +extern void sync_icache_aliases(void *kaddr, unsigned long len); static inline void flush_cache_mm(struct mm_struct *mm) { @@ -81,6 +81,11 @@ static inline void flush_cache_page(struct vm_area_struct *vma, { } +static inline void flush_cache_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ +} + /* * Cache maintenance functions used by the DMA API. No to be used directly. */ diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h new file mode 100644 index 000000000000..4174f09678c4 --- /dev/null +++ b/arch/arm64/include/asm/cpucaps.h @@ -0,0 +1,41 @@ +/* + * arch/arm64/include/asm/cpucaps.h + * + * Copyright (C) 2016 ARM Ltd. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#ifndef __ASM_CPUCAPS_H +#define __ASM_CPUCAPS_H + +#define ARM64_WORKAROUND_CLEAN_CACHE 0 +#define ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE 1 +#define ARM64_WORKAROUND_845719 2 +#define ARM64_HAS_SYSREG_GIC_CPUIF 3 +#define ARM64_HAS_PAN 4 +#define ARM64_HAS_LSE_ATOMICS 5 +#define ARM64_WORKAROUND_CAVIUM_23154 6 +#define ARM64_WORKAROUND_834220 7 +#define ARM64_HAS_NO_HW_PREFETCH 8 +#define ARM64_HAS_UAO 9 +#define ARM64_ALT_PAN_NOT_UAO 10 +#define ARM64_HAS_VIRT_HOST_EXTN 11 +#define ARM64_WORKAROUND_CAVIUM_27456 12 +#define ARM64_HAS_32BIT_EL0 13 +#define ARM64_HYP_OFFSET_LOW 14 +#define ARM64_MISMATCHED_CACHE_LINE_SIZE 15 +#define ARM64_HAS_NO_FPSIMD 16 + +#define ARM64_NCAPS 17 + +#endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index a27c3245ba21..b4989df48670 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -9,8 +9,7 @@ #ifndef __ASM_CPUFEATURE_H #define __ASM_CPUFEATURE_H -#include <linux/jump_label.h> - +#include <asm/cpucaps.h> #include <asm/hwcap.h> #include <asm/sysreg.h> @@ -24,27 +23,10 @@ #define MAX_CPU_FEATURES (8 * sizeof(elf_hwcap)) #define cpu_feature(x) ilog2(HWCAP_ ## x) -#define ARM64_WORKAROUND_CLEAN_CACHE 0 -#define ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE 1 -#define ARM64_WORKAROUND_845719 2 -#define ARM64_HAS_SYSREG_GIC_CPUIF 3 -#define ARM64_HAS_PAN 4 -#define ARM64_HAS_LSE_ATOMICS 5 -#define ARM64_WORKAROUND_CAVIUM_23154 6 -#define ARM64_WORKAROUND_834220 7 -#define ARM64_HAS_NO_HW_PREFETCH 8 -#define ARM64_HAS_UAO 9 -#define ARM64_ALT_PAN_NOT_UAO 10 -#define ARM64_HAS_VIRT_HOST_EXTN 11 -#define ARM64_WORKAROUND_CAVIUM_27456 12 -#define ARM64_HAS_32BIT_EL0 13 -#define ARM64_HYP_OFFSET_LOW 14 -#define ARM64_MISMATCHED_CACHE_LINE_SIZE 15 - -#define ARM64_NCAPS 16 - #ifndef __ASSEMBLY__ +#include <linux/bug.h> +#include <linux/jump_label.h> #include <linux/kernel.h> /* CPU feature register tracking */ @@ -122,14 +104,19 @@ static inline bool cpu_have_feature(unsigned int num) return elf_hwcap & (1UL << num); } +/* System capability check for constant caps */ +static inline bool cpus_have_const_cap(int num) +{ + if (num >= ARM64_NCAPS) + return false; + return static_branch_unlikely(&cpu_hwcap_keys[num]); +} + static inline bool cpus_have_cap(unsigned int num) { if (num >= ARM64_NCAPS) return false; - if (__builtin_constant_p(num)) - return static_branch_unlikely(&cpu_hwcap_keys[num]); - else - return test_bit(num, cpu_hwcaps); + return test_bit(num, cpu_hwcaps); } static inline void cpus_set_cap(unsigned int num) @@ -218,7 +205,7 @@ static inline bool cpu_supports_mixed_endian_el0(void) static inline bool system_supports_32bit_el0(void) { - return cpus_have_cap(ARM64_HAS_32BIT_EL0); + return cpus_have_const_cap(ARM64_HAS_32BIT_EL0); } static inline bool system_supports_mixed_endian_el0(void) @@ -226,6 +213,17 @@ static inline bool system_supports_mixed_endian_el0(void) return id_aa64mmfr0_mixed_endian_el0(read_system_reg(SYS_ID_AA64MMFR0_EL1)); } +static inline bool system_supports_fpsimd(void) +{ + return !cpus_have_const_cap(ARM64_HAS_NO_FPSIMD); +} + +static inline bool system_uses_ttbr0_pan(void) +{ + return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) && + !cpus_have_cap(ARM64_HAS_PAN); +} + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/current.h b/arch/arm64/include/asm/current.h new file mode 100644 index 000000000000..f2bcbe2d9889 --- /dev/null +++ b/arch/arm64/include/asm/current.h @@ -0,0 +1,22 @@ +#ifndef __ASM_CURRENT_H +#define __ASM_CURRENT_H + +#include <linux/compiler.h> + +#include <asm/sysreg.h> + +#ifndef __ASSEMBLY__ + +struct task_struct; + +static __always_inline struct task_struct *get_current(void) +{ + return (struct task_struct *)read_sysreg(sp_el0); +} + +#define current get_current() + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_CURRENT_H */ + diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index b71420a12f26..a44cf5225429 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -68,6 +68,9 @@ #define BRK64_ESR_MASK 0xFFFF #define BRK64_ESR_KPROBES 0x0004 #define BRK64_OPCODE_KPROBES (AARCH64_BREAK_MON | (BRK64_ESR_KPROBES << 5)) +/* uprobes BRK opcodes with ESR encoding */ +#define BRK64_ESR_UPROBES 0x0005 +#define BRK64_OPCODE_UPROBES (AARCH64_BREAK_MON | (BRK64_ESR_UPROBES << 5)) /* AArch32 */ #define DBG_ESR_EVT_BKPT 0x4 diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index a9e54aad15ef..0b6b1633017f 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -1,6 +1,7 @@ #ifndef _ASM_EFI_H #define _ASM_EFI_H +#include <asm/cpufeature.h> #include <asm/io.h> #include <asm/mmu_context.h> #include <asm/neon.h> @@ -51,6 +52,9 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); #define __efi_call_early(f, ...) f(__VA_ARGS__) #define efi_is_64bit() (true) +#define efi_call_proto(protocol, f, instance, ...) \ + ((protocol##_t *)instance)->f(instance, ##__VA_ARGS__) + #define alloc_screen_info(x...) &screen_info #define free_screen_info(x...) @@ -75,7 +79,30 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) static inline void efi_set_pgd(struct mm_struct *mm) { - switch_mm(NULL, mm, NULL); + __switch_mm(mm); + + if (system_uses_ttbr0_pan()) { + if (mm != current->active_mm) { + /* + * Update the current thread's saved ttbr0 since it is + * restored as part of a return from exception. Set + * the hardware TTBR0_EL1 using cpu_switch_mm() + * directly to enable potential errata workarounds. + */ + update_saved_ttbr0(current, mm); + cpu_switch_mm(mm->pgd, mm); + } else { + /* + * Defer the switch to the current thread's TTBR0_EL1 + * until uaccess_enable(). Restore the current + * thread's saved ttbr0 corresponding to its active_mm + * (if different from init_mm). + */ + cpu_set_reserved_ttbr0(); + if (current->active_mm != &init_mm) + update_saved_ttbr0(current, current->active_mm); + } + } } void efi_virtmap_load(void); diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index a55384f4a5d7..5d1700425efe 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -138,7 +138,11 @@ typedef struct user_fpsimd_state elf_fpregset_t; */ #define ELF_PLAT_INIT(_r, load_addr) (_r)->regs[0] = 0 -#define SET_PERSONALITY(ex) clear_thread_flag(TIF_32BIT); +#define SET_PERSONALITY(ex) \ +({ \ + clear_bit(TIF_32BIT, ¤t->mm->context.flags); \ + clear_thread_flag(TIF_32BIT); \ +}) /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ #define ARCH_DLINFO \ @@ -183,7 +187,11 @@ typedef compat_elf_greg_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; ((x)->e_flags & EF_ARM_EABI_MASK)) #define compat_start_thread compat_start_thread -#define COMPAT_SET_PERSONALITY(ex) set_thread_flag(TIF_32BIT); +#define COMPAT_SET_PERSONALITY(ex) \ +({ \ + set_bit(TIF_32BIT, ¤t->mm->context.flags); \ + set_thread_flag(TIF_32BIT); \ + }) #define COMPAT_ARCH_DLINFO extern int aarch32_setup_vectors_page(struct linux_binprm *bprm, int uses_interp); diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index f2585cdd32c2..85c4a8981d47 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -21,15 +21,12 @@ #include <linux/futex.h> #include <linux/uaccess.h> -#include <asm/alternative.h> -#include <asm/cpufeature.h> #include <asm/errno.h> -#include <asm/sysreg.h> #define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \ +do { \ + uaccess_enable(); \ asm volatile( \ - ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ " prfm pstl1strm, %2\n" \ "1: ldxr %w1, %2\n" \ insn "\n" \ @@ -44,11 +41,11 @@ " .popsection\n" \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ - ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \ : "r" (oparg), "Ir" (-EFAULT) \ - : "memory") + : "memory"); \ + uaccess_disable(); \ +} while (0) static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) @@ -118,8 +115,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; + uaccess_enable(); asm volatile("// futex_atomic_cmpxchg_inatomic\n" -ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) " prfm pstl1strm, %2\n" "1: ldxr %w1, %2\n" " sub %w3, %w1, %w4\n" @@ -134,10 +131,10 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) " .popsection\n" _ASM_EXTABLE(1b, 4b) _ASM_EXTABLE(2b, 4b) -ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp) : "r" (oldval), "r" (newval), "Ir" (-EFAULT) : "memory"); + uaccess_disable(); *uval = val; return ret; diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h index 9510ace570e2..b6b167ac082b 100644 --- a/arch/arm64/include/asm/hw_breakpoint.h +++ b/arch/arm64/include/asm/hw_breakpoint.h @@ -77,7 +77,11 @@ static inline void decode_ctrl_reg(u32 reg, /* Lengths */ #define ARM_BREAKPOINT_LEN_1 0x1 #define ARM_BREAKPOINT_LEN_2 0x3 +#define ARM_BREAKPOINT_LEN_3 0x7 #define ARM_BREAKPOINT_LEN_4 0xf +#define ARM_BREAKPOINT_LEN_5 0x1f +#define ARM_BREAKPOINT_LEN_6 0x3f +#define ARM_BREAKPOINT_LEN_7 0x7f #define ARM_BREAKPOINT_LEN_8 0xff /* Kernel stepping */ @@ -119,7 +123,7 @@ struct perf_event; struct pmu; extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, - int *gen_len, int *gen_type); + int *gen_len, int *gen_type, int *offset); extern int arch_check_bp_in_kernelspace(struct perf_event *bp); extern int arch_validate_hwbkpt_settings(struct perf_event *bp); extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 0bba427bb4c2..0c00c87bb9dd 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -22,7 +22,6 @@ #ifdef __KERNEL__ #include <linux/types.h> -#include <linux/blk_types.h> #include <asm/byteorder.h> #include <asm/barrier.h> diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 7e51d1b57c0c..7803343e5881 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -19,6 +19,7 @@ #ifndef __ASM_KERNEL_PGTABLE_H #define __ASM_KERNEL_PGTABLE_H +#include <asm/pgtable.h> #include <asm/sparsemem.h> /* @@ -54,6 +55,12 @@ #define SWAPPER_DIR_SIZE (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE) #define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE) +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +#define RESERVED_TTBR0_SIZE (PAGE_SIZE) +#else +#define RESERVED_TTBR0_SIZE (0) +#endif + /* Initial memory map size */ #if ARM64_SWAPPER_USES_SECTION_MAPS #define SWAPPER_BLOCK_SHIFT SECTION_SHIFT diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 18f746551bf6..ec3553eb9349 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -54,6 +54,7 @@ extern char __kvm_hyp_vector[]; extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); extern void __kvm_tlb_flush_vmid(struct kvm *kvm); +extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index bd94e6766759..e5050388e062 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -62,6 +62,9 @@ struct kvm_arch { /* VTTBR value associated with above pgd and vmid */ u64 vttbr; + /* The last vcpu id that ran on each physical CPU */ + int __percpu *last_vcpu_ran; + /* The maximum number of vCPUs depends on the used GIC model */ int max_vcpus; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index a79b969c26fc..6f72fe8b0e3e 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -128,7 +128,7 @@ static inline unsigned long __kern_hyp_va(unsigned long v) return v; } -#define kern_hyp_va(v) (typeof(v))(__kern_hyp_va((unsigned long)(v))) +#define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v)))) /* * We currently only support a 40bit IPA. diff --git a/arch/arm64/include/asm/lse.h b/arch/arm64/include/asm/lse.h index 23acc00be32d..fc756e22c84c 100644 --- a/arch/arm64/include/asm/lse.h +++ b/arch/arm64/include/asm/lse.h @@ -5,7 +5,6 @@ #include <linux/stringify.h> #include <asm/alternative.h> -#include <asm/cpufeature.h> #ifdef __ASSEMBLER__ diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index ba62df8c6e35..b71086d25195 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -217,7 +217,7 @@ static inline void *phys_to_virt(phys_addr_t x) #define _virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) #else #define __virt_to_pgoff(kaddr) (((u64)(kaddr) & ~PAGE_OFFSET) / PAGE_SIZE * sizeof(struct page)) -#define __page_to_voff(kaddr) (((u64)(page) & ~VMEMMAP_START) * PAGE_SIZE / sizeof(struct page)) +#define __page_to_voff(page) (((u64)(page) & ~VMEMMAP_START) * PAGE_SIZE / sizeof(struct page)) #define page_to_virt(page) ((void *)((__page_to_voff(page)) | PAGE_OFFSET)) #define virt_to_page(vaddr) ((struct page *)((__virt_to_pgoff(vaddr)) | VMEMMAP_START)) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 8d9fce037b2f..47619411f0ff 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -19,6 +19,7 @@ typedef struct { atomic64_t id; void *vdso; + unsigned long flags; } mm_context_t; /* @@ -34,7 +35,7 @@ extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); extern void init_mem_pgprot(void); extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, - pgprot_t prot, bool allow_block_mappings); + pgprot_t prot, bool page_mappings_only); extern void *fixmap_remap_fdt(phys_addr_t dt_phys); #endif diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index a50185375f09..0363fe80455c 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -23,6 +23,7 @@ #include <linux/sched.h> #include <asm/cacheflush.h> +#include <asm/cpufeature.h> #include <asm/proc-fns.h> #include <asm-generic/mm_hooks.h> #include <asm/cputype.h> @@ -103,7 +104,7 @@ static inline void cpu_uninstall_idmap(void) local_flush_tlb_all(); cpu_set_default_tcr_t0sz(); - if (mm != &init_mm) + if (mm != &init_mm && !system_uses_ttbr0_pan()) cpu_switch_mm(mm->pgd, mm); } @@ -163,20 +164,26 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { } -/* - * This is the actual mm switch as far as the scheduler - * is concerned. No registers are touched. We avoid - * calling the CPU specific function when the mm hasn't - * actually changed. - */ -static inline void -switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +static inline void update_saved_ttbr0(struct task_struct *tsk, + struct mm_struct *mm) { - unsigned int cpu = smp_processor_id(); + if (system_uses_ttbr0_pan()) { + BUG_ON(mm->pgd == swapper_pg_dir); + task_thread_info(tsk)->ttbr0 = + virt_to_phys(mm->pgd) | ASID(mm) << 48; + } +} +#else +static inline void update_saved_ttbr0(struct task_struct *tsk, + struct mm_struct *mm) +{ +} +#endif - if (prev == next) - return; +static inline void __switch_mm(struct mm_struct *next) +{ + unsigned int cpu = smp_processor_id(); /* * init_mm.pgd does not contain any user mappings and it is always @@ -190,8 +197,26 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, check_and_switch_context(next, cpu); } +static inline void +switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + if (prev != next) + __switch_mm(next); + + /* + * Update the saved TTBR0_EL1 of the scheduled-in task as the previous + * value may have not been initialised yet (activate_mm caller) or the + * ASID has changed since the last run (following the context switch + * of another thread of the same process). Avoid setting the reserved + * TTBR0_EL1 to swapper_pg_dir (init_mm; e.g. via idle_task_exit). + */ + if (next != &init_mm) + update_saved_ttbr0(tsk, next); +} + #define deactivate_mm(tsk,mm) do { } while (0) -#define activate_mm(prev,next) switch_mm(prev, next, NULL) +#define activate_mm(prev,next) switch_mm(prev, next, current) void verify_cpu_asid_bits(void); diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h index 13ce4cc18e26..ad4cdc966c0f 100644 --- a/arch/arm64/include/asm/neon.h +++ b/arch/arm64/include/asm/neon.h @@ -9,8 +9,9 @@ */ #include <linux/types.h> +#include <asm/fpsimd.h> -#define cpu_has_neon() (1) +#define cpu_has_neon() system_supports_fpsimd() #define kernel_neon_begin() kernel_neon_begin_partial(32) diff --git a/arch/arm64/include/asm/opcodes.h b/arch/arm64/include/asm/opcodes.h deleted file mode 100644 index 123f45d92cd1..000000000000 --- a/arch/arm64/include/asm/opcodes.h +++ /dev/null @@ -1,5 +0,0 @@ -#ifdef CONFIG_CPU_BIG_ENDIAN -#define CONFIG_CPU_ENDIAN_BE8 CONFIG_CPU_BIG_ENDIAN -#endif - -#include <../../arm/include/asm/opcodes.h> diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h index 5394c8405e66..3bd498e4de4c 100644 --- a/arch/arm64/include/asm/percpu.h +++ b/arch/arm64/include/asm/percpu.h @@ -16,6 +16,8 @@ #ifndef __ASM_PERCPU_H #define __ASM_PERCPU_H +#include <asm/stack_pointer.h> + static inline void set_my_cpu_offset(unsigned long off) { asm volatile("msr tpidr_el1, %0" :: "r" (off) : "memory"); @@ -101,16 +103,16 @@ static inline unsigned long __percpu_read(void *ptr, int size) switch (size) { case 1: - ret = ACCESS_ONCE(*(u8 *)ptr); + ret = READ_ONCE(*(u8 *)ptr); break; case 2: - ret = ACCESS_ONCE(*(u16 *)ptr); + ret = READ_ONCE(*(u16 *)ptr); break; case 4: - ret = ACCESS_ONCE(*(u32 *)ptr); + ret = READ_ONCE(*(u32 *)ptr); break; case 8: - ret = ACCESS_ONCE(*(u64 *)ptr); + ret = READ_ONCE(*(u64 *)ptr); break; default: BUILD_BUG(); @@ -123,16 +125,16 @@ static inline void __percpu_write(void *ptr, unsigned long val, int size) { switch (size) { case 1: - ACCESS_ONCE(*(u8 *)ptr) = (u8)val; + WRITE_ONCE(*(u8 *)ptr, (u8)val); break; case 2: - ACCESS_ONCE(*(u16 *)ptr) = (u16)val; + WRITE_ONCE(*(u16 *)ptr, (u16)val); break; case 4: - ACCESS_ONCE(*(u32 *)ptr) = (u32)val; + WRITE_ONCE(*(u32 *)ptr, (u32)val); break; case 8: - ACCESS_ONCE(*(u64 *)ptr) = (u64)val; + WRITE_ONCE(*(u64 *)ptr, (u64)val); break; default: BUILD_BUG(); diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h index 2065f46fa740..8d5cbec17d80 100644 --- a/arch/arm64/include/asm/perf_event.h +++ b/arch/arm64/include/asm/perf_event.h @@ -17,6 +17,8 @@ #ifndef __ASM_PERF_EVENT_H #define __ASM_PERF_EVENT_H +#include <asm/stack_pointer.h> + #define ARMV8_PMU_MAX_COUNTERS 32 #define ARMV8_PMU_COUNTER_MASK (ARMV8_PMU_MAX_COUNTERS - 1) @@ -46,7 +48,15 @@ #define ARMV8_PMU_EVTYPE_MASK 0xc800ffff /* Mask for writable bits */ #define ARMV8_PMU_EVTYPE_EVENT 0xffff /* Mask for EVENT bits */ -#define ARMV8_PMU_EVTYPE_EVENT_SW_INCR 0 /* Software increment event */ +/* + * PMUv3 event types: required events + */ +#define ARMV8_PMUV3_PERFCTR_SW_INCR 0x00 +#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL 0x03 +#define ARMV8_PMUV3_PERFCTR_L1D_CACHE 0x04 +#define ARMV8_PMUV3_PERFCTR_BR_MIS_PRED 0x10 +#define ARMV8_PMUV3_PERFCTR_CPU_CYCLES 0x11 +#define ARMV8_PMUV3_PERFCTR_BR_PRED 0x12 /* * Event filters for PMUv3 diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h index 5af574d632fa..6a5b28904c33 100644 --- a/arch/arm64/include/asm/probes.h +++ b/arch/arm64/include/asm/probes.h @@ -15,21 +15,22 @@ #ifndef _ARM_PROBES_H #define _ARM_PROBES_H -#include <asm/opcodes.h> - -struct kprobe; -struct arch_specific_insn; - -typedef u32 kprobe_opcode_t; -typedef void (kprobes_handler_t) (u32 opcode, long addr, struct pt_regs *); +typedef u32 probe_opcode_t; +typedef void (probes_handler_t) (u32 opcode, long addr, struct pt_regs *); /* architecture specific copy of original instruction */ -struct arch_specific_insn { - kprobe_opcode_t *insn; +struct arch_probe_insn { + probe_opcode_t *insn; pstate_check_t *pstate_cc; - kprobes_handler_t *handler; + probes_handler_t *handler; /* restore address after step xol */ unsigned long restore; }; +#ifdef CONFIG_KPROBES +typedef u32 kprobe_opcode_t; +struct arch_specific_insn { + struct arch_probe_insn api; +}; +#endif #endif diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 60e34824e18c..747c65a616ed 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -149,8 +149,6 @@ static inline void cpu_relax(void) asm volatile("yield" ::: "memory"); } -#define cpu_relax_lowlatency() cpu_relax() - /* Thread switching */ extern struct task_struct *cpu_switch_to(struct task_struct *prev, struct task_struct *next); diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h index 07b8ed037dee..6afd8476c60c 100644 --- a/arch/arm64/include/asm/ptdump.h +++ b/arch/arm64/include/asm/ptdump.h @@ -16,9 +16,10 @@ #ifndef __ASM_PTDUMP_H #define __ASM_PTDUMP_H -#ifdef CONFIG_ARM64_PTDUMP +#ifdef CONFIG_ARM64_PTDUMP_CORE #include <linux/mm_types.h> +#include <linux/seq_file.h> struct addr_marker { unsigned long start_address; @@ -29,16 +30,25 @@ struct ptdump_info { struct mm_struct *mm; const struct addr_marker *markers; unsigned long base_addr; - unsigned long max_addr; }; -int ptdump_register(struct ptdump_info *info, const char *name); - +void ptdump_walk_pgd(struct seq_file *s, struct ptdump_info *info); +#ifdef CONFIG_ARM64_PTDUMP_DEBUGFS +int ptdump_debugfs_register(struct ptdump_info *info, const char *name); #else -static inline int ptdump_register(struct ptdump_info *info, const char *name) +static inline int ptdump_debugfs_register(struct ptdump_info *info, + const char *name) { return 0; } -#endif /* CONFIG_ARM64_PTDUMP */ +#endif +void ptdump_check_wx(void); +#endif /* CONFIG_ARM64_PTDUMP_CORE */ + +#ifdef CONFIG_DEBUG_WX +#define debug_checkwx() ptdump_check_wx() +#else +#define debug_checkwx() do { } while (0) +#endif #endif /* __ASM_PTDUMP_H */ diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index ada08b5b036d..513daf050e84 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -217,6 +217,14 @@ int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task); #include <asm-generic/ptrace.h> +#define procedure_link_pointer(regs) ((regs)->regs[30]) + +static inline void procedure_link_pointer_set(struct pt_regs *regs, + unsigned long val) +{ + procedure_link_pointer(regs) = val; +} + #undef profile_pc extern unsigned long profile_pc(struct pt_regs *regs); diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index 022644704a93..d050d720a1b4 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -29,11 +29,22 @@ #ifndef __ASSEMBLY__ +#include <asm/percpu.h> + #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/thread_info.h> -#define raw_smp_processor_id() (current_thread_info()->cpu) +DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); + +/* + * We don't use this_cpu_read(cpu_number) as that has implicit writes to + * preempt_count, and associated (compiler) barriers, that we'd like to avoid + * the expense of. If we're preemptible, the value can be stale at use anyway. + * And we can't use this_cpu_ptr() either, as that winds up recursing back + * here under CONFIG_DEBUG_PREEMPT=y. + */ +#define raw_smp_processor_id() (*raw_cpu_ptr(&cpu_number)) struct seq_file; @@ -73,6 +84,7 @@ asmlinkage void secondary_start_kernel(void); */ struct secondary_data { void *stack; + struct task_struct *task; long status; }; diff --git a/arch/arm64/include/asm/stack_pointer.h b/arch/arm64/include/asm/stack_pointer.h new file mode 100644 index 000000000000..ffcdf742cddf --- /dev/null +++ b/arch/arm64/include/asm/stack_pointer.h @@ -0,0 +1,9 @@ +#ifndef __ASM_STACK_POINTER_H +#define __ASM_STACK_POINTER_H + +/* + * how to get the current stack pointer from C + */ +register unsigned long current_stack_pointer asm ("sp"); + +#endif /* __ASM_STACK_POINTER_H */ diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h index b8a313fd7a09..de5600f40adf 100644 --- a/arch/arm64/include/asm/suspend.h +++ b/arch/arm64/include/asm/suspend.h @@ -1,7 +1,7 @@ #ifndef __ASM_SUSPEND_H #define __ASM_SUSPEND_H -#define NR_CTX_REGS 10 +#define NR_CTX_REGS 12 #define NR_CALLEE_SAVED_REGS 12 /* diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 6c80b3699cb8..98ae03f8eedd 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -22,8 +22,6 @@ #include <linux/stringify.h> -#include <asm/opcodes.h> - /* * ARMv8 ARM reserves the following encoding for system registers: * (Ref: ARMv8 ARM, Section: "System instruction class encoding overview", @@ -37,6 +35,33 @@ #define sys_reg(op0, op1, crn, crm, op2) \ ((((op0)&3)<<19)|((op1)<<16)|((crn)<<12)|((crm)<<8)|((op2)<<5)) +#ifndef CONFIG_BROKEN_GAS_INST + +#ifdef __ASSEMBLY__ +#define __emit_inst(x) .inst (x) +#else +#define __emit_inst(x) ".inst " __stringify((x)) "\n\t" +#endif + +#else /* CONFIG_BROKEN_GAS_INST */ + +#ifndef CONFIG_CPU_BIG_ENDIAN +#define __INSTR_BSWAP(x) (x) +#else /* CONFIG_CPU_BIG_ENDIAN */ +#define __INSTR_BSWAP(x) ((((x) << 24) & 0xff000000) | \ + (((x) << 8) & 0x00ff0000) | \ + (((x) >> 8) & 0x0000ff00) | \ + (((x) >> 24) & 0x000000ff)) +#endif /* CONFIG_CPU_BIG_ENDIAN */ + +#ifdef __ASSEMBLY__ +#define __emit_inst(x) .long __INSTR_BSWAP(x) +#else /* __ASSEMBLY__ */ +#define __emit_inst(x) ".long " __stringify(__INSTR_BSWAP(x)) "\n\t" +#endif /* __ASSEMBLY__ */ + +#endif /* CONFIG_BROKEN_GAS_INST */ + #define SYS_MIDR_EL1 sys_reg(3, 0, 0, 0, 0) #define SYS_MPIDR_EL1 sys_reg(3, 0, 0, 0, 5) #define SYS_REVIDR_EL1 sys_reg(3, 0, 0, 0, 6) @@ -81,10 +106,10 @@ #define REG_PSTATE_PAN_IMM sys_reg(0, 0, 4, 0, 4) #define REG_PSTATE_UAO_IMM sys_reg(0, 0, 4, 0, 3) -#define SET_PSTATE_PAN(x) __inst_arm(0xd5000000 | REG_PSTATE_PAN_IMM |\ - (!!x)<<8 | 0x1f) -#define SET_PSTATE_UAO(x) __inst_arm(0xd5000000 | REG_PSTATE_UAO_IMM |\ - (!!x)<<8 | 0x1f) +#define SET_PSTATE_PAN(x) __emit_inst(0xd5000000 | REG_PSTATE_PAN_IMM | \ + (!!x)<<8 | 0x1f) +#define SET_PSTATE_UAO(x) __emit_inst(0xd5000000 | REG_PSTATE_UAO_IMM | \ + (!!x)<<8 | 0x1f) /* Common SCTLR_ELx flags. */ #define SCTLR_ELx_EE (1 << 25) @@ -228,11 +253,11 @@ .equ .L__reg_num_xzr, 31 .macro mrs_s, rt, sreg - .inst 0xd5200000|(\sreg)|(.L__reg_num_\rt) + __emit_inst(0xd5200000|(\sreg)|(.L__reg_num_\rt)) .endm .macro msr_s, sreg, rt - .inst 0xd5000000|(\sreg)|(.L__reg_num_\rt) + __emit_inst(0xd5000000|(\sreg)|(.L__reg_num_\rt)) .endm #else @@ -246,11 +271,11 @@ asm( " .equ .L__reg_num_xzr, 31\n" "\n" " .macro mrs_s, rt, sreg\n" -" .inst 0xd5200000|(\\sreg)|(.L__reg_num_\\rt)\n" + __emit_inst(0xd5200000|(\\sreg)|(.L__reg_num_\\rt)) " .endm\n" "\n" " .macro msr_s, sreg, rt\n" -" .inst 0xd5000000|(\\sreg)|(.L__reg_num_\\rt)\n" + __emit_inst(0xd5000000|(\\sreg)|(.L__reg_num_\\rt)) " .endm\n" ); diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index e9ea5a6bd449..46c3b93cf865 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -36,58 +36,31 @@ struct task_struct; +#include <asm/stack_pointer.h> #include <asm/types.h> typedef unsigned long mm_segment_t; /* * low level task data that entry.S needs immediate access to. - * __switch_to() assumes cpu_context follows immediately after cpu_domain. */ struct thread_info { unsigned long flags; /* low level flags */ mm_segment_t addr_limit; /* address limit */ - struct task_struct *task; /* main task structure */ +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + u64 ttbr0; /* saved TTBR0_EL1 */ +#endif int preempt_count; /* 0 => preemptable, <0 => bug */ - int cpu; /* cpu */ }; #define INIT_THREAD_INFO(tsk) \ { \ - .task = &tsk, \ - .flags = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) -/* - * how to get the current stack pointer from C - */ -register unsigned long current_stack_pointer asm ("sp"); - -/* - * how to get the thread information struct from C - */ -static inline struct thread_info *current_thread_info(void) __attribute_const__; - -/* - * struct thread_info can be accessed directly via sp_el0. - * - * We don't use read_sysreg() as we want the compiler to cache the value where - * possible. - */ -static inline struct thread_info *current_thread_info(void) -{ - unsigned long sp_el0; - - asm ("mrs %0, sp_el0" : "=r" (sp_el0)); - - return (struct thread_info *)sp_el0; -} - #define thread_saved_pc(tsk) \ ((unsigned long)(tsk->thread.cpu_context.pc)) #define thread_saved_sp(tsk) \ @@ -112,6 +85,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_NEED_RESCHED 1 #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ +#define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_NOHZ 7 #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT 9 @@ -132,10 +106,12 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE) + _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ + _TIF_UPROBE) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 55d0adbf6509..d26750ca6e06 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -18,6 +18,12 @@ #ifndef __ASM_UACCESS_H #define __ASM_UACCESS_H +#include <asm/alternative.h> +#include <asm/kernel-pgtable.h> +#include <asm/sysreg.h> + +#ifndef __ASSEMBLY__ + /* * User space memory access functions */ @@ -26,10 +32,8 @@ #include <linux/string.h> #include <linux/thread_info.h> -#include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/ptrace.h> -#include <asm/sysreg.h> #include <asm/errno.h> #include <asm/memory.h> #include <asm/compiler.h> @@ -120,6 +124,99 @@ static inline void set_fs(mm_segment_t fs) " .popsection\n" /* + * User access enabling/disabling. + */ +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +static inline void __uaccess_ttbr0_disable(void) +{ + unsigned long ttbr; + + /* reserved_ttbr0 placed at the end of swapper_pg_dir */ + ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE; + write_sysreg(ttbr, ttbr0_el1); + isb(); +} + +static inline void __uaccess_ttbr0_enable(void) +{ + unsigned long flags; + + /* + * Disable interrupts to avoid preemption between reading the 'ttbr0' + * variable and the MSR. A context switch could trigger an ASID + * roll-over and an update of 'ttbr0'. + */ + local_irq_save(flags); + write_sysreg(current_thread_info()->ttbr0, ttbr0_el1); + isb(); + local_irq_restore(flags); +} + +static inline bool uaccess_ttbr0_disable(void) +{ + if (!system_uses_ttbr0_pan()) + return false; + __uaccess_ttbr0_disable(); + return true; +} + +static inline bool uaccess_ttbr0_enable(void) +{ + if (!system_uses_ttbr0_pan()) + return false; + __uaccess_ttbr0_enable(); + return true; +} +#else +static inline bool uaccess_ttbr0_disable(void) +{ + return false; +} + +static inline bool uaccess_ttbr0_enable(void) +{ + return false; +} +#endif + +#define __uaccess_disable(alt) \ +do { \ + if (!uaccess_ttbr0_disable()) \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ + CONFIG_ARM64_PAN)); \ +} while (0) + +#define __uaccess_enable(alt) \ +do { \ + if (!uaccess_ttbr0_enable()) \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ + CONFIG_ARM64_PAN)); \ +} while (0) + +static inline void uaccess_disable(void) +{ + __uaccess_disable(ARM64_HAS_PAN); +} + +static inline void uaccess_enable(void) +{ + __uaccess_enable(ARM64_HAS_PAN); +} + +/* + * These functions are no-ops when UAO is present. + */ +static inline void uaccess_disable_not_uao(void) +{ + __uaccess_disable(ARM64_ALT_PAN_NOT_UAO); +} + +static inline void uaccess_enable_not_uao(void) +{ + __uaccess_enable(ARM64_ALT_PAN_NOT_UAO); +} + +/* * The "__xxx" versions of the user access functions do not verify the address * space - it must have been done previously with a separate "access_ok()" * call. @@ -146,8 +243,7 @@ static inline void set_fs(mm_segment_t fs) do { \ unsigned long __gu_val; \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_enable_not_uao(); \ switch (sizeof(*(ptr))) { \ case 1: \ __get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \ @@ -168,9 +264,8 @@ do { \ default: \ BUILD_BUG(); \ } \ + uaccess_disable_not_uao(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ } while (0) #define __get_user(x, ptr) \ @@ -215,8 +310,7 @@ do { \ do { \ __typeof__(*(ptr)) __pu_val = (x); \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_enable_not_uao(); \ switch (sizeof(*(ptr))) { \ case 1: \ __put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \ @@ -237,8 +331,7 @@ do { \ default: \ BUILD_BUG(); \ } \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_disable_not_uao(); \ } while (0) #define __put_user(x, ptr) \ @@ -331,4 +424,66 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count); extern __must_check long strlen_user(const char __user *str); extern __must_check long strnlen_user(const char __user *str, long n); +#else /* __ASSEMBLY__ */ + +#include <asm/assembler.h> + +/* + * User access enabling/disabling macros. + */ +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + .macro __uaccess_ttbr0_disable, tmp1 + mrs \tmp1, ttbr1_el1 // swapper_pg_dir + add \tmp1, \tmp1, #SWAPPER_DIR_SIZE // reserved_ttbr0 at the end of swapper_pg_dir + msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 + isb + .endm + + .macro __uaccess_ttbr0_enable, tmp1 + get_thread_info \tmp1 + ldr \tmp1, [\tmp1, #TSK_TI_TTBR0] // load saved TTBR0_EL1 + msr ttbr0_el1, \tmp1 // set the non-PAN TTBR0_EL1 + isb + .endm + + .macro uaccess_ttbr0_disable, tmp1 +alternative_if_not ARM64_HAS_PAN + __uaccess_ttbr0_disable \tmp1 +alternative_else_nop_endif + .endm + + .macro uaccess_ttbr0_enable, tmp1, tmp2 +alternative_if_not ARM64_HAS_PAN + save_and_disable_irq \tmp2 // avoid preemption + __uaccess_ttbr0_enable \tmp1 + restore_irq \tmp2 +alternative_else_nop_endif + .endm +#else + .macro uaccess_ttbr0_disable, tmp1 + .endm + + .macro uaccess_ttbr0_enable, tmp1, tmp2 + .endm +#endif + +/* + * These macros are no-ops when UAO is present. + */ + .macro uaccess_disable_not_uao, tmp1 + uaccess_ttbr0_disable \tmp1 +alternative_if ARM64_ALT_PAN_NOT_UAO + SET_PSTATE_PAN(1) +alternative_else_nop_endif + .endm + + .macro uaccess_enable_not_uao, tmp1, tmp2 + uaccess_ttbr0_enable \tmp1, \tmp2 +alternative_if ARM64_ALT_PAN_NOT_UAO + SET_PSTATE_PAN(0) +alternative_else_nop_endif + .endm + +#endif /* __ASSEMBLY__ */ + #endif /* __ASM_UACCESS_H */ diff --git a/arch/arm64/include/asm/uprobes.h b/arch/arm64/include/asm/uprobes.h new file mode 100644 index 000000000000..8d004073d0e8 --- /dev/null +++ b/arch/arm64/include/asm/uprobes.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2014-2016 Pratyush Anand <panand@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_UPROBES_H +#define _ASM_UPROBES_H + +#include <asm/debug-monitors.h> +#include <asm/insn.h> +#include <asm/probes.h> + +#define MAX_UINSN_BYTES AARCH64_INSN_SIZE + +#define UPROBE_SWBP_INSN BRK64_OPCODE_UPROBES +#define UPROBE_SWBP_INSN_SIZE AARCH64_INSN_SIZE +#define UPROBE_XOL_SLOT_BYTES MAX_UINSN_BYTES + +typedef u32 uprobe_opcode_t; + +struct arch_uprobe_task { +}; + +struct arch_uprobe { + union { + u8 insn[MAX_UINSN_BYTES]; + u8 ixol[MAX_UINSN_BYTES]; + }; + struct arch_probe_insn api; + bool simulate; +}; + +#endif diff --git a/arch/arm64/include/asm/xen/hypercall.h b/arch/arm64/include/asm/xen/hypercall.h index 74b0c423ff5b..3522cbaed316 100644 --- a/arch/arm64/include/asm/xen/hypercall.h +++ b/arch/arm64/include/asm/xen/hypercall.h @@ -1 +1 @@ -#include <../../arm/include/asm/xen/hypercall.h> +#include <xen/arm/hypercall.h> diff --git a/arch/arm64/include/asm/xen/hypervisor.h b/arch/arm64/include/asm/xen/hypervisor.h index f263da8e8769..d6e7709d0688 100644 --- a/arch/arm64/include/asm/xen/hypervisor.h +++ b/arch/arm64/include/asm/xen/hypervisor.h @@ -1 +1 @@ -#include <../../arm/include/asm/xen/hypervisor.h> +#include <xen/arm/hypervisor.h> diff --git a/arch/arm64/include/asm/xen/interface.h b/arch/arm64/include/asm/xen/interface.h index 44457aebeed4..88c0d75da190 100644 --- a/arch/arm64/include/asm/xen/interface.h +++ b/arch/arm64/include/asm/xen/interface.h @@ -1 +1 @@ -#include <../../arm/include/asm/xen/interface.h> +#include <xen/arm/interface.h> diff --git a/arch/arm64/include/asm/xen/page-coherent.h b/arch/arm64/include/asm/xen/page-coherent.h index 2052102b4e02..b3ef061d8b74 100644 --- a/arch/arm64/include/asm/xen/page-coherent.h +++ b/arch/arm64/include/asm/xen/page-coherent.h @@ -1 +1 @@ -#include <../../arm/include/asm/xen/page-coherent.h> +#include <xen/arm/page-coherent.h> diff --git a/arch/arm64/include/asm/xen/page.h b/arch/arm64/include/asm/xen/page.h index bed87ec36780..31bbc803cecb 100644 --- a/arch/arm64/include/asm/xen/page.h +++ b/arch/arm64/include/asm/xen/page.h @@ -1 +1 @@ -#include <../../arm/include/asm/xen/page.h> +#include <xen/arm/page.h> diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index b0988bb1bf64..04de188a36c9 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -14,10 +14,8 @@ #include <linux/slab.h> #include <linux/sysctl.h> -#include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/insn.h> -#include <asm/opcodes.h> #include <asm/sysreg.h> #include <asm/system_misc.h> #include <asm/traps.h> @@ -285,10 +283,10 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) #define __SWP_LL_SC_LOOPS 4 #define __user_swpX_asm(data, addr, res, temp, temp2, B) \ +do { \ + uaccess_enable(); \ __asm__ __volatile__( \ " mov %w3, %w7\n" \ - ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ "0: ldxr"B" %w2, [%4]\n" \ "1: stxr"B" %w0, %w1, [%4]\n" \ " cbz %w0, 2f\n" \ @@ -306,12 +304,12 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) " .popsection" \ _ASM_EXTABLE(0b, 4b) \ _ASM_EXTABLE(1b, 4b) \ - ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ : "=&r" (res), "+r" (data), "=&r" (temp), "=&r" (temp2) \ : "r" (addr), "i" (-EAGAIN), "i" (-EFAULT), \ "i" (__SWP_LL_SC_LOOPS) \ - : "memory") + : "memory"); \ + uaccess_disable(); \ +} while (0) #define __user_swp_asm(data, addr, res, temp, temp2) \ __user_swpX_asm(data, addr, res, temp, temp2, "") @@ -352,6 +350,10 @@ static int emulate_swpX(unsigned int address, unsigned int *data, return res; } +#define ARM_OPCODE_CONDTEST_FAIL 0 +#define ARM_OPCODE_CONDTEST_PASS 1 +#define ARM_OPCODE_CONDTEST_UNCOND 2 + #define ARM_OPCODE_CONDITION_UNCOND 0xf static unsigned int __kprobes aarch32_check_condition(u32 opcode, u32 psr) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 4a2f0f0fef32..bc049afc73a7 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -36,11 +36,13 @@ int main(void) { DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); BLANK(); - DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); - DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); - DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); - DEFINE(TI_TASK, offsetof(struct thread_info, task)); - DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); + DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); + DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); + DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit)); +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); +#endif + DEFINE(TSK_STACK, offsetof(struct task_struct, stack)); BLANK(); DEFINE(THREAD_CPU_CONTEXT, offsetof(struct task_struct, thread.cpu_context)); BLANK(); @@ -123,6 +125,7 @@ int main(void) DEFINE(TZ_DSTTIME, offsetof(struct timezone, tz_dsttime)); BLANK(); DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack)); + DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); BLANK(); #ifdef CONFIG_KVM_ARM_HOST DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index c02504ea304b..fdf8f045929f 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -47,6 +47,7 @@ unsigned int compat_elf_hwcap2 __read_mostly; #endif DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); +EXPORT_SYMBOL(cpu_hwcaps); DEFINE_STATIC_KEY_ARRAY_FALSE(cpu_hwcap_keys, ARM64_NCAPS); EXPORT_SYMBOL(cpu_hwcap_keys); @@ -746,6 +747,14 @@ static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry, return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode(); } +static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unused) +{ + u64 pfr0 = read_system_reg(SYS_ID_AA64PFR0_EL1); + + return cpuid_feature_extract_signed_field(pfr0, + ID_AA64PFR0_FP_SHIFT) < 0; +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -829,6 +838,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .def_scope = SCOPE_SYSTEM, .matches = hyp_offset_low, }, + { + /* FP/SIMD is not implemented */ + .capability = ARM64_HAS_NO_FPSIMD, + .def_scope = SCOPE_SYSTEM, + .min_field_value = 0, + .matches = has_no_fpsimd, + }, {}, }; @@ -1102,5 +1118,5 @@ void __init setup_cpu_features(void) static bool __maybe_unused cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused) { - return (cpus_have_cap(ARM64_HAS_PAN) && !cpus_have_cap(ARM64_HAS_UAO)); + return (cpus_have_const_cap(ARM64_HAS_PAN) && !cpus_have_const_cap(ARM64_HAS_UAO)); } diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index b3d5b3e8fbcb..7b7be71e87bf 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -227,7 +227,7 @@ static struct attribute_group cpuregs_attr_group = { .name = "identification" }; -static int cpuid_add_regs(int cpu) +static int cpuid_cpu_online(unsigned int cpu) { int rc; struct device *dev; @@ -248,7 +248,7 @@ out: return rc; } -static int cpuid_remove_regs(int cpu) +static int cpuid_cpu_offline(unsigned int cpu) { struct device *dev; struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu); @@ -264,40 +264,22 @@ static int cpuid_remove_regs(int cpu) return 0; } -static int cpuid_callback(struct notifier_block *nb, - unsigned long action, void *hcpu) -{ - int rc = 0; - unsigned long cpu = (unsigned long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - rc = cpuid_add_regs(cpu); - break; - case CPU_DEAD: - rc = cpuid_remove_regs(cpu); - break; - } - - return notifier_from_errno(rc); -} - static int __init cpuinfo_regs_init(void) { - int cpu; - - cpu_notifier_register_begin(); + int cpu, ret; for_each_possible_cpu(cpu) { struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu); kobject_init(&info->kobj, &cpuregs_kobj_type); - if (cpu_online(cpu)) - cpuid_add_regs(cpu); } - __hotcpu_notifier(cpuid_callback, 0); - cpu_notifier_register_done(); + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "arm64/cpuinfo:online", + cpuid_cpu_online, cpuid_cpu_offline); + if (ret < 0) { + pr_err("cpuinfo: failed to register hotplug callbacks.\n"); + return ret; + } return 0; } static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info) diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 73ae90ef434c..605df76f0a06 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -226,6 +226,8 @@ static void send_user_sigtrap(int si_code) static int single_step_handler(unsigned long addr, unsigned int esr, struct pt_regs *regs) { + bool handler_found = false; + /* * If we are stepping a pending breakpoint, call the hw_breakpoint * handler first. @@ -233,7 +235,14 @@ static int single_step_handler(unsigned long addr, unsigned int esr, if (!reinstall_suspended_bps(regs)) return 0; - if (user_mode(regs)) { +#ifdef CONFIG_KPROBES + if (kprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) + handler_found = true; +#endif + if (!handler_found && call_step_hook(regs, esr) == DBG_HOOK_HANDLED) + handler_found = true; + + if (!handler_found && user_mode(regs)) { send_user_sigtrap(TRAP_TRACE); /* @@ -243,15 +252,8 @@ static int single_step_handler(unsigned long addr, unsigned int esr, * to the active-not-pending state). */ user_rewind_single_step(current); - } else { -#ifdef CONFIG_KPROBES - if (kprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) - return 0; -#endif - if (call_step_hook(regs, esr) == DBG_HOOK_HANDLED) - return 0; - - pr_warning("Unexpected kernel single-step exception at EL1\n"); + } else if (!handler_found) { + pr_warn("Unexpected kernel single-step exception at EL1\n"); /* * Re-enable stepping since we know that we will be * returning to regs. @@ -304,16 +306,20 @@ NOKPROBE_SYMBOL(call_break_hook); static int brk_handler(unsigned long addr, unsigned int esr, struct pt_regs *regs) { - if (user_mode(regs)) { - send_user_sigtrap(TRAP_BRKPT); - } + bool handler_found = false; + #ifdef CONFIG_KPROBES - else if ((esr & BRK64_ESR_MASK) == BRK64_ESR_KPROBES) { - if (kprobe_breakpoint_handler(regs, esr) != DBG_HOOK_HANDLED) - return -EFAULT; + if ((esr & BRK64_ESR_MASK) == BRK64_ESR_KPROBES) { + if (kprobe_breakpoint_handler(regs, esr) == DBG_HOOK_HANDLED) + handler_found = true; } #endif - else if (call_break_hook(regs, esr) != DBG_HOOK_HANDLED) { + if (!handler_found && call_break_hook(regs, esr) == DBG_HOOK_HANDLED) + handler_found = true; + + if (!handler_found && user_mode(regs)) { + send_user_sigtrap(TRAP_BRKPT); + } else if (!handler_found) { pr_warn("Unexpected kernel BRK exception at EL1\n"); return -EFAULT; } diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index ba9bee389fd5..5d17f377d905 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -62,8 +62,8 @@ struct screen_info screen_info __section(.data); int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) { pteval_t prot_val = create_mapping_protection(md); - bool allow_block_mappings = (md->type != EFI_RUNTIME_SERVICES_CODE && - md->type != EFI_RUNTIME_SERVICES_DATA); + bool page_mappings_only = (md->type == EFI_RUNTIME_SERVICES_CODE || + md->type == EFI_RUNTIME_SERVICES_DATA); if (!PAGE_ALIGNED(md->phys_addr) || !PAGE_ALIGNED(md->num_pages << EFI_PAGE_SHIFT)) { @@ -76,12 +76,12 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) * from the MMU routines. So avoid block mappings altogether in * that case. */ - allow_block_mappings = false; + page_mappings_only = true; } create_pgd_mapping(mm, md->phys_addr, md->virt_addr, md->num_pages << EFI_PAGE_SHIFT, - __pgprot(prot_val | PTE_NG), allow_block_mappings); + __pgprot(prot_val | PTE_NG), page_mappings_only); return 0; } diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 223d54a4d66b..4f0d76339414 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -29,7 +29,9 @@ #include <asm/esr.h> #include <asm/irq.h> #include <asm/memory.h> +#include <asm/ptrace.h> #include <asm/thread_info.h> +#include <asm/uaccess.h> #include <asm/unistd.h> /* @@ -90,9 +92,8 @@ .if \el == 0 mrs x21, sp_el0 - mov tsk, sp - and tsk, tsk, #~(THREAD_SIZE - 1) // Ensure MDSCR_EL1.SS is clear, - ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug + ldr_this_cpu tsk, __entry_task, x20 // Ensure MDSCR_EL1.SS is clear, + ldr x19, [tsk, #TSK_TI_FLAGS] // since we can unmask debug disable_step_tsk x19, x20 // exceptions when scheduling. mov x29, xzr // fp pointed to user-space @@ -100,15 +101,41 @@ add x21, sp, #S_FRAME_SIZE get_thread_info tsk /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */ - ldr x20, [tsk, #TI_ADDR_LIMIT] + ldr x20, [tsk, #TSK_TI_ADDR_LIMIT] str x20, [sp, #S_ORIG_ADDR_LIMIT] mov x20, #TASK_SIZE_64 - str x20, [tsk, #TI_ADDR_LIMIT] + str x20, [tsk, #TSK_TI_ADDR_LIMIT] /* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */ .endif /* \el == 0 */ mrs x22, elr_el1 mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] + +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Set the TTBR0 PAN bit in SPSR. When the exception is taken from + * EL0, there is no need to check the state of TTBR0_EL1 since + * accesses are always enabled. + * Note that the meaning of this bit differs from the ARMv8.1 PAN + * feature as all TTBR0_EL1 accesses are disabled, not just those to + * user mappings. + */ +alternative_if ARM64_HAS_PAN + b 1f // skip TTBR0 PAN +alternative_else_nop_endif + + .if \el != 0 + mrs x21, ttbr0_el1 + tst x21, #0xffff << 48 // Check for the reserved ASID + orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR + b.eq 1f // TTBR0 access already disabled + and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR + .endif + + __uaccess_ttbr0_disable x21 +1: +#endif + stp x22, x23, [sp, #S_PC] /* @@ -139,7 +166,7 @@ .if \el != 0 /* Restore the task's original addr_limit. */ ldr x20, [sp, #S_ORIG_ADDR_LIMIT] - str x20, [tsk, #TI_ADDR_LIMIT] + str x20, [tsk, #TSK_TI_ADDR_LIMIT] /* No need to restore UAO, it will be restored from SPSR_EL1 */ .endif @@ -147,6 +174,40 @@ ldp x21, x22, [sp, #S_PC] // load ELR, SPSR .if \el == 0 ct_user_enter + .endif + +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR + * PAN bit checking. + */ +alternative_if ARM64_HAS_PAN + b 2f // skip TTBR0 PAN +alternative_else_nop_endif + + .if \el != 0 + tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set + .endif + + __uaccess_ttbr0_enable x0 + + .if \el == 0 + /* + * Enable errata workarounds only if returning to user. The only + * workaround currently required for TTBR0_EL1 changes are for the + * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache + * corruption). + */ + post_ttbr0_update_workaround + .endif +1: + .if \el != 0 + and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit + .endif +2: +#endif + + .if \el == 0 ldr x23, [sp, #S_SP] // load return stack pointer msr sp_el0, x23 #ifdef CONFIG_ARM64_ERRATUM_845719 @@ -162,6 +223,7 @@ alternative_if ARM64_WORKAROUND_845719 alternative_else_nop_endif #endif .endif + msr elr_el1, x21 // set up the return data msr spsr_el1, x22 ldp x0, x1, [sp, #16 * 0] @@ -184,23 +246,20 @@ alternative_else_nop_endif eret // return to kernel .endm - .macro get_thread_info, rd - mrs \rd, sp_el0 - .endm - .macro irq_stack_entry mov x19, sp // preserve the original sp /* - * Compare sp with the current thread_info, if the top - * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and - * should switch to the irq stack. + * Compare sp with the base of the task stack. + * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack, + * and should switch to the irq stack. */ - and x25, x19, #~(THREAD_SIZE - 1) - cmp x25, tsk - b.ne 9998f + ldr x25, [tsk, TSK_STACK] + eor x25, x25, x19 + and x25, x25, #~(THREAD_SIZE - 1) + cbnz x25, 9998f - this_cpu_ptr irq_stack, x25, x26 + adr_this_cpu x25, irq_stack, x26 mov x26, #IRQ_STACK_START_SP add x26, x25, x26 @@ -427,9 +486,9 @@ el1_irq: irq_handler #ifdef CONFIG_PREEMPT - ldr w24, [tsk, #TI_PREEMPT] // get preempt count + ldr w24, [tsk, #TSK_TI_PREEMPT] // get preempt count cbnz w24, 1f // preempt count != 0 - ldr x0, [tsk, #TI_FLAGS] // get flags + ldr x0, [tsk, #TSK_TI_FLAGS] // get flags tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling? bl el1_preempt 1: @@ -444,7 +503,7 @@ ENDPROC(el1_irq) el1_preempt: mov x24, lr 1: bl preempt_schedule_irq // irq en/disable is done inside - ldr x0, [tsk, #TI_FLAGS] // get new tasks TI_FLAGS + ldr x0, [tsk, #TSK_TI_FLAGS] // get new tasks TI_FLAGS tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling? ret x24 #endif @@ -674,8 +733,7 @@ ENTRY(cpu_switch_to) ldp x29, x9, [x8], #16 ldr lr, [x8] mov sp, x9 - and x9, x9, #~(THREAD_SIZE - 1) - msr sp_el0, x9 + msr sp_el0, x1 ret ENDPROC(cpu_switch_to) @@ -686,7 +744,7 @@ ENDPROC(cpu_switch_to) ret_fast_syscall: disable_irq // disable interrupts str x0, [sp, #S_X0] // returned x0 - ldr x1, [tsk, #TI_FLAGS] // re-check for syscall tracing + ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing and x2, x1, #_TIF_SYSCALL_WORK cbnz x2, ret_fast_syscall_trace and x2, x1, #_TIF_WORK_MASK @@ -706,14 +764,14 @@ work_pending: #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_on // enabled while in userspace #endif - ldr x1, [tsk, #TI_FLAGS] // re-check for single-step + ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step b finish_ret_to_user /* * "slow" syscall return path. */ ret_to_user: disable_irq // disable interrupts - ldr x1, [tsk, #TI_FLAGS] + ldr x1, [tsk, #TSK_TI_FLAGS] and x2, x1, #_TIF_WORK_MASK cbnz x2, work_pending finish_ret_to_user: @@ -746,7 +804,7 @@ el0_svc_naked: // compat entry point enable_dbg_and_irq ct_user_exit 1 - ldr x16, [tsk, #TI_FLAGS] // check for syscall hooks + ldr x16, [tsk, #TSK_TI_FLAGS] // check for syscall hooks tst x16, #_TIF_SYSCALL_WORK b.ne __sys_trace cmp scno, sc_nr // check upper syscall limit diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 394c61db5566..b883f1f75216 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -127,6 +127,8 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs) void fpsimd_thread_switch(struct task_struct *next) { + if (!system_supports_fpsimd()) + return; /* * Save the current FPSIMD state to memory, but only if whatever is in * the registers is in fact the most recent userland FPSIMD state of @@ -157,6 +159,8 @@ void fpsimd_thread_switch(struct task_struct *next) void fpsimd_flush_thread(void) { + if (!system_supports_fpsimd()) + return; memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); fpsimd_flush_task_state(current); set_thread_flag(TIF_FOREIGN_FPSTATE); @@ -168,6 +172,8 @@ void fpsimd_flush_thread(void) */ void fpsimd_preserve_current_state(void) { + if (!system_supports_fpsimd()) + return; preempt_disable(); if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) fpsimd_save_state(¤t->thread.fpsimd_state); @@ -181,6 +187,8 @@ void fpsimd_preserve_current_state(void) */ void fpsimd_restore_current_state(void) { + if (!system_supports_fpsimd()) + return; preempt_disable(); if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { struct fpsimd_state *st = ¤t->thread.fpsimd_state; @@ -199,6 +207,8 @@ void fpsimd_restore_current_state(void) */ void fpsimd_update_current_state(struct fpsimd_state *state) { + if (!system_supports_fpsimd()) + return; preempt_disable(); fpsimd_load_state(state); if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { @@ -228,6 +238,8 @@ static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate); */ void kernel_neon_begin_partial(u32 num_regs) { + if (WARN_ON(!system_supports_fpsimd())) + return; if (in_interrupt()) { struct fpsimd_partial_state *s = this_cpu_ptr( in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); @@ -252,6 +264,8 @@ EXPORT_SYMBOL(kernel_neon_begin_partial); void kernel_neon_end(void) { + if (!system_supports_fpsimd()) + return; if (in_interrupt()) { struct fpsimd_partial_state *s = this_cpu_ptr( in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 332e33193ccf..4b1abac3485a 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -326,14 +326,14 @@ __create_page_tables: * dirty cache lines being evicted. */ adrp x0, idmap_pg_dir - adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE bl __inval_cache_range /* * Clear the idmap and swapper page tables. */ adrp x0, idmap_pg_dir - adrp x6, swapper_pg_dir + SWAPPER_DIR_SIZE + adrp x6, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE 1: stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 @@ -412,7 +412,7 @@ __create_page_tables: * tables again to remove any speculatively loaded cache lines. */ adrp x0, idmap_pg_dir - adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE dmb sy bl __inval_cache_range @@ -428,7 +428,8 @@ ENDPROC(__create_page_tables) __primary_switched: adrp x4, init_thread_union add sp, x4, #THREAD_SIZE - msr sp_el0, x4 // Save thread_info + adr_l x5, init_task + msr sp_el0, x5 // Save thread_info adr_l x8, vectors // load VBAR_EL1 with virtual msr vbar_el1, x8 // vector table address @@ -524,10 +525,21 @@ set_hcr: msr hcr_el2, x0 isb - /* Generic timers. */ + /* + * Allow Non-secure EL1 and EL0 to access physical timer and counter. + * This is not necessary for VHE, since the host kernel runs in EL2, + * and EL0 accesses are configured in the later stage of boot process. + * Note that when HCR_EL2.E2H == 1, CNTHCTL_EL2 has the same bit layout + * as CNTKCTL_EL1, and CNTKCTL_EL1 accessing instructions are redefined + * to access CNTHCTL_EL2. This allows the kernel designed to run at EL1 + * to transparently mess with the EL0 bits via CNTKCTL_EL1 access in + * EL2. + */ + cbnz x2, 1f mrs x0, cnthctl_el2 orr x0, x0, #3 // Enable EL1 physical timers msr cnthctl_el2, x0 +1: msr cntvoff_el2, xzr // Clear virtual offset #ifdef CONFIG_ARM_GIC_V3 @@ -699,10 +711,10 @@ __secondary_switched: isb adr_l x0, secondary_data - ldr x0, [x0, #CPU_BOOT_STACK] // get secondary_data.stack - mov sp, x0 - and x0, x0, #~(THREAD_SIZE - 1) - msr sp_el0, x0 // save thread_info + ldr x1, [x0, #CPU_BOOT_STACK] // get secondary_data.stack + mov sp, x1 + ldr x2, [x0, #CPU_BOOT_TASK] + msr sp_el0, x2 mov x29, #0 b secondary_start_kernel ENDPROC(__secondary_switched) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index d55a7b09959b..fe301cbcb442 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -136,7 +136,7 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) /* Save the mpidr of the cpu we called cpu_suspend() on... */ if (sleep_cpu < 0) { - pr_err("Failing to hibernate on an unkown CPU.\n"); + pr_err("Failing to hibernate on an unknown CPU.\n"); return -ENODEV; } hdr->sleep_cpu_mpidr = cpu_logical_map(sleep_cpu); @@ -547,7 +547,7 @@ out: int hibernate_resume_nonboot_cpu_disable(void) { if (sleep_cpu < 0) { - pr_err("Failing to resume from hibernate on an unkown CPU.\n"); + pr_err("Failing to resume from hibernate on an unknown CPU.\n"); return -ENODEV; } diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 948b73148d56..1b3c747fedda 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -317,9 +317,21 @@ static int get_hbp_len(u8 hbp_len) case ARM_BREAKPOINT_LEN_2: len_in_bytes = 2; break; + case ARM_BREAKPOINT_LEN_3: + len_in_bytes = 3; + break; case ARM_BREAKPOINT_LEN_4: len_in_bytes = 4; break; + case ARM_BREAKPOINT_LEN_5: + len_in_bytes = 5; + break; + case ARM_BREAKPOINT_LEN_6: + len_in_bytes = 6; + break; + case ARM_BREAKPOINT_LEN_7: + len_in_bytes = 7; + break; case ARM_BREAKPOINT_LEN_8: len_in_bytes = 8; break; @@ -349,7 +361,7 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp) * to generic breakpoint descriptions. */ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, - int *gen_len, int *gen_type) + int *gen_len, int *gen_type, int *offset) { /* Type */ switch (ctrl.type) { @@ -369,17 +381,33 @@ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, return -EINVAL; } + if (!ctrl.len) + return -EINVAL; + *offset = __ffs(ctrl.len); + /* Len */ - switch (ctrl.len) { + switch (ctrl.len >> *offset) { case ARM_BREAKPOINT_LEN_1: *gen_len = HW_BREAKPOINT_LEN_1; break; case ARM_BREAKPOINT_LEN_2: *gen_len = HW_BREAKPOINT_LEN_2; break; + case ARM_BREAKPOINT_LEN_3: + *gen_len = HW_BREAKPOINT_LEN_3; + break; case ARM_BREAKPOINT_LEN_4: *gen_len = HW_BREAKPOINT_LEN_4; break; + case ARM_BREAKPOINT_LEN_5: + *gen_len = HW_BREAKPOINT_LEN_5; + break; + case ARM_BREAKPOINT_LEN_6: + *gen_len = HW_BREAKPOINT_LEN_6; + break; + case ARM_BREAKPOINT_LEN_7: + *gen_len = HW_BREAKPOINT_LEN_7; + break; case ARM_BREAKPOINT_LEN_8: *gen_len = HW_BREAKPOINT_LEN_8; break; @@ -423,9 +451,21 @@ static int arch_build_bp_info(struct perf_event *bp) case HW_BREAKPOINT_LEN_2: info->ctrl.len = ARM_BREAKPOINT_LEN_2; break; + case HW_BREAKPOINT_LEN_3: + info->ctrl.len = ARM_BREAKPOINT_LEN_3; + break; case HW_BREAKPOINT_LEN_4: info->ctrl.len = ARM_BREAKPOINT_LEN_4; break; + case HW_BREAKPOINT_LEN_5: + info->ctrl.len = ARM_BREAKPOINT_LEN_5; + break; + case HW_BREAKPOINT_LEN_6: + info->ctrl.len = ARM_BREAKPOINT_LEN_6; + break; + case HW_BREAKPOINT_LEN_7: + info->ctrl.len = ARM_BREAKPOINT_LEN_7; + break; case HW_BREAKPOINT_LEN_8: info->ctrl.len = ARM_BREAKPOINT_LEN_8; break; @@ -517,18 +557,17 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) default: return -EINVAL; } - - info->address &= ~alignment_mask; - info->ctrl.len <<= offset; } else { if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) alignment_mask = 0x3; else alignment_mask = 0x7; - if (info->address & alignment_mask) - return -EINVAL; + offset = info->address & alignment_mask; } + info->address &= ~alignment_mask; + info->ctrl.len <<= offset; + /* * Disallow per-task kernel breakpoints since these would * complicate the stepping code. @@ -661,12 +700,47 @@ unlock: } NOKPROBE_SYMBOL(breakpoint_handler); +/* + * Arm64 hardware does not always report a watchpoint hit address that matches + * one of the watchpoints set. It can also report an address "near" the + * watchpoint if a single instruction access both watched and unwatched + * addresses. There is no straight-forward way, short of disassembling the + * offending instruction, to map that address back to the watchpoint. This + * function computes the distance of the memory access from the watchpoint as a + * heuristic for the likelyhood that a given access triggered the watchpoint. + * + * See Section D2.10.5 "Determining the memory location that caused a Watchpoint + * exception" of ARMv8 Architecture Reference Manual for details. + * + * The function returns the distance of the address from the bytes watched by + * the watchpoint. In case of an exact match, it returns 0. + */ +static u64 get_distance_from_watchpoint(unsigned long addr, u64 val, + struct arch_hw_breakpoint_ctrl *ctrl) +{ + u64 wp_low, wp_high; + u32 lens, lene; + + lens = __ffs(ctrl->len); + lene = __fls(ctrl->len); + + wp_low = val + lens; + wp_high = val + lene; + if (addr < wp_low) + return wp_low - addr; + else if (addr > wp_high) + return addr - wp_high; + else + return 0; +} + static int watchpoint_handler(unsigned long addr, unsigned int esr, struct pt_regs *regs) { - int i, step = 0, *kernel_step, access; + int i, step = 0, *kernel_step, access, closest_match = 0; + u64 min_dist = -1, dist; u32 ctrl_reg; - u64 val, alignment_mask; + u64 val; struct perf_event *wp, **slots; struct debug_info *debug_info; struct arch_hw_breakpoint *info; @@ -675,35 +749,15 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, slots = this_cpu_ptr(wp_on_reg); debug_info = ¤t->thread.debug; + /* + * Find all watchpoints that match the reported address. If no exact + * match is found. Attribute the hit to the closest watchpoint. + */ + rcu_read_lock(); for (i = 0; i < core_num_wrps; ++i) { - rcu_read_lock(); - wp = slots[i]; - if (wp == NULL) - goto unlock; - - info = counter_arch_bp(wp); - /* AArch32 watchpoints are either 4 or 8 bytes aligned. */ - if (is_compat_task()) { - if (info->ctrl.len == ARM_BREAKPOINT_LEN_8) - alignment_mask = 0x7; - else - alignment_mask = 0x3; - } else { - alignment_mask = 0x7; - } - - /* Check if the watchpoint value matches. */ - val = read_wb_reg(AARCH64_DBG_REG_WVR, i); - if (val != (addr & ~alignment_mask)) - goto unlock; - - /* Possible match, check the byte address select to confirm. */ - ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i); - decode_ctrl_reg(ctrl_reg, &ctrl); - if (!((1 << (addr & alignment_mask)) & ctrl.len)) - goto unlock; + continue; /* * Check that the access type matches. @@ -712,18 +766,41 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, access = (esr & AARCH64_ESR_ACCESS_MASK) ? HW_BREAKPOINT_W : HW_BREAKPOINT_R; if (!(access & hw_breakpoint_type(wp))) - goto unlock; + continue; + /* Check if the watchpoint value and byte select match. */ + val = read_wb_reg(AARCH64_DBG_REG_WVR, i); + ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i); + decode_ctrl_reg(ctrl_reg, &ctrl); + dist = get_distance_from_watchpoint(addr, val, &ctrl); + if (dist < min_dist) { + min_dist = dist; + closest_match = i; + } + /* Is this an exact match? */ + if (dist != 0) + continue; + + info = counter_arch_bp(wp); info->trigger = addr; perf_bp_event(wp, regs); /* Do we need to handle the stepping? */ if (is_default_overflow_handler(wp)) step = 1; + } + if (min_dist > 0 && min_dist != -1) { + /* No exact match found. */ + wp = slots[closest_match]; + info = counter_arch_bp(wp); + info->trigger = addr; + perf_bp_event(wp, regs); -unlock: - rcu_read_unlock(); + /* Do we need to handle the stepping? */ + if (is_default_overflow_handler(wp)) + step = 1; } + rcu_read_unlock(); if (!step) return 0; diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index 6f2ac4fc66ca..94b62c1fa4df 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -30,7 +30,6 @@ #include <asm/cacheflush.h> #include <asm/debug-monitors.h> #include <asm/fixmap.h> -#include <asm/opcodes.h> #include <asm/insn.h> #define AARCH64_INSN_SF_BIT BIT(31) diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index e017a9493b92..d217c9e95b06 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -247,6 +247,9 @@ NOKPROBE_SYMBOL(kgdb_compiled_brk_fn); static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr) { + if (!kgdb_single_step) + return DBG_HOOK_ERROR; + kgdb_handle_exception(1, SIGTRAP, 0, regs); return 0; } diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index acf38722457b..4f0e3ebfea4b 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -114,6 +114,19 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) return 0; } +static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci) +{ + struct resource_entry *entry, *tmp; + int status; + + status = acpi_pci_probe_root_resources(ci); + resource_list_for_each_entry_safe(entry, tmp, &ci->resources) { + if (!(entry->res->flags & IORESOURCE_WINDOW)) + resource_list_destroy_entry(entry); + } + return status; +} + /* * Lookup the bus range for the domain in MCFG, and set up config space * mapping. @@ -121,31 +134,33 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) static struct pci_config_window * pci_acpi_setup_ecam_mapping(struct acpi_pci_root *root) { + struct device *dev = &root->device->dev; struct resource *bus_res = &root->secondary; u16 seg = root->segment; - struct pci_config_window *cfg; + struct pci_ecam_ops *ecam_ops; struct resource cfgres; - unsigned int bsz; - - /* Use address from _CBA if present, otherwise lookup MCFG */ - if (!root->mcfg_addr) - root->mcfg_addr = pci_mcfg_lookup(seg, bus_res); + struct acpi_device *adev; + struct pci_config_window *cfg; + int ret; - if (!root->mcfg_addr) { - dev_err(&root->device->dev, "%04x:%pR ECAM region not found\n", - seg, bus_res); + ret = pci_mcfg_lookup(root, &cfgres, &ecam_ops); + if (ret) { + dev_err(dev, "%04x:%pR ECAM region not found\n", seg, bus_res); return NULL; } - bsz = 1 << pci_generic_ecam_ops.bus_shift; - cfgres.start = root->mcfg_addr + bus_res->start * bsz; - cfgres.end = cfgres.start + resource_size(bus_res) * bsz - 1; - cfgres.flags = IORESOURCE_MEM; - cfg = pci_ecam_create(&root->device->dev, &cfgres, bus_res, - &pci_generic_ecam_ops); + adev = acpi_resource_consumer(&cfgres); + if (adev) + dev_info(dev, "ECAM area %pR reserved by %s\n", &cfgres, + dev_name(&adev->dev)); + else + dev_warn(dev, FW_BUG "ECAM area %pR not reserved in ACPI namespace\n", + &cfgres); + + cfg = pci_ecam_create(dev, &cfgres, bus_res, ecam_ops); if (IS_ERR(cfg)) { - dev_err(&root->device->dev, "%04x:%pR error %ld mapping ECAM\n", - seg, bus_res, PTR_ERR(cfg)); + dev_err(dev, "%04x:%pR error %ld mapping ECAM\n", seg, bus_res, + PTR_ERR(cfg)); return NULL; } @@ -159,33 +174,37 @@ static void pci_acpi_generic_release_info(struct acpi_pci_root_info *ci) ri = container_of(ci, struct acpi_pci_generic_root_info, common); pci_ecam_free(ri->cfg); + kfree(ci->ops); kfree(ri); } -static struct acpi_pci_root_ops acpi_pci_root_ops = { - .release_info = pci_acpi_generic_release_info, -}; - /* Interface called from ACPI code to setup PCI host controller */ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) { int node = acpi_get_node(root->device->handle); struct acpi_pci_generic_root_info *ri; struct pci_bus *bus, *child; + struct acpi_pci_root_ops *root_ops; ri = kzalloc_node(sizeof(*ri), GFP_KERNEL, node); if (!ri) return NULL; + root_ops = kzalloc_node(sizeof(*root_ops), GFP_KERNEL, node); + if (!root_ops) + return NULL; + ri->cfg = pci_acpi_setup_ecam_mapping(root); if (!ri->cfg) { kfree(ri); + kfree(root_ops); return NULL; } - acpi_pci_root_ops.pci_ops = &ri->cfg->ops->pci_ops; - bus = acpi_pci_root_create(root, &acpi_pci_root_ops, &ri->common, - ri->cfg); + root_ops->release_info = pci_acpi_generic_release_info; + root_ops->prepare_resources = pci_acpi_root_prepare_resources; + root_ops->pci_ops = &ri->cfg->ops->pci_ops; + bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg); if (!bus) return NULL; diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index a9310a69fffd..57ae9d9ed9bb 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -31,17 +31,9 @@ /* * ARMv8 PMUv3 Performance Events handling code. - * Common event types. + * Common event types (some are defined in asm/perf_event.h). */ -/* Required events. */ -#define ARMV8_PMUV3_PERFCTR_SW_INCR 0x00 -#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL 0x03 -#define ARMV8_PMUV3_PERFCTR_L1D_CACHE 0x04 -#define ARMV8_PMUV3_PERFCTR_BR_MIS_PRED 0x10 -#define ARMV8_PMUV3_PERFCTR_CPU_CYCLES 0x11 -#define ARMV8_PMUV3_PERFCTR_BR_PRED 0x12 - /* At least one of the following is required. */ #define ARMV8_PMUV3_PERFCTR_INST_RETIRED 0x08 #define ARMV8_PMUV3_PERFCTR_INST_SPEC 0x1B diff --git a/arch/arm64/kernel/probes/Makefile b/arch/arm64/kernel/probes/Makefile index ce06312e3d34..89b6df613dde 100644 --- a/arch/arm64/kernel/probes/Makefile +++ b/arch/arm64/kernel/probes/Makefile @@ -1,3 +1,5 @@ obj-$(CONFIG_KPROBES) += kprobes.o decode-insn.o \ kprobes_trampoline.o \ simulate-insn.o +obj-$(CONFIG_UPROBES) += uprobes.o decode-insn.o \ + simulate-insn.o diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index d1731bf977ef..6bf6657a5a52 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -17,7 +17,6 @@ #include <linux/kprobes.h> #include <linux/module.h> #include <linux/kallsyms.h> -#include <asm/kprobes.h> #include <asm/insn.h> #include <asm/sections.h> @@ -78,8 +77,8 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * INSN_GOOD If instruction is supported and uses instruction slot, * INSN_GOOD_NO_SLOT If instruction is supported but doesn't use its slot. */ -static enum kprobe_insn __kprobes -arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi) +enum probe_insn __kprobes +arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api) { /* * Instructions reading or modifying the PC won't work from the XOL @@ -89,26 +88,26 @@ arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi) return INSN_GOOD; if (aarch64_insn_is_bcond(insn)) { - asi->handler = simulate_b_cond; + api->handler = simulate_b_cond; } else if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn)) { - asi->handler = simulate_cbz_cbnz; + api->handler = simulate_cbz_cbnz; } else if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) { - asi->handler = simulate_tbz_tbnz; + api->handler = simulate_tbz_tbnz; } else if (aarch64_insn_is_adr_adrp(insn)) { - asi->handler = simulate_adr_adrp; + api->handler = simulate_adr_adrp; } else if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) { - asi->handler = simulate_b_bl; + api->handler = simulate_b_bl; } else if (aarch64_insn_is_br(insn) || aarch64_insn_is_blr(insn) || aarch64_insn_is_ret(insn)) { - asi->handler = simulate_br_blr_ret; + api->handler = simulate_br_blr_ret; } else if (aarch64_insn_is_ldr_lit(insn)) { - asi->handler = simulate_ldr_literal; + api->handler = simulate_ldr_literal; } else if (aarch64_insn_is_ldrsw_lit(insn)) { - asi->handler = simulate_ldrsw_literal; + api->handler = simulate_ldrsw_literal; } else { /* * Instruction cannot be stepped out-of-line and we don't @@ -120,6 +119,7 @@ arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi) return INSN_GOOD_NO_SLOT; } +#ifdef CONFIG_KPROBES static bool __kprobes is_probed_address_atomic(kprobe_opcode_t *scan_start, kprobe_opcode_t *scan_end) { @@ -138,12 +138,12 @@ is_probed_address_atomic(kprobe_opcode_t *scan_start, kprobe_opcode_t *scan_end) return false; } -enum kprobe_insn __kprobes +enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) { - enum kprobe_insn decoded; - kprobe_opcode_t insn = le32_to_cpu(*addr); - kprobe_opcode_t *scan_end = NULL; + enum probe_insn decoded; + probe_opcode_t insn = le32_to_cpu(*addr); + probe_opcode_t *scan_end = NULL; unsigned long size = 0, offset = 0; /* @@ -162,7 +162,7 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) else scan_end = addr - MAX_ATOMIC_CONTEXT_SIZE; } - decoded = arm_probe_decode_insn(insn, asi); + decoded = arm_probe_decode_insn(insn, &asi->api); if (decoded != INSN_REJECTED && scan_end) if (is_probed_address_atomic(addr - 1, scan_end)) @@ -170,3 +170,4 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) return decoded; } +#endif diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h index d438289646a6..76d3f315407f 100644 --- a/arch/arm64/kernel/probes/decode-insn.h +++ b/arch/arm64/kernel/probes/decode-insn.h @@ -23,13 +23,17 @@ */ #define MAX_ATOMIC_CONTEXT_SIZE (128 / sizeof(kprobe_opcode_t)) -enum kprobe_insn { +enum probe_insn { INSN_REJECTED, INSN_GOOD_NO_SLOT, INSN_GOOD, }; -enum kprobe_insn __kprobes +#ifdef CONFIG_KPROBES +enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi); +#endif +enum probe_insn __kprobes +arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *asi); #endif /* _ARM_KERNEL_KPROBES_ARM64_H */ diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index f5077ea7af6d..1decd2b2c730 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -44,31 +44,31 @@ post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *); static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { /* prepare insn slot */ - p->ainsn.insn[0] = cpu_to_le32(p->opcode); + p->ainsn.api.insn[0] = cpu_to_le32(p->opcode); - flush_icache_range((uintptr_t) (p->ainsn.insn), - (uintptr_t) (p->ainsn.insn) + + flush_icache_range((uintptr_t) (p->ainsn.api.insn), + (uintptr_t) (p->ainsn.api.insn) + MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); /* * Needs restoring of return address after stepping xol. */ - p->ainsn.restore = (unsigned long) p->addr + + p->ainsn.api.restore = (unsigned long) p->addr + sizeof(kprobe_opcode_t); } static void __kprobes arch_prepare_simulate(struct kprobe *p) { /* This instructions is not executed xol. No need to adjust the PC */ - p->ainsn.restore = 0; + p->ainsn.api.restore = 0; } static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - if (p->ainsn.handler) - p->ainsn.handler((u32)p->opcode, (long)p->addr, regs); + if (p->ainsn.api.handler) + p->ainsn.api.handler((u32)p->opcode, (long)p->addr, regs); /* single step simulated, now go for post processing */ post_kprobe_handler(kcb, regs); @@ -98,18 +98,18 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return -EINVAL; case INSN_GOOD_NO_SLOT: /* insn need simulation */ - p->ainsn.insn = NULL; + p->ainsn.api.insn = NULL; break; case INSN_GOOD: /* instruction uses slot */ - p->ainsn.insn = get_insn_slot(); - if (!p->ainsn.insn) + p->ainsn.api.insn = get_insn_slot(); + if (!p->ainsn.api.insn) return -ENOMEM; break; }; /* prepare the instruction */ - if (p->ainsn.insn) + if (p->ainsn.api.insn) arch_prepare_ss_slot(p); else arch_prepare_simulate(p); @@ -142,9 +142,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { - if (p->ainsn.insn) { - free_insn_slot(p->ainsn.insn, 0); - p->ainsn.insn = NULL; + if (p->ainsn.api.insn) { + free_insn_slot(p->ainsn.api.insn, 0); + p->ainsn.api.insn = NULL; } } @@ -244,9 +244,9 @@ static void __kprobes setup_singlestep(struct kprobe *p, } - if (p->ainsn.insn) { + if (p->ainsn.api.insn) { /* prepare for single stepping */ - slot = (unsigned long)p->ainsn.insn; + slot = (unsigned long)p->ainsn.api.insn; set_ss_context(kcb, slot); /* mark pending ss */ @@ -295,8 +295,8 @@ post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs) return; /* return addr restore if non-branching insn */ - if (cur->ainsn.restore != 0) - instruction_pointer_set(regs, cur->ainsn.restore); + if (cur->ainsn.api.restore != 0) + instruction_pointer_set(regs, cur->ainsn.api.restore); /* restore back original saved kprobe variables and continue */ if (kcb->kprobe_status == KPROBE_REENTER) { diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index 8977ce9d009d..357d3efe1366 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -13,28 +13,26 @@ * General Public License for more details. */ +#include <linux/bitops.h> #include <linux/kernel.h> #include <linux/kprobes.h> #include "simulate-insn.h" -#define sign_extend(x, signbit) \ - ((x) | (0 - ((x) & (1 << (signbit))))) - #define bbl_displacement(insn) \ - sign_extend(((insn) & 0x3ffffff) << 2, 27) + sign_extend32(((insn) & 0x3ffffff) << 2, 27) #define bcond_displacement(insn) \ - sign_extend(((insn >> 5) & 0x7ffff) << 2, 20) + sign_extend32(((insn >> 5) & 0x7ffff) << 2, 20) #define cbz_displacement(insn) \ - sign_extend(((insn >> 5) & 0x7ffff) << 2, 20) + sign_extend32(((insn >> 5) & 0x7ffff) << 2, 20) #define tbz_displacement(insn) \ - sign_extend(((insn >> 5) & 0x3fff) << 2, 15) + sign_extend32(((insn >> 5) & 0x3fff) << 2, 15) #define ldr_displacement(insn) \ - sign_extend(((insn >> 5) & 0x7ffff) << 2, 20) + sign_extend32(((insn >> 5) & 0x7ffff) << 2, 20) static inline void set_x_reg(struct pt_regs *regs, int reg, u64 val) { @@ -106,7 +104,7 @@ simulate_adr_adrp(u32 opcode, long addr, struct pt_regs *regs) xn = opcode & 0x1f; imm = ((opcode >> 3) & 0x1ffffc) | ((opcode >> 29) & 0x3); - imm = sign_extend(imm, 20); + imm = sign_extend64(imm, 20); if (opcode & 0x80000000) val = (imm<<12) + (addr & 0xfffffffffffff000); else diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c new file mode 100644 index 000000000000..26c998534dca --- /dev/null +++ b/arch/arm64/kernel/probes/uprobes.c @@ -0,0 +1,216 @@ +/* + * Copyright (C) 2014-2016 Pratyush Anand <panand@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/highmem.h> +#include <linux/ptrace.h> +#include <linux/uprobes.h> +#include <asm/cacheflush.h> + +#include "decode-insn.h" + +#define UPROBE_INV_FAULT_CODE UINT_MAX + +void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, + void *src, unsigned long len) +{ + void *xol_page_kaddr = kmap_atomic(page); + void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK); + + /* Initialize the slot */ + memcpy(dst, src, len); + + /* flush caches (dcache/icache) */ + sync_icache_aliases(dst, len); + + kunmap_atomic(xol_page_kaddr); +} + +unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) +{ + return instruction_pointer(regs); +} + +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long addr) +{ + probe_opcode_t insn; + + /* TODO: Currently we do not support AARCH32 instruction probing */ + if (test_bit(TIF_32BIT, &mm->context.flags)) + return -ENOTSUPP; + else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE)) + return -EINVAL; + + insn = *(probe_opcode_t *)(&auprobe->insn[0]); + + switch (arm_probe_decode_insn(insn, &auprobe->api)) { + case INSN_REJECTED: + return -EINVAL; + + case INSN_GOOD_NO_SLOT: + auprobe->simulate = true; + break; + + default: + break; + } + + return 0; +} + +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + /* Initialize with an invalid fault code to detect if ol insn trapped */ + current->thread.fault_code = UPROBE_INV_FAULT_CODE; + + /* Instruction points to execute ol */ + instruction_pointer_set(regs, utask->xol_vaddr); + + user_enable_single_step(current); + + return 0; +} + +int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + WARN_ON_ONCE(current->thread.fault_code != UPROBE_INV_FAULT_CODE); + + /* Instruction points to execute next to breakpoint address */ + instruction_pointer_set(regs, utask->vaddr + 4); + + user_disable_single_step(current); + + return 0; +} +bool arch_uprobe_xol_was_trapped(struct task_struct *t) +{ + /* + * Between arch_uprobe_pre_xol and arch_uprobe_post_xol, if an xol + * insn itself is trapped, then detect the case with the help of + * invalid fault code which is being set in arch_uprobe_pre_xol + */ + if (t->thread.fault_code != UPROBE_INV_FAULT_CODE) + return true; + + return false; +} + +bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + probe_opcode_t insn; + unsigned long addr; + + if (!auprobe->simulate) + return false; + + insn = *(probe_opcode_t *)(&auprobe->insn[0]); + addr = instruction_pointer(regs); + + if (auprobe->api.handler) + auprobe->api.handler(insn, addr, regs); + + return true; +} + +void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + /* + * Task has received a fatal signal, so reset back to probbed + * address. + */ + instruction_pointer_set(regs, utask->vaddr); + + user_disable_single_step(current); +} + +bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) +{ + /* + * If a simple branch instruction (B) was called for retprobed + * assembly label then return true even when regs->sp and ret->stack + * are same. It will ensure that cleanup and reporting of return + * instances corresponding to callee label is done when + * handle_trampoline for called function is executed. + */ + if (ctx == RP_CHECK_CHAIN_CALL) + return regs->sp <= ret->stack; + else + return regs->sp < ret->stack; +} + +unsigned long +arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, + struct pt_regs *regs) +{ + unsigned long orig_ret_vaddr; + + orig_ret_vaddr = procedure_link_pointer(regs); + /* Replace the return addr with trampoline addr */ + procedure_link_pointer_set(regs, trampoline_vaddr); + + return orig_ret_vaddr; +} + +int arch_uprobe_exception_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return NOTIFY_DONE; +} + +static int uprobe_breakpoint_handler(struct pt_regs *regs, + unsigned int esr) +{ + if (user_mode(regs) && uprobe_pre_sstep_notifier(regs)) + return DBG_HOOK_HANDLED; + + return DBG_HOOK_ERROR; +} + +static int uprobe_single_step_handler(struct pt_regs *regs, + unsigned int esr) +{ + struct uprobe_task *utask = current->utask; + + if (user_mode(regs)) { + WARN_ON(utask && + (instruction_pointer(regs) != utask->xol_vaddr + 4)); + + if (uprobe_post_sstep_notifier(regs)) + return DBG_HOOK_HANDLED; + } + + return DBG_HOOK_ERROR; +} + +/* uprobe breakpoint handler hook */ +static struct break_hook uprobes_break_hook = { + .esr_mask = BRK64_ESR_MASK, + .esr_val = BRK64_ESR_UPROBES, + .fn = uprobe_breakpoint_handler, +}; + +/* uprobe single step handler hook */ +static struct step_hook uprobes_step_hook = { + .fn = uprobe_single_step_handler, +}; + +static int __init arch_init_uprobes(void) +{ + register_break_hook(&uprobes_break_hook); + register_step_hook(&uprobes_step_hook); + + return 0; +} + +device_initcall(arch_init_uprobes); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 01753cd7d3f0..a3a2816ba73a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -45,6 +45,7 @@ #include <linux/personality.h> #include <linux/notifier.h> #include <trace/events/power.h> +#include <linux/percpu.h> #include <asm/alternative.h> #include <asm/compat.h> @@ -282,7 +283,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, memset(childregs, 0, sizeof(struct pt_regs)); childregs->pstate = PSR_MODE_EL1h; if (IS_ENABLED(CONFIG_ARM64_UAO) && - cpus_have_cap(ARM64_HAS_UAO)) + cpus_have_const_cap(ARM64_HAS_UAO)) childregs->pstate |= PSR_UAO_BIT; p->thread.cpu_context.x19 = stack_start; p->thread.cpu_context.x20 = stk_sz; @@ -322,6 +323,20 @@ void uao_thread_switch(struct task_struct *next) } /* + * We store our current task in sp_el0, which is clobbered by userspace. Keep a + * shadow copy so that we can restore this upon entry from userspace. + * + * This is *only* for exception entry from EL0, and is not valid until we + * __switch_to() a user task. + */ +DEFINE_PER_CPU(struct task_struct *, __entry_task); + +static void entry_task_switch(struct task_struct *next) +{ + __this_cpu_write(__entry_task, next); +} + +/* * Thread switching. */ struct task_struct *__switch_to(struct task_struct *prev, @@ -333,6 +348,7 @@ struct task_struct *__switch_to(struct task_struct *prev, tls_thread_switch(next); hw_breakpoint_thread_switch(next); contextidr_thread_switch(next); + entry_task_switch(next); uao_thread_switch(next); /* @@ -350,27 +366,35 @@ struct task_struct *__switch_to(struct task_struct *prev, unsigned long get_wchan(struct task_struct *p) { struct stackframe frame; - unsigned long stack_page; + unsigned long stack_page, ret = 0; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; + stack_page = (unsigned long)try_get_task_stack(p); + if (!stack_page) + return 0; + frame.fp = thread_saved_fp(p); frame.sp = thread_saved_sp(p); frame.pc = thread_saved_pc(p); #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = p->curr_ret_stack; #endif - stack_page = (unsigned long)task_stack_page(p); do { if (frame.sp < stack_page || frame.sp >= stack_page + THREAD_SIZE || unwind_frame(p, &frame)) - return 0; - if (!in_sched_functions(frame.pc)) - return frame.pc; + goto out; + if (!in_sched_functions(frame.pc)) { + ret = frame.pc; + goto out; + } } while (count ++ < 16); - return 0; + +out: + put_task_stack(p); + return ret; } unsigned long arch_align_stack(unsigned long sp) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index e0c81da60f76..fc35e06ccaac 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -327,13 +327,13 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, struct arch_hw_breakpoint_ctrl ctrl, struct perf_event_attr *attr) { - int err, len, type, disabled = !ctrl.enabled; + int err, len, type, offset, disabled = !ctrl.enabled; attr->disabled = disabled; if (disabled) return 0; - err = arch_bp_generic_fields(ctrl, &len, &type); + err = arch_bp_generic_fields(ctrl, &len, &type, &offset); if (err) return err; @@ -352,6 +352,7 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, attr->bp_len = len; attr->bp_type = type; + attr->bp_addr += offset; return 0; } @@ -404,7 +405,7 @@ static int ptrace_hbp_get_addr(unsigned int note_type, if (IS_ERR(bp)) return PTR_ERR(bp); - *addr = bp ? bp->attr.bp_addr : 0; + *addr = bp ? counter_arch_bp(bp)->address : 0; return 0; } diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 1718706fde83..12a87f2600f2 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -12,6 +12,7 @@ #include <linux/export.h> #include <linux/ftrace.h> +#include <asm/stack_pointer.h> #include <asm/stacktrace.h> struct return_address_data { diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index f534f492a268..a53f52ac81c6 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -291,6 +291,15 @@ void __init setup_arch(char **cmdline_p) smp_init_cpus(); smp_build_mpidr_hash(); +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Make sure init_thread_info.ttbr0 always generates translation + * faults in case uaccess_enable() is inadvertently called by the init + * thread. + */ + init_task.thread_info.ttbr0 = virt_to_phys(empty_zero_page); +#endif + #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) conswitchp = &vga_con; diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 404dd67080b9..c7b6de62f9d3 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -414,6 +414,9 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, } else { local_irq_enable(); + if (thread_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + if (thread_flags & _TIF_SIGPENDING) do_signal(regs); diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 1bec41b5fda3..df67652e46f0 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -125,9 +125,6 @@ ENTRY(_cpu_resume) /* load sp from context */ ldr x2, [x0, #CPU_CTX_SP] mov sp, x2 - /* save thread_info */ - and x2, x2, #~(THREAD_SIZE - 1) - msr sp_el0, x2 /* * cpu_do_resume expects x0 to contain context address pointer */ diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 8507703dabe4..cb87234cfcf2 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -58,6 +58,9 @@ #define CREATE_TRACE_POINTS #include <trace/events/ipi.h> +DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); + /* * as from 2.5, kernels no longer have an init_tasks structure * so we need some other way of telling a new secondary core @@ -146,6 +149,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) * We need to tell the secondary core where to find its stack and the * page tables. */ + secondary_data.task = idle; secondary_data.stack = task_stack_page(idle) + THREAD_START_SP; update_cpu_boot_status(CPU_MMU_OFF); __flush_dcache_area(&secondary_data, sizeof(secondary_data)); @@ -170,6 +174,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) pr_err("CPU%u: failed to boot: %d\n", cpu, ret); } + secondary_data.task = NULL; secondary_data.stack = NULL; status = READ_ONCE(secondary_data.status); if (ret && status) { @@ -208,7 +213,10 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) asmlinkage void secondary_start_kernel(void) { struct mm_struct *mm = &init_mm; - unsigned int cpu = smp_processor_id(); + unsigned int cpu; + + cpu = task_cpu(current); + set_my_cpu_offset(per_cpu_offset(cpu)); /* * All kernel threads share the same mm context; grab a @@ -217,8 +225,6 @@ asmlinkage void secondary_start_kernel(void) atomic_inc(&mm->mm_count); current->active_mm = mm; - set_my_cpu_offset(per_cpu_offset(smp_processor_id())); - /* * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. @@ -718,6 +724,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus) */ for_each_possible_cpu(cpu) { + per_cpu(cpu_number, cpu) = cpu; + if (cpu == smp_processor_id()) continue; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index c2efddfca18c..8a552a33c6ef 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -22,6 +22,7 @@ #include <linux/stacktrace.h> #include <asm/irq.h> +#include <asm/stack_pointer.h> #include <asm/stacktrace.h> /* @@ -128,7 +129,6 @@ void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame, break; } } -EXPORT_SYMBOL(walk_stackframe); #ifdef CONFIG_STACKTRACE struct stack_trace_data { @@ -181,6 +181,9 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) struct stack_trace_data data; struct stackframe frame; + if (!try_get_task_stack(tsk)) + return; + data.trace = trace; data.skip = trace->skip; @@ -202,6 +205,8 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) walk_stackframe(tsk, &frame, save_trace, &data); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; + + put_task_stack(tsk); } void save_stack_trace(struct stack_trace *trace) diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index bb0cd787a9d3..1e3be9064cfa 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -47,12 +47,6 @@ void notrace __cpu_suspend_exit(void) cpu_uninstall_idmap(); /* - * Restore per-cpu offset before any kernel - * subsystem relying on it has a chance to run. - */ - set_my_cpu_offset(per_cpu_offset(cpu)); - - /* * PSTATE was not saved over suspend/resume, re-enable any detected * features that might not have been set correctly. */ diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 694f6deedbab..23e9e13bd2aa 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -19,10 +19,226 @@ #include <linux/nodemask.h> #include <linux/of.h> #include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/cpufreq.h> +#include <asm/cpu.h> #include <asm/cputype.h> #include <asm/topology.h> +static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; +static DEFINE_MUTEX(cpu_scale_mutex); + +unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + return per_cpu(cpu_scale, cpu); +} + +static void set_capacity_scale(unsigned int cpu, unsigned long capacity) +{ + per_cpu(cpu_scale, cpu) = capacity; +} + +#ifdef CONFIG_PROC_SYSCTL +static ssize_t cpu_capacity_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + + return sprintf(buf, "%lu\n", + arch_scale_cpu_capacity(NULL, cpu->dev.id)); +} + +static ssize_t cpu_capacity_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + int this_cpu = cpu->dev.id, i; + unsigned long new_capacity; + ssize_t ret; + + if (count) { + ret = kstrtoul(buf, 0, &new_capacity); + if (ret) + return ret; + if (new_capacity > SCHED_CAPACITY_SCALE) + return -EINVAL; + + mutex_lock(&cpu_scale_mutex); + for_each_cpu(i, &cpu_topology[this_cpu].core_sibling) + set_capacity_scale(i, new_capacity); + mutex_unlock(&cpu_scale_mutex); + } + + return count; +} + +static DEVICE_ATTR_RW(cpu_capacity); + +static int register_cpu_capacity_sysctl(void) +{ + int i; + struct device *cpu; + + for_each_possible_cpu(i) { + cpu = get_cpu_device(i); + if (!cpu) { + pr_err("%s: too early to get CPU%d device!\n", + __func__, i); + continue; + } + device_create_file(cpu, &dev_attr_cpu_capacity); + } + + return 0; +} +subsys_initcall(register_cpu_capacity_sysctl); +#endif + +static u32 capacity_scale; +static u32 *raw_capacity; +static bool cap_parsing_failed; + +static void __init parse_cpu_capacity(struct device_node *cpu_node, int cpu) +{ + int ret; + u32 cpu_capacity; + + if (cap_parsing_failed) + return; + + ret = of_property_read_u32(cpu_node, + "capacity-dmips-mhz", + &cpu_capacity); + if (!ret) { + if (!raw_capacity) { + raw_capacity = kcalloc(num_possible_cpus(), + sizeof(*raw_capacity), + GFP_KERNEL); + if (!raw_capacity) { + pr_err("cpu_capacity: failed to allocate memory for raw capacities\n"); + cap_parsing_failed = true; + return; + } + } + capacity_scale = max(cpu_capacity, capacity_scale); + raw_capacity[cpu] = cpu_capacity; + pr_debug("cpu_capacity: %s cpu_capacity=%u (raw)\n", + cpu_node->full_name, raw_capacity[cpu]); + } else { + if (raw_capacity) { + pr_err("cpu_capacity: missing %s raw capacity\n", + cpu_node->full_name); + pr_err("cpu_capacity: partial information: fallback to 1024 for all CPUs\n"); + } + cap_parsing_failed = true; + kfree(raw_capacity); + } +} + +static void normalize_cpu_capacity(void) +{ + u64 capacity; + int cpu; + + if (!raw_capacity || cap_parsing_failed) + return; + + pr_debug("cpu_capacity: capacity_scale=%u\n", capacity_scale); + mutex_lock(&cpu_scale_mutex); + for_each_possible_cpu(cpu) { + pr_debug("cpu_capacity: cpu=%d raw_capacity=%u\n", + cpu, raw_capacity[cpu]); + capacity = (raw_capacity[cpu] << SCHED_CAPACITY_SHIFT) + / capacity_scale; + set_capacity_scale(cpu, capacity); + pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n", + cpu, arch_scale_cpu_capacity(NULL, cpu)); + } + mutex_unlock(&cpu_scale_mutex); +} + +#ifdef CONFIG_CPU_FREQ +static cpumask_var_t cpus_to_visit; +static bool cap_parsing_done; +static void parsing_done_workfn(struct work_struct *work); +static DECLARE_WORK(parsing_done_work, parsing_done_workfn); + +static int +init_cpu_capacity_callback(struct notifier_block *nb, + unsigned long val, + void *data) +{ + struct cpufreq_policy *policy = data; + int cpu; + + if (cap_parsing_failed || cap_parsing_done) + return 0; + + switch (val) { + case CPUFREQ_NOTIFY: + pr_debug("cpu_capacity: init cpu capacity for CPUs [%*pbl] (to_visit=%*pbl)\n", + cpumask_pr_args(policy->related_cpus), + cpumask_pr_args(cpus_to_visit)); + cpumask_andnot(cpus_to_visit, + cpus_to_visit, + policy->related_cpus); + for_each_cpu(cpu, policy->related_cpus) { + raw_capacity[cpu] = arch_scale_cpu_capacity(NULL, cpu) * + policy->cpuinfo.max_freq / 1000UL; + capacity_scale = max(raw_capacity[cpu], capacity_scale); + } + if (cpumask_empty(cpus_to_visit)) { + normalize_cpu_capacity(); + kfree(raw_capacity); + pr_debug("cpu_capacity: parsing done\n"); + cap_parsing_done = true; + schedule_work(&parsing_done_work); + } + } + return 0; +} + +static struct notifier_block init_cpu_capacity_notifier = { + .notifier_call = init_cpu_capacity_callback, +}; + +static int __init register_cpufreq_notifier(void) +{ + if (cap_parsing_failed) + return -EINVAL; + + if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL)) { + pr_err("cpu_capacity: failed to allocate memory for cpus_to_visit\n"); + return -ENOMEM; + } + cpumask_copy(cpus_to_visit, cpu_possible_mask); + + return cpufreq_register_notifier(&init_cpu_capacity_notifier, + CPUFREQ_POLICY_NOTIFIER); +} +core_initcall(register_cpufreq_notifier); + +static void parsing_done_workfn(struct work_struct *work) +{ + cpufreq_unregister_notifier(&init_cpu_capacity_notifier, + CPUFREQ_POLICY_NOTIFIER); +} + +#else +static int __init free_raw_capacity(void) +{ + kfree(raw_capacity); + + return 0; +} +core_initcall(free_raw_capacity); +#endif + static int __init get_cpu_for_node(struct device_node *node) { struct device_node *cpu_node; @@ -34,6 +250,7 @@ static int __init get_cpu_for_node(struct device_node *node) for_each_possible_cpu(cpu) { if (of_get_cpu_node(cpu, NULL) == cpu_node) { + parse_cpu_capacity(cpu_node, cpu); of_node_put(cpu_node); return cpu; } @@ -178,13 +395,17 @@ static int __init parse_dt_topology(void) * cluster with restricted subnodes. */ map = of_get_child_by_name(cn, "cpu-map"); - if (!map) + if (!map) { + cap_parsing_failed = true; goto out; + } ret = parse_cluster(map, 0); if (ret != 0) goto out_map; + normalize_cpu_capacity(); + /* * Check that all cores are in the topology; the SMP code will * only mark cores described in the DT as possible. diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index c9986b3e0a96..5b830be79c01 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -38,6 +38,7 @@ #include <asm/esr.h> #include <asm/insn.h> #include <asm/traps.h> +#include <asm/stack_pointer.h> #include <asm/stacktrace.h> #include <asm/exception.h> #include <asm/system_misc.h> @@ -147,6 +148,9 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) if (!tsk) tsk = current; + if (!try_get_task_stack(tsk)) + return; + /* * Switching between stacks is valid when tracing current and in * non-preemptible context. @@ -212,6 +216,8 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) stack + sizeof(struct pt_regs)); } } + + put_task_stack(tsk); } void show_stack(struct task_struct *tsk, unsigned long *sp) @@ -227,10 +233,9 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) #endif #define S_SMP " SMP" -static int __die(const char *str, int err, struct thread_info *thread, - struct pt_regs *regs) +static int __die(const char *str, int err, struct pt_regs *regs) { - struct task_struct *tsk = thread->task; + struct task_struct *tsk = current; static int die_counter; int ret; @@ -245,7 +250,8 @@ static int __die(const char *str, int err, struct thread_info *thread, print_modules(); __show_regs(regs); pr_emerg("Process %.*s (pid: %d, stack limit = 0x%p)\n", - TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), thread + 1); + TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), + end_of_stack(tsk)); if (!user_mode(regs)) { dump_mem(KERN_EMERG, "Stack: ", regs->sp, @@ -264,7 +270,6 @@ static DEFINE_RAW_SPINLOCK(die_lock); */ void die(const char *str, struct pt_regs *regs, int err) { - struct thread_info *thread = current_thread_info(); int ret; oops_enter(); @@ -272,9 +277,9 @@ void die(const char *str, struct pt_regs *regs, int err) raw_spin_lock_irq(&die_lock); console_verbose(); bust_spinlocks(1); - ret = __die(str, err, thread, regs); + ret = __die(str, err, regs); - if (regs && kexec_should_crash(thread->task)) + if (regs && kexec_should_crash(current)) crash_kexec(regs); bust_spinlocks(0); @@ -435,9 +440,10 @@ int cpu_enable_cache_maint_trap(void *__unused) } #define __user_cache_maint(insn, address, res) \ - if (untagged_addr(address) >= user_addr_max()) \ + if (untagged_addr(address) >= user_addr_max()) { \ res = -EFAULT; \ - else \ + } else { \ + uaccess_ttbr0_enable(); \ asm volatile ( \ "1: " insn ", %1\n" \ " mov %w0, #0\n" \ @@ -449,7 +455,9 @@ int cpu_enable_cache_maint_trap(void *__unused) " .popsection\n" \ _ASM_EXTABLE(1b, 3b) \ : "=r" (res) \ - : "r" (address), "i" (-EFAULT) ) + : "r" (address), "i" (-EFAULT)); \ + uaccess_ttbr0_disable(); \ + } static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) { diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 1105aab1e6d6..b8deffa9e1bf 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -216,6 +216,11 @@ SECTIONS swapper_pg_dir = .; . += SWAPPER_DIR_SIZE; +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + reserved_ttbr0 = .; + . += RESERVED_TTBR0_SIZE; +#endif + _end = .; STABS_DEBUG diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 6eaf12c1d627..52cb7ad9b2fd 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -16,9 +16,6 @@ menuconfig VIRTUALIZATION if VIRTUALIZATION -config KVM_ARM_VGIC_V3_ITS - bool - config KVM bool "Kernel-based Virtual Machine (KVM) support" depends on OF @@ -34,7 +31,6 @@ config KVM select KVM_VFIO select HAVE_KVM_EVENTFD select HAVE_KVM_IRQFD - select KVM_ARM_VGIC_V3_ITS select KVM_ARM_PMU if HW_PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_IRQCHIP diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index a204adf29f0a..1bfe30dfbfe7 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -57,6 +57,16 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) return 1; } +/* + * Guest access to FP/ASIMD registers are routed to this handler only + * when the system doesn't support FP/ASIMD. + */ +static int handle_no_fpsimd(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + kvm_inject_undefined(vcpu); + return 1; +} + /** * kvm_handle_wfx - handle a wait-for-interrupts or wait-for-event * instruction executed by a guest @@ -144,6 +154,7 @@ static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_BKPT32] = kvm_handle_guest_debug, [ESR_ELx_EC_BRK64] = kvm_handle_guest_debug, + [ESR_ELx_EC_FP_ASIMD] = handle_no_fpsimd, }; static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 4e92399f7105..5e9052f087f2 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -106,9 +106,16 @@ el1_trap: * x0: ESR_EC */ - /* Guest accessed VFP/SIMD registers, save host, restore Guest */ + /* + * We trap the first access to the FP/SIMD to save the host context + * and restore the guest context lazily. + * If FP/SIMD is not implemented, handle the trap and inject an + * undefined instruction exception to the guest. + */ +alternative_if_not ARM64_HAS_NO_FPSIMD cmp x0, #ESR_ELx_EC_FP_ASIMD b.eq __fpsimd_guest_restore +alternative_else_nop_endif mrs x1, tpidr_el2 mov x0, #ARM_EXCEPTION_TRAP diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 83037cd62d01..75e83dd40d43 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -21,6 +21,7 @@ #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> +#include <asm/fpsimd.h> static bool __hyp_text __fpsimd_enabled_nvhe(void) { @@ -76,16 +77,24 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu) * traps are only taken to EL2 if the operation would not otherwise * trap to EL1. Therefore, always make sure that for 32-bit guests, * we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit. + * If FP/ASIMD is not implemented, FPEXC is UNDEFINED and any access to + * it will cause an exception. */ val = vcpu->arch.hcr_el2; - if (!(val & HCR_RW)) { + if (!(val & HCR_RW) && system_supports_fpsimd()) { write_sysreg(1 << 30, fpexc32_el2); isb(); } write_sysreg(val, hcr_el2); /* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */ write_sysreg(1 << 15, hstr_el2); - /* Make sure we trap PMU access from EL0 to EL2 */ + /* + * Make sure we trap PMU access from EL0 to EL2. Also sanitize + * PMSELR_EL0 to make sure it never contains the cycle + * counter, which could make a PMXEVCNTR_EL0 access UNDEF at + * EL1 instead of being trapped to EL2. + */ + write_sysreg(0, pmselr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); __activate_traps_arch()(); diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c index 9cc0ea784ae6..88e2f2b938f0 100644 --- a/arch/arm64/kvm/hyp/tlb.c +++ b/arch/arm64/kvm/hyp/tlb.c @@ -64,6 +64,21 @@ void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) write_sysreg(0, vttbr_el2); } +void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = kern_hyp_va(kern_hyp_va(vcpu)->kvm); + + /* Switch to requested VMID */ + write_sysreg(kvm->arch.vttbr, vttbr_el2); + isb(); + + asm volatile("tlbi vmalle1" : : ); + dsb(nsh); + isb(); + + write_sysreg(0, vttbr_el2); +} + void __hyp_text __kvm_flush_vm_context(void) { dsb(ishst); diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 5bc460884639..e95d4f68bf54 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -86,12 +86,6 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VCPU_ATTRIBUTES: r = 1; break; - case KVM_CAP_MSI_DEVID: - if (!kvm) - r = -EINVAL; - else - r = kvm->arch.vgic.msis_require_devid; - break; default: r = 0; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index f302fdb3a030..87e7e6608cd8 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -597,8 +597,14 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, idx = ARMV8_PMU_CYCLE_IDX; } else { - BUG(); + return false; } + } else if (r->CRn == 0 && r->CRm == 9) { + /* PMCCNTR */ + if (pmu_access_event_counter_el0_disabled(vcpu)) + return false; + + idx = ARMV8_PMU_CYCLE_IDX; } else if (r->CRn == 14 && (r->CRm & 12) == 8) { /* PMEVCNTRn_EL0 */ if (pmu_access_event_counter_el0_disabled(vcpu)) @@ -606,7 +612,7 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); } else { - BUG(); + return false; } if (!pmu_counter_idx_valid(vcpu, idx)) diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 5d1cad3ce6d6..d7150e30438a 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -17,10 +17,7 @@ */ #include <linux/linkage.h> -#include <asm/alternative.h> -#include <asm/assembler.h> -#include <asm/cpufeature.h> -#include <asm/sysreg.h> +#include <asm/uaccess.h> .text @@ -33,8 +30,7 @@ * Alignment fixed up by hardware. */ ENTRY(__clear_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x2, x3 mov x2, x1 // save the size for fixup return subs x1, x1, #8 b.mi 2f @@ -54,8 +50,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 b.mi 5f uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x2 ret ENDPROC(__clear_user) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 4fd67ea03bb0..cfe13396085b 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -16,11 +16,8 @@ #include <linux/linkage.h> -#include <asm/alternative.h> -#include <asm/assembler.h> #include <asm/cache.h> -#include <asm/cpufeature.h> -#include <asm/sysreg.h> +#include <asm/uaccess.h> /* * Copy from user space to a kernel buffer (alignment handled by the hardware) @@ -67,12 +64,10 @@ end .req x5 ENTRY(__arch_copy_from_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 // Nothing to copy ret ENDPROC(__arch_copy_from_user) diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index f7292dd08c84..718b1c4e2f85 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -18,11 +18,8 @@ #include <linux/linkage.h> -#include <asm/alternative.h> -#include <asm/assembler.h> #include <asm/cache.h> -#include <asm/cpufeature.h> -#include <asm/sysreg.h> +#include <asm/uaccess.h> /* * Copy from user space to user space (alignment handled by the hardware) @@ -68,12 +65,10 @@ end .req x5 ENTRY(__copy_in_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 ret ENDPROC(__copy_in_user) diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 7a7efe255034..e99e31c9acac 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -16,11 +16,8 @@ #include <linux/linkage.h> -#include <asm/alternative.h> -#include <asm/assembler.h> #include <asm/cache.h> -#include <asm/cpufeature.h> -#include <asm/sysreg.h> +#include <asm/uaccess.h> /* * Copy to user space from a kernel buffer (alignment handled by the hardware) @@ -66,12 +63,10 @@ end .req x5 ENTRY(__arch_copy_to_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 ret ENDPROC(__arch_copy_to_user) diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 54bb209cae8e..e703fb9defad 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -3,7 +3,8 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ ioremap.o mmap.o pgd.o mmu.o \ context.o proc.o pageattr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_ARM64_PTDUMP) += dump.o +obj-$(CONFIG_ARM64_PTDUMP_CORE) += dump.o +obj-$(CONFIG_ARM64_PTDUMP_DEBUGFS) += ptdump_debugfs.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_KASAN) += kasan_init.o diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 58b5a906ff78..da9576932322 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -23,6 +23,7 @@ #include <asm/assembler.h> #include <asm/cpufeature.h> #include <asm/alternative.h> +#include <asm/uaccess.h> /* * flush_icache_range(start,end) @@ -48,6 +49,7 @@ ENTRY(flush_icache_range) * - end - virtual end address of region */ ENTRY(__flush_cache_user_range) + uaccess_ttbr0_enable x2, x3 dcache_line_size x2, x3 sub x3, x2, #1 bic x4, x0, x3 @@ -69,10 +71,12 @@ USER(9f, ic ivau, x4 ) // invalidate I line PoU dsb ish isb mov x0, #0 +1: + uaccess_ttbr0_disable x1 ret 9: mov x0, #-EFAULT - ret + b 1b ENDPROC(flush_icache_range) ENDPROC(__flush_cache_user_range) diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index efcf1f7ef1e4..4c63cb154859 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -221,7 +221,12 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); switch_mm_fastpath: - cpu_switch_mm(mm->pgd, mm); + /* + * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when + * emulating PAN. + */ + if (!system_uses_ttbr0_pan()) + cpu_switch_mm(mm->pgd, mm); } static int asids_init(void) diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 3f74d0d98de6..290a84f3351f 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -796,6 +796,8 @@ static struct dma_map_ops iommu_dma_ops = { .sync_single_for_device = __iommu_sync_single_for_device, .sync_sg_for_cpu = __iommu_sync_sg_for_cpu, .sync_sg_for_device = __iommu_sync_sg_for_device, + .map_resource = iommu_dma_map_resource, + .unmap_resource = iommu_dma_unmap_resource, .dma_supported = iommu_dma_supported, .mapping_error = iommu_dma_mapping_error, }; @@ -938,11 +940,6 @@ static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, void arch_teardown_dma_ops(struct device *dev) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - - if (WARN_ON(domain)) - iommu_detach_device(domain, dev); - dev->archdata.dma_ops = NULL; } diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 9c3e75df2180..ca74a2aace42 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -50,6 +50,18 @@ static const struct addr_marker address_markers[] = { { -1, NULL }, }; +#define pt_dump_seq_printf(m, fmt, args...) \ +({ \ + if (m) \ + seq_printf(m, fmt, ##args); \ +}) + +#define pt_dump_seq_puts(m, fmt) \ +({ \ + if (m) \ + seq_printf(m, fmt); \ +}) + /* * The page dumper groups page table entries of the same type into a single * description. It uses pg_state to track the range information while @@ -62,6 +74,9 @@ struct pg_state { unsigned long start_address; unsigned level; u64 current_prot; + bool check_wx; + unsigned long wx_pages; + unsigned long uxn_pages; }; struct prot_bits { @@ -186,10 +201,39 @@ static void dump_prot(struct pg_state *st, const struct prot_bits *bits, s = bits->clear; if (s) - seq_printf(st->seq, " %s", s); + pt_dump_seq_printf(st->seq, " %s", s); } } +static void note_prot_uxn(struct pg_state *st, unsigned long addr) +{ + if (!st->check_wx) + return; + + if ((st->current_prot & PTE_UXN) == PTE_UXN) + return; + + WARN_ONCE(1, "arm64/mm: Found non-UXN mapping at address %p/%pS\n", + (void *)st->start_address, (void *)st->start_address); + + st->uxn_pages += (addr - st->start_address) / PAGE_SIZE; +} + +static void note_prot_wx(struct pg_state *st, unsigned long addr) +{ + if (!st->check_wx) + return; + if ((st->current_prot & PTE_RDONLY) == PTE_RDONLY) + return; + if ((st->current_prot & PTE_PXN) == PTE_PXN) + return; + + WARN_ONCE(1, "arm64/mm: Found insecure W+X mapping at address %p/%pS\n", + (void *)st->start_address, (void *)st->start_address); + + st->wx_pages += (addr - st->start_address) / PAGE_SIZE; +} + static void note_page(struct pg_state *st, unsigned long addr, unsigned level, u64 val) { @@ -200,14 +244,16 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level, st->level = level; st->current_prot = prot; st->start_address = addr; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); } else if (prot != st->current_prot || level != st->level || addr >= st->marker[1].start_address) { const char *unit = units; unsigned long delta; if (st->current_prot) { - seq_printf(st->seq, "0x%016lx-0x%016lx ", + note_prot_uxn(st, addr); + note_prot_wx(st, addr); + pt_dump_seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr); delta = (addr - st->start_address) >> 10; @@ -215,17 +261,17 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level, delta >>= 10; unit++; } - seq_printf(st->seq, "%9lu%c %s", delta, *unit, + pt_dump_seq_printf(st->seq, "%9lu%c %s", delta, *unit, pg_level[st->level].name); if (pg_level[st->level].bits) dump_prot(st, pg_level[st->level].bits, pg_level[st->level].num); - seq_puts(st->seq, "\n"); + pt_dump_seq_puts(st->seq, "\n"); } if (addr >= st->marker[1].start_address) { st->marker++; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); } st->start_address = addr; @@ -235,7 +281,7 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level, if (addr >= st->marker[1].start_address) { st->marker++; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); } } @@ -304,9 +350,8 @@ static void walk_pgd(struct pg_state *st, struct mm_struct *mm, } } -static int ptdump_show(struct seq_file *m, void *v) +void ptdump_walk_pgd(struct seq_file *m, struct ptdump_info *info) { - struct ptdump_info *info = m->private; struct pg_state st = { .seq = m, .marker = info->markers, @@ -315,33 +360,16 @@ static int ptdump_show(struct seq_file *m, void *v) walk_pgd(&st, info->mm, info->base_addr); note_page(&st, 0, 0, 0); - return 0; } -static int ptdump_open(struct inode *inode, struct file *file) +static void ptdump_initialize(void) { - return single_open(file, ptdump_show, inode->i_private); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -int ptdump_register(struct ptdump_info *info, const char *name) -{ - struct dentry *pe; unsigned i, j; for (i = 0; i < ARRAY_SIZE(pg_level); i++) if (pg_level[i].bits) for (j = 0; j < pg_level[i].num; j++) pg_level[i].mask |= pg_level[i].bits[j].mask; - - pe = debugfs_create_file(name, 0400, NULL, info, &ptdump_fops); - return pe ? 0 : -ENOMEM; } static struct ptdump_info kernel_ptdump_info = { @@ -350,8 +378,30 @@ static struct ptdump_info kernel_ptdump_info = { .base_addr = VA_START, }; +void ptdump_check_wx(void) +{ + struct pg_state st = { + .seq = NULL, + .marker = (struct addr_marker[]) { + { 0, NULL}, + { -1, NULL}, + }, + .check_wx = true, + }; + + walk_pgd(&st, &init_mm, 0); + note_page(&st, 0, 0, 0); + if (st.wx_pages || st.uxn_pages) + pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n", + st.wx_pages, st.uxn_pages); + else + pr_info("Checked W+X mappings: passed, no W+X pages found\n"); +} + static int ptdump_init(void) { - return ptdump_register(&kernel_ptdump_info, "kernel_page_tables"); + ptdump_initialize(); + return ptdump_debugfs_register(&kernel_ptdump_info, + "kernel_page_tables"); } device_initcall(ptdump_init); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 0f8788374815..a78a5c401806 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -269,13 +269,19 @@ out: return fault; } -static inline bool is_permission_fault(unsigned int esr) +static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs) { unsigned int ec = ESR_ELx_EC(esr); unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; - return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) || - (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM); + if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR) + return false; + + if (system_uses_ttbr0_pan()) + return fsc_type == ESR_ELx_FSC_FAULT && + (regs->pstate & PSR_PAN_BIT); + else + return fsc_type == ESR_ELx_FSC_PERM; } static bool is_el0_instruction_abort(unsigned int esr) @@ -315,7 +321,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, mm_flags |= FAULT_FLAG_WRITE; } - if (is_permission_fault(esr) && (addr < USER_DS)) { + if (addr < USER_DS && is_permission_fault(esr, regs)) { /* regs->orig_addr_limit may be 0 if we entered from EL0 */ if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); @@ -507,10 +513,10 @@ static const struct fault_info { { do_bad, SIGBUS, 0, "unknown 17" }, { do_bad, SIGBUS, 0, "unknown 18" }, { do_bad, SIGBUS, 0, "unknown 19" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, { do_bad, SIGBUS, 0, "synchronous parity error" }, { do_bad, SIGBUS, 0, "unknown 25" }, { do_bad, SIGBUS, 0, "unknown 26" }, diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 8377329d8c97..554a2558c12e 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -25,14 +25,7 @@ #include <asm/cachetype.h> #include <asm/tlbflush.h> -void flush_cache_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) -{ - if (vma->vm_flags & VM_EXEC) - __flush_icache_all(); -} - -static void sync_icache_aliases(void *kaddr, unsigned long len) +void sync_icache_aliases(void *kaddr, unsigned long len) { unsigned long addr = (unsigned long)kaddr; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 2e49bd252fe7..964b7549af5c 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -51,20 +51,8 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr, *pgsize = PAGE_SIZE; if (!pte_cont(pte)) return 1; - if (!pgd_present(*pgd)) { - VM_BUG_ON(!pgd_present(*pgd)); - return 1; - } pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) { - VM_BUG_ON(!pud_present(*pud)); - return 1; - } pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - VM_BUG_ON(!pmd_present(*pmd)); - return 1; - } if ((pte_t *)pmd == ptep) { *pgsize = PMD_SIZE; return CONT_PMDS; @@ -212,7 +200,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); /* save the 1st pte to return */ pte = ptep_get_and_clear(mm, addr, cpte); - for (i = 1; i < ncontig; ++i) { + for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) { /* * If HW_AFDBM is enabled, then the HW could * turn on the dirty bit for any of the page @@ -250,7 +238,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, pfn = pte_pfn(*cpte); ncontig = find_num_contig(vma->vm_mm, addr, cpte, *cpte, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte) { + for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) { changed = ptep_set_access_flags(vma, addr, cpte, pfn_pte(pfn, hugeprot), @@ -273,7 +261,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, cpte = huge_pte_offset(mm, addr); ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte) + for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) ptep_set_wrprotect(mm, addr, cpte); } else { ptep_set_wrprotect(mm, addr, ptep); @@ -291,7 +279,7 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma, cpte = huge_pte_offset(vma->vm_mm, addr); ncontig = find_num_contig(vma->vm_mm, addr, cpte, *cpte, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte) + for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) ptep_clear_flush(vma, addr, cpte); } else { ptep_clear_flush(vma, addr, ptep); @@ -323,7 +311,7 @@ __setup("hugepagesz=", setup_hugepagesz); static __init int add_default_hugepagesz(void) { if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL) - hugetlb_add_hstate(CONT_PMD_SHIFT); + hugetlb_add_hstate(CONT_PTE_SHIFT); return 0; } arch_initcall(add_default_hugepagesz); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 05615a3fdc6f..17243e43184e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -28,8 +28,6 @@ #include <linux/memblock.h> #include <linux/fs.h> #include <linux/io.h> -#include <linux/slab.h> -#include <linux/stop_machine.h> #include <asm/barrier.h> #include <asm/cputype.h> @@ -42,6 +40,7 @@ #include <asm/tlb.h> #include <asm/memblock.h> #include <asm/mmu_context.h> +#include <asm/ptdump.h> u64 idmap_t0sz = TCR_T0SZ(VA_BITS); @@ -95,11 +94,24 @@ static phys_addr_t __init early_pgtable_alloc(void) return phys; } +static bool pgattr_change_is_safe(u64 old, u64 new) +{ + /* + * The following mapping attributes may be updated in live + * kernel mappings without the need for break-before-make. + */ + static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE; + + return old == 0 || new == 0 || ((old ^ new) & ~mask) == 0; +} + static void alloc_init_pte(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void), + bool page_mappings_only) { + pgprot_t __prot = prot; pte_t *pte; BUG_ON(pmd_sect(*pmd)); @@ -115,8 +127,28 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, pte = pte_set_fixmap_offset(pmd, addr); do { - set_pte(pte, pfn_pte(pfn, prot)); + pte_t old_pte = *pte; + + /* + * Set the contiguous bit for the subsequent group of PTEs if + * its size and alignment are appropriate. + */ + if (((addr | PFN_PHYS(pfn)) & ~CONT_PTE_MASK) == 0) { + if (end - addr >= CONT_PTE_SIZE && !page_mappings_only) + __prot = __pgprot(pgprot_val(prot) | PTE_CONT); + else + __prot = prot; + } + + set_pte(pte, pfn_pte(pfn, __prot)); pfn++; + + /* + * After the PTE entry has been populated once, we + * only allow updates to the permission attributes. + */ + BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), pte_val(*pte))); + } while (pte++, addr += PAGE_SIZE, addr != end); pte_clear_fixmap(); @@ -125,8 +157,9 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), - bool allow_block_mappings) + bool page_mappings_only) { + pgprot_t __prot = prot; pmd_t *pmd; unsigned long next; @@ -146,27 +179,39 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, pmd = pmd_set_fixmap_offset(pud, addr); do { + pmd_t old_pmd = *pmd; + next = pmd_addr_end(addr, end); + /* try section mapping first */ if (((addr | next | phys) & ~SECTION_MASK) == 0 && - allow_block_mappings) { - pmd_t old_pmd =*pmd; - pmd_set_huge(pmd, phys, prot); + !page_mappings_only) { /* - * Check for previous table entries created during - * boot (__create_page_tables) and flush them. + * Set the contiguous bit for the subsequent group of + * PMDs if its size and alignment are appropriate. */ - if (!pmd_none(old_pmd)) { - flush_tlb_all(); - if (pmd_table(old_pmd)) { - phys_addr_t table = pmd_page_paddr(old_pmd); - if (!WARN_ON_ONCE(slab_is_available())) - memblock_free(table, PAGE_SIZE); - } + if (((addr | phys) & ~CONT_PMD_MASK) == 0) { + if (end - addr >= CONT_PMD_SIZE) + __prot = __pgprot(pgprot_val(prot) | + PTE_CONT); + else + __prot = prot; } + pmd_set_huge(pmd, phys, __prot); + + /* + * After the PMD entry has been populated once, we + * only allow updates to the permission attributes. + */ + BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd), + pmd_val(*pmd))); } else { alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys), - prot, pgtable_alloc); + prot, pgtable_alloc, + page_mappings_only); + + BUG_ON(pmd_val(old_pmd) != 0 && + pmd_val(old_pmd) != pmd_val(*pmd)); } phys += next - addr; } while (pmd++, addr = next, addr != end); @@ -189,7 +234,7 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), - bool allow_block_mappings) + bool page_mappings_only) { pud_t *pud; unsigned long next; @@ -204,33 +249,28 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, pud = pud_set_fixmap_offset(pgd, addr); do { + pud_t old_pud = *pud; + next = pud_addr_end(addr, end); /* * For 4K granule only, attempt to put down a 1GB block */ - if (use_1G_block(addr, next, phys) && allow_block_mappings) { - pud_t old_pud = *pud; + if (use_1G_block(addr, next, phys) && !page_mappings_only) { pud_set_huge(pud, phys, prot); /* - * If we have an old value for a pud, it will - * be pointing to a pmd table that we no longer - * need (from swapper_pg_dir). - * - * Look up the old pmd table and free it. + * After the PUD entry has been populated once, we + * only allow updates to the permission attributes. */ - if (!pud_none(old_pud)) { - flush_tlb_all(); - if (pud_table(old_pud)) { - phys_addr_t table = pud_page_paddr(old_pud); - if (!WARN_ON_ONCE(slab_is_available())) - memblock_free(table, PAGE_SIZE); - } - } + BUG_ON(!pgattr_change_is_safe(pud_val(old_pud), + pud_val(*pud))); } else { alloc_init_pmd(pud, addr, next, phys, prot, - pgtable_alloc, allow_block_mappings); + pgtable_alloc, page_mappings_only); + + BUG_ON(pud_val(old_pud) != 0 && + pud_val(old_pud) != pud_val(*pud)); } phys += next - addr; } while (pud++, addr = next, addr != end); @@ -242,7 +282,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), - bool allow_block_mappings) + bool page_mappings_only) { unsigned long addr, length, end, next; pgd_t *pgd = pgd_offset_raw(pgdir, virt); @@ -262,7 +302,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, do { next = pgd_addr_end(addr, end); alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc, - allow_block_mappings); + page_mappings_only); phys += next - addr; } while (pgd++, addr = next, addr != end); } @@ -291,17 +331,17 @@ static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt, &phys, virt); return; } - __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, true); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, false); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, - pgprot_t prot, bool allow_block_mappings) + pgprot_t prot, bool page_mappings_only) { BUG_ON(mm == &init_mm); __create_pgd_mapping(mm->pgd, phys, virt, size, prot, - pgd_pgtable_alloc, allow_block_mappings); + pgd_pgtable_alloc, page_mappings_only); } static void create_mapping_late(phys_addr_t phys, unsigned long virt, @@ -314,7 +354,7 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, } __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, - NULL, !debug_pagealloc_enabled()); + NULL, debug_pagealloc_enabled()); } static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) @@ -332,7 +372,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end __create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start, PAGE_KERNEL, early_pgtable_alloc, - !debug_pagealloc_enabled()); + debug_pagealloc_enabled()); return; } @@ -345,13 +385,13 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end __phys_to_virt(start), kernel_start - start, PAGE_KERNEL, early_pgtable_alloc, - !debug_pagealloc_enabled()); + debug_pagealloc_enabled()); if (kernel_end < end) __create_pgd_mapping(pgd, kernel_end, __phys_to_virt(kernel_end), end - kernel_end, PAGE_KERNEL, early_pgtable_alloc, - !debug_pagealloc_enabled()); + debug_pagealloc_enabled()); /* * Map the linear alias of the [_text, __init_begin) interval as @@ -361,7 +401,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end */ __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start), kernel_end - kernel_start, PAGE_KERNEL_RO, - early_pgtable_alloc, !debug_pagealloc_enabled()); + early_pgtable_alloc, debug_pagealloc_enabled()); } static void __init map_mem(pgd_t *pgd) @@ -396,6 +436,11 @@ void mark_rodata_ro(void) section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; create_mapping_late(__pa(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); + + /* flush the TLBs after updating live kernel mappings */ + flush_tlb_all(); + + debug_checkwx(); } static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, @@ -408,7 +453,7 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, BUG_ON(!PAGE_ALIGNED(size)); __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, - early_pgtable_alloc, !debug_pagealloc_enabled()); + early_pgtable_alloc, debug_pagealloc_enabled()); vma->addr = va_start; vma->phys_addr = pa_start; diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index 778a985c8a70..4b32168cf91a 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -147,7 +147,7 @@ static int __init early_cpu_to_node(int cpu) static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { - return node_distance(from, to); + return node_distance(early_cpu_to_node(from), early_cpu_to_node(to)); } static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, @@ -223,8 +223,11 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) void *nd; int tnid; - pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n", - nid, start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); + if (start_pfn < end_pfn) + pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, + start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); + else + pr_info("Initmem setup node %d [<memory-less node>]\n", nid); nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); nd = __va(nd_pa); diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 352c73b6a59e..32682be978e0 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -70,11 +70,14 @@ ENTRY(cpu_do_suspend) mrs x8, mdscr_el1 mrs x9, oslsr_el1 mrs x10, sctlr_el1 + mrs x11, tpidr_el1 + mrs x12, sp_el0 stp x2, x3, [x0] stp x4, xzr, [x0, #16] stp x5, x6, [x0, #32] stp x7, x8, [x0, #48] stp x9, x10, [x0, #64] + stp x11, x12, [x0, #80] ret ENDPROC(cpu_do_suspend) @@ -90,6 +93,7 @@ ENTRY(cpu_do_resume) ldp x6, x8, [x0, #32] ldp x9, x10, [x0, #48] ldp x11, x12, [x0, #64] + ldp x13, x14, [x0, #80] msr tpidr_el0, x2 msr tpidrro_el0, x3 msr contextidr_el1, x4 @@ -112,6 +116,8 @@ ENTRY(cpu_do_resume) msr mdscr_el1, x10 msr sctlr_el1, x12 + msr tpidr_el1, x13 + msr sp_el0, x14 /* * Restore oslsr_el1 by writing oslar_el1 */ @@ -136,11 +142,7 @@ ENTRY(cpu_do_switch_mm) bfi x0, x1, #48, #16 // set the ASID msr ttbr0_el1, x0 // set TTBR0 isb -alternative_if ARM64_WORKAROUND_CAVIUM_27456 - ic iallu - dsb nsh - isb -alternative_else_nop_endif + post_ttbr0_update_workaround ret ENDPROC(cpu_do_switch_mm) diff --git a/arch/arm64/mm/ptdump_debugfs.c b/arch/arm64/mm/ptdump_debugfs.c new file mode 100644 index 000000000000..eee4d864350c --- /dev/null +++ b/arch/arm64/mm/ptdump_debugfs.c @@ -0,0 +1,31 @@ +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include <asm/ptdump.h> + +static int ptdump_show(struct seq_file *m, void *v) +{ + struct ptdump_info *info = m->private; + ptdump_walk_pgd(m, info); + return 0; +} + +static int ptdump_open(struct inode *inode, struct file *file) +{ + return single_open(file, ptdump_show, inode->i_private); +} + +static const struct file_operations ptdump_fops = { + .open = ptdump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int ptdump_debugfs_register(struct ptdump_info *info, const char *name) +{ + struct dentry *pe; + pe = debugfs_create_file(name, 0400, NULL, info, &ptdump_fops); + return pe ? 0 : -ENOMEM; + +} diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index 329c8027b0a9..b41aff25426d 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -49,6 +49,7 @@ #include <linux/linkage.h> #include <asm/assembler.h> +#include <asm/uaccess.h> #include <xen/interface/xen.h> @@ -91,6 +92,20 @@ ENTRY(privcmd_call) mov x2, x3 mov x3, x4 mov x4, x5 + /* + * Privcmd calls are issued by the userspace. The kernel needs to + * enable access to TTBR0_EL1 as the hypervisor would issue stage 1 + * translations to user memory via AT instructions. Since AT + * instructions are not affected by the PAN bit (ARMv8.1), we only + * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation + * is enabled (it implies that hardware UAO and PAN disabled). + */ + uaccess_ttbr0_enable x6, x7 hvc XEN_IMM + + /* + * Disable userspace access from kernel once the hyp call completed. + */ + uaccess_ttbr0_disable x6 ret ENDPROC(privcmd_call); |