527 files changed, 10693 insertions, 10772 deletions
diff --git a/arch/arm/boot/dts/amlogic/meson8.dtsi b/arch/arm/boot/dts/amlogic/meson8.dtsi
index 847f7b1f1e96..f785e0de0847 100644
--- a/arch/arm/boot/dts/amlogic/meson8.dtsi
+++ b/arch/arm/boot/dts/amlogic/meson8.dtsi
@@ -451,7 +451,7 @@
 	pwm_ef: pwm@86c0 {
 		compatible = "amlogic,meson8-pwm-v2";
 		clocks = <&xtal>,
-			 <>, /* unknown/untested, the datasheet calls it "Video PLL" */
+			 <0>, /* unknown/untested, the datasheet calls it "Video PLL" */
 			 <&clkc CLKID_FCLK_DIV4>,
 			 <&clkc CLKID_FCLK_DIV3>;
 		reg = <0x86c0 0x10>;
@@ -705,7 +705,7 @@
 &pwm_ab {
 	compatible = "amlogic,meson8-pwm-v2";
 	clocks = <&xtal>,
-		 <>, /* unknown/untested, the datasheet calls it "Video PLL" */
+		 <0>, /* unknown/untested, the datasheet calls it "Video PLL" */
 		 <&clkc CLKID_FCLK_DIV4>,
 		 <&clkc CLKID_FCLK_DIV3>;
 };
@@ -713,7 +713,7 @@
 &pwm_cd {
 	compatible = "amlogic,meson8-pwm-v2";
 	clocks = <&xtal>,
-		 <>, /* unknown/untested, the datasheet calls it "Video PLL" */
+		 <0>, /* unknown/untested, the datasheet calls it "Video PLL" */
 		 <&clkc CLKID_FCLK_DIV4>,
 		 <&clkc CLKID_FCLK_DIV3>;
 };
diff --git a/arch/arm/boot/dts/amlogic/meson8b.dtsi b/arch/arm/boot/dts/amlogic/meson8b.dtsi
index 0876611ce26a..fdb0abe23a0c 100644
--- a/arch/arm/boot/dts/amlogic/meson8b.dtsi
+++ b/arch/arm/boot/dts/amlogic/meson8b.dtsi
@@ -406,7 +406,7 @@
 		compatible = "amlogic,meson8b-pwm-v2", "amlogic,meson8-pwm-v2";
 		reg = <0x86c0 0x10>;
 		clocks = <&xtal>,
-			 <>, /* unknown/untested, the datasheet calls it "Video PLL" */
+			 <0>, /* unknown/untested, the datasheet calls it "Video PLL" */
 			 <&clkc CLKID_FCLK_DIV4>,
 			 <&clkc CLKID_FCLK_DIV3>;
 		#pwm-cells = <3>;
@@ -680,7 +680,7 @@
 &pwm_ab {
 	compatible = "amlogic,meson8b-pwm-v2", "amlogic,meson8-pwm-v2";
 	clocks = <&xtal>,
-		 <>, /* unknown/untested, the datasheet calls it "Video PLL" */
+		 <0>, /* unknown/untested, the datasheet calls it "Video PLL" */
 		 <&clkc CLKID_FCLK_DIV4>,
 		 <&clkc CLKID_FCLK_DIV3>;
 };
@@ -688,7 +688,7 @@
 &pwm_cd {
 	compatible = "amlogic,meson8b-pwm-v2", "amlogic,meson8-pwm-v2";
 	clocks = <&xtal>,
-		 <>, /* unknown/untested, the datasheet calls it "Video PLL" */
+		 <0>, /* unknown/untested, the datasheet calls it "Video PLL" */
 		 <&clkc CLKID_FCLK_DIV4>,
 		 <&clkc CLKID_FCLK_DIV3>;
 };
diff --git a/arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi b/arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi
index f2386dcb9ff2..dda4fa91b2f2 100644
--- a/arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi
+++ b/arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi
@@ -40,6 +40,9 @@
 			reg = <1>;
 			interrupt-parent = <&gpio4>;
 			interrupts = <16 IRQ_TYPE_LEVEL_LOW>;
+			micrel,led-mode = <1>;
+			clocks = <&clks IMX6UL_CLK_ENET_REF>;
+			clock-names = "rmii-ref";
 			status = "okay";
 		};
 	};
diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig
index e81a5d6c1c20..e81964cce516 100644
--- a/arch/arm/configs/exynos_defconfig
+++ b/arch/arm/configs/exynos_defconfig
@@ -349,7 +349,7 @@ CONFIG_NLS_ASCII=y
 CONFIG_NLS_ISO8859_1=y
 CONFIG_NLS_UTF8=y
 CONFIG_CRYPTO_USER=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_XTS=m
@@ -364,7 +364,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA256_ARM=m
 CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
 CONFIG_CRYPTO_CHACHA20_NEON=m
diff --git a/arch/arm/configs/milbeaut_m10v_defconfig b/arch/arm/configs/milbeaut_m10v_defconfig
index 275ddf7a3a14..242e7d5a3f68 100644
--- a/arch/arm/configs/milbeaut_m10v_defconfig
+++ b/arch/arm/configs/milbeaut_m10v_defconfig
@@ -93,15 +93,13 @@ CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_ISO8859_1=y
 CONFIG_NLS_UTF8=y
 CONFIG_KEYS=y
-CONFIG_CRYPTO_MANAGER=y
-# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
+CONFIG_CRYPTO_SELFTESTS=y
 # CONFIG_CRYPTO_ECHAINIV is not set
 CONFIG_CRYPTO_AES=y
 CONFIG_CRYPTO_SEQIV=m
 CONFIG_CRYPTO_GHASH_ARM_CE=m
 CONFIG_CRYPTO_SHA1_ARM_NEON=m
 CONFIG_CRYPTO_SHA1_ARM_CE=m
-CONFIG_CRYPTO_SHA2_ARM_CE=m
 CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig
index ad037c175fdb..96178acedad0 100644
--- a/arch/arm/configs/multi_v7_defconfig
+++ b/arch/arm/configs/multi_v7_defconfig
@@ -1301,7 +1301,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_CRYPTO_GHASH_ARM_CE=m
 CONFIG_CRYPTO_SHA1_ARM_NEON=m
 CONFIG_CRYPTO_SHA1_ARM_CE=m
-CONFIG_CRYPTO_SHA2_ARM_CE=m
 CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 75b326bc7830..317f977e509e 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -697,7 +697,6 @@ CONFIG_SECURITY=y
 CONFIG_CRYPTO_MICHAEL_MIC=y
 CONFIG_CRYPTO_GHASH_ARM_CE=m
 CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA256_ARM=m
 CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index 24fca8608554..ded4b9a5accf 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -636,10 +636,9 @@ CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_UTF8=m
 CONFIG_TIMER_STATS=y
 CONFIG_SECURITY=y
-CONFIG_CRYPTO_MANAGER=y
 CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_AUTHENC=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
@@ -660,7 +659,6 @@ CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_DEFLATE=y
 CONFIG_CRYPTO_LZO=y
 CONFIG_CRYPTO_SHA1_ARM=m
-CONFIG_CRYPTO_SHA256_ARM=m
 CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_FONTS=y
diff --git a/arch/arm/configs/spitz_defconfig b/arch/arm/configs/spitz_defconfig
index ffec59e3f49c..ac2a0f998c73 100644
--- a/arch/arm/configs/spitz_defconfig
+++ b/arch/arm/configs/spitz_defconfig
@@ -215,7 +215,7 @@ CONFIG_NLS_ISO8859_1=y
 CONFIG_NLS_UTF8=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_CRYPTO_NULL=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_ECB=m
 CONFIG_CRYPTO_SHA256=m
 CONFIG_CRYPTO_AES=m
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 23e4ea067ddb..7efb9a8596e4 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -46,30 +46,6 @@ config CRYPTO_NHPOLY1305_NEON
 	  Architecture: arm using:
 	  - NEON (Advanced SIMD) extensions
 
-config CRYPTO_POLY1305_ARM
-	tristate
-	select CRYPTO_HASH
-	select CRYPTO_ARCH_HAVE_LIB_POLY1305
-	default CRYPTO_LIB_POLY1305_INTERNAL
-	help
-	  Poly1305 authenticator algorithm (RFC7539)
-
-	  Architecture: arm optionally using
-	  - NEON (Advanced SIMD) extensions
-
-config CRYPTO_BLAKE2S_ARM
-	bool "Hash functions: BLAKE2s"
-	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	help
-	  BLAKE2s cryptographic hash function (RFC 7693)
-
-	  Architecture: arm
-
-	  This is faster than the generic implementations of BLAKE2s and
-	  BLAKE2b, but slower than the NEON implementation of BLAKE2b.
-	  There is no NEON implementation of BLAKE2s, since NEON doesn't
-	  really help with it.
-
 config CRYPTO_BLAKE2B_NEON
 	tristate "Hash functions: BLAKE2b (NEON)"
 	depends on KERNEL_MODE_NEON
@@ -117,27 +93,6 @@ config CRYPTO_SHA1_ARM_CE
 
 	  Architecture: arm using ARMv8 Crypto Extensions
 
-config CRYPTO_SHA2_ARM_CE
-	tristate "Hash functions: SHA-224 and SHA-256 (ARMv8 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_SHA256_ARM
-	select CRYPTO_HASH
-	help
-	  SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
-	  Architecture: arm using
-	  - ARMv8 Crypto Extensions
-
-config CRYPTO_SHA256_ARM
-	tristate "Hash functions: SHA-224 and SHA-256 (NEON)"
-	select CRYPTO_HASH
-	depends on !CPU_V7M
-	help
-	  SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
-	  Architecture: arm using
-	  - NEON (Advanced SIMD) extensions
-
 config CRYPTO_SHA512_ARM
 	tristate "Hash functions: SHA-384 and SHA-512 (NEON)"
 	select CRYPTO_HASH
@@ -172,7 +127,6 @@ config CRYPTO_AES_ARM_BS
 	select CRYPTO_AES_ARM
 	select CRYPTO_SKCIPHER
 	select CRYPTO_LIB_AES
-	select CRYPTO_SIMD
 	help
 	  Length-preserving ciphers: AES cipher algorithms (FIPS-197)
 	  with block cipher modes:
@@ -200,7 +154,6 @@ config CRYPTO_AES_ARM_CE
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_SKCIPHER
 	select CRYPTO_LIB_AES
-	select CRYPTO_SIMD
 	help
 	  Length-preserving ciphers: AES cipher algorithms (FIPS-197)
 	   with block cipher modes:
@@ -214,17 +167,5 @@ config CRYPTO_AES_ARM_CE
 	  Architecture: arm using:
 	  - ARMv8 Crypto Extensions
 
-config CRYPTO_CHACHA20_NEON
-	tristate
-	select CRYPTO_SKCIPHER
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-	default CRYPTO_LIB_CHACHA_INTERNAL
-	help
-	  Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
-	  stream cipher algorithms
-
-	  Architecture: arm using:
-	  - NEON (Advanced SIMD) extensions
-
 endmenu
 
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 3d0e23ff9e74..8479137c6e80 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -7,37 +7,25 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
-obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
-obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o
 obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
 obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
 
 aes-arm-y	:= aes-cipher-core.o aes-cipher-glue.o
 aes-arm-bs-y	:= aes-neonbs-core.o aes-neonbs-glue.o
 sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
 sha1-arm-neon-y	:= sha1-armv7-neon.o sha1_neon_glue.o
-sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
-sha256-arm-y	:= sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
 sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
 sha512-arm-y	:= sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
-libblake2s-arm-y:= blake2s-core.o blake2s-glue.o
 blake2b-neon-y  := blake2b-neon-core.o blake2b-neon-glue.o
 sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
-sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
-chacha-neon-y := chacha-scalar-core.o chacha-glue.o
-chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
-poly1305-arm-y := poly1305-core.o poly1305-glue.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
 curve25519-neon-y := curve25519-core.o curve25519-glue.o
 
@@ -47,14 +35,8 @@ quiet_cmd_perl = PERL    $@
 $(obj)/%-core.S: $(src)/%-armv4.pl
 	$(call cmd,perl)
 
-clean-files += poly1305-core.S sha256-core.S sha512-core.S
+clean-files += sha512-core.S
 
 aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
 
-AFLAGS_sha256-core.o += $(aflags-thumb2-y)
 AFLAGS_sha512-core.o += $(aflags-thumb2-y)
-
-# massage the perlasm code a bit so we only get the NEON routine if we need it
-poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
-poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
-AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c
index 1cf61f51e766..00591895d540 100644
--- a/arch/arm/crypto/aes-ce-glue.c
+++ b/arch/arm/crypto/aes-ce-glue.c
@@ -10,8 +10,6 @@
 #include <asm/simd.h>
 #include <linux/unaligned.h>
 #include <crypto/aes.h>
-#include <crypto/ctr.h>
-#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/cpufeature.h>
@@ -418,29 +416,6 @@ static int ctr_encrypt(struct skcipher_request *req)
 	return err;
 }
 
-static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
-{
-	struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
-	unsigned long flags;
-
-	/*
-	 * Temporarily disable interrupts to avoid races where
-	 * cachelines are evicted when the CPU is interrupted
-	 * to do something else.
-	 */
-	local_irq_save(flags);
-	aes_encrypt(ctx, dst, src);
-	local_irq_restore(flags);
-}
-
-static int ctr_encrypt_sync(struct skcipher_request *req)
-{
-	if (!crypto_simd_usable())
-		return crypto_ctr_encrypt_walk(req, ctr_encrypt_one);
-
-	return ctr_encrypt(req);
-}
-
 static int xts_encrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
@@ -586,10 +561,9 @@ static int xts_decrypt(struct skcipher_request *req)
 }
 
 static struct skcipher_alg aes_algs[] = { {
-	.base.cra_name		= "__ecb(aes)",
-	.base.cra_driver_name	= "__ecb-aes-ce",
+	.base.cra_name		= "ecb(aes)",
+	.base.cra_driver_name	= "ecb-aes-ce",
 	.base.cra_priority	= 300,
-	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct crypto_aes_ctx),
 	.base.cra_module	= THIS_MODULE,
@@ -600,10 +574,9 @@ static struct skcipher_alg aes_algs[] = { {
 	.encrypt		= ecb_encrypt,
 	.decrypt		= ecb_decrypt,
 }, {
-	.base.cra_name		= "__cbc(aes)",
-	.base.cra_driver_name	= "__cbc-aes-ce",
+	.base.cra_name		= "cbc(aes)",
+	.base.cra_driver_name	= "cbc-aes-ce",
 	.base.cra_priority	= 300,
-	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct crypto_aes_ctx),
 	.base.cra_module	= THIS_MODULE,
@@ -615,10 +588,9 @@ static struct skcipher_alg aes_algs[] = { {
 	.encrypt		= cbc_encrypt,
 	.decrypt		= cbc_decrypt,
 }, {
-	.base.cra_name		= "__cts(cbc(aes))",
-	.base.cra_driver_name	= "__cts-cbc-aes-ce",
+	.base.cra_name		= "cts(cbc(aes))",
+	.base.cra_driver_name	= "cts-cbc-aes-ce",
 	.base.cra_priority	= 300,
-	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct crypto_aes_ctx),
 	.base.cra_module	= THIS_MODULE,
@@ -631,10 +603,9 @@ static struct skcipher_alg aes_algs[] = { {
 	.encrypt		= cts_cbc_encrypt,
 	.decrypt		= cts_cbc_decrypt,
 }, {
-	.base.cra_name		= "__ctr(aes)",
-	.base.cra_driver_name	= "__ctr-aes-ce",
+	.base.cra_name		= "ctr(aes)",
+	.base.cra_driver_name	= "ctr-aes-ce",
 	.base.cra_priority	= 300,
-	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 	.base.cra_blocksize	= 1,
 	.base.cra_ctxsize	= sizeof(struct crypto_aes_ctx),
 	.base.cra_module	= THIS_MODULE,
@@ -647,25 +618,9 @@ static struct skcipher_alg aes_algs[] = { {
 	.encrypt		= ctr_encrypt,
 	.decrypt		= ctr_encrypt,
 }, {
-	.base.cra_name		= "ctr(aes)",
-	.base.cra_driver_name	= "ctr-aes-ce-sync",
-	.base.cra_priority	= 300 - 1,
-	.base.cra_blocksize	= 1,
-	.base.cra_ctxsize	= sizeof(struct crypto_aes_ctx),
-	.base.cra_module	= THIS_MODULE,
-
-	.min_keysize		= AES_MIN_KEY_SIZE,
-	.max_keysize		= AES_MAX_KEY_SIZE,
-	.ivsize			= AES_BLOCK_SIZE,
-	.chunksize		= AES_BLOCK_SIZE,
-	.setkey			= ce_aes_setkey,
-	.encrypt		= ctr_encrypt_sync,
-	.decrypt		= ctr_encrypt_sync,
-}, {
-	.base.cra_name		= "__xts(aes)",
-	.base.cra_driver_name	= "__xts-aes-ce",
+	.base.cra_name		= "xts(aes)",
+	.base.cra_driver_name	= "xts-aes-ce",
 	.base.cra_priority	= 300,
-	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct crypto_aes_xts_ctx),
 	.base.cra_module	= THIS_MODULE,
@@ -679,51 +634,14 @@ static struct skcipher_alg aes_algs[] = { {
 	.decrypt		= xts_decrypt,
 } };
 
-static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
-
 static void aes_exit(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(aes_simd_algs) && aes_simd_algs[i]; i++)
-		simd_skcipher_free(aes_simd_algs[i]);
-
 	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
 }
 
 static int __init aes_init(void)
 {
-	struct simd_skcipher_alg *simd;
-	const char *basename;
-	const char *algname;
-	const char *drvname;
-	int err;
-	int i;
-
-	err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
-	if (err)
-		return err;
-
-	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
-		if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
-			continue;
-
-		algname = aes_algs[i].base.cra_name + 2;
-		drvname = aes_algs[i].base.cra_driver_name + 2;
-		basename = aes_algs[i].base.cra_driver_name;
-		simd = simd_skcipher_create_compat(aes_algs + i, algname, drvname, basename);
-		err = PTR_ERR(simd);
-		if (IS_ERR(simd))
-			goto unregister_simds;
-
-		aes_simd_algs[i] = simd;
-	}
-
-	return 0;
-
-unregister_simds:
-	aes_exit();
-	return err;
+	return crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
 }
 
 module_cpu_feature_match(AES, aes_init);
diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c
index f6be80b5938b..c60104dc1585 100644
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -8,8 +8,6 @@
 #include <asm/neon.h>
 #include <asm/simd.h>
 #include <crypto/aes.h>
-#include <crypto/ctr.h>
-#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/xts.h>
@@ -59,11 +57,6 @@ struct aesbs_xts_ctx {
 	struct crypto_aes_ctx	tweak_key;
 };
 
-struct aesbs_ctr_ctx {
-	struct aesbs_ctx	key;		/* must be first member */
-	struct crypto_aes_ctx	fallback;
-};
-
 static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 			unsigned int key_len)
 {
@@ -200,25 +193,6 @@ static int cbc_decrypt(struct skcipher_request *req)
 	return err;
 }
 
-static int aesbs_ctr_setkey_sync(struct crypto_skcipher *tfm, const u8 *in_key,
-				 unsigned int key_len)
-{
-	struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int err;
-
-	err = aes_expandkey(&ctx->fallback, in_key, key_len);
-	if (err)
-		return err;
-
-	ctx->key.rounds = 6 + key_len / 4;
-
-	kernel_neon_begin();
-	aesbs_convert_key(ctx->key.rk, ctx->fallback.key_enc, ctx->key.rounds);
-	kernel_neon_end();
-
-	return 0;
-}
-
 static int ctr_encrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
@@ -254,21 +228,6 @@ static int ctr_encrypt(struct skcipher_request *req)
 	return err;
 }
 
-static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
-{
-	struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	__aes_arm_encrypt(ctx->fallback.key_enc, ctx->key.rounds, src, dst);
-}
-
-static int ctr_encrypt_sync(struct skcipher_request *req)
-{
-	if (!crypto_simd_usable())
-		return crypto_ctr_encrypt_walk(req, ctr_encrypt_one);
-
-	return ctr_encrypt(req);
-}
-
 static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 			    unsigned int key_len)
 {
@@ -374,13 +333,12 @@ static int xts_decrypt(struct skcipher_request *req)
 }
 
 static struct skcipher_alg aes_algs[] = { {
-	.base.cra_name		= "__ecb(aes)",
-	.base.cra_driver_name	= "__ecb-aes-neonbs",
+	.base.cra_name		= "ecb(aes)",
+	.base.cra_driver_name	= "ecb-aes-neonbs",
 	.base.cra_priority	= 250,
 	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
 	.base.cra_module	= THIS_MODULE,
-	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 
 	.min_keysize		= AES_MIN_KEY_SIZE,
 	.max_keysize		= AES_MAX_KEY_SIZE,
@@ -389,13 +347,12 @@ static struct skcipher_alg aes_algs[] = { {
 	.encrypt		= ecb_encrypt,
 	.decrypt		= ecb_decrypt,
 }, {
-	.base.cra_name		= "__cbc(aes)",
-	.base.cra_driver_name	= "__cbc-aes-neonbs",
+	.base.cra_name		= "cbc(aes)",
+	.base.cra_driver_name	= "cbc-aes-neonbs",
 	.base.cra_priority	= 250,
 	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct aesbs_cbc_ctx),
 	.base.cra_module	= THIS_MODULE,
-	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 
 	.min_keysize		= AES_MIN_KEY_SIZE,
 	.max_keysize		= AES_MAX_KEY_SIZE,
@@ -405,13 +362,12 @@ static struct skcipher_alg aes_algs[] = { {
 	.encrypt		= cbc_encrypt,
 	.decrypt		= cbc_decrypt,
 }, {
-	.base.cra_name		= "__ctr(aes)",
-	.base.cra_driver_name	= "__ctr-aes-neonbs",
+	.base.cra_name		= "ctr(aes)",
+	.base.cra_driver_name	= "ctr-aes-neonbs",
 	.base.cra_priority	= 250,
 	.base.cra_blocksize	= 1,
 	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
 	.base.cra_module	= THIS_MODULE,
-	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 
 	.min_keysize		= AES_MIN_KEY_SIZE,
 	.max_keysize		= AES_MAX_KEY_SIZE,
@@ -422,29 +378,12 @@ static struct skcipher_alg aes_algs[] = { {
 	.encrypt		= ctr_encrypt,
 	.decrypt		= ctr_encrypt,
 }, {
-	.base.cra_name		= "ctr(aes)",
-	.base.cra_driver_name	= "ctr-aes-neonbs-sync",
-	.base.cra_priority	= 250 - 1,
-	.base.cra_blocksize	= 1,
-	.base.cra_ctxsize	= sizeof(struct aesbs_ctr_ctx),
-	.base.cra_module	= THIS_MODULE,
-
-	.min_keysize		= AES_MIN_KEY_SIZE,
-	.max_keysize		= AES_MAX_KEY_SIZE,
-	.chunksize		= AES_BLOCK_SIZE,
-	.walksize		= 8 * AES_BLOCK_SIZE,
-	.ivsize			= AES_BLOCK_SIZE,
-	.setkey			= aesbs_ctr_setkey_sync,
-	.encrypt		= ctr_encrypt_sync,
-	.decrypt		= ctr_encrypt_sync,
-}, {
-	.base.cra_name		= "__xts(aes)",
-	.base.cra_driver_name	= "__xts-aes-neonbs",
+	.base.cra_name		= "xts(aes)",
+	.base.cra_driver_name	= "xts-aes-neonbs",
 	.base.cra_priority	= 250,
 	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct aesbs_xts_ctx),
 	.base.cra_module	= THIS_MODULE,
-	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 
 	.min_keysize		= 2 * AES_MIN_KEY_SIZE,
 	.max_keysize		= 2 * AES_MAX_KEY_SIZE,
@@ -455,55 +394,18 @@ static struct skcipher_alg aes_algs[] = { {
 	.decrypt		= xts_decrypt,
 } };
 
-static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
-
 static void aes_exit(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
-		if (aes_simd_algs[i])
-			simd_skcipher_free(aes_simd_algs[i]);
-
 	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
 }
 
 static int __init aes_init(void)
 {
-	struct simd_skcipher_alg *simd;
-	const char *basename;
-	const char *algname;
-	const char *drvname;
-	int err;
-	int i;
-
 	if (!(elf_hwcap & HWCAP_NEON))
 		return -ENODEV;
 
-	err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
-	if (err)
-		return err;
-
-	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
-		if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
-			continue;
-
-		algname = aes_algs[i].base.cra_name + 2;
-		drvname = aes_algs[i].base.cra_driver_name + 2;
-		basename = aes_algs[i].base.cra_driver_name;
-		simd = simd_skcipher_create_compat(aes_algs + i, algname, drvname, basename);
-		err = PTR_ERR(simd);
-		if (IS_ERR(simd))
-			goto unregister_simds;
-
-		aes_simd_algs[i] = simd;
-	}
-	return 0;
-
-unregister_simds:
-	aes_exit();
-	return err;
+	return crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
 }
 
-late_initcall(aes_init);
+module_init(aes_init);
 module_exit(aes_exit);
diff --git a/arch/arm/crypto/blake2b-neon-glue.c b/arch/arm/crypto/blake2b-neon-glue.c
index 4b59d027ba4a..2ff443a91724 100644
--- a/arch/arm/crypto/blake2b-neon-glue.c
+++ b/arch/arm/crypto/blake2b-neon-glue.c
@@ -7,7 +7,6 @@
 
 #include <crypto/internal/blake2b.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 
 #include <linux/module.h>
 #include <linux/sizes.h>
@@ -21,11 +20,6 @@ asmlinkage void blake2b_compress_neon(struct blake2b_state *state,
 static void blake2b_compress_arch(struct blake2b_state *state,
 				  const u8 *block, size_t nblocks, u32 inc)
 {
-	if (!crypto_simd_usable()) {
-		blake2b_compress_generic(state, block, nblocks, inc);
-		return;
-	}
-
 	do {
 		const size_t blocks = min_t(size_t, nblocks,
 					    SZ_4K / BLAKE2B_BLOCK_SIZE);
@@ -42,12 +36,14 @@ static void blake2b_compress_arch(struct blake2b_state *state,
 static int crypto_blake2b_update_neon(struct shash_desc *desc,
 				      const u8 *in, unsigned int inlen)
 {
-	return crypto_blake2b_update(desc, in, inlen, blake2b_compress_arch);
+	return crypto_blake2b_update_bo(desc, in, inlen, blake2b_compress_arch);
 }
 
-static int crypto_blake2b_final_neon(struct shash_desc *desc, u8 *out)
+static int crypto_blake2b_finup_neon(struct shash_desc *desc, const u8 *in,
+				     unsigned int inlen, u8 *out)
 {
-	return crypto_blake2b_final(desc, out, blake2b_compress_arch);
+	return crypto_blake2b_finup(desc, in, inlen, out,
+				    blake2b_compress_arch);
 }
 
 #define BLAKE2B_ALG(name, driver_name, digest_size)			\
@@ -55,7 +51,9 @@ static int crypto_blake2b_final_neon(struct shash_desc *desc, u8 *out)
 		.base.cra_name		= name,				\
 		.base.cra_driver_name	= driver_name,			\
 		.base.cra_priority	= 200,				\
-		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,	\
+		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY |	\
+					  CRYPTO_AHASH_ALG_BLOCK_ONLY |	\
+					  CRYPTO_AHASH_ALG_FINAL_NONZERO, \
 		.base.cra_blocksize	= BLAKE2B_BLOCK_SIZE,		\
 		.base.cra_ctxsize	= sizeof(struct blake2b_tfm_ctx), \
 		.base.cra_module	= THIS_MODULE,			\
@@ -63,8 +61,9 @@ static int crypto_blake2b_final_neon(struct shash_desc *desc, u8 *out)
 		.setkey			= crypto_blake2b_setkey,	\
 		.init			= crypto_blake2b_init,		\
 		.update			= crypto_blake2b_update_neon,	\
-		.final			= crypto_blake2b_final_neon,	\
+		.finup			= crypto_blake2b_finup_neon,	\
 		.descsize		= sizeof(struct blake2b_state),	\
+		.statesize		= BLAKE2B_STATE_SIZE,		\
 	}
 
 static struct shash_alg blake2b_neon_algs[] = {
diff --git a/arch/arm/crypto/chacha-glue.c b/arch/arm/crypto/chacha-glue.c
deleted file mode 100644
index 50e635512046..000000000000
--- a/arch/arm/crypto/chacha-glue.c
+++ /dev/null
@@ -1,352 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
- * including ChaCha20 (RFC7539)
- *
- * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <crypto/algapi.h>
-#include <crypto/internal/chacha.h>
-#include <crypto/internal/simd.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/cputype.h>
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
-				      int nrounds);
-asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
-				       int nrounds, unsigned int nbytes);
-asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
-asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
-
-asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
-			     const u32 *state, int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
-
-static inline bool neon_usable(void)
-{
-	return static_branch_likely(&use_neon) && crypto_simd_usable();
-}
-
-static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
-			  unsigned int bytes, int nrounds)
-{
-	u8 buf[CHACHA_BLOCK_SIZE];
-
-	while (bytes > CHACHA_BLOCK_SIZE) {
-		unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
-
-		chacha_4block_xor_neon(state, dst, src, nrounds, l);
-		bytes -= l;
-		src += l;
-		dst += l;
-		state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
-	}
-	if (bytes) {
-		const u8 *s = src;
-		u8 *d = dst;
-
-		if (bytes != CHACHA_BLOCK_SIZE)
-			s = d = memcpy(buf, src, bytes);
-		chacha_block_xor_neon(state, d, s, nrounds);
-		if (d != dst)
-			memcpy(dst, buf, bytes);
-		state[12]++;
-	}
-}
-
-void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
-{
-	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
-		hchacha_block_arm(state, stream, nrounds);
-	} else {
-		kernel_neon_begin();
-		hchacha_block_neon(state, stream, nrounds);
-		kernel_neon_end();
-	}
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
-		       int nrounds)
-{
-	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
-	    bytes <= CHACHA_BLOCK_SIZE) {
-		chacha_doarm(dst, src, bytes, state, nrounds);
-		state[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
-		return;
-	}
-
-	do {
-		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
-		kernel_neon_begin();
-		chacha_doneon(state, dst, src, todo, nrounds);
-		kernel_neon_end();
-
-		bytes -= todo;
-		src += todo;
-		dst += todo;
-	} while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-static int chacha_stream_xor(struct skcipher_request *req,
-			     const struct chacha_ctx *ctx, const u8 *iv,
-			     bool neon)
-{
-	struct skcipher_walk walk;
-	u32 state[16];
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	chacha_init(state, ctx->key, iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-
-		if (nbytes < walk.total)
-			nbytes = round_down(nbytes, walk.stride);
-
-		if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
-			chacha_doarm(walk.dst.virt.addr, walk.src.virt.addr,
-				     nbytes, state, ctx->nrounds);
-			state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE);
-		} else {
-			kernel_neon_begin();
-			chacha_doneon(state, walk.dst.virt.addr,
-				      walk.src.virt.addr, nbytes, ctx->nrounds);
-			kernel_neon_end();
-		}
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
-
-	return err;
-}
-
-static int do_chacha(struct skcipher_request *req, bool neon)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	return chacha_stream_xor(req, ctx, req->iv, neon);
-}
-
-static int chacha_arm(struct skcipher_request *req)
-{
-	return do_chacha(req, false);
-}
-
-static int chacha_neon(struct skcipher_request *req)
-{
-	return do_chacha(req, neon_usable());
-}
-
-static int do_xchacha(struct skcipher_request *req, bool neon)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct chacha_ctx subctx;
-	u32 state[16];
-	u8 real_iv[16];
-
-	chacha_init(state, ctx->key, req->iv);
-
-	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
-		hchacha_block_arm(state, subctx.key, ctx->nrounds);
-	} else {
-		kernel_neon_begin();
-		hchacha_block_neon(state, subctx.key, ctx->nrounds);
-		kernel_neon_end();
-	}
-	subctx.nrounds = ctx->nrounds;
-
-	memcpy(&real_iv[0], req->iv + 24, 8);
-	memcpy(&real_iv[8], req->iv + 16, 8);
-	return chacha_stream_xor(req, &subctx, real_iv, neon);
-}
-
-static int xchacha_arm(struct skcipher_request *req)
-{
-	return do_xchacha(req, false);
-}
-
-static int xchacha_neon(struct skcipher_request *req)
-{
-	return do_xchacha(req, neon_usable());
-}
-
-static struct skcipher_alg arm_algs[] = {
-	{
-		.base.cra_name		= "chacha20",
-		.base.cra_driver_name	= "chacha20-arm",
-		.base.cra_priority	= 200,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= CHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= chacha_arm,
-		.decrypt		= chacha_arm,
-	}, {
-		.base.cra_name		= "xchacha20",
-		.base.cra_driver_name	= "xchacha20-arm",
-		.base.cra_priority	= 200,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= xchacha_arm,
-		.decrypt		= xchacha_arm,
-	}, {
-		.base.cra_name		= "xchacha12",
-		.base.cra_driver_name	= "xchacha12-arm",
-		.base.cra_priority	= 200,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= chacha12_setkey,
-		.encrypt		= xchacha_arm,
-		.decrypt		= xchacha_arm,
-	},
-};
-
-static struct skcipher_alg neon_algs[] = {
-	{
-		.base.cra_name		= "chacha20",
-		.base.cra_driver_name	= "chacha20-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= CHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= chacha_neon,
-		.decrypt		= chacha_neon,
-	}, {
-		.base.cra_name		= "xchacha20",
-		.base.cra_driver_name	= "xchacha20-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= xchacha_neon,
-		.decrypt		= xchacha_neon,
-	}, {
-		.base.cra_name		= "xchacha12",
-		.base.cra_driver_name	= "xchacha12-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
-		.setkey			= chacha12_setkey,
-		.encrypt		= xchacha_neon,
-		.decrypt		= xchacha_neon,
-	}
-};
-
-static int __init chacha_simd_mod_init(void)
-{
-	int err = 0;
-
-	if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER)) {
-		err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
-		if (err)
-			return err;
-	}
-
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
-		int i;
-
-		switch (read_cpuid_part()) {
-		case ARM_CPU_PART_CORTEX_A7:
-		case ARM_CPU_PART_CORTEX_A5:
-			/*
-			 * The Cortex-A7 and Cortex-A5 do not perform well with
-			 * the NEON implementation but do incredibly with the
-			 * scalar one and use less power.
-			 */
-			for (i = 0; i < ARRAY_SIZE(neon_algs); i++)
-				neon_algs[i].base.cra_priority = 0;
-			break;
-		default:
-			static_branch_enable(&use_neon);
-		}
-
-		if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER)) {
-			err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
-			if (err)
-				crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
-		}
-	}
-	return err;
-}
-
-static void __exit chacha_simd_mod_fini(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER)) {
-		crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
-		if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON))
-			crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
-	}
-}
-
-module_init(chacha_simd_mod_init);
-module_exit(chacha_simd_mod_fini);
-
-MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (scalar and NEON accelerated)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-arm");
-MODULE_ALIAS_CRYPTO("xchacha20");
-MODULE_ALIAS_CRYPTO("xchacha20-arm");
-MODULE_ALIAS_CRYPTO("xchacha12");
-MODULE_ALIAS_CRYPTO("xchacha12-arm");
-#ifdef CONFIG_KERNEL_MODE_NEON
-MODULE_ALIAS_CRYPTO("chacha20-neon");
-MODULE_ALIAS_CRYPTO("xchacha20-neon");
-MODULE_ALIAS_CRYPTO("xchacha12-neon");
-#endif
diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c
index aabfcf522a2c..a52dcc8c1e33 100644
--- a/arch/arm/crypto/ghash-ce-glue.c
+++ b/arch/arm/crypto/ghash-ce-glue.c
@@ -8,22 +8,22 @@
 
 #include <asm/hwcap.h>
 #include <asm/neon.h>
-#include <asm/simd.h>
-#include <linux/unaligned.h>
 #include <crypto/aes.h>
-#include <crypto/gcm.h>
 #include <crypto/b128ops.h>
-#include <crypto/cryptd.h>
+#include <crypto/gcm.h>
+#include <crypto/gf128mul.h>
+#include <crypto/ghash.h>
 #include <crypto/internal/aead.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
-#include <crypto/gf128mul.h>
 #include <crypto/scatterwalk.h>
 #include <linux/cpufeature.h>
-#include <linux/crypto.h>
+#include <linux/errno.h>
 #include <linux/jump_label.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
 
 MODULE_DESCRIPTION("GHASH hash function using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ardb@kernel.org>");
@@ -32,9 +32,6 @@ MODULE_ALIAS_CRYPTO("ghash");
 MODULE_ALIAS_CRYPTO("gcm(aes)");
 MODULE_ALIAS_CRYPTO("rfc4106(gcm(aes))");
 
-#define GHASH_BLOCK_SIZE	16
-#define GHASH_DIGEST_SIZE	16
-
 #define RFC4106_NONCE_SIZE	4
 
 struct ghash_key {
@@ -49,10 +46,8 @@ struct gcm_key {
 	u8	nonce[];	// for RFC4106 nonce
 };
 
-struct ghash_desc_ctx {
+struct arm_ghash_desc_ctx {
 	u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)];
-	u8 buf[GHASH_BLOCK_SIZE];
-	u32 count;
 };
 
 asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
@@ -65,9 +60,9 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_p64);
 
 static int ghash_init(struct shash_desc *desc)
 {
-	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+	struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc);
 
-	*ctx = (struct ghash_desc_ctx){};
+	*ctx = (struct arm_ghash_desc_ctx){};
 	return 0;
 }
 
@@ -85,52 +80,49 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src,
 static int ghash_update(struct shash_desc *desc, const u8 *src,
 			unsigned int len)
 {
-	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
-	unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
-
-	ctx->count += len;
+	struct ghash_key *key = crypto_shash_ctx(desc->tfm);
+	struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+	int blocks;
 
-	if ((partial + len) >= GHASH_BLOCK_SIZE) {
-		struct ghash_key *key = crypto_shash_ctx(desc->tfm);
-		int blocks;
+	blocks = len / GHASH_BLOCK_SIZE;
+	ghash_do_update(blocks, ctx->digest, src, key, NULL);
+	return len - blocks * GHASH_BLOCK_SIZE;
+}
 
-		if (partial) {
-			int p = GHASH_BLOCK_SIZE - partial;
+static int ghash_export(struct shash_desc *desc, void *out)
+{
+	struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+	u8 *dst = out;
 
-			memcpy(ctx->buf + partial, src, p);
-			src += p;
-			len -= p;
-		}
+	put_unaligned_be64(ctx->digest[1], dst);
+	put_unaligned_be64(ctx->digest[0], dst + 8);
+	return 0;
+}
 
-		blocks = len / GHASH_BLOCK_SIZE;
-		len %= GHASH_BLOCK_SIZE;
+static int ghash_import(struct shash_desc *desc, const void *in)
+{
+	struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+	const u8 *src = in;
 
-		ghash_do_update(blocks, ctx->digest, src, key,
-				partial ? ctx->buf : NULL);
-		src += blocks * GHASH_BLOCK_SIZE;
-		partial = 0;
-	}
-	if (len)
-		memcpy(ctx->buf + partial, src, len);
+	ctx->digest[1] = get_unaligned_be64(src);
+	ctx->digest[0] = get_unaligned_be64(src + 8);
 	return 0;
 }
 
-static int ghash_final(struct shash_desc *desc, u8 *dst)
+static int ghash_finup(struct shash_desc *desc, const u8 *src,
+		       unsigned int len, u8 *dst)
 {
-	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
-	unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
+	struct ghash_key *key = crypto_shash_ctx(desc->tfm);
+	struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc);
 
-	if (partial) {
-		struct ghash_key *key = crypto_shash_ctx(desc->tfm);
+	if (len) {
+		u8 buf[GHASH_BLOCK_SIZE] = {};
 
-		memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
-		ghash_do_update(1, ctx->digest, ctx->buf, key, NULL);
+		memcpy(buf, src, len);
+		ghash_do_update(1, ctx->digest, buf, key, NULL);
+		memzero_explicit(buf, sizeof(buf));
 	}
-	put_unaligned_be64(ctx->digest[1], dst);
-	put_unaligned_be64(ctx->digest[0], dst + 8);
-
-	*ctx = (struct ghash_desc_ctx){};
-	return 0;
+	return ghash_export(desc, dst);
 }
 
 static void ghash_reflect(u64 h[], const be128 *k)
@@ -175,13 +167,17 @@ static struct shash_alg ghash_alg = {
 	.digestsize		= GHASH_DIGEST_SIZE,
 	.init			= ghash_init,
 	.update			= ghash_update,
-	.final			= ghash_final,
+	.finup			= ghash_finup,
 	.setkey			= ghash_setkey,
-	.descsize		= sizeof(struct ghash_desc_ctx),
+	.export			= ghash_export,
+	.import			= ghash_import,
+	.descsize		= sizeof(struct arm_ghash_desc_ctx),
+	.statesize		= sizeof(struct ghash_desc_ctx),
 
 	.base.cra_name		= "ghash",
 	.base.cra_driver_name	= "ghash-ce",
 	.base.cra_priority	= 300,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 	.base.cra_blocksize	= GHASH_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct ghash_key) + sizeof(u64[2]),
 	.base.cra_module	= THIS_MODULE,
@@ -317,9 +313,6 @@ static int gcm_encrypt(struct aead_request *req, const u8 *iv, u32 assoclen)
 	u8 *tag, *dst;
 	int tail, err;
 
-	if (WARN_ON_ONCE(!may_use_simd()))
-		return -EBUSY;
-
 	err = skcipher_walk_aead_encrypt(&walk, req, false);
 
 	kernel_neon_begin();
@@ -409,9 +402,6 @@ static int gcm_decrypt(struct aead_request *req, const u8 *iv, u32 assoclen)
 	u8 *tag, *dst;
 	int tail, err, ret;
 
-	if (WARN_ON_ONCE(!may_use_simd()))
-		return -EBUSY;
-
 	scatterwalk_map_and_copy(otag, req->src,
 				 req->assoclen + req->cryptlen - authsize,
 				 authsize, 0);
diff --git a/arch/arm/crypto/poly1305-glue.c b/arch/arm/crypto/poly1305-glue.c
deleted file mode 100644
index 4464ffbf8fd1..000000000000
--- a/arch/arm/crypto/poly1305-glue.c
+++ /dev/null
@@ -1,274 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <linux/unaligned.h>
-#include <crypto/algapi.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/poly1305.h>
-#include <crypto/internal/simd.h>
-#include <linux/cpufeature.h>
-#include <linux/crypto.h>
-#include <linux/jump_label.h>
-#include <linux/module.h>
-
-void poly1305_init_arm(void *state, const u8 *key);
-void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
-void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
-void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
-
-void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
-{
-}
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
-{
-	poly1305_init_arm(&dctx->h, key);
-	dctx->s[0] = get_unaligned_le32(key + 16);
-	dctx->s[1] = get_unaligned_le32(key + 20);
-	dctx->s[2] = get_unaligned_le32(key + 24);
-	dctx->s[3] = get_unaligned_le32(key + 28);
-	dctx->buflen = 0;
-}
-EXPORT_SYMBOL(poly1305_init_arch);
-
-static int arm_poly1305_init(struct shash_desc *desc)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	dctx->buflen = 0;
-	dctx->rset = 0;
-	dctx->sset = false;
-
-	return 0;
-}
-
-static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
-				 u32 len, u32 hibit, bool do_neon)
-{
-	if (unlikely(!dctx->sset)) {
-		if (!dctx->rset) {
-			poly1305_init_arm(&dctx->h, src);
-			src += POLY1305_BLOCK_SIZE;
-			len -= POLY1305_BLOCK_SIZE;
-			dctx->rset = 1;
-		}
-		if (len >= POLY1305_BLOCK_SIZE) {
-			dctx->s[0] = get_unaligned_le32(src +  0);
-			dctx->s[1] = get_unaligned_le32(src +  4);
-			dctx->s[2] = get_unaligned_le32(src +  8);
-			dctx->s[3] = get_unaligned_le32(src + 12);
-			src += POLY1305_BLOCK_SIZE;
-			len -= POLY1305_BLOCK_SIZE;
-			dctx->sset = true;
-		}
-		if (len < POLY1305_BLOCK_SIZE)
-			return;
-	}
-
-	len &= ~(POLY1305_BLOCK_SIZE - 1);
-
-	if (static_branch_likely(&have_neon) && likely(do_neon))
-		poly1305_blocks_neon(&dctx->h, src, len, hibit);
-	else
-		poly1305_blocks_arm(&dctx->h, src, len, hibit);
-}
-
-static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx,
-				    const u8 *src, u32 len, bool do_neon)
-{
-	if (unlikely(dctx->buflen)) {
-		u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
-
-		memcpy(dctx->buf + dctx->buflen, src, bytes);
-		src += bytes;
-		len -= bytes;
-		dctx->buflen += bytes;
-
-		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-			arm_poly1305_blocks(dctx, dctx->buf,
-					    POLY1305_BLOCK_SIZE, 1, false);
-			dctx->buflen = 0;
-		}
-	}
-
-	if (likely(len >= POLY1305_BLOCK_SIZE)) {
-		arm_poly1305_blocks(dctx, src, len, 1, do_neon);
-		src += round_down(len, POLY1305_BLOCK_SIZE);
-		len %= POLY1305_BLOCK_SIZE;
-	}
-
-	if (unlikely(len)) {
-		dctx->buflen = len;
-		memcpy(dctx->buf, src, len);
-	}
-}
-
-static int arm_poly1305_update(struct shash_desc *desc,
-			       const u8 *src, unsigned int srclen)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	arm_poly1305_do_update(dctx, src, srclen, false);
-	return 0;
-}
-
-static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc,
-						   const u8 *src,
-						   unsigned int srclen)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-	bool do_neon = crypto_simd_usable() && srclen > 128;
-
-	if (static_branch_likely(&have_neon) && do_neon)
-		kernel_neon_begin();
-	arm_poly1305_do_update(dctx, src, srclen, do_neon);
-	if (static_branch_likely(&have_neon) && do_neon)
-		kernel_neon_end();
-	return 0;
-}
-
-void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
-			  unsigned int nbytes)
-{
-	bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-		       crypto_simd_usable();
-
-	if (unlikely(dctx->buflen)) {
-		u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
-
-		memcpy(dctx->buf + dctx->buflen, src, bytes);
-		src += bytes;
-		nbytes -= bytes;
-		dctx->buflen += bytes;
-
-		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-			poly1305_blocks_arm(&dctx->h, dctx->buf,
-					    POLY1305_BLOCK_SIZE, 1);
-			dctx->buflen = 0;
-		}
-	}
-
-	if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
-		unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
-
-		if (static_branch_likely(&have_neon) && do_neon) {
-			do {
-				unsigned int todo = min_t(unsigned int, len, SZ_4K);
-
-				kernel_neon_begin();
-				poly1305_blocks_neon(&dctx->h, src, todo, 1);
-				kernel_neon_end();
-
-				len -= todo;
-				src += todo;
-			} while (len);
-		} else {
-			poly1305_blocks_arm(&dctx->h, src, len, 1);
-			src += len;
-		}
-		nbytes %= POLY1305_BLOCK_SIZE;
-	}
-
-	if (unlikely(nbytes)) {
-		dctx->buflen = nbytes;
-		memcpy(dctx->buf, src, nbytes);
-	}
-}
-EXPORT_SYMBOL(poly1305_update_arch);
-
-void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
-{
-	if (unlikely(dctx->buflen)) {
-		dctx->buf[dctx->buflen++] = 1;
-		memset(dctx->buf + dctx->buflen, 0,
-		       POLY1305_BLOCK_SIZE - dctx->buflen);
-		poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
-	}
-
-	poly1305_emit_arm(&dctx->h, dst, dctx->s);
-	*dctx = (struct poly1305_desc_ctx){};
-}
-EXPORT_SYMBOL(poly1305_final_arch);
-
-static int arm_poly1305_final(struct shash_desc *desc, u8 *dst)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	if (unlikely(!dctx->sset))
-		return -ENOKEY;
-
-	poly1305_final_arch(dctx, dst);
-	return 0;
-}
-
-static struct shash_alg arm_poly1305_algs[] = {{
-	.init			= arm_poly1305_init,
-	.update			= arm_poly1305_update,
-	.final			= arm_poly1305_final,
-	.digestsize		= POLY1305_DIGEST_SIZE,
-	.descsize		= sizeof(struct poly1305_desc_ctx),
-
-	.base.cra_name		= "poly1305",
-	.base.cra_driver_name	= "poly1305-arm",
-	.base.cra_priority	= 150,
-	.base.cra_blocksize	= POLY1305_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-#ifdef CONFIG_KERNEL_MODE_NEON
-}, {
-	.init			= arm_poly1305_init,
-	.update			= arm_poly1305_update_neon,
-	.final			= arm_poly1305_final,
-	.digestsize		= POLY1305_DIGEST_SIZE,
-	.descsize		= sizeof(struct poly1305_desc_ctx),
-
-	.base.cra_name		= "poly1305",
-	.base.cra_driver_name	= "poly1305-neon",
-	.base.cra_priority	= 200,
-	.base.cra_blocksize	= POLY1305_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-#endif
-}};
-
-static int __init arm_poly1305_mod_init(void)
-{
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    (elf_hwcap & HWCAP_NEON))
-		static_branch_enable(&have_neon);
-	else if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
-		/* register only the first entry */
-		return crypto_register_shash(&arm_poly1305_algs[0]);
-
-	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
-		crypto_register_shashes(arm_poly1305_algs,
-					ARRAY_SIZE(arm_poly1305_algs)) : 0;
-}
-
-static void __exit arm_poly1305_mod_exit(void)
-{
-	if (!IS_REACHABLE(CONFIG_CRYPTO_HASH))
-		return;
-	if (!static_branch_likely(&have_neon)) {
-		crypto_unregister_shash(&arm_poly1305_algs[0]);
-		return;
-	}
-	crypto_unregister_shashes(arm_poly1305_algs,
-				  ARRAY_SIZE(arm_poly1305_algs));
-}
-
-module_init(arm_poly1305_mod_init);
-module_exit(arm_poly1305_mod_exit);
-
-MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("poly1305");
-MODULE_ALIAS_CRYPTO("poly1305-arm");
-MODULE_ALIAS_CRYPTO("poly1305-neon");
diff --git a/arch/arm/crypto/sha1-ce-glue.c b/arch/arm/crypto/sha1-ce-glue.c
index de9100c67b37..fac07a4799de 100644
--- a/arch/arm/crypto/sha1-ce-glue.c
+++ b/arch/arm/crypto/sha1-ce-glue.c
@@ -5,20 +5,14 @@
  * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
 
+#include <asm/neon.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/sha1.h>
 #include <crypto/sha1_base.h>
 #include <linux/cpufeature.h>
-#include <linux/crypto.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-#include "sha1.h"
-
 MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
@@ -29,50 +23,36 @@ asmlinkage void sha1_ce_transform(struct sha1_state *sst, u8 const *src,
 static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
 			  unsigned int len)
 {
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable() ||
-	    (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
-		return sha1_update_arm(desc, data, len);
+	int remain;
 
 	kernel_neon_begin();
-	sha1_base_do_update(desc, data, len, sha1_ce_transform);
+	remain = sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform);
 	kernel_neon_end();
 
-	return 0;
+	return remain;
 }
 
 static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
 			 unsigned int len, u8 *out)
 {
-	if (!crypto_simd_usable())
-		return sha1_finup_arm(desc, data, len, out);
-
 	kernel_neon_begin();
-	if (len)
-		sha1_base_do_update(desc, data, len, sha1_ce_transform);
-	sha1_base_do_finalize(desc, sha1_ce_transform);
+	sha1_base_do_finup(desc, data, len, sha1_ce_transform);
 	kernel_neon_end();
 
 	return sha1_base_finish(desc, out);
 }
 
-static int sha1_ce_final(struct shash_desc *desc, u8 *out)
-{
-	return sha1_ce_finup(desc, NULL, 0, out);
-}
-
 static struct shash_alg alg = {
 	.init			= sha1_base_init,
 	.update			= sha1_ce_update,
-	.final			= sha1_ce_final,
 	.finup			= sha1_ce_finup,
-	.descsize		= sizeof(struct sha1_state),
+	.descsize		= SHA1_STATE_SIZE,
 	.digestsize		= SHA1_DIGEST_SIZE,
 	.base			= {
 		.cra_name		= "sha1",
 		.cra_driver_name	= "sha1-ce",
 		.cra_priority		= 200,
+		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize		= SHA1_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
diff --git a/arch/arm/crypto/sha1.h b/arch/arm/crypto/sha1.h
deleted file mode 100644
index b1b7e21da2c3..000000000000
--- a/arch/arm/crypto/sha1.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ASM_ARM_CRYPTO_SHA1_H
-#define ASM_ARM_CRYPTO_SHA1_H
-
-#include <linux/crypto.h>
-#include <crypto/sha1.h>
-
-extern int sha1_update_arm(struct shash_desc *desc, const u8 *data,
-			   unsigned int len);
-
-extern int sha1_finup_arm(struct shash_desc *desc, const u8 *data,
-			   unsigned int len, u8 *out);
-
-#endif
diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c
index 95a727bcd664..255da00c7d98 100644
--- a/arch/arm/crypto/sha1_glue.c
+++ b/arch/arm/crypto/sha1_glue.c
@@ -12,53 +12,42 @@
  */
 
 #include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/types.h>
 #include <crypto/sha1.h>
 #include <crypto/sha1_base.h>
-#include <asm/byteorder.h>
-
-#include "sha1.h"
+#include <linux/kernel.h>
+#include <linux/module.h>
 
 asmlinkage void sha1_block_data_order(struct sha1_state *digest,
 		const u8 *data, int rounds);
 
-int sha1_update_arm(struct shash_desc *desc, const u8 *data,
-		    unsigned int len)
+static int sha1_update_arm(struct shash_desc *desc, const u8 *data,
+			   unsigned int len)
 {
 	/* make sure signature matches sha1_block_fn() */
 	BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
 
-	return sha1_base_do_update(desc, data, len, sha1_block_data_order);
+	return sha1_base_do_update_blocks(desc, data, len,
+					  sha1_block_data_order);
 }
-EXPORT_SYMBOL_GPL(sha1_update_arm);
 
-static int sha1_final(struct shash_desc *desc, u8 *out)
+static int sha1_finup_arm(struct shash_desc *desc, const u8 *data,
+			  unsigned int len, u8 *out)
 {
-	sha1_base_do_finalize(desc, sha1_block_data_order);
+	sha1_base_do_finup(desc, data, len, sha1_block_data_order);
 	return sha1_base_finish(desc, out);
 }
 
-int sha1_finup_arm(struct shash_desc *desc, const u8 *data,
-		   unsigned int len, u8 *out)
-{
-	sha1_base_do_update(desc, data, len, sha1_block_data_order);
-	return sha1_final(desc, out);
-}
-EXPORT_SYMBOL_GPL(sha1_finup_arm);
-
 static struct shash_alg alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
 	.init		=	sha1_base_init,
 	.update		=	sha1_update_arm,
-	.final		=	sha1_final,
 	.finup		=	sha1_finup_arm,
-	.descsize	=	sizeof(struct sha1_state),
+	.descsize	=	SHA1_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha1",
 		.cra_driver_name=	"sha1-asm",
 		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c
index 9c70b87e69f7..d321850f22a6 100644
--- a/arch/arm/crypto/sha1_neon_glue.c
+++ b/arch/arm/crypto/sha1_neon_glue.c
@@ -13,18 +13,12 @@
  *  Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
  */
 
+#include <asm/neon.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
 #include <crypto/sha1.h>
 #include <crypto/sha1_base.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-#include "sha1.h"
+#include <linux/kernel.h>
+#include <linux/module.h>
 
 asmlinkage void sha1_transform_neon(struct sha1_state *state_h,
 				    const u8 *data, int rounds);
@@ -32,50 +26,37 @@ asmlinkage void sha1_transform_neon(struct sha1_state *state_h,
 static int sha1_neon_update(struct shash_desc *desc, const u8 *data,
 			  unsigned int len)
 {
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable() ||
-	    (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
-		return sha1_update_arm(desc, data, len);
+	int remain;
 
 	kernel_neon_begin();
-	sha1_base_do_update(desc, data, len, sha1_transform_neon);
+	remain = sha1_base_do_update_blocks(desc, data, len,
+					    sha1_transform_neon);
 	kernel_neon_end();
 
-	return 0;
+	return remain;
 }
 
 static int sha1_neon_finup(struct shash_desc *desc, const u8 *data,
 			   unsigned int len, u8 *out)
 {
-	if (!crypto_simd_usable())
-		return sha1_finup_arm(desc, data, len, out);
-
 	kernel_neon_begin();
-	if (len)
-		sha1_base_do_update(desc, data, len, sha1_transform_neon);
-	sha1_base_do_finalize(desc, sha1_transform_neon);
+	sha1_base_do_finup(desc, data, len, sha1_transform_neon);
 	kernel_neon_end();
 
 	return sha1_base_finish(desc, out);
 }
 
-static int sha1_neon_final(struct shash_desc *desc, u8 *out)
-{
-	return sha1_neon_finup(desc, NULL, 0, out);
-}
-
 static struct shash_alg alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
 	.init		=	sha1_base_init,
 	.update		=	sha1_neon_update,
-	.final		=	sha1_neon_final,
 	.finup		=	sha1_neon_finup,
-	.descsize	=	sizeof(struct sha1_state),
+	.descsize		= SHA1_STATE_SIZE,
 	.base		=	{
 		.cra_name		= "sha1",
 		.cra_driver_name	= "sha1-neon",
 		.cra_priority		= 250,
+		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize		= SHA1_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
deleted file mode 100644
index aeac45bfbf9f..000000000000
--- a/arch/arm/crypto/sha2-ce-glue.c
+++ /dev/null
@@ -1,109 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <linux/cpufeature.h>
-#include <linux/crypto.h>
-#include <linux/module.h>
-
-#include <asm/hwcap.h>
-#include <asm/simd.h>
-#include <asm/neon.h>
-#include <linux/unaligned.h>
-
-#include "sha256_glue.h"
-
-MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-
-asmlinkage void sha2_ce_transform(struct sha256_state *sst, u8 const *src,
-				  int blocks);
-
-static int sha2_ce_update(struct shash_desc *desc, const u8 *data,
-			  unsigned int len)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable() ||
-	    (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
-		return crypto_sha256_arm_update(desc, data, len);
-
-	kernel_neon_begin();
-	sha256_base_do_update(desc, data, len,
-			      (sha256_block_fn *)sha2_ce_transform);
-	kernel_neon_end();
-
-	return 0;
-}
-
-static int sha2_ce_finup(struct shash_desc *desc, const u8 *data,
-			 unsigned int len, u8 *out)
-{
-	if (!crypto_simd_usable())
-		return crypto_sha256_arm_finup(desc, data, len, out);
-
-	kernel_neon_begin();
-	if (len)
-		sha256_base_do_update(desc, data, len,
-				      (sha256_block_fn *)sha2_ce_transform);
-	sha256_base_do_finalize(desc, (sha256_block_fn *)sha2_ce_transform);
-	kernel_neon_end();
-
-	return sha256_base_finish(desc, out);
-}
-
-static int sha2_ce_final(struct shash_desc *desc, u8 *out)
-{
-	return sha2_ce_finup(desc, NULL, 0, out);
-}
-
-static struct shash_alg algs[] = { {
-	.init			= sha224_base_init,
-	.update			= sha2_ce_update,
-	.final			= sha2_ce_final,
-	.finup			= sha2_ce_finup,
-	.descsize		= sizeof(struct sha256_state),
-	.digestsize		= SHA224_DIGEST_SIZE,
-	.base			= {
-		.cra_name		= "sha224",
-		.cra_driver_name	= "sha224-ce",
-		.cra_priority		= 300,
-		.cra_blocksize		= SHA256_BLOCK_SIZE,
-		.cra_module		= THIS_MODULE,
-	}
-}, {
-	.init			= sha256_base_init,
-	.update			= sha2_ce_update,
-	.final			= sha2_ce_final,
-	.finup			= sha2_ce_finup,
-	.descsize		= sizeof(struct sha256_state),
-	.digestsize		= SHA256_DIGEST_SIZE,
-	.base			= {
-		.cra_name		= "sha256",
-		.cra_driver_name	= "sha256-ce",
-		.cra_priority		= 300,
-		.cra_blocksize		= SHA256_BLOCK_SIZE,
-		.cra_module		= THIS_MODULE,
-	}
-} };
-
-static int __init sha2_ce_mod_init(void)
-{
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha2_ce_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_cpu_feature_match(SHA2, sha2_ce_mod_init);
-module_exit(sha2_ce_mod_fini);
diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c
deleted file mode 100644
index f85933fdec75..000000000000
--- a/arch/arm/crypto/sha256_glue.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
- * using optimized ARM assembler and NEON instructions.
- *
- * Copyright © 2015 Google Inc.
- *
- * This file is based on sha256_ssse3_glue.c:
- *   Copyright (C) 2013 Intel Corporation
- *   Author: Tim Chen <tim.c.chen@linux.intel.com>
- */
-
-#include <crypto/internal/hash.h>
-#include <linux/crypto.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <asm/simd.h>
-#include <asm/neon.h>
-
-#include "sha256_glue.h"
-
-asmlinkage void sha256_block_data_order(struct sha256_state *state,
-					const u8 *data, int num_blks);
-
-int crypto_sha256_arm_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len)
-{
-	/* make sure casting to sha256_block_fn() is safe */
-	BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
-
-	return sha256_base_do_update(desc, data, len, sha256_block_data_order);
-}
-EXPORT_SYMBOL(crypto_sha256_arm_update);
-
-static int crypto_sha256_arm_final(struct shash_desc *desc, u8 *out)
-{
-	sha256_base_do_finalize(desc, sha256_block_data_order);
-	return sha256_base_finish(desc, out);
-}
-
-int crypto_sha256_arm_finup(struct shash_desc *desc, const u8 *data,
-			    unsigned int len, u8 *out)
-{
-	sha256_base_do_update(desc, data, len, sha256_block_data_order);
-	return crypto_sha256_arm_final(desc, out);
-}
-EXPORT_SYMBOL(crypto_sha256_arm_finup);
-
-static struct shash_alg algs[] = { {
-	.digestsize	=	SHA256_DIGEST_SIZE,
-	.init		=	sha256_base_init,
-	.update		=	crypto_sha256_arm_update,
-	.final		=	crypto_sha256_arm_final,
-	.finup		=	crypto_sha256_arm_finup,
-	.descsize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha256",
-		.cra_driver_name =	"sha256-asm",
-		.cra_priority	=	150,
-		.cra_blocksize	=	SHA256_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-}, {
-	.digestsize	=	SHA224_DIGEST_SIZE,
-	.init		=	sha224_base_init,
-	.update		=	crypto_sha256_arm_update,
-	.final		=	crypto_sha256_arm_final,
-	.finup		=	crypto_sha256_arm_finup,
-	.descsize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha224",
-		.cra_driver_name =	"sha224-asm",
-		.cra_priority	=	150,
-		.cra_blocksize	=	SHA224_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-} };
-
-static int __init sha256_mod_init(void)
-{
-	int res = crypto_register_shashes(algs, ARRAY_SIZE(algs));
-
-	if (res < 0)
-		return res;
-
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
-		res = crypto_register_shashes(sha256_neon_algs,
-					      ARRAY_SIZE(sha256_neon_algs));
-
-		if (res < 0)
-			crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-	}
-
-	return res;
-}
-
-static void __exit sha256_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
-		crypto_unregister_shashes(sha256_neon_algs,
-					  ARRAY_SIZE(sha256_neon_algs));
-}
-
-module_init(sha256_mod_init);
-module_exit(sha256_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm (ARM), including NEON");
-
-MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/arm/crypto/sha256_glue.h b/arch/arm/crypto/sha256_glue.h
deleted file mode 100644
index 9f0d578bab5f..000000000000
--- a/arch/arm/crypto/sha256_glue.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _CRYPTO_SHA256_GLUE_H
-#define _CRYPTO_SHA256_GLUE_H
-
-#include <linux/crypto.h>
-
-extern struct shash_alg sha256_neon_algs[2];
-
-int crypto_sha256_arm_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len);
-
-int crypto_sha256_arm_finup(struct shash_desc *desc, const u8 *data,
-			    unsigned int len, u8 *hash);
-
-#endif /* _CRYPTO_SHA256_GLUE_H */
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
deleted file mode 100644
index ccdcfff71910..000000000000
--- a/arch/arm/crypto/sha256_neon_glue.c
+++ /dev/null
@@ -1,92 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
- * using NEON instructions.
- *
- * Copyright © 2015 Google Inc.
- *
- * This file is based on sha512_neon_glue.c:
- *   Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- */
-
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <asm/byteorder.h>
-#include <asm/simd.h>
-#include <asm/neon.h>
-
-#include "sha256_glue.h"
-
-asmlinkage void sha256_block_data_order_neon(struct sha256_state *digest,
-					     const u8 *data, int num_blks);
-
-static int crypto_sha256_neon_update(struct shash_desc *desc, const u8 *data,
-				     unsigned int len)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable() ||
-	    (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
-		return crypto_sha256_arm_update(desc, data, len);
-
-	kernel_neon_begin();
-	sha256_base_do_update(desc, data, len, sha256_block_data_order_neon);
-	kernel_neon_end();
-
-	return 0;
-}
-
-static int crypto_sha256_neon_finup(struct shash_desc *desc, const u8 *data,
-				    unsigned int len, u8 *out)
-{
-	if (!crypto_simd_usable())
-		return crypto_sha256_arm_finup(desc, data, len, out);
-
-	kernel_neon_begin();
-	if (len)
-		sha256_base_do_update(desc, data, len,
-				      sha256_block_data_order_neon);
-	sha256_base_do_finalize(desc, sha256_block_data_order_neon);
-	kernel_neon_end();
-
-	return sha256_base_finish(desc, out);
-}
-
-static int crypto_sha256_neon_final(struct shash_desc *desc, u8 *out)
-{
-	return crypto_sha256_neon_finup(desc, NULL, 0, out);
-}
-
-struct shash_alg sha256_neon_algs[] = { {
-	.digestsize	=	SHA256_DIGEST_SIZE,
-	.init		=	sha256_base_init,
-	.update		=	crypto_sha256_neon_update,
-	.final		=	crypto_sha256_neon_final,
-	.finup		=	crypto_sha256_neon_finup,
-	.descsize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha256",
-		.cra_driver_name =	"sha256-neon",
-		.cra_priority	=	250,
-		.cra_blocksize	=	SHA256_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-}, {
-	.digestsize	=	SHA224_DIGEST_SIZE,
-	.init		=	sha224_base_init,
-	.update		=	crypto_sha256_neon_update,
-	.final		=	crypto_sha256_neon_final,
-	.finup		=	crypto_sha256_neon_finup,
-	.descsize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha224",
-		.cra_driver_name =	"sha224-neon",
-		.cra_priority	=	250,
-		.cra_blocksize	=	SHA224_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-} };
diff --git a/arch/arm/crypto/sha512-glue.c b/arch/arm/crypto/sha512-glue.c
index 1be5bd498af3..f8a6480889b1 100644
--- a/arch/arm/crypto/sha512-glue.c
+++ b/arch/arm/crypto/sha512-glue.c
@@ -5,15 +5,14 @@
  * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
 
+#include <asm/hwcap.h>
+#include <asm/neon.h>
 #include <crypto/internal/hash.h>
 #include <crypto/sha2.h>
 #include <crypto/sha512_base.h>
-#include <linux/crypto.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-
 #include "sha512.h"
 
 MODULE_DESCRIPTION("Accelerated SHA-384/SHA-512 secure hash for ARM");
@@ -28,50 +27,47 @@ MODULE_ALIAS_CRYPTO("sha512-arm");
 asmlinkage void sha512_block_data_order(struct sha512_state *state,
 					u8 const *src, int blocks);
 
-int sha512_arm_update(struct shash_desc *desc, const u8 *data,
-		      unsigned int len)
+static int sha512_arm_update(struct shash_desc *desc, const u8 *data,
+			     unsigned int len)
 {
-	return sha512_base_do_update(desc, data, len, sha512_block_data_order);
+	return sha512_base_do_update_blocks(desc, data, len,
+					    sha512_block_data_order);
 }
 
-static int sha512_arm_final(struct shash_desc *desc, u8 *out)
+static int sha512_arm_finup(struct shash_desc *desc, const u8 *data,
+			    unsigned int len, u8 *out)
 {
-	sha512_base_do_finalize(desc, sha512_block_data_order);
+	sha512_base_do_finup(desc, data, len, sha512_block_data_order);
 	return sha512_base_finish(desc, out);
 }
 
-int sha512_arm_finup(struct shash_desc *desc, const u8 *data,
-		     unsigned int len, u8 *out)
-{
-	sha512_base_do_update(desc, data, len, sha512_block_data_order);
-	return sha512_arm_final(desc, out);
-}
-
 static struct shash_alg sha512_arm_algs[] = { {
 	.init			= sha384_base_init,
 	.update			= sha512_arm_update,
-	.final			= sha512_arm_final,
 	.finup			= sha512_arm_finup,
-	.descsize		= sizeof(struct sha512_state),
+	.descsize		= SHA512_STATE_SIZE,
 	.digestsize		= SHA384_DIGEST_SIZE,
 	.base			= {
 		.cra_name		= "sha384",
 		.cra_driver_name	= "sha384-arm",
 		.cra_priority		= 250,
+		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					  CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize		= SHA512_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
 },  {
 	.init			= sha512_base_init,
 	.update			= sha512_arm_update,
-	.final			= sha512_arm_final,
 	.finup			= sha512_arm_finup,
-	.descsize		= sizeof(struct sha512_state),
+	.descsize		= SHA512_STATE_SIZE,
 	.digestsize		= SHA512_DIGEST_SIZE,
 	.base			= {
 		.cra_name		= "sha512",
 		.cra_driver_name	= "sha512-arm",
 		.cra_priority		= 250,
+		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					  CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize		= SHA512_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
diff --git a/arch/arm/crypto/sha512-neon-glue.c b/arch/arm/crypto/sha512-neon-glue.c
index c6e58fe475ac..bd528077fefb 100644
--- a/arch/arm/crypto/sha512-neon-glue.c
+++ b/arch/arm/crypto/sha512-neon-glue.c
@@ -5,16 +5,13 @@
  * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
 
+#include <asm/neon.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/sha2.h>
 #include <crypto/sha512_base.h>
-#include <linux/crypto.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 
-#include <asm/simd.h>
-#include <asm/neon.h>
-
 #include "sha512.h"
 
 MODULE_ALIAS_CRYPTO("sha384-neon");
@@ -26,51 +23,36 @@ asmlinkage void sha512_block_data_order_neon(struct sha512_state *state,
 static int sha512_neon_update(struct shash_desc *desc, const u8 *data,
 			      unsigned int len)
 {
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable() ||
-	    (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
-		return sha512_arm_update(desc, data, len);
+	int remain;
 
 	kernel_neon_begin();
-	sha512_base_do_update(desc, data, len, sha512_block_data_order_neon);
+	remain = sha512_base_do_update_blocks(desc, data, len,
+					      sha512_block_data_order_neon);
 	kernel_neon_end();
-
-	return 0;
+	return remain;
 }
 
 static int sha512_neon_finup(struct shash_desc *desc, const u8 *data,
 			     unsigned int len, u8 *out)
 {
-	if (!crypto_simd_usable())
-		return sha512_arm_finup(desc, data, len, out);
-
 	kernel_neon_begin();
-	if (len)
-		sha512_base_do_update(desc, data, len,
-				      sha512_block_data_order_neon);
-	sha512_base_do_finalize(desc, sha512_block_data_order_neon);
+	sha512_base_do_finup(desc, data, len, sha512_block_data_order_neon);
 	kernel_neon_end();
-
 	return sha512_base_finish(desc, out);
 }
 
-static int sha512_neon_final(struct shash_desc *desc, u8 *out)
-{
-	return sha512_neon_finup(desc, NULL, 0, out);
-}
-
 struct shash_alg sha512_neon_algs[] = { {
 	.init			= sha384_base_init,
 	.update			= sha512_neon_update,
-	.final			= sha512_neon_final,
 	.finup			= sha512_neon_finup,
-	.descsize		= sizeof(struct sha512_state),
+	.descsize		= SHA512_STATE_SIZE,
 	.digestsize		= SHA384_DIGEST_SIZE,
 	.base			= {
 		.cra_name		= "sha384",
 		.cra_driver_name	= "sha384-neon",
 		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					  CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize		= SHA384_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 
@@ -78,14 +60,15 @@ struct shash_alg sha512_neon_algs[] = { {
 },  {
 	.init			= sha512_base_init,
 	.update			= sha512_neon_update,
-	.final			= sha512_neon_final,
 	.finup			= sha512_neon_finup,
-	.descsize		= sizeof(struct sha512_state),
+	.descsize		= SHA512_STATE_SIZE,
 	.digestsize		= SHA512_DIGEST_SIZE,
 	.base			= {
 		.cra_name		= "sha512",
 		.cra_driver_name	= "sha512-neon",
 		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					  CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize		= SHA512_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
diff --git a/arch/arm/crypto/sha512.h b/arch/arm/crypto/sha512.h
index e14572be76d1..eeaee52cda69 100644
--- a/arch/arm/crypto/sha512.h
+++ b/arch/arm/crypto/sha512.h
@@ -1,9 +1,3 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
-int sha512_arm_update(struct shash_desc *desc, const u8 *data,
-		      unsigned int len);
-
-int sha512_arm_finup(struct shash_desc *desc, const u8 *data,
-		     unsigned int len, u8 *out);
-
 extern struct shash_alg sha512_neon_algs[2];
diff --git a/arch/arm/include/asm/simd.h b/arch/arm/include/asm/simd.h
index 82191dbd7e78..d37559762180 100644
--- a/arch/arm/include/asm/simd.h
+++ b/arch/arm/include/asm/simd.h
@@ -1,8 +1,14 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
 
-#include <linux/hardirq.h>
+#include <linux/compiler_attributes.h>
+#include <linux/preempt.h>
+#include <linux/types.h>
 
 static __must_check inline bool may_use_simd(void)
 {
 	return IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && !in_hardirq();
 }
+
+#endif	/* _ASM_SIMD_H */
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 007874320937..91ea0e29107a 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -5,6 +5,8 @@
 # Copyright (C) 1995-2000 Russell King
 #
 
+obj-y += crypto/
+
 lib-y		:= changebit.o csumipv6.o csumpartial.o               \
 		   csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
 		   delay.o delay-loop.o findbit.o memchr.o memcpy.o   \
@@ -47,7 +49,7 @@ endif
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
 obj-$(CONFIG_CRC32_ARCH) += crc32-arm.o
-crc32-arm-y := crc32-glue.o crc32-core.o
+crc32-arm-y := crc32.o crc32-core.o
 
 obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm.o
-crc-t10dif-arm-y := crc-t10dif-glue.o crc-t10dif-core.o
+crc-t10dif-arm-y := crc-t10dif.o crc-t10dif-core.o
diff --git a/arch/arm/lib/crc-t10dif-glue.c b/arch/arm/lib/crc-t10dif.c
index 6efad3d78284..1093f8ec13b0 100644
--- a/arch/arm/lib/crc-t10dif-glue.c
+++ b/arch/arm/lib/crc-t10dif.c
@@ -16,8 +16,8 @@
 #include <asm/neon.h>
 #include <asm/simd.h>
 
-static DEFINE_STATIC_KEY_FALSE(have_neon);
-static DEFINE_STATIC_KEY_FALSE(have_pmull);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
 
 #define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
 
@@ -60,7 +60,7 @@ static int __init crc_t10dif_arm_init(void)
 	}
 	return 0;
 }
-arch_initcall(crc_t10dif_arm_init);
+subsys_initcall(crc_t10dif_arm_init);
 
 static void __exit crc_t10dif_arm_exit(void)
 {
diff --git a/arch/arm/lib/crc32-glue.c b/arch/arm/lib/crc32.c
index 4340351dbde8..f2bef8849c7c 100644
--- a/arch/arm/lib/crc32-glue.c
+++ b/arch/arm/lib/crc32.c
@@ -18,8 +18,8 @@
 #include <asm/neon.h>
 #include <asm/simd.h>
 
-static DEFINE_STATIC_KEY_FALSE(have_crc32);
-static DEFINE_STATIC_KEY_FALSE(have_pmull);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
 
 #define PMULL_MIN_LEN	64	/* min size of buffer for pmull functions */
 
@@ -103,7 +103,7 @@ static int __init crc32_arm_init(void)
 		static_branch_enable(&have_pmull);
 	return 0;
 }
-arch_initcall(crc32_arm_init);
+subsys_initcall(crc32_arm_init);
 
 static void __exit crc32_arm_exit(void)
 {
diff --git a/arch/arm/lib/crypto/.gitignore b/arch/arm/lib/crypto/.gitignore
new file mode 100644
index 000000000000..12d74d8b03d0
--- /dev/null
+++ b/arch/arm/lib/crypto/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
+sha256-core.S
diff --git a/arch/arm/lib/crypto/Kconfig b/arch/arm/lib/crypto/Kconfig
new file mode 100644
index 000000000000..d1ad664f0c67
--- /dev/null
+++ b/arch/arm/lib/crypto/Kconfig
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_BLAKE2S_ARM
+	bool "Hash functions: BLAKE2s"
+	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
+	help
+	  BLAKE2s cryptographic hash function (RFC 7693)
+
+	  Architecture: arm
+
+	  This is faster than the generic implementations of BLAKE2s and
+	  BLAKE2b, but slower than the NEON implementation of BLAKE2b.
+	  There is no NEON implementation of BLAKE2s, since NEON doesn't
+	  really help with it.
+
+config CRYPTO_CHACHA20_NEON
+	tristate
+	default CRYPTO_LIB_CHACHA
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_ARM
+	tristate
+	default CRYPTO_LIB_POLY1305
+	select CRYPTO_ARCH_HAVE_LIB_POLY1305
+
+config CRYPTO_SHA256_ARM
+	tristate
+	depends on !CPU_V7M
+	default CRYPTO_LIB_SHA256
+	select CRYPTO_ARCH_HAVE_LIB_SHA256
+	select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
diff --git a/arch/arm/lib/crypto/Makefile b/arch/arm/lib/crypto/Makefile
new file mode 100644
index 000000000000..431f77c3ff6f
--- /dev/null
+++ b/arch/arm/lib/crypto/Makefile
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o
+libblake2s-arm-y := blake2s-core.o blake2s-glue.o
+
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+chacha-neon-y := chacha-scalar-core.o chacha-glue.o
+chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
+
+obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
+poly1305-arm-y := poly1305-core.o poly1305-glue.o
+
+obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
+sha256-arm-y := sha256.o sha256-core.o
+sha256-arm-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
+
+quiet_cmd_perl = PERL    $@
+      cmd_perl = $(PERL) $(<) > $(@)
+
+$(obj)/%-core.S: $(src)/%-armv4.pl
+	$(call cmd,perl)
+
+clean-files += poly1305-core.S sha256-core.S
+
+aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
+
+# massage the perlasm code a bit so we only get the NEON routine if we need it
+poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
+poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
+AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
+
+AFLAGS_sha256-core.o += $(aflags-thumb2-y)
diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/lib/crypto/blake2s-core.S
index df40e46601f1..df40e46601f1 100644
--- a/arch/arm/crypto/blake2s-core.S
+++ b/arch/arm/lib/crypto/blake2s-core.S
diff --git a/arch/arm/crypto/blake2s-glue.c b/arch/arm/lib/crypto/blake2s-glue.c
index 0238a70d9581..0238a70d9581 100644
--- a/arch/arm/crypto/blake2s-glue.c
+++ b/arch/arm/lib/crypto/blake2s-glue.c
diff --git a/arch/arm/lib/crypto/chacha-glue.c b/arch/arm/lib/crypto/chacha-glue.c
new file mode 100644
index 000000000000..88ec96415283
--- /dev/null
+++ b/arch/arm/lib/crypto/chacha-glue.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ChaCha and HChaCha functions (ARM optimized)
+ *
+ * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/cputype.h>
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
+				      u8 *dst, const u8 *src, int nrounds);
+asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       int nrounds, unsigned int nbytes);
+asmlinkage void hchacha_block_arm(const struct chacha_state *state,
+				  u32 out[HCHACHA_OUT_WORDS], int nrounds);
+asmlinkage void hchacha_block_neon(const struct chacha_state *state,
+				   u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+			     const struct chacha_state *state, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
+
+static inline bool neon_usable(void)
+{
+	return static_branch_likely(&use_neon) && crypto_simd_usable();
+}
+
+static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds)
+{
+	u8 buf[CHACHA_BLOCK_SIZE];
+
+	while (bytes > CHACHA_BLOCK_SIZE) {
+		unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
+
+		chacha_4block_xor_neon(state, dst, src, nrounds, l);
+		bytes -= l;
+		src += l;
+		dst += l;
+		state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+	}
+	if (bytes) {
+		const u8 *s = src;
+		u8 *d = dst;
+
+		if (bytes != CHACHA_BLOCK_SIZE)
+			s = d = memcpy(buf, src, bytes);
+		chacha_block_xor_neon(state, d, s, nrounds);
+		if (d != dst)
+			memcpy(dst, buf, bytes);
+		state->x[12]++;
+	}
+}
+
+void hchacha_block_arch(const struct chacha_state *state,
+			u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
+		hchacha_block_arm(state, out, nrounds);
+	} else {
+		kernel_neon_begin();
+		hchacha_block_neon(state, out, nrounds);
+		kernel_neon_end();
+	}
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+		       unsigned int bytes, int nrounds)
+{
+	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
+	    bytes <= CHACHA_BLOCK_SIZE) {
+		chacha_doarm(dst, src, bytes, state, nrounds);
+		state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
+		return;
+	}
+
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		kernel_neon_begin();
+		chacha_doneon(state, dst, src, todo, nrounds);
+		kernel_neon_end();
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	/* We always can use at least the ARM scalar implementation. */
+	return true;
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+static int __init chacha_arm_mod_init(void)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
+		switch (read_cpuid_part()) {
+		case ARM_CPU_PART_CORTEX_A7:
+		case ARM_CPU_PART_CORTEX_A5:
+			/*
+			 * The Cortex-A7 and Cortex-A5 do not perform well with
+			 * the NEON implementation but do incredibly with the
+			 * scalar one and use less power.
+			 */
+			break;
+		default:
+			static_branch_enable(&use_neon);
+		}
+	}
+	return 0;
+}
+subsys_initcall(chacha_arm_mod_init);
+
+static void __exit chacha_arm_mod_exit(void)
+{
+}
+module_exit(chacha_arm_mod_exit);
+
+MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/crypto/chacha-neon-core.S b/arch/arm/lib/crypto/chacha-neon-core.S
index 13d12f672656..ddd62b6294a5 100644
--- a/arch/arm/crypto/chacha-neon-core.S
+++ b/arch/arm/lib/crypto/chacha-neon-core.S
@@ -1,5 +1,5 @@
 /*
- * ChaCha/XChaCha NEON helper functions
+ * ChaCha/HChaCha NEON helper functions
  *
  * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
  *
diff --git a/arch/arm/crypto/chacha-scalar-core.S b/arch/arm/lib/crypto/chacha-scalar-core.S
index 083fe1ab96d0..4951df05c158 100644
--- a/arch/arm/crypto/chacha-scalar-core.S
+++ b/arch/arm/lib/crypto/chacha-scalar-core.S
@@ -367,7 +367,7 @@
 
 /*
  * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
- *		     const u32 *state, int nrounds);
+ *		     const struct chacha_state *state, int nrounds);
  */
 ENTRY(chacha_doarm)
 	cmp		r2, #0			// len == 0?
@@ -407,7 +407,8 @@ ENTRY(chacha_doarm)
 ENDPROC(chacha_doarm)
 
 /*
- * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
+ * void hchacha_block_arm(const struct chacha_state *state,
+ *			  u32 out[HCHACHA_OUT_WORDS], int nrounds);
  */
 ENTRY(hchacha_block_arm)
 	push		{r1,r4-r11,lr}
diff --git a/arch/arm/crypto/poly1305-armv4.pl b/arch/arm/lib/crypto/poly1305-armv4.pl
index 6d79498d3115..d57c6e2fc84a 100644
--- a/arch/arm/crypto/poly1305-armv4.pl
+++ b/arch/arm/lib/crypto/poly1305-armv4.pl
@@ -43,9 +43,9 @@ $code.=<<___;
 #else
 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
 # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
-# define poly1305_init   poly1305_init_arm
+# define poly1305_init   poly1305_block_init_arch
 # define poly1305_blocks poly1305_blocks_arm
-# define poly1305_emit   poly1305_emit_arm
+# define poly1305_emit   poly1305_emit_arch
 .globl	poly1305_blocks_neon
 #endif
 
diff --git a/arch/arm/lib/crypto/poly1305-glue.c b/arch/arm/lib/crypto/poly1305-glue.c
new file mode 100644
index 000000000000..2603b0771f2c
--- /dev/null
+++ b/arch/arm/lib/crypto/poly1305-glue.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <crypto/internal/poly1305.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+asmlinkage void poly1305_block_init_arch(
+	struct poly1305_block_state *state,
+	const u8 raw_key[POLY1305_BLOCK_SIZE]);
+EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
+asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state,
+				    const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
+				     const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
+				   u8 digest[POLY1305_DIGEST_SIZE],
+				   const u32 nonce[4]);
+EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+
+void __weak poly1305_blocks_neon(struct poly1305_block_state *state,
+				 const u8 *src, u32 len, u32 hibit)
+{
+}
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
+			  unsigned int len, u32 padbit)
+{
+	len = round_down(len, POLY1305_BLOCK_SIZE);
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_neon)) {
+		do {
+			unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+			kernel_neon_begin();
+			poly1305_blocks_neon(state, src, todo, padbit);
+			kernel_neon_end();
+
+			len -= todo;
+			src += todo;
+		} while (len);
+	} else
+		poly1305_blocks_arm(state, src, len, padbit);
+}
+EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
+
+bool poly1305_is_arch_optimized(void)
+{
+	/* We always can use at least the ARM scalar implementation. */
+	return true;
+}
+EXPORT_SYMBOL(poly1305_is_arch_optimized);
+
+static int __init arm_poly1305_mod_init(void)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    (elf_hwcap & HWCAP_NEON))
+		static_branch_enable(&have_neon);
+	return 0;
+}
+subsys_initcall(arm_poly1305_mod_init);
+
+static void __exit arm_poly1305_mod_exit(void)
+{
+}
+module_exit(arm_poly1305_mod_exit);
+
+MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/lib/crypto/sha256-armv4.pl
index f3a2b54efd4e..8122db7fd599 100644
--- a/arch/arm/crypto/sha256-armv4.pl
+++ b/arch/arm/lib/crypto/sha256-armv4.pl
@@ -204,18 +204,18 @@ K256:
 .word	0				@ terminator
 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 .LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-sha256_block_data_order
+.word	OPENSSL_armcap_P-sha256_blocks_arch
 #endif
 .align	5
 
-.global	sha256_block_data_order
-.type	sha256_block_data_order,%function
-sha256_block_data_order:
-.Lsha256_block_data_order:
+.global	sha256_blocks_arch
+.type	sha256_blocks_arch,%function
+sha256_blocks_arch:
+.Lsha256_blocks_arch:
 #if __ARM_ARCH__<7
-	sub	r3,pc,#8		@ sha256_block_data_order
+	sub	r3,pc,#8		@ sha256_blocks_arch
 #else
-	adr	r3,.Lsha256_block_data_order
+	adr	r3,.Lsha256_blocks_arch
 #endif
 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 	ldr	r12,.LOPENSSL_armcap
@@ -282,7 +282,7 @@ $code.=<<___;
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
-.size	sha256_block_data_order,.-sha256_block_data_order
+.size	sha256_blocks_arch,.-sha256_blocks_arch
 ___
 ######################################################################
 # NEON stuff
@@ -470,8 +470,8 @@ sha256_block_data_order_neon:
 	stmdb	sp!,{r4-r12,lr}
 
 	sub	$H,sp,#16*4+16
-	adr	$Ktbl,.Lsha256_block_data_order
-	sub	$Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
+	adr	$Ktbl,.Lsha256_blocks_arch
+	sub	$Ktbl,$Ktbl,#.Lsha256_blocks_arch-K256
 	bic	$H,$H,#15		@ align for 128-bit stores
 	mov	$t2,sp
 	mov	sp,$H			@ alloca
diff --git a/arch/arm/crypto/sha2-ce-core.S b/arch/arm/lib/crypto/sha256-ce.S
index b6369d2440a1..ac2c9b01b22d 100644
--- a/arch/arm/crypto/sha2-ce-core.S
+++ b/arch/arm/lib/crypto/sha256-ce.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * sha2-ce-core.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
+ * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
  *
  * Copyright (C) 2015 Linaro Ltd.
  * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
@@ -67,10 +67,10 @@
 	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 
 	/*
-	 * void sha2_ce_transform(struct sha256_state *sst, u8 const *src,
-				  int blocks);
+	 * void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
+	 *			    const u8 *data, size_t nblocks);
 	 */
-ENTRY(sha2_ce_transform)
+ENTRY(sha256_ce_transform)
 	/* load state */
 	vld1.32		{dga-dgb}, [r0]
 
@@ -120,4 +120,4 @@ ENTRY(sha2_ce_transform)
 	/* store new state */
 	vst1.32		{dga-dgb}, [r0]
 	bx		lr
-ENDPROC(sha2_ce_transform)
+ENDPROC(sha256_ce_transform)
diff --git a/arch/arm/lib/crypto/sha256.c b/arch/arm/lib/crypto/sha256.c
new file mode 100644
index 000000000000..109192e54b0f
--- /dev/null
+++ b/arch/arm/lib/crypto/sha256.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-256 optimized for ARM
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <crypto/internal/sha2.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+				   const u8 *data, size_t nblocks);
+EXPORT_SYMBOL_GPL(sha256_blocks_arch);
+asmlinkage void sha256_block_data_order_neon(u32 state[SHA256_STATE_WORDS],
+					     const u8 *data, size_t nblocks);
+asmlinkage void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
+				    const u8 *data, size_t nblocks);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
+			const u8 *data, size_t nblocks)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_neon)) {
+		kernel_neon_begin();
+		if (static_branch_likely(&have_ce))
+			sha256_ce_transform(state, data, nblocks);
+		else
+			sha256_block_data_order_neon(state, data, nblocks);
+		kernel_neon_end();
+	} else {
+		sha256_blocks_arch(state, data, nblocks);
+	}
+}
+EXPORT_SYMBOL_GPL(sha256_blocks_simd);
+
+bool sha256_is_arch_optimized(void)
+{
+	/* We always can use at least the ARM scalar implementation. */
+	return true;
+}
+EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
+
+static int __init sha256_arm_mod_init(void)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
+		static_branch_enable(&have_neon);
+		if (elf_hwcap2 & HWCAP2_SHA2)
+			static_branch_enable(&have_ce);
+	}
+	return 0;
+}
+subsys_initcall(sha256_arm_mod_init);
+
+static void __exit sha256_arm_mod_exit(void)
+{
+}
+module_exit(sha256_arm_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-256 optimized for ARM");
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts
index 13a0e63afeaf..2c64d834a2c4 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts
@@ -152,28 +152,12 @@
 	vcc-pg-supply = <&reg_aldo1>;
 };
 
-&r_ir {
-	linux,rc-map-name = "rc-beelink-gs1";
-	status = "okay";
-};
-
-&r_pio {
-	/*
-	 * FIXME: We can't add that supply for now since it would
-	 * create a circular dependency between pinctrl, the regulator
-	 * and the RSB Bus.
-	 *
-	 * vcc-pl-supply = <&reg_aldo1>;
-	 */
-	vcc-pm-supply = <&reg_aldo1>;
-};
-
-&r_rsb {
+&r_i2c {
 	status = "okay";
 
-	axp805: pmic@745 {
+	axp805: pmic@36 {
 		compatible = "x-powers,axp805", "x-powers,axp806";
-		reg = <0x745>;
+		reg = <0x36>;
 		interrupt-parent = <&r_intc>;
 		interrupts = <GIC_SPI 96 IRQ_TYPE_LEVEL_LOW>;
 		interrupt-controller;
@@ -291,6 +275,22 @@
 	};
 };
 
+&r_ir {
+	linux,rc-map-name = "rc-beelink-gs1";
+	status = "okay";
+};
+
+&r_pio {
+	/*
+	 * PL0 and PL1 are used for PMIC I2C
+	 * don't enable the pl-supply else
+	 * it will fail at boot
+	 *
+	 * vcc-pl-supply = <&reg_aldo1>;
+	 */
+	vcc-pm-supply = <&reg_aldo1>;
+};
+
 &spdif {
 	pinctrl-names = "default";
 	pinctrl-0 = <&spdif_tx_pin>;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi-3.dts b/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi-3.dts
index ab87c3447cd7..f005072c68a1 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi-3.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi-3.dts
@@ -176,16 +176,12 @@
 	vcc-pg-supply = <&reg_vcc_wifi_io>;
 };
 
-&r_ir {
-	status = "okay";
-};
-
-&r_rsb {
+&r_i2c {
 	status = "okay";
 
-	axp805: pmic@745 {
+	axp805: pmic@36 {
 		compatible = "x-powers,axp805", "x-powers,axp806";
-		reg = <0x745>;
+		reg = <0x36>;
 		interrupt-parent = <&r_intc>;
 		interrupts = <GIC_SPI 96 IRQ_TYPE_LEVEL_LOW>;
 		interrupt-controller;
@@ -296,6 +292,10 @@
 	};
 };
 
+&r_ir {
+	status = "okay";
+};
+
 &rtc {
 	clocks = <&ext_osc32k>;
 };
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi.dtsi
index d05dc5d6e6b9..e34dbb992021 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi.dtsi
@@ -113,20 +113,12 @@
 	vcc-pg-supply = <&reg_aldo1>;
 };
 
-&r_ir {
-	status = "okay";
-};
-
-&r_pio {
-	vcc-pm-supply = <&reg_bldo3>;
-};
-
-&r_rsb {
+&r_i2c {
 	status = "okay";
 
-	axp805: pmic@745 {
+	axp805: pmic@36 {
 		compatible = "x-powers,axp805", "x-powers,axp806";
-		reg = <0x745>;
+		reg = <0x36>;
 		interrupt-parent = <&r_intc>;
 		interrupts = <GIC_SPI 96 IRQ_TYPE_LEVEL_LOW>;
 		interrupt-controller;
@@ -241,6 +233,14 @@
 	};
 };
 
+&r_ir {
+	status = "okay";
+};
+
+&r_pio {
+	vcc-pm-supply = <&reg_bldo3>;
+};
+
 &rtc {
 	clocks = <&ext_osc32k>;
 };
diff --git a/arch/arm64/boot/dts/amazon/alpine-v2.dtsi b/arch/arm64/boot/dts/amazon/alpine-v2.dtsi
index da9de4986660..5a72f0b64247 100644
--- a/arch/arm64/boot/dts/amazon/alpine-v2.dtsi
+++ b/arch/arm64/boot/dts/amazon/alpine-v2.dtsi
@@ -151,7 +151,7 @@
 			al,msi-num-spis = <160>;
 		};
 
-		io-fabric@fc000000 {
+		io-bus@fc000000 {
 			compatible = "simple-bus";
 			#address-cells = <1>;
 			#size-cells = <1>;
diff --git a/arch/arm64/boot/dts/amazon/alpine-v3.dtsi b/arch/arm64/boot/dts/amazon/alpine-v3.dtsi
index 8b6156b5af65..dea60d136c2e 100644
--- a/arch/arm64/boot/dts/amazon/alpine-v3.dtsi
+++ b/arch/arm64/boot/dts/amazon/alpine-v3.dtsi
@@ -361,7 +361,7 @@
 			interrupt-parent = <&gic>;
 		};
 
-		io-fabric@fc000000 {
+		io-bus@fc000000 {
 			compatible = "simple-bus";
 			#address-cells = <1>;
 			#size-cells = <1>;
diff --git a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi
index ab2b3f15ef19..69834b49673d 100644
--- a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi
@@ -2313,7 +2313,7 @@
 					     "amlogic,meson8-pwm-v2";
 				reg = <0x0 0x19000 0x0 0x20>;
 				clocks = <&xtal>,
-					 <>, /* unknown/untested, the datasheet calls it "vid_pll" */
+					 <0>, /* unknown/untested, the datasheet calls it "vid_pll" */
 					 <&clkc CLKID_FCLK_DIV4>,
 					 <&clkc CLKID_FCLK_DIV3>;
 				#pwm-cells = <3>;
@@ -2325,7 +2325,7 @@
 					     "amlogic,meson8-pwm-v2";
 				reg = <0x0 0x1a000 0x0 0x20>;
 				clocks = <&xtal>,
-					 <>, /* unknown/untested, the datasheet calls it "vid_pll" */
+					 <0>, /* unknown/untested, the datasheet calls it "vid_pll" */
 					 <&clkc CLKID_FCLK_DIV4>,
 					 <&clkc CLKID_FCLK_DIV3>;
 				#pwm-cells = <3>;
@@ -2337,7 +2337,7 @@
 					     "amlogic,meson8-pwm-v2";
 				reg = <0x0 0x1b000 0x0 0x20>;
 				clocks = <&xtal>,
-					 <>, /* unknown/untested, the datasheet calls it "vid_pll" */
+					 <0>, /* unknown/untested, the datasheet calls it "vid_pll" */
 					 <&clkc CLKID_FCLK_DIV4>,
 					 <&clkc CLKID_FCLK_DIV3>;
 				#pwm-cells = <3>;
diff --git a/arch/arm64/boot/dts/amlogic/meson-g12b-dreambox.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12b-dreambox.dtsi
index de35fa2d7a6d..8e3e3354ed67 100644
--- a/arch/arm64/boot/dts/amlogic/meson-g12b-dreambox.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-g12b-dreambox.dtsi
@@ -116,6 +116,10 @@
 	status = "okay";
 };
 
+&clkc_audio {
+	status = "okay";
+};
+
 &frddr_a {
 	status = "okay";
 };
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi
index 8ebce7114a60..6c134592c7bb 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi
@@ -741,7 +741,7 @@
 
 &pwm_ab {
 	clocks = <&xtal>,
-		 <>, /* unknown/untested, the datasheet calls it "vid_pll" */
+		 <0>, /* unknown/untested, the datasheet calls it "vid_pll" */
 		 <&clkc CLKID_FCLK_DIV4>,
 		 <&clkc CLKID_FCLK_DIV3>;
 };
@@ -752,14 +752,14 @@
 
 &pwm_cd {
 	clocks = <&xtal>,
-		 <>, /* unknown/untested, the datasheet calls it "vid_pll" */
+		 <0>, /* unknown/untested, the datasheet calls it "vid_pll" */
 		 <&clkc CLKID_FCLK_DIV4>,
 		 <&clkc CLKID_FCLK_DIV3>;
 };
 
 &pwm_ef {
 	clocks = <&xtal>,
-		 <>, /* unknown/untested, the datasheet calls it "vid_pll" */
+		 <0>, /* unknown/untested, the datasheet calls it "vid_pll" */
 		 <&clkc CLKID_FCLK_DIV4>,
 		 <&clkc CLKID_FCLK_DIV3>;
 };
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi
index 2dc2fdaecf9f..19b8a39de6a0 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi
@@ -811,7 +811,7 @@
 
 &pwm_ab {
 	clocks = <&xtal>,
-		 <>, /* unknown/untested, the datasheet calls it "vid_pll" */
+		 <0>, /* unknown/untested, the datasheet calls it "vid_pll" */
 		 <&clkc CLKID_FCLK_DIV4>,
 		 <&clkc CLKID_FCLK_DIV3>;
 };
@@ -822,14 +822,14 @@
 
 &pwm_cd {
 	clocks = <&xtal>,
-		 <>, /* unknown/untested, the datasheet calls it "vid_pll" */
+		 <0>, /* unknown/untested, the datasheet calls it "vid_pll" */
 		 <&clkc CLKID_FCLK_DIV4>,
 		 <&clkc CLKID_FCLK_DIV3>;
 };
 
 &pwm_ef {
 	clocks = <&xtal>,
-		 <>, /* unknown/untested, the datasheet calls it "vid_pll" */
+		 <0>, /* unknown/untested, the datasheet calls it "vid_pll" */
 		 <&clkc CLKID_FCLK_DIV4>,
 		 <&clkc CLKID_FCLK_DIV3>;
 };
diff --git a/arch/arm64/boot/dts/apple/t8103-j293.dts b/arch/arm64/boot/dts/apple/t8103-j293.dts
index 2dfe7b895b2b..e2d9439397f7 100644
--- a/arch/arm64/boot/dts/apple/t8103-j293.dts
+++ b/arch/arm64/boot/dts/apple/t8103-j293.dts
@@ -77,6 +77,16 @@
 	};
 };
 
+/*
+ * The driver depends on boot loader initialized state which resets when this
+ * power-domain is powered off. This happens on suspend or when the driver is
+ * missing during boot. Mark the domain as always on until the driver can
+ * handle this.
+ */
+&ps_dispdfr_be {
+	apple,always-on;
+};
+
 &display_dfr {
 	status = "okay";
 };
diff --git a/arch/arm64/boot/dts/apple/t8112-j493.dts b/arch/arm64/boot/dts/apple/t8112-j493.dts
index 3d73f9ee2f46..be86d34c6696 100644
--- a/arch/arm64/boot/dts/apple/t8112-j493.dts
+++ b/arch/arm64/boot/dts/apple/t8112-j493.dts
@@ -40,6 +40,16 @@
 	};
 };
 
+/*
+ * The driver depends on boot loader initialized state which resets when this
+ * power-domain is powered off. This happens on suspend or when the driver is
+ * missing during boot. Mark the domain as always on until the driver can
+ * handle this.
+ */
+&ps_dispdfr_be {
+	apple,always-on;
+};
+
 &display_dfr {
 	status = "okay";
 };
diff --git a/arch/arm64/boot/dts/arm/morello.dtsi b/arch/arm64/boot/dts/arm/morello.dtsi
index 0bab0b3ea969..5bc1c725dc86 100644
--- a/arch/arm64/boot/dts/arm/morello.dtsi
+++ b/arch/arm64/boot/dts/arm/morello.dtsi
@@ -44,7 +44,7 @@
 			next-level-cache = <&l2_0>;
 			clocks = <&scmi_dvfs 0>;
 
-			l2_0: l2-cache-0 {
+			l2_0: l2-cache {
 				compatible = "cache";
 				cache-level = <2>;
 				/* 8 ways set associative */
@@ -53,13 +53,6 @@
 				cache-sets = <2048>;
 				cache-unified;
 				next-level-cache = <&l3_0>;
-
-				l3_0: l3-cache {
-					compatible = "cache";
-					cache-level = <3>;
-					cache-size = <0x100000>;
-					cache-unified;
-				};
 			};
 		};
 
@@ -78,7 +71,7 @@
 			next-level-cache = <&l2_1>;
 			clocks = <&scmi_dvfs 0>;
 
-			l2_1: l2-cache-1 {
+			l2_1: l2-cache {
 				compatible = "cache";
 				cache-level = <2>;
 				/* 8 ways set associative */
@@ -105,7 +98,7 @@
 			next-level-cache = <&l2_2>;
 			clocks = <&scmi_dvfs 1>;
 
-			l2_2: l2-cache-2 {
+			l2_2: l2-cache {
 				compatible = "cache";
 				cache-level = <2>;
 				/* 8 ways set associative */
@@ -132,7 +125,7 @@
 			next-level-cache = <&l2_3>;
 			clocks = <&scmi_dvfs 1>;
 
-			l2_3: l2-cache-3 {
+			l2_3: l2-cache {
 				compatible = "cache";
 				cache-level = <2>;
 				/* 8 ways set associative */
@@ -143,6 +136,13 @@
 				next-level-cache = <&l3_0>;
 			};
 		};
+
+		l3_0: l3-cache {
+			compatible = "cache";
+			cache-level = <3>;
+			cache-size = <0x100000>;
+			cache-unified;
+		};
 	};
 
 	firmware {
diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
index 7251ad3a0017..b46566f3ce20 100644
--- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
@@ -144,6 +144,19 @@
 		startup-delay-us = <20000>;
 	};
 
+	reg_usdhc2_vqmmc: regulator-usdhc2-vqmmc {
+		compatible = "regulator-gpio";
+		pinctrl-names = "default";
+		pinctrl-0 = <&pinctrl_usdhc2_vsel>;
+		gpios = <&gpio1 4 GPIO_ACTIVE_HIGH>;
+		regulator-max-microvolt = <3300000>;
+		regulator-min-microvolt = <1800000>;
+		states = <1800000 0x1>,
+			 <3300000 0x0>;
+		regulator-name = "PMIC_USDHC_VSELECT";
+		vin-supply = <&reg_nvcc_sd>;
+	};
+
 	reserved-memory {
 		#address-cells = <2>;
 		#size-cells = <2>;
@@ -269,7 +282,7 @@
 			  "SODIMM_19",
 			  "",
 			  "",
-			  "",
+			  "PMIC_USDHC_VSELECT",
 			  "",
 			  "",
 			  "",
@@ -785,6 +798,7 @@
 	pinctrl-2 = <&pinctrl_usdhc2_200mhz>, <&pinctrl_usdhc2_cd>;
 	pinctrl-3 = <&pinctrl_usdhc2_sleep>, <&pinctrl_usdhc2_cd_sleep>;
 	vmmc-supply = <&reg_usdhc2_vmmc>;
+	vqmmc-supply = <&reg_usdhc2_vqmmc>;
 };
 
 &wdog1 {
@@ -1206,13 +1220,17 @@
 			<MX8MM_IOMUXC_NAND_CLE_GPIO3_IO5		0x6>;	/* SODIMM 76 */
 	};
 
+	pinctrl_usdhc2_vsel: usdhc2vselgrp {
+		fsl,pins =
+			<MX8MM_IOMUXC_GPIO1_IO04_GPIO1_IO4	0x10>; /* PMIC_USDHC_VSELECT */
+	};
+
 	/*
 	 * Note: Due to ERR050080 we use discrete external on-module resistors pulling-up to the
 	 * on-module +V3.3_1.8_SD (LDO5) rail and explicitly disable the internal pull-ups here.
 	 */
 	pinctrl_usdhc2: usdhc2grp {
 		fsl,pins =
-			<MX8MM_IOMUXC_GPIO1_IO04_USDHC2_VSELECT		0x10>,
 			<MX8MM_IOMUXC_SD2_CLK_USDHC2_CLK		0x90>,	/* SODIMM 78 */
 			<MX8MM_IOMUXC_SD2_CMD_USDHC2_CMD		0x90>,	/* SODIMM 74 */
 			<MX8MM_IOMUXC_SD2_DATA0_USDHC2_DATA0		0x90>,	/* SODIMM 80 */
@@ -1223,7 +1241,6 @@
 
 	pinctrl_usdhc2_100mhz: usdhc2-100mhzgrp {
 		fsl,pins =
-			<MX8MM_IOMUXC_GPIO1_IO04_USDHC2_VSELECT		0x10>,
 			<MX8MM_IOMUXC_SD2_CLK_USDHC2_CLK		0x94>,
 			<MX8MM_IOMUXC_SD2_CMD_USDHC2_CMD		0x94>,
 			<MX8MM_IOMUXC_SD2_DATA0_USDHC2_DATA0		0x94>,
@@ -1234,7 +1251,6 @@
 
 	pinctrl_usdhc2_200mhz: usdhc2-200mhzgrp {
 		fsl,pins =
-			<MX8MM_IOMUXC_GPIO1_IO04_USDHC2_VSELECT		0x10>,
 			<MX8MM_IOMUXC_SD2_CLK_USDHC2_CLK		0x96>,
 			<MX8MM_IOMUXC_SD2_CMD_USDHC2_CMD		0x96>,
 			<MX8MM_IOMUXC_SD2_DATA0_USDHC2_DATA0		0x96>,
@@ -1246,7 +1262,6 @@
 	/* Avoid backfeeding with removed card power */
 	pinctrl_usdhc2_sleep: usdhc2slpgrp {
 		fsl,pins =
-			<MX8MM_IOMUXC_GPIO1_IO04_USDHC2_VSELECT		0x0>,
 			<MX8MM_IOMUXC_SD2_CLK_USDHC2_CLK		0x0>,
 			<MX8MM_IOMUXC_SD2_CMD_USDHC2_CMD		0x0>,
 			<MX8MM_IOMUXC_SD2_DATA0_USDHC2_DATA0		0x0>,
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi
index a1b75c9068b2..2ce1860b244d 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi
@@ -24,6 +24,20 @@
 	fsl,operating-mode = "nominal";
 };
 
+&gpu2d {
+	assigned-clocks = <&clk IMX8MP_CLK_GPU2D_CORE>;
+	assigned-clock-parents = <&clk IMX8MP_SYS_PLL1_800M>;
+	assigned-clock-rates = <800000000>;
+};
+
+&gpu3d {
+	assigned-clocks = <&clk IMX8MP_CLK_GPU3D_CORE>,
+			  <&clk IMX8MP_CLK_GPU3D_SHADER_CORE>;
+	assigned-clock-parents = <&clk IMX8MP_SYS_PLL1_800M>,
+				 <&clk IMX8MP_SYS_PLL1_800M>;
+	assigned-clock-rates = <800000000>, <800000000>;
+};
+
 &pgc_hdmimix {
 	assigned-clocks = <&clk IMX8MP_CLK_HDMI_AXI>,
 			  <&clk IMX8MP_CLK_HDMI_APB>;
@@ -46,6 +60,18 @@
 	assigned-clock-rates = <600000000>, <300000000>;
 };
 
+&pgc_mlmix {
+	assigned-clocks = <&clk IMX8MP_CLK_ML_CORE>,
+			  <&clk IMX8MP_CLK_ML_AXI>,
+			  <&clk IMX8MP_CLK_ML_AHB>;
+	assigned-clock-parents = <&clk IMX8MP_SYS_PLL1_800M>,
+				 <&clk IMX8MP_SYS_PLL1_800M>,
+				 <&clk IMX8MP_SYS_PLL1_800M>;
+	assigned-clock-rates = <800000000>,
+			       <800000000>,
+			       <300000000>;
+};
+
 &media_blk_ctrl {
 	assigned-clocks = <&clk IMX8MP_CLK_MEDIA_AXI>,
 			  <&clk IMX8MP_CLK_MEDIA_APB>,
@@ -62,3 +88,5 @@
 			       <0>, <0>, <400000000>,
 			       <1039500000>;
 };
+
+/delete-node/ &{noc_opp_table/opp-1000000000};
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-var-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-var-som.dtsi
index b2ac2583a592..b59da91fdd04 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-var-som.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-var-som.dtsi
@@ -35,7 +35,6 @@
 		      <0x1 0x00000000 0 0xc0000000>;
 	};
 
-
 	reg_usdhc2_vmmc: regulator-usdhc2-vmmc {
 	        compatible = "regulator-fixed";
 	        regulator-name = "VSD_3V3";
@@ -46,6 +45,16 @@
 	        startup-delay-us = <100>;
 	        off-on-delay-us = <12000>;
 	};
+
+	reg_usdhc2_vqmmc: regulator-usdhc2-vqmmc {
+		compatible = "regulator-gpio";
+		regulator-name = "VSD_VSEL";
+		regulator-min-microvolt = <1800000>;
+		regulator-max-microvolt = <3300000>;
+		gpios = <&gpio2 12 GPIO_ACTIVE_HIGH>;
+		states = <3300000 0x0 1800000 0x1>;
+		vin-supply = <&ldo5>;
+	};
 };
 
 &A53_0 {
@@ -205,6 +214,7 @@
         pinctrl-2 = <&pinctrl_usdhc2_200mhz>, <&pinctrl_usdhc2_gpio>;
         cd-gpios = <&gpio1 14 GPIO_ACTIVE_LOW>;
         vmmc-supply = <&reg_usdhc2_vmmc>;
+	vqmmc-supply = <&reg_usdhc2_vqmmc>;
         bus-width = <4>;
         status = "okay";
 };
diff --git a/arch/arm64/boot/dts/freescale/imx8mp.dtsi b/arch/arm64/boot/dts/freescale/imx8mp.dtsi
index ce6793b2d57e..7c1c87eab54c 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp.dtsi
@@ -1645,6 +1645,12 @@
 					opp-hz = /bits/ 64 <200000000>;
 				};
 
+				/* Nominal drive mode maximum */
+				opp-800000000 {
+					opp-hz = /bits/ 64 <800000000>;
+				};
+
+				/* Overdrive mode maximum */
 				opp-1000000000 {
 					opp-hz = /bits/ 64 <1000000000>;
 				};
diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi
index 9bb26b466a06..59f057ba6fa7 100644
--- a/arch/arm64/boot/dts/freescale/imx95.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx95.dtsi
@@ -1626,7 +1626,7 @@
 			reg = <0 0x4c300000 0 0x10000>,
 			      <0 0x60100000 0 0xfe00000>,
 			      <0 0x4c360000 0 0x10000>,
-			      <0 0x4c340000 0 0x2000>;
+			      <0 0x4c340000 0 0x4000>;
 			reg-names = "dbi", "config", "atu", "app";
 			ranges = <0x81000000 0x0 0x00000000 0x0 0x6ff00000 0 0x00100000>,
 				 <0x82000000 0x0 0x10000000 0x9 0x10000000 0 0x10000000>;
@@ -1673,7 +1673,7 @@
 			reg = <0 0x4c300000 0 0x10000>,
 			      <0 0x4c360000 0 0x1000>,
 			      <0 0x4c320000 0 0x1000>,
-			      <0 0x4c340000 0 0x2000>,
+			      <0 0x4c340000 0 0x4000>,
 			      <0 0x4c370000 0 0x10000>,
 			      <0x9 0 1 0>;
 			reg-names = "dbi","atu", "dbi2", "app", "dma", "addr_space";
@@ -1700,7 +1700,7 @@
 			reg = <0 0x4c380000 0 0x10000>,
 			      <8 0x80100000 0 0xfe00000>,
 			      <0 0x4c3e0000 0 0x10000>,
-			      <0 0x4c3c0000 0 0x2000>;
+			      <0 0x4c3c0000 0 0x4000>;
 			reg-names = "dbi", "config", "atu", "app";
 			ranges = <0x81000000 0 0x00000000 0x8 0x8ff00000 0 0x00100000>,
 				 <0x82000000 0 0x10000000 0xa 0x10000000 0 0x10000000>;
@@ -1749,7 +1749,7 @@
 			reg = <0 0x4c380000 0 0x10000>,
 			      <0 0x4c3e0000 0 0x1000>,
 			      <0 0x4c3a0000 0 0x1000>,
-			      <0 0x4c3c0000 0 0x2000>,
+			      <0 0x4c3c0000 0 0x4000>,
 			      <0 0x4c3f0000 0 0x10000>,
 			      <0xa 0 1 0>;
 			reg-names = "dbi", "atu", "dbi2", "app", "dma", "addr_space";
diff --git a/arch/arm64/boot/dts/marvell/armada-3720-uDPU.dtsi b/arch/arm64/boot/dts/marvell/armada-3720-uDPU.dtsi
index 3a9b6907185d..242820845707 100644
--- a/arch/arm64/boot/dts/marvell/armada-3720-uDPU.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-3720-uDPU.dtsi
@@ -26,6 +26,8 @@
 
 	leds {
 		compatible = "gpio-leds";
+		pinctrl-names = "default";
+		pinctrl-0 = <&spi_quad_pins>;
 
 		led-power1 {
 			label = "udpu:green:power";
@@ -82,8 +84,6 @@
 
 &spi0 {
 	status = "okay";
-	pinctrl-names = "default";
-	pinctrl-0 = <&spi_quad_pins>;
 
 	flash@0 {
 		compatible = "jedec,spi-nor";
@@ -108,6 +108,10 @@
 	};
 };
 
+&spi_quad_pins {
+	function = "gpio";
+};
+
 &pinctrl_nb {
 	i2c2_recovery_pins: i2c2-recovery-pins {
 		groups = "i2c2";
diff --git a/arch/arm64/boot/dts/rockchip/px30-engicam-common.dtsi b/arch/arm64/boot/dts/rockchip/px30-engicam-common.dtsi
index 1edfd643b25a..a334ef0629d1 100644
--- a/arch/arm64/boot/dts/rockchip/px30-engicam-common.dtsi
+++ b/arch/arm64/boot/dts/rockchip/px30-engicam-common.dtsi
@@ -31,7 +31,7 @@
 	};
 
 	vcc3v3_btreg: vcc3v3-btreg {
-		compatible = "regulator-gpio";
+		compatible = "regulator-fixed";
 		enable-active-high;
 		pinctrl-names = "default";
 		pinctrl-0 = <&bt_enable_h>;
@@ -39,7 +39,6 @@
 		regulator-min-microvolt = <3300000>;
 		regulator-max-microvolt = <3300000>;
 		regulator-always-on;
-		states = <3300000 0x0>;
 	};
 
 	vcc3v3_rf_aux_mod: regulator-vcc3v3-rf-aux-mod {
diff --git a/arch/arm64/boot/dts/rockchip/px30-engicam-ctouch2.dtsi b/arch/arm64/boot/dts/rockchip/px30-engicam-ctouch2.dtsi
index 80db778c9684..b60e68faa83a 100644
--- a/arch/arm64/boot/dts/rockchip/px30-engicam-ctouch2.dtsi
+++ b/arch/arm64/boot/dts/rockchip/px30-engicam-ctouch2.dtsi
@@ -26,5 +26,5 @@
 };
 
 &vcc3v3_btreg {
-	enable-gpios = <&gpio1 RK_PC3 GPIO_ACTIVE_HIGH>;
+	gpios = <&gpio1 RK_PC3 GPIO_ACTIVE_HIGH>;
 };
diff --git a/arch/arm64/boot/dts/rockchip/px30-engicam-px30-core-edimm2.2.dts b/arch/arm64/boot/dts/rockchip/px30-engicam-px30-core-edimm2.2.dts
index 165d09ccb942..5886b802c520 100644
--- a/arch/arm64/boot/dts/rockchip/px30-engicam-px30-core-edimm2.2.dts
+++ b/arch/arm64/boot/dts/rockchip/px30-engicam-px30-core-edimm2.2.dts
@@ -39,5 +39,5 @@
 };
 
 &vcc3v3_btreg {
-	enable-gpios = <&gpio1 RK_PC2 GPIO_ACTIVE_HIGH>;
+	gpios = <&gpio1 RK_PC2 GPIO_ACTIVE_HIGH>;
 };
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi
index 541dca12bf1a..046dbe329017 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi
@@ -43,7 +43,7 @@
 	sdio_pwrseq: sdio-pwrseq {
 		compatible = "mmc-pwrseq-simple";
 		clocks = <&rk808 1>;
-		clock-names = "lpo";
+		clock-names = "ext_clock";
 		pinctrl-names = "default";
 		pinctrl-0 = <&wifi_enable_h>;
 		reset-gpios = <&gpio0 RK_PB2 GPIO_ACTIVE_LOW>;
diff --git a/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi b/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi
index a48351471764..e7ba477e75f9 100644
--- a/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi
@@ -775,7 +775,7 @@
 	rockchip,default-sample-phase = <90>;
 	status = "okay";
 
-	sdio-wifi@1 {
+	wifi@1 {
 		compatible = "brcm,bcm4329-fmac";
 		reg = <1>;
 		interrupt-parent = <&gpio2>;
diff --git a/arch/arm64/boot/dts/rockchip/rk3568-qnap-ts433.dts b/arch/arm64/boot/dts/rockchip/rk3568-qnap-ts433.dts
index 7bd32d230ad2..b80d628c426b 100644
--- a/arch/arm64/boot/dts/rockchip/rk3568-qnap-ts433.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3568-qnap-ts433.dts
@@ -619,6 +619,8 @@
 	bus-width = <8>;
 	max-frequency = <200000000>;
 	non-removable;
+	pinctrl-names = "default";
+	pinctrl-0 = <&emmc_bus8 &emmc_clk &emmc_cmd &emmc_datastrobe>;
 	status = "okay";
 };
 
diff --git a/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts b/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts
index 828bde7fab68..314067ba6f3c 100644
--- a/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts
@@ -610,7 +610,7 @@
 		reg = <0x51>;
 		clock-output-names = "hym8563";
 		interrupt-parent = <&gpio0>;
-		interrupts = <RK_PB0 IRQ_TYPE_LEVEL_LOW>;
+		interrupts = <RK_PA0 IRQ_TYPE_LEVEL_LOW>;
 		pinctrl-names = "default";
 		pinctrl-0 = <&hym8563_int>;
 		wakeup-source;
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-friendlyelec-cm3588.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-friendlyelec-cm3588.dtsi
index 1af0a30866f6..af431fdcbea7 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-friendlyelec-cm3588.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588-friendlyelec-cm3588.dtsi
@@ -222,6 +222,10 @@
 		compatible = "realtek,rt5616";
 		reg = <0x1b>;
 		#sound-dai-cells = <0>;
+		assigned-clocks = <&cru I2S0_8CH_MCLKOUT>;
+		assigned-clock-rates = <12288000>;
+		clocks = <&cru I2S0_8CH_MCLKOUT>;
+		clock-names = "mclk";
 	};
 };
 
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-turing-rk1.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-turing-rk1.dtsi
index 711ac4f2c7cb..60ad272982ad 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-turing-rk1.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588-turing-rk1.dtsi
@@ -214,6 +214,8 @@
 };
 
 &package_thermal {
+	polling-delay = <1000>;
+
 	trips {
 		package_active1: trip-active1 {
 			temperature = <45000>;
diff --git a/arch/arm64/boot/dts/rockchip/rk3588j.dtsi b/arch/arm64/boot/dts/rockchip/rk3588j.dtsi
index bce72bac4503..3045cb3bd68c 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588j.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588j.dtsi
@@ -11,20 +11,15 @@
 		compatible = "operating-points-v2";
 		opp-shared;
 
-		opp-1416000000 {
-			opp-hz = /bits/ 64 <1416000000>;
+		opp-1200000000 {
+			opp-hz = /bits/ 64 <1200000000>;
 			opp-microvolt = <750000 750000 950000>;
 			clock-latency-ns = <40000>;
 			opp-suspend;
 		};
-		opp-1608000000 {
-			opp-hz = /bits/ 64 <1608000000>;
-			opp-microvolt = <887500 887500 950000>;
-			clock-latency-ns = <40000>;
-		};
-		opp-1704000000 {
-			opp-hz = /bits/ 64 <1704000000>;
-			opp-microvolt = <937500 937500 950000>;
+		opp-1296000000 {
+			opp-hz = /bits/ 64 <1296000000>;
+			opp-microvolt = <775000 775000 950000>;
 			clock-latency-ns = <40000>;
 		};
 	};
@@ -33,9 +28,14 @@
 		compatible = "operating-points-v2";
 		opp-shared;
 
+		opp-1200000000{
+			opp-hz = /bits/ 64 <1200000000>;
+			opp-microvolt = <750000 750000 950000>;
+			clock-latency-ns = <40000>;
+		};
 		opp-1416000000 {
 			opp-hz = /bits/ 64 <1416000000>;
-			opp-microvolt = <750000 750000 950000>;
+			opp-microvolt = <762500 762500 950000>;
 			clock-latency-ns = <40000>;
 		};
 		opp-1608000000 {
@@ -43,25 +43,20 @@
 			opp-microvolt = <787500 787500 950000>;
 			clock-latency-ns = <40000>;
 		};
-		opp-1800000000 {
-			opp-hz = /bits/ 64 <1800000000>;
-			opp-microvolt = <875000 875000 950000>;
-			clock-latency-ns = <40000>;
-		};
-		opp-2016000000 {
-			opp-hz = /bits/ 64 <2016000000>;
-			opp-microvolt = <950000 950000 950000>;
-			clock-latency-ns = <40000>;
-		};
 	};
 
 	cluster2_opp_table: opp-table-cluster2 {
 		compatible = "operating-points-v2";
 		opp-shared;
 
+		opp-1200000000{
+			opp-hz = /bits/ 64 <1200000000>;
+			opp-microvolt = <750000 750000 950000>;
+			clock-latency-ns = <40000>;
+		};
 		opp-1416000000 {
 			opp-hz = /bits/ 64 <1416000000>;
-			opp-microvolt = <750000 750000 950000>;
+			opp-microvolt = <762500 762500 950000>;
 			clock-latency-ns = <40000>;
 		};
 		opp-1608000000 {
@@ -69,16 +64,6 @@
 			opp-microvolt = <787500 787500 950000>;
 			clock-latency-ns = <40000>;
 		};
-		opp-1800000000 {
-			opp-hz = /bits/ 64 <1800000000>;
-			opp-microvolt = <875000 875000 950000>;
-			clock-latency-ns = <40000>;
-		};
-		opp-2016000000 {
-			opp-hz = /bits/ 64 <2016000000>;
-			opp-microvolt = <950000 950000 950000>;
-			clock-latency-ns = <40000>;
-		};
 	};
 
 	gpu_opp_table: opp-table {
@@ -104,10 +89,6 @@
 			opp-hz = /bits/ 64 <700000000>;
 			opp-microvolt = <750000 750000 850000>;
 		};
-		opp-850000000 {
-			opp-hz = /bits/ 64 <800000000>;
-			opp-microvolt = <787500 787500 850000>;
-		};
 	};
 };
 
diff --git a/arch/arm64/boot/dts/st/stm32mp211.dtsi b/arch/arm64/boot/dts/st/stm32mp211.dtsi
index 6dd1377f3e1d..bf888d60cd4f 100644
--- a/arch/arm64/boot/dts/st/stm32mp211.dtsi
+++ b/arch/arm64/boot/dts/st/stm32mp211.dtsi
@@ -116,11 +116,11 @@
 		};
 
 		intc: interrupt-controller@4ac10000 {
-			compatible = "arm,cortex-a7-gic";
+			compatible = "arm,gic-400";
 			reg = <0x4ac10000 0x0 0x1000>,
-			      <0x4ac20000 0x0 0x2000>,
-			      <0x4ac40000 0x0 0x2000>,
-			      <0x4ac60000 0x0 0x2000>;
+			      <0x4ac20000 0x0 0x20000>,
+			      <0x4ac40000 0x0 0x20000>,
+			      <0x4ac60000 0x0 0x20000>;
 			      #interrupt-cells = <3>;
 			      interrupt-controller;
 		};
diff --git a/arch/arm64/boot/dts/st/stm32mp231.dtsi b/arch/arm64/boot/dts/st/stm32mp231.dtsi
index 8820d219a33e..75697acd1345 100644
--- a/arch/arm64/boot/dts/st/stm32mp231.dtsi
+++ b/arch/arm64/boot/dts/st/stm32mp231.dtsi
@@ -1201,13 +1201,12 @@
 		};
 
 		intc: interrupt-controller@4ac10000 {
-			compatible = "arm,cortex-a7-gic";
+			compatible = "arm,gic-400";
 			reg = <0x4ac10000 0x1000>,
-			      <0x4ac20000 0x2000>,
-			      <0x4ac40000 0x2000>,
-			      <0x4ac60000 0x2000>;
+			      <0x4ac20000 0x20000>,
+			      <0x4ac40000 0x20000>,
+			      <0x4ac60000 0x20000>;
 			#interrupt-cells = <3>;
-			#address-cells = <1>;
 			interrupt-controller;
 		};
 	};
diff --git a/arch/arm64/boot/dts/st/stm32mp251.dtsi b/arch/arm64/boot/dts/st/stm32mp251.dtsi
index f3c6cdfd7008..87110f91e489 100644
--- a/arch/arm64/boot/dts/st/stm32mp251.dtsi
+++ b/arch/arm64/boot/dts/st/stm32mp251.dtsi
@@ -115,14 +115,13 @@
 	};
 
 	intc: interrupt-controller@4ac00000 {
-		compatible = "arm,cortex-a7-gic";
+		compatible = "arm,gic-400";
 		#interrupt-cells = <3>;
-		#address-cells = <1>;
 		interrupt-controller;
 		reg = <0x0 0x4ac10000 0x0 0x1000>,
-		      <0x0 0x4ac20000 0x0 0x2000>,
-		      <0x0 0x4ac40000 0x0 0x2000>,
-		      <0x0 0x4ac60000 0x0 0x2000>;
+		      <0x0 0x4ac20000 0x0 0x20000>,
+		      <0x0 0x4ac40000 0x0 0x20000>,
+		      <0x0 0x4ac60000 0x0 0x20000>;
 	};
 
 	psci {
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 5bb8f09422a2..370ad70b4be8 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1729,15 +1729,14 @@ CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_ISO8859_1=y
 CONFIG_SECURITY=y
 CONFIG_CRYPTO_USER=y
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_CHACHA20=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_ECHAINIV=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_ANSI_CPRNG=y
 CONFIG_CRYPTO_USER_API_RNG=m
-CONFIG_CRYPTO_CHACHA20_NEON=m
 CONFIG_CRYPTO_GHASH_ARM64_CE=y
 CONFIG_CRYPTO_SHA1_ARM64_CE=y
-CONFIG_CRYPTO_SHA2_ARM64_CE=y
 CONFIG_CRYPTO_SHA512_ARM64_CE=m
 CONFIG_CRYPTO_SHA3_ARM64=m
 CONFIG_CRYPTO_SM3_ARM64_CE=m
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 3418c8d3c78d..c44b0f202a1f 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -25,18 +25,6 @@ config CRYPTO_NHPOLY1305_NEON
 	  Architecture: arm64 using:
 	  - NEON (Advanced SIMD) extensions
 
-config CRYPTO_POLY1305_NEON
-	tristate
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_HASH
-	select CRYPTO_ARCH_HAVE_LIB_POLY1305
-	default CRYPTO_LIB_POLY1305_INTERNAL
-	help
-	  Poly1305 authenticator algorithm (RFC7539)
-
-	  Architecture: arm64 using:
-	  - NEON (Advanced SIMD) extensions
-
 config CRYPTO_SHA1_ARM64_CE
 	tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)"
 	depends on KERNEL_MODE_NEON
@@ -48,25 +36,6 @@ config CRYPTO_SHA1_ARM64_CE
 	  Architecture: arm64 using:
 	  - ARMv8 Crypto Extensions
 
-config CRYPTO_SHA256_ARM64
-	tristate "Hash functions: SHA-224 and SHA-256"
-	select CRYPTO_HASH
-	help
-	  SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
-	  Architecture: arm64
-
-config CRYPTO_SHA2_ARM64_CE
-	tristate "Hash functions: SHA-224 and SHA-256 (ARMv8 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_HASH
-	select CRYPTO_SHA256_ARM64
-	help
-	  SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
-	  Architecture: arm64 using:
-	  - ARMv8 Crypto Extensions
-
 config CRYPTO_SHA512_ARM64
 	tristate "Hash functions: SHA-384 and SHA-512"
 	select CRYPTO_HASH
@@ -101,7 +70,7 @@ config CRYPTO_SM3_NEON
 	tristate "Hash functions: SM3 (NEON)"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_HASH
-	select CRYPTO_SM3
+	select CRYPTO_LIB_SM3
 	help
 	  SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012)
 
@@ -112,7 +81,7 @@ config CRYPTO_SM3_ARM64_CE
 	tristate "Hash functions: SM3 (ARMv8.2 Crypto Extensions)"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_HASH
-	select CRYPTO_SM3
+	select CRYPTO_LIB_SM3
 	help
 	  SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012)
 
@@ -143,7 +112,7 @@ config CRYPTO_AES_ARM64
 
 config CRYPTO_AES_ARM64_CE
 	tristate "Ciphers: AES (ARMv8 Crypto Extensions)"
-	depends on ARM64 && KERNEL_MODE_NEON
+	depends on KERNEL_MODE_NEON
 	select CRYPTO_ALGAPI
 	select CRYPTO_LIB_AES
 	help
@@ -186,20 +155,6 @@ config CRYPTO_AES_ARM64_NEON_BLK
 	  Architecture: arm64 using:
 	  - NEON (Advanced SIMD) extensions
 
-config CRYPTO_CHACHA20_NEON
-	tristate
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_SKCIPHER
-	select CRYPTO_LIB_CHACHA_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-	default CRYPTO_LIB_CHACHA_INTERNAL
-	help
-	  Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
-	  stream cipher algorithms
-
-	  Architecture: arm64 using:
-	  - NEON (Advanced SIMD) extensions
-
 config CRYPTO_AES_ARM64_BS
 	tristate "Ciphers: AES, modes: ECB/CBC/CTR/XCTR/XTS modes (bit-sliced NEON)"
 	depends on KERNEL_MODE_NEON
@@ -267,7 +222,7 @@ config CRYPTO_SM4_ARM64_NEON_BLK
 
 config CRYPTO_AES_ARM64_CE_CCM
 	tristate "AEAD cipher: AES in CCM mode (ARMv8 Crypto Extensions)"
-	depends on ARM64 && KERNEL_MODE_NEON
+	depends on KERNEL_MODE_NEON
 	select CRYPTO_ALGAPI
 	select CRYPTO_AES_ARM64_CE
 	select CRYPTO_AES_ARM64_CE_BLK
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index e7139c4768ce..c231c980c514 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -8,9 +8,6 @@
 obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o
 sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
 
-obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o
-sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
-
 obj-$(CONFIG_CRYPTO_SHA512_ARM64_CE) += sha512-ce.o
 sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o
 
@@ -56,19 +53,9 @@ aes-ce-blk-y := aes-glue-ce.o aes-ce.o
 obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
 aes-neon-blk-y := aes-glue-neon.o aes-neon.o
 
-obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o
-sha256-arm64-y := sha256-glue.o sha256-core.o
-
 obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
 sha512-arm64-y := sha512-glue.o sha512-core.o
 
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
-poly1305-neon-y := poly1305-core.o poly1305-glue.o
-AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64
-
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
 
@@ -81,10 +68,7 @@ aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
 quiet_cmd_perlasm = PERLASM $@
       cmd_perlasm = $(PERL) $(<) void $(@)
 
-$(obj)/%-core.S: $(src)/%-armv8.pl
-	$(call cmd,perlasm)
-
-$(obj)/sha256-core.S: $(src)/sha512-armv8.pl
+$(obj)/sha512-core.S: $(src)/../lib/crypto/sha2-armv8.pl
 	$(call cmd,perlasm)
 
-clean-files += poly1305-core.S sha256-core.S sha512-core.S
+clean-files += sha512-core.S
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index b0150999743f..81560f722b9d 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -5,19 +5,20 @@
  * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
 
-#include <asm/neon.h>
 #include <asm/hwcap.h>
-#include <asm/simd.h>
+#include <asm/neon.h>
 #include <crypto/aes.h>
 #include <crypto/ctr.h>
-#include <crypto/sha2.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
-#include <linux/module.h>
-#include <linux/cpufeature.h>
+#include <crypto/sha2.h>
+#include <crypto/utils.h>
 #include <crypto/xts.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
 
 #include "aes-ce-setkey.h"
 
@@ -130,7 +131,6 @@ struct mac_tfm_ctx {
 };
 
 struct mac_desc_ctx {
-	unsigned int len;
 	u8 dg[AES_BLOCK_SIZE];
 };
 
@@ -869,109 +869,64 @@ static int mac_init(struct shash_desc *desc)
 	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
 
 	memset(ctx->dg, 0, AES_BLOCK_SIZE);
-	ctx->len = 0;
-
 	return 0;
 }
 
 static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
-			  u8 dg[], int enc_before, int enc_after)
+			  u8 dg[], int enc_before)
 {
 	int rounds = 6 + ctx->key_length / 4;
+	int rem;
 
-	if (crypto_simd_usable()) {
-		int rem;
-
-		do {
-			kernel_neon_begin();
-			rem = aes_mac_update(in, ctx->key_enc, rounds, blocks,
-					     dg, enc_before, enc_after);
-			kernel_neon_end();
-			in += (blocks - rem) * AES_BLOCK_SIZE;
-			blocks = rem;
-			enc_before = 0;
-		} while (blocks);
-	} else {
-		if (enc_before)
-			aes_encrypt(ctx, dg, dg);
-
-		while (blocks--) {
-			crypto_xor(dg, in, AES_BLOCK_SIZE);
-			in += AES_BLOCK_SIZE;
-
-			if (blocks || enc_after)
-				aes_encrypt(ctx, dg, dg);
-		}
-	}
+	do {
+		kernel_neon_begin();
+		rem = aes_mac_update(in, ctx->key_enc, rounds, blocks,
+				     dg, enc_before, !enc_before);
+		kernel_neon_end();
+		in += (blocks - rem) * AES_BLOCK_SIZE;
+		blocks = rem;
+	} while (blocks);
 }
 
 static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len)
 {
 	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+	int blocks = len / AES_BLOCK_SIZE;
 
-	while (len > 0) {
-		unsigned int l;
-
-		if ((ctx->len % AES_BLOCK_SIZE) == 0 &&
-		    (ctx->len + len) > AES_BLOCK_SIZE) {
-
-			int blocks = len / AES_BLOCK_SIZE;
-
-			len %= AES_BLOCK_SIZE;
-
-			mac_do_update(&tctx->key, p, blocks, ctx->dg,
-				      (ctx->len != 0), (len != 0));
-
-			p += blocks * AES_BLOCK_SIZE;
-
-			if (!len) {
-				ctx->len = AES_BLOCK_SIZE;
-				break;
-			}
-			ctx->len = 0;
-		}
-
-		l = min(len, AES_BLOCK_SIZE - ctx->len);
-
-		if (l <= AES_BLOCK_SIZE) {
-			crypto_xor(ctx->dg + ctx->len, p, l);
-			ctx->len += l;
-			len -= l;
-			p += l;
-		}
-	}
-
-	return 0;
+	len %= AES_BLOCK_SIZE;
+	mac_do_update(&tctx->key, p, blocks, ctx->dg, 0);
+	return len;
 }
 
-static int cbcmac_final(struct shash_desc *desc, u8 *out)
+static int cbcmac_finup(struct shash_desc *desc, const u8 *src,
+			unsigned int len, u8 *out)
 {
 	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
 
-	mac_do_update(&tctx->key, NULL, 0, ctx->dg, (ctx->len != 0), 0);
-
+	if (len) {
+		crypto_xor(ctx->dg, src, len);
+		mac_do_update(&tctx->key, NULL, 0, ctx->dg, 1);
+	}
 	memcpy(out, ctx->dg, AES_BLOCK_SIZE);
-
 	return 0;
 }
 
-static int cmac_final(struct shash_desc *desc, u8 *out)
+static int cmac_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
+		      u8 *out)
 {
 	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
 	u8 *consts = tctx->consts;
 
-	if (ctx->len != AES_BLOCK_SIZE) {
-		ctx->dg[ctx->len] ^= 0x80;
+	crypto_xor(ctx->dg, src, len);
+	if (len != AES_BLOCK_SIZE) {
+		ctx->dg[len] ^= 0x80;
 		consts += AES_BLOCK_SIZE;
 	}
-
-	mac_do_update(&tctx->key, consts, 1, ctx->dg, 0, 1);
-
+	mac_do_update(&tctx->key, consts, 1, ctx->dg, 0);
 	memcpy(out, ctx->dg, AES_BLOCK_SIZE);
-
 	return 0;
 }
 
@@ -979,6 +934,8 @@ static struct shash_alg mac_algs[] = { {
 	.base.cra_name		= "cmac(aes)",
 	.base.cra_driver_name	= "cmac-aes-" MODE,
 	.base.cra_priority	= PRIO,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				  CRYPTO_AHASH_ALG_FINAL_NONZERO,
 	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
 				  2 * AES_BLOCK_SIZE,
@@ -987,13 +944,15 @@ static struct shash_alg mac_algs[] = { {
 	.digestsize		= AES_BLOCK_SIZE,
 	.init			= mac_init,
 	.update			= mac_update,
-	.final			= cmac_final,
+	.finup			= cmac_finup,
 	.setkey			= cmac_setkey,
 	.descsize		= sizeof(struct mac_desc_ctx),
 }, {
 	.base.cra_name		= "xcbc(aes)",
 	.base.cra_driver_name	= "xcbc-aes-" MODE,
 	.base.cra_priority	= PRIO,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				  CRYPTO_AHASH_ALG_FINAL_NONZERO,
 	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
 				  2 * AES_BLOCK_SIZE,
@@ -1002,21 +961,22 @@ static struct shash_alg mac_algs[] = { {
 	.digestsize		= AES_BLOCK_SIZE,
 	.init			= mac_init,
 	.update			= mac_update,
-	.final			= cmac_final,
+	.finup			= cmac_finup,
 	.setkey			= xcbc_setkey,
 	.descsize		= sizeof(struct mac_desc_ctx),
 }, {
 	.base.cra_name		= "cbcmac(aes)",
 	.base.cra_driver_name	= "cbcmac-aes-" MODE,
 	.base.cra_priority	= PRIO,
-	.base.cra_blocksize	= 1,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx),
 	.base.cra_module	= THIS_MODULE,
 
 	.digestsize		= AES_BLOCK_SIZE,
 	.init			= mac_init,
 	.update			= mac_update,
-	.final			= cbcmac_final,
+	.finup			= cbcmac_finup,
 	.setkey			= cbcmac_setkey,
 	.descsize		= sizeof(struct mac_desc_ctx),
 } };
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
deleted file mode 100644
index 229876acfc58..000000000000
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers,
- * including ChaCha20 (RFC7539)
- *
- * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/internal/chacha.h>
-#include <crypto/internal/simd.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
-				      int nrounds);
-asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
-				       int nrounds, int bytes);
-asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
-			  int bytes, int nrounds)
-{
-	while (bytes > 0) {
-		int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
-
-		if (l <= CHACHA_BLOCK_SIZE) {
-			u8 buf[CHACHA_BLOCK_SIZE];
-
-			memcpy(buf, src, l);
-			chacha_block_xor_neon(state, buf, buf, nrounds);
-			memcpy(dst, buf, l);
-			state[12] += 1;
-			break;
-		}
-		chacha_4block_xor_neon(state, dst, src, nrounds, l);
-		bytes -= l;
-		src += l;
-		dst += l;
-		state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
-	}
-}
-
-void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
-{
-	if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
-		hchacha_block_generic(state, stream, nrounds);
-	} else {
-		kernel_neon_begin();
-		hchacha_block_neon(state, stream, nrounds);
-		kernel_neon_end();
-	}
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
-		       int nrounds)
-{
-	if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
-	    !crypto_simd_usable())
-		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
-
-	do {
-		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
-		kernel_neon_begin();
-		chacha_doneon(state, dst, src, todo, nrounds);
-		kernel_neon_end();
-
-		bytes -= todo;
-		src += todo;
-		dst += todo;
-	} while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-static int chacha_neon_stream_xor(struct skcipher_request *req,
-				  const struct chacha_ctx *ctx, const u8 *iv)
-{
-	struct skcipher_walk walk;
-	u32 state[16];
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	chacha_init(state, ctx->key, iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-
-		if (nbytes < walk.total)
-			nbytes = rounddown(nbytes, walk.stride);
-
-		if (!static_branch_likely(&have_neon) ||
-		    !crypto_simd_usable()) {
-			chacha_crypt_generic(state, walk.dst.virt.addr,
-					     walk.src.virt.addr, nbytes,
-					     ctx->nrounds);
-		} else {
-			kernel_neon_begin();
-			chacha_doneon(state, walk.dst.virt.addr,
-				      walk.src.virt.addr, nbytes, ctx->nrounds);
-			kernel_neon_end();
-		}
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
-
-	return err;
-}
-
-static int chacha_neon(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	return chacha_neon_stream_xor(req, ctx, req->iv);
-}
-
-static int xchacha_neon(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct chacha_ctx subctx;
-	u32 state[16];
-	u8 real_iv[16];
-
-	chacha_init(state, ctx->key, req->iv);
-	hchacha_block_arch(state, subctx.key, ctx->nrounds);
-	subctx.nrounds = ctx->nrounds;
-
-	memcpy(&real_iv[0], req->iv + 24, 8);
-	memcpy(&real_iv[8], req->iv + 16, 8);
-	return chacha_neon_stream_xor(req, &subctx, real_iv);
-}
-
-static struct skcipher_alg algs[] = {
-	{
-		.base.cra_name		= "chacha20",
-		.base.cra_driver_name	= "chacha20-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= CHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 5 * CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= chacha_neon,
-		.decrypt		= chacha_neon,
-	}, {
-		.base.cra_name		= "xchacha20",
-		.base.cra_driver_name	= "xchacha20-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 5 * CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= xchacha_neon,
-		.decrypt		= xchacha_neon,
-	}, {
-		.base.cra_name		= "xchacha12",
-		.base.cra_driver_name	= "xchacha12-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 5 * CHACHA_BLOCK_SIZE,
-		.setkey			= chacha12_setkey,
-		.encrypt		= xchacha_neon,
-		.decrypt		= xchacha_neon,
-	}
-};
-
-static int __init chacha_simd_mod_init(void)
-{
-	if (!cpu_have_named_feature(ASIMD))
-		return 0;
-
-	static_branch_enable(&have_neon);
-
-	return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ?
-		crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
-}
-
-static void __exit chacha_simd_mod_fini(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) && cpu_have_named_feature(ASIMD))
-		crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-module_init(chacha_simd_mod_init);
-module_exit(chacha_simd_mod_fini);
-
-MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-neon");
-MODULE_ALIAS_CRYPTO("xchacha20");
-MODULE_ALIAS_CRYPTO("xchacha20-neon");
-MODULE_ALIAS_CRYPTO("xchacha12");
-MODULE_ALIAS_CRYPTO("xchacha12-neon");
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 071e122f9c37..4995b6e22335 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -6,30 +6,27 @@
  */
 
 #include <asm/neon.h>
-#include <asm/simd.h>
-#include <linux/unaligned.h>
 #include <crypto/aes.h>
-#include <crypto/gcm.h>
-#include <crypto/algapi.h>
 #include <crypto/b128ops.h>
+#include <crypto/gcm.h>
+#include <crypto/ghash.h>
 #include <crypto/gf128mul.h>
 #include <crypto/internal/aead.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/cpufeature.h>
-#include <linux/crypto.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
 
 MODULE_DESCRIPTION("GHASH and AES-GCM using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("ghash");
 
-#define GHASH_BLOCK_SIZE	16
-#define GHASH_DIGEST_SIZE	16
-
 #define RFC4106_NONCE_SIZE	4
 
 struct ghash_key {
@@ -37,10 +34,8 @@ struct ghash_key {
 	u64			h[][2];
 };
 
-struct ghash_desc_ctx {
+struct arm_ghash_desc_ctx {
 	u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)];
-	u8 buf[GHASH_BLOCK_SIZE];
-	u32 count;
 };
 
 struct gcm_aes_ctx {
@@ -65,36 +60,12 @@ asmlinkage int pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[],
 
 static int ghash_init(struct shash_desc *desc)
 {
-	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+	struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc);
 
-	*ctx = (struct ghash_desc_ctx){};
+	*ctx = (struct arm_ghash_desc_ctx){};
 	return 0;
 }
 
-static void ghash_do_update(int blocks, u64 dg[], const char *src,
-			    struct ghash_key *key, const char *head)
-{
-	be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
-
-	do {
-		const u8 *in = src;
-
-		if (head) {
-			in = head;
-			blocks++;
-			head = NULL;
-		} else {
-			src += GHASH_BLOCK_SIZE;
-		}
-
-		crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE);
-		gf128mul_lle(&dst, &key->k);
-	} while (--blocks);
-
-	dg[0] = be64_to_cpu(dst.b);
-	dg[1] = be64_to_cpu(dst.a);
-}
-
 static __always_inline
 void ghash_do_simd_update(int blocks, u64 dg[], const char *src,
 			  struct ghash_key *key, const char *head,
@@ -103,13 +74,9 @@ void ghash_do_simd_update(int blocks, u64 dg[], const char *src,
 					      u64 const h[][2],
 					      const char *head))
 {
-	if (likely(crypto_simd_usable())) {
-		kernel_neon_begin();
-		simd_update(blocks, dg, src, key->h, head);
-		kernel_neon_end();
-	} else {
-		ghash_do_update(blocks, dg, src, key, head);
-	}
+	kernel_neon_begin();
+	simd_update(blocks, dg, src, key->h, head);
+	kernel_neon_end();
 }
 
 /* avoid hogging the CPU for too long */
@@ -118,61 +85,59 @@ void ghash_do_simd_update(int blocks, u64 dg[], const char *src,
 static int ghash_update(struct shash_desc *desc, const u8 *src,
 			unsigned int len)
 {
-	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
-	unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
+	struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+	struct ghash_key *key = crypto_shash_ctx(desc->tfm);
+	int blocks;
 
-	ctx->count += len;
+	blocks = len / GHASH_BLOCK_SIZE;
+	len -= blocks * GHASH_BLOCK_SIZE;
 
-	if ((partial + len) >= GHASH_BLOCK_SIZE) {
-		struct ghash_key *key = crypto_shash_ctx(desc->tfm);
-		int blocks;
-
-		if (partial) {
-			int p = GHASH_BLOCK_SIZE - partial;
+	do {
+		int chunk = min(blocks, MAX_BLOCKS);
 
-			memcpy(ctx->buf + partial, src, p);
-			src += p;
-			len -= p;
-		}
+		ghash_do_simd_update(chunk, ctx->digest, src, key, NULL,
+				     pmull_ghash_update_p8);
+		blocks -= chunk;
+		src += chunk * GHASH_BLOCK_SIZE;
+	} while (unlikely(blocks > 0));
+	return len;
+}
 
-		blocks = len / GHASH_BLOCK_SIZE;
-		len %= GHASH_BLOCK_SIZE;
+static int ghash_export(struct shash_desc *desc, void *out)
+{
+	struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+	u8 *dst = out;
 
-		do {
-			int chunk = min(blocks, MAX_BLOCKS);
+	put_unaligned_be64(ctx->digest[1], dst);
+	put_unaligned_be64(ctx->digest[0], dst + 8);
+	return 0;
+}
 
-			ghash_do_simd_update(chunk, ctx->digest, src, key,
-					     partial ? ctx->buf : NULL,
-					     pmull_ghash_update_p8);
+static int ghash_import(struct shash_desc *desc, const void *in)
+{
+	struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+	const u8 *src = in;
 
-			blocks -= chunk;
-			src += chunk * GHASH_BLOCK_SIZE;
-			partial = 0;
-		} while (unlikely(blocks > 0));
-	}
-	if (len)
-		memcpy(ctx->buf + partial, src, len);
+	ctx->digest[1] = get_unaligned_be64(src);
+	ctx->digest[0] = get_unaligned_be64(src + 8);
 	return 0;
 }
 
-static int ghash_final(struct shash_desc *desc, u8 *dst)
+static int ghash_finup(struct shash_desc *desc, const u8 *src,
+		       unsigned int len, u8 *dst)
 {
-	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
-	unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
-
-	if (partial) {
-		struct ghash_key *key = crypto_shash_ctx(desc->tfm);
+	struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+	struct ghash_key *key = crypto_shash_ctx(desc->tfm);
 
-		memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
+	if (len) {
+		u8 buf[GHASH_BLOCK_SIZE] = {};
 
-		ghash_do_simd_update(1, ctx->digest, ctx->buf, key, NULL,
+		memcpy(buf, src, len);
+		ghash_do_simd_update(1, ctx->digest, src, key, NULL,
 				     pmull_ghash_update_p8);
+		memzero_explicit(buf, sizeof(buf));
 	}
-	put_unaligned_be64(ctx->digest[1], dst);
-	put_unaligned_be64(ctx->digest[0], dst + 8);
-
-	memzero_explicit(ctx, sizeof(*ctx));
-	return 0;
+	return ghash_export(desc, dst);
 }
 
 static void ghash_reflect(u64 h[], const be128 *k)
@@ -205,6 +170,7 @@ static struct shash_alg ghash_alg = {
 	.base.cra_name		= "ghash",
 	.base.cra_driver_name	= "ghash-neon",
 	.base.cra_priority	= 150,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 	.base.cra_blocksize	= GHASH_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct ghash_key) + sizeof(u64[2]),
 	.base.cra_module	= THIS_MODULE,
@@ -212,9 +178,12 @@ static struct shash_alg ghash_alg = {
 	.digestsize		= GHASH_DIGEST_SIZE,
 	.init			= ghash_init,
 	.update			= ghash_update,
-	.final			= ghash_final,
+	.finup			= ghash_finup,
 	.setkey			= ghash_setkey,
-	.descsize		= sizeof(struct ghash_desc_ctx),
+	.export			= ghash_export,
+	.import			= ghash_import,
+	.descsize		= sizeof(struct arm_ghash_desc_ctx),
+	.statesize		= sizeof(struct ghash_desc_ctx),
 };
 
 static int num_rounds(struct crypto_aes_ctx *ctx)
diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c
deleted file mode 100644
index 18883ea438f3..000000000000
--- a/arch/arm64/crypto/poly1305-glue.c
+++ /dev/null
@@ -1,232 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <linux/unaligned.h>
-#include <crypto/algapi.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/poly1305.h>
-#include <crypto/internal/simd.h>
-#include <linux/cpufeature.h>
-#include <linux/crypto.h>
-#include <linux/jump_label.h>
-#include <linux/module.h>
-
-asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
-asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
-{
-	poly1305_init_arm64(&dctx->h, key);
-	dctx->s[0] = get_unaligned_le32(key + 16);
-	dctx->s[1] = get_unaligned_le32(key + 20);
-	dctx->s[2] = get_unaligned_le32(key + 24);
-	dctx->s[3] = get_unaligned_le32(key + 28);
-	dctx->buflen = 0;
-}
-EXPORT_SYMBOL(poly1305_init_arch);
-
-static int neon_poly1305_init(struct shash_desc *desc)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	dctx->buflen = 0;
-	dctx->rset = 0;
-	dctx->sset = false;
-
-	return 0;
-}
-
-static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
-				 u32 len, u32 hibit, bool do_neon)
-{
-	if (unlikely(!dctx->sset)) {
-		if (!dctx->rset) {
-			poly1305_init_arm64(&dctx->h, src);
-			src += POLY1305_BLOCK_SIZE;
-			len -= POLY1305_BLOCK_SIZE;
-			dctx->rset = 1;
-		}
-		if (len >= POLY1305_BLOCK_SIZE) {
-			dctx->s[0] = get_unaligned_le32(src +  0);
-			dctx->s[1] = get_unaligned_le32(src +  4);
-			dctx->s[2] = get_unaligned_le32(src +  8);
-			dctx->s[3] = get_unaligned_le32(src + 12);
-			src += POLY1305_BLOCK_SIZE;
-			len -= POLY1305_BLOCK_SIZE;
-			dctx->sset = true;
-		}
-		if (len < POLY1305_BLOCK_SIZE)
-			return;
-	}
-
-	len &= ~(POLY1305_BLOCK_SIZE - 1);
-
-	if (static_branch_likely(&have_neon) && likely(do_neon))
-		poly1305_blocks_neon(&dctx->h, src, len, hibit);
-	else
-		poly1305_blocks(&dctx->h, src, len, hibit);
-}
-
-static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx,
-				    const u8 *src, u32 len, bool do_neon)
-{
-	if (unlikely(dctx->buflen)) {
-		u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
-
-		memcpy(dctx->buf + dctx->buflen, src, bytes);
-		src += bytes;
-		len -= bytes;
-		dctx->buflen += bytes;
-
-		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-			neon_poly1305_blocks(dctx, dctx->buf,
-					     POLY1305_BLOCK_SIZE, 1, false);
-			dctx->buflen = 0;
-		}
-	}
-
-	if (likely(len >= POLY1305_BLOCK_SIZE)) {
-		neon_poly1305_blocks(dctx, src, len, 1, do_neon);
-		src += round_down(len, POLY1305_BLOCK_SIZE);
-		len %= POLY1305_BLOCK_SIZE;
-	}
-
-	if (unlikely(len)) {
-		dctx->buflen = len;
-		memcpy(dctx->buf, src, len);
-	}
-}
-
-static int neon_poly1305_update(struct shash_desc *desc,
-				const u8 *src, unsigned int srclen)
-{
-	bool do_neon = crypto_simd_usable() && srclen > 128;
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	if (static_branch_likely(&have_neon) && do_neon)
-		kernel_neon_begin();
-	neon_poly1305_do_update(dctx, src, srclen, do_neon);
-	if (static_branch_likely(&have_neon) && do_neon)
-		kernel_neon_end();
-	return 0;
-}
-
-void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
-			  unsigned int nbytes)
-{
-	if (unlikely(dctx->buflen)) {
-		u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
-
-		memcpy(dctx->buf + dctx->buflen, src, bytes);
-		src += bytes;
-		nbytes -= bytes;
-		dctx->buflen += bytes;
-
-		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-			poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
-			dctx->buflen = 0;
-		}
-	}
-
-	if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
-		unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
-
-		if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
-			do {
-				unsigned int todo = min_t(unsigned int, len, SZ_4K);
-
-				kernel_neon_begin();
-				poly1305_blocks_neon(&dctx->h, src, todo, 1);
-				kernel_neon_end();
-
-				len -= todo;
-				src += todo;
-			} while (len);
-		} else {
-			poly1305_blocks(&dctx->h, src, len, 1);
-			src += len;
-		}
-		nbytes %= POLY1305_BLOCK_SIZE;
-	}
-
-	if (unlikely(nbytes)) {
-		dctx->buflen = nbytes;
-		memcpy(dctx->buf, src, nbytes);
-	}
-}
-EXPORT_SYMBOL(poly1305_update_arch);
-
-void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
-{
-	if (unlikely(dctx->buflen)) {
-		dctx->buf[dctx->buflen++] = 1;
-		memset(dctx->buf + dctx->buflen, 0,
-		       POLY1305_BLOCK_SIZE - dctx->buflen);
-		poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
-	}
-
-	poly1305_emit(&dctx->h, dst, dctx->s);
-	memzero_explicit(dctx, sizeof(*dctx));
-}
-EXPORT_SYMBOL(poly1305_final_arch);
-
-static int neon_poly1305_final(struct shash_desc *desc, u8 *dst)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	if (unlikely(!dctx->sset))
-		return -ENOKEY;
-
-	poly1305_final_arch(dctx, dst);
-	return 0;
-}
-
-static struct shash_alg neon_poly1305_alg = {
-	.init			= neon_poly1305_init,
-	.update			= neon_poly1305_update,
-	.final			= neon_poly1305_final,
-	.digestsize		= POLY1305_DIGEST_SIZE,
-	.descsize		= sizeof(struct poly1305_desc_ctx),
-
-	.base.cra_name		= "poly1305",
-	.base.cra_driver_name	= "poly1305-neon",
-	.base.cra_priority	= 200,
-	.base.cra_blocksize	= POLY1305_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-};
-
-static int __init neon_poly1305_mod_init(void)
-{
-	if (!cpu_have_named_feature(ASIMD))
-		return 0;
-
-	static_branch_enable(&have_neon);
-
-	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
-		crypto_register_shash(&neon_poly1305_alg) : 0;
-}
-
-static void __exit neon_poly1305_mod_exit(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && cpu_have_named_feature(ASIMD))
-		crypto_unregister_shash(&neon_poly1305_alg);
-}
-
-module_init(neon_poly1305_mod_init);
-module_exit(neon_poly1305_mod_exit);
-
-MODULE_DESCRIPTION("Poly1305 transform using NEON instructions");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("poly1305");
-MODULE_ALIAS_CRYPTO("poly1305-neon");
diff --git a/arch/arm64/crypto/polyval-ce-glue.c b/arch/arm64/crypto/polyval-ce-glue.c
index 0a3b5718df85..c4e653688ea0 100644
--- a/arch/arm64/crypto/polyval-ce-glue.c
+++ b/arch/arm64/crypto/polyval-ce-glue.c
@@ -15,17 +15,15 @@
  * ARMv8 Crypto Extensions instructions to implement the finite field operations.
  */
 
-#include <crypto/algapi.h>
+#include <asm/neon.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/polyval.h>
-#include <linux/crypto.h>
-#include <linux/init.h>
+#include <crypto/utils.h>
+#include <linux/cpufeature.h>
+#include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/cpufeature.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
+#include <linux/string.h>
 
 #define NUM_KEY_POWERS	8
 
@@ -38,7 +36,6 @@ struct polyval_tfm_ctx {
 
 struct polyval_desc_ctx {
 	u8 buffer[POLYVAL_BLOCK_SIZE];
-	u32 bytes;
 };
 
 asmlinkage void pmull_polyval_update(const struct polyval_tfm_ctx *keys,
@@ -48,25 +45,16 @@ asmlinkage void pmull_polyval_mul(u8 *op1, const u8 *op2);
 static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
 	const u8 *in, size_t nblocks, u8 *accumulator)
 {
-	if (likely(crypto_simd_usable())) {
-		kernel_neon_begin();
-		pmull_polyval_update(keys, in, nblocks, accumulator);
-		kernel_neon_end();
-	} else {
-		polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in,
-			nblocks, accumulator);
-	}
+	kernel_neon_begin();
+	pmull_polyval_update(keys, in, nblocks, accumulator);
+	kernel_neon_end();
 }
 
 static void internal_polyval_mul(u8 *op1, const u8 *op2)
 {
-	if (likely(crypto_simd_usable())) {
-		kernel_neon_begin();
-		pmull_polyval_mul(op1, op2);
-		kernel_neon_end();
-	} else {
-		polyval_mul_non4k(op1, op2);
-	}
+	kernel_neon_begin();
+	pmull_polyval_mul(op1, op2);
+	kernel_neon_end();
 }
 
 static int polyval_arm64_setkey(struct crypto_shash *tfm,
@@ -103,49 +91,27 @@ static int polyval_arm64_update(struct shash_desc *desc,
 {
 	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
 	const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
-	u8 *pos;
 	unsigned int nblocks;
-	unsigned int n;
-
-	if (dctx->bytes) {
-		n = min(srclen, dctx->bytes);
-		pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes;
-
-		dctx->bytes -= n;
-		srclen -= n;
 
-		while (n--)
-			*pos++ ^= *src++;
-
-		if (!dctx->bytes)
-			internal_polyval_mul(dctx->buffer,
-					    tctx->key_powers[NUM_KEY_POWERS-1]);
-	}
-
-	while (srclen >= POLYVAL_BLOCK_SIZE) {
+	do {
 		/* allow rescheduling every 4K bytes */
 		nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
 		internal_polyval_update(tctx, src, nblocks, dctx->buffer);
 		srclen -= nblocks * POLYVAL_BLOCK_SIZE;
 		src += nblocks * POLYVAL_BLOCK_SIZE;
-	}
+	} while (srclen >= POLYVAL_BLOCK_SIZE);
 
-	if (srclen) {
-		dctx->bytes = POLYVAL_BLOCK_SIZE - srclen;
-		pos = dctx->buffer;
-		while (srclen--)
-			*pos++ ^= *src++;
-	}
-
-	return 0;
+	return srclen;
 }
 
-static int polyval_arm64_final(struct shash_desc *desc, u8 *dst)
+static int polyval_arm64_finup(struct shash_desc *desc, const u8 *src,
+			       unsigned int len, u8 *dst)
 {
 	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
 	const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 
-	if (dctx->bytes) {
+	if (len) {
+		crypto_xor(dctx->buffer, src, len);
 		internal_polyval_mul(dctx->buffer,
 				     tctx->key_powers[NUM_KEY_POWERS-1]);
 	}
@@ -159,13 +125,14 @@ static struct shash_alg polyval_alg = {
 	.digestsize	= POLYVAL_DIGEST_SIZE,
 	.init		= polyval_arm64_init,
 	.update		= polyval_arm64_update,
-	.final		= polyval_arm64_final,
+	.finup		= polyval_arm64_finup,
 	.setkey		= polyval_arm64_setkey,
 	.descsize	= sizeof(struct polyval_desc_ctx),
 	.base		= {
 		.cra_name		= "polyval",
 		.cra_driver_name	= "polyval-ce",
 		.cra_priority		= 200,
+		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize		= POLYVAL_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct polyval_tfm_ctx),
 		.cra_module		= THIS_MODULE,
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index cbd14f208f83..65b6980817e5 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -7,14 +7,14 @@
 
 #include <asm/neon.h>
 #include <asm/simd.h>
-#include <linux/unaligned.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
 #include <crypto/sha1.h>
 #include <crypto/sha1_base.h>
 #include <linux/cpufeature.h>
-#include <linux/crypto.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/string.h>
 
 MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
@@ -56,79 +56,49 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
 {
 	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
 
-	if (!crypto_simd_usable())
-		return crypto_sha1_update(desc, data, len);
-
 	sctx->finalize = 0;
-	sha1_base_do_update(desc, data, len, sha1_ce_transform);
-
-	return 0;
+	return sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform);
 }
 
 static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
 			 unsigned int len, u8 *out)
 {
 	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
-	bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE) && len;
-
-	if (!crypto_simd_usable())
-		return crypto_sha1_finup(desc, data, len, out);
+	bool finalized = false;
 
 	/*
 	 * Allow the asm code to perform the finalization if there is no
 	 * partial data and the input is a round multiple of the block size.
 	 */
-	sctx->finalize = finalize;
-
-	sha1_base_do_update(desc, data, len, sha1_ce_transform);
-	if (!finalize)
-		sha1_base_do_finalize(desc, sha1_ce_transform);
-	return sha1_base_finish(desc, out);
-}
-
-static int sha1_ce_final(struct shash_desc *desc, u8 *out)
-{
-	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable())
-		return crypto_sha1_finup(desc, NULL, 0, out);
-
-	sctx->finalize = 0;
-	sha1_base_do_finalize(desc, sha1_ce_transform);
+	if (len >= SHA1_BLOCK_SIZE) {
+		unsigned int remain = len - round_down(len, SHA1_BLOCK_SIZE);
+
+		finalized = !remain;
+		sctx->finalize = finalized;
+		sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform);
+		data += len - remain;
+		len = remain;
+	}
+	if (!finalized) {
+		sctx->finalize = 0;
+		sha1_base_do_finup(desc, data, len, sha1_ce_transform);
+	}
 	return sha1_base_finish(desc, out);
 }
 
-static int sha1_ce_export(struct shash_desc *desc, void *out)
-{
-	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, &sctx->sst, sizeof(struct sha1_state));
-	return 0;
-}
-
-static int sha1_ce_import(struct shash_desc *desc, const void *in)
-{
-	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(&sctx->sst, in, sizeof(struct sha1_state));
-	sctx->finalize = 0;
-	return 0;
-}
-
 static struct shash_alg alg = {
 	.init			= sha1_base_init,
 	.update			= sha1_ce_update,
-	.final			= sha1_ce_final,
 	.finup			= sha1_ce_finup,
-	.import			= sha1_ce_import,
-	.export			= sha1_ce_export,
 	.descsize		= sizeof(struct sha1_ce_state),
-	.statesize		= sizeof(struct sha1_state),
+	.statesize		= SHA1_STATE_SIZE,
 	.digestsize		= SHA1_DIGEST_SIZE,
 	.base			= {
 		.cra_name		= "sha1",
 		.cra_driver_name	= "sha1-ce",
 		.cra_priority		= 200,
+		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					  CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize		= SHA1_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
deleted file mode 100644
index 6b4866a88ded..000000000000
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ /dev/null
@@ -1,192 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2014 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <linux/unaligned.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <linux/cpufeature.h>
-#include <linux/crypto.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha224");
-MODULE_ALIAS_CRYPTO("sha256");
-
-struct sha256_ce_state {
-	struct sha256_state	sst;
-	u32			finalize;
-};
-
-extern const u32 sha256_ce_offsetof_count;
-extern const u32 sha256_ce_offsetof_finalize;
-
-asmlinkage int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
-				     int blocks);
-
-static void sha256_ce_transform(struct sha256_state *sst, u8 const *src,
-				int blocks)
-{
-	while (blocks) {
-		int rem;
-
-		kernel_neon_begin();
-		rem = __sha256_ce_transform(container_of(sst,
-							 struct sha256_ce_state,
-							 sst), src, blocks);
-		kernel_neon_end();
-		src += (blocks - rem) * SHA256_BLOCK_SIZE;
-		blocks = rem;
-	}
-}
-
-const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state,
-					      sst.count);
-const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state,
-						 finalize);
-
-asmlinkage void sha256_block_data_order(u32 *digest, u8 const *src, int blocks);
-
-static void sha256_arm64_transform(struct sha256_state *sst, u8 const *src,
-				   int blocks)
-{
-	sha256_block_data_order(sst->state, src, blocks);
-}
-
-static int sha256_ce_update(struct shash_desc *desc, const u8 *data,
-			    unsigned int len)
-{
-	struct sha256_ce_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable())
-		return sha256_base_do_update(desc, data, len,
-					     sha256_arm64_transform);
-
-	sctx->finalize = 0;
-	sha256_base_do_update(desc, data, len, sha256_ce_transform);
-
-	return 0;
-}
-
-static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
-			   unsigned int len, u8 *out)
-{
-	struct sha256_ce_state *sctx = shash_desc_ctx(desc);
-	bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE) && len;
-
-	if (!crypto_simd_usable()) {
-		if (len)
-			sha256_base_do_update(desc, data, len,
-					      sha256_arm64_transform);
-		sha256_base_do_finalize(desc, sha256_arm64_transform);
-		return sha256_base_finish(desc, out);
-	}
-
-	/*
-	 * Allow the asm code to perform the finalization if there is no
-	 * partial data and the input is a round multiple of the block size.
-	 */
-	sctx->finalize = finalize;
-
-	sha256_base_do_update(desc, data, len, sha256_ce_transform);
-	if (!finalize)
-		sha256_base_do_finalize(desc, sha256_ce_transform);
-	return sha256_base_finish(desc, out);
-}
-
-static int sha256_ce_final(struct shash_desc *desc, u8 *out)
-{
-	struct sha256_ce_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable()) {
-		sha256_base_do_finalize(desc, sha256_arm64_transform);
-		return sha256_base_finish(desc, out);
-	}
-
-	sctx->finalize = 0;
-	sha256_base_do_finalize(desc, sha256_ce_transform);
-	return sha256_base_finish(desc, out);
-}
-
-static int sha256_ce_digest(struct shash_desc *desc, const u8 *data,
-			    unsigned int len, u8 *out)
-{
-	sha256_base_init(desc);
-	return sha256_ce_finup(desc, data, len, out);
-}
-
-static int sha256_ce_export(struct shash_desc *desc, void *out)
-{
-	struct sha256_ce_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, &sctx->sst, sizeof(struct sha256_state));
-	return 0;
-}
-
-static int sha256_ce_import(struct shash_desc *desc, const void *in)
-{
-	struct sha256_ce_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(&sctx->sst, in, sizeof(struct sha256_state));
-	sctx->finalize = 0;
-	return 0;
-}
-
-static struct shash_alg algs[] = { {
-	.init			= sha224_base_init,
-	.update			= sha256_ce_update,
-	.final			= sha256_ce_final,
-	.finup			= sha256_ce_finup,
-	.export			= sha256_ce_export,
-	.import			= sha256_ce_import,
-	.descsize		= sizeof(struct sha256_ce_state),
-	.statesize		= sizeof(struct sha256_state),
-	.digestsize		= SHA224_DIGEST_SIZE,
-	.base			= {
-		.cra_name		= "sha224",
-		.cra_driver_name	= "sha224-ce",
-		.cra_priority		= 200,
-		.cra_blocksize		= SHA256_BLOCK_SIZE,
-		.cra_module		= THIS_MODULE,
-	}
-}, {
-	.init			= sha256_base_init,
-	.update			= sha256_ce_update,
-	.final			= sha256_ce_final,
-	.finup			= sha256_ce_finup,
-	.digest			= sha256_ce_digest,
-	.export			= sha256_ce_export,
-	.import			= sha256_ce_import,
-	.descsize		= sizeof(struct sha256_ce_state),
-	.statesize		= sizeof(struct sha256_state),
-	.digestsize		= SHA256_DIGEST_SIZE,
-	.base			= {
-		.cra_name		= "sha256",
-		.cra_driver_name	= "sha256-ce",
-		.cra_priority		= 200,
-		.cra_blocksize		= SHA256_BLOCK_SIZE,
-		.cra_module		= THIS_MODULE,
-	}
-} };
-
-static int __init sha2_ce_mod_init(void)
-{
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha2_ce_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_cpu_feature_match(SHA2, sha2_ce_mod_init);
-module_exit(sha2_ce_mod_fini);
diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c
deleted file mode 100644
index 35356987cc1e..000000000000
--- a/arch/arm64/crypto/sha256-glue.c
+++ /dev/null
@@ -1,194 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Linux/arm64 port of the OpenSSL SHA256 implementation for AArch64
- *
- * Copyright (c) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash for arm64");
-MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha224");
-MODULE_ALIAS_CRYPTO("sha256");
-
-asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
-					unsigned int num_blks);
-EXPORT_SYMBOL(sha256_block_data_order);
-
-static void sha256_arm64_transform(struct sha256_state *sst, u8 const *src,
-				   int blocks)
-{
-	sha256_block_data_order(sst->state, src, blocks);
-}
-
-asmlinkage void sha256_block_neon(u32 *digest, const void *data,
-				  unsigned int num_blks);
-
-static void sha256_neon_transform(struct sha256_state *sst, u8 const *src,
-				  int blocks)
-{
-	sha256_block_neon(sst->state, src, blocks);
-}
-
-static int crypto_sha256_arm64_update(struct shash_desc *desc, const u8 *data,
-				      unsigned int len)
-{
-	return sha256_base_do_update(desc, data, len, sha256_arm64_transform);
-}
-
-static int crypto_sha256_arm64_finup(struct shash_desc *desc, const u8 *data,
-				     unsigned int len, u8 *out)
-{
-	if (len)
-		sha256_base_do_update(desc, data, len, sha256_arm64_transform);
-	sha256_base_do_finalize(desc, sha256_arm64_transform);
-
-	return sha256_base_finish(desc, out);
-}
-
-static int crypto_sha256_arm64_final(struct shash_desc *desc, u8 *out)
-{
-	return crypto_sha256_arm64_finup(desc, NULL, 0, out);
-}
-
-static struct shash_alg algs[] = { {
-	.digestsize		= SHA256_DIGEST_SIZE,
-	.init			= sha256_base_init,
-	.update			= crypto_sha256_arm64_update,
-	.final			= crypto_sha256_arm64_final,
-	.finup			= crypto_sha256_arm64_finup,
-	.descsize		= sizeof(struct sha256_state),
-	.base.cra_name		= "sha256",
-	.base.cra_driver_name	= "sha256-arm64",
-	.base.cra_priority	= 125,
-	.base.cra_blocksize	= SHA256_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.digestsize		= SHA224_DIGEST_SIZE,
-	.init			= sha224_base_init,
-	.update			= crypto_sha256_arm64_update,
-	.final			= crypto_sha256_arm64_final,
-	.finup			= crypto_sha256_arm64_finup,
-	.descsize		= sizeof(struct sha256_state),
-	.base.cra_name		= "sha224",
-	.base.cra_driver_name	= "sha224-arm64",
-	.base.cra_priority	= 125,
-	.base.cra_blocksize	= SHA224_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-} };
-
-static int sha256_update_neon(struct shash_desc *desc, const u8 *data,
-			      unsigned int len)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable())
-		return sha256_base_do_update(desc, data, len,
-				sha256_arm64_transform);
-
-	while (len > 0) {
-		unsigned int chunk = len;
-
-		/*
-		 * Don't hog the CPU for the entire time it takes to process all
-		 * input when running on a preemptible kernel, but process the
-		 * data block by block instead.
-		 */
-		if (IS_ENABLED(CONFIG_PREEMPTION) &&
-		    chunk + sctx->count % SHA256_BLOCK_SIZE > SHA256_BLOCK_SIZE)
-			chunk = SHA256_BLOCK_SIZE -
-				sctx->count % SHA256_BLOCK_SIZE;
-
-		kernel_neon_begin();
-		sha256_base_do_update(desc, data, chunk, sha256_neon_transform);
-		kernel_neon_end();
-		data += chunk;
-		len -= chunk;
-	}
-	return 0;
-}
-
-static int sha256_finup_neon(struct shash_desc *desc, const u8 *data,
-			     unsigned int len, u8 *out)
-{
-	if (!crypto_simd_usable()) {
-		if (len)
-			sha256_base_do_update(desc, data, len,
-				sha256_arm64_transform);
-		sha256_base_do_finalize(desc, sha256_arm64_transform);
-	} else {
-		if (len)
-			sha256_update_neon(desc, data, len);
-		kernel_neon_begin();
-		sha256_base_do_finalize(desc, sha256_neon_transform);
-		kernel_neon_end();
-	}
-	return sha256_base_finish(desc, out);
-}
-
-static int sha256_final_neon(struct shash_desc *desc, u8 *out)
-{
-	return sha256_finup_neon(desc, NULL, 0, out);
-}
-
-static struct shash_alg neon_algs[] = { {
-	.digestsize		= SHA256_DIGEST_SIZE,
-	.init			= sha256_base_init,
-	.update			= sha256_update_neon,
-	.final			= sha256_final_neon,
-	.finup			= sha256_finup_neon,
-	.descsize		= sizeof(struct sha256_state),
-	.base.cra_name		= "sha256",
-	.base.cra_driver_name	= "sha256-arm64-neon",
-	.base.cra_priority	= 150,
-	.base.cra_blocksize	= SHA256_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.digestsize		= SHA224_DIGEST_SIZE,
-	.init			= sha224_base_init,
-	.update			= sha256_update_neon,
-	.final			= sha256_final_neon,
-	.finup			= sha256_finup_neon,
-	.descsize		= sizeof(struct sha256_state),
-	.base.cra_name		= "sha224",
-	.base.cra_driver_name	= "sha224-arm64-neon",
-	.base.cra_priority	= 150,
-	.base.cra_blocksize	= SHA224_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-} };
-
-static int __init sha256_mod_init(void)
-{
-	int ret = crypto_register_shashes(algs, ARRAY_SIZE(algs));
-	if (ret)
-		return ret;
-
-	if (cpu_have_named_feature(ASIMD)) {
-		ret = crypto_register_shashes(neon_algs, ARRAY_SIZE(neon_algs));
-		if (ret)
-			crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-	}
-	return ret;
-}
-
-static void __exit sha256_mod_fini(void)
-{
-	if (cpu_have_named_feature(ASIMD))
-		crypto_unregister_shashes(neon_algs, ARRAY_SIZE(neon_algs));
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_init(sha256_mod_init);
-module_exit(sha256_mod_fini);
diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
index 5662c3ac49e9..b4f1001046c9 100644
--- a/arch/arm64/crypto/sha3-ce-glue.c
+++ b/arch/arm64/crypto/sha3-ce-glue.c
@@ -12,13 +12,13 @@
 #include <asm/hwcap.h>
 #include <asm/neon.h>
 #include <asm/simd.h>
-#include <linux/unaligned.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/sha3.h>
 #include <linux/cpufeature.h>
-#include <linux/crypto.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
 
 MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
@@ -35,74 +35,55 @@ static int sha3_update(struct shash_desc *desc, const u8 *data,
 		       unsigned int len)
 {
 	struct sha3_state *sctx = shash_desc_ctx(desc);
-	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
-
-	if (!crypto_simd_usable())
-		return crypto_sha3_update(desc, data, len);
-
-	if ((sctx->partial + len) >= sctx->rsiz) {
-		int blocks;
-
-		if (sctx->partial) {
-			int p = sctx->rsiz - sctx->partial;
-
-			memcpy(sctx->buf + sctx->partial, data, p);
-			kernel_neon_begin();
-			sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size);
-			kernel_neon_end();
-
-			data += p;
-			len -= p;
-			sctx->partial = 0;
-		}
-
-		blocks = len / sctx->rsiz;
-		len %= sctx->rsiz;
-
-		while (blocks) {
-			int rem;
-
-			kernel_neon_begin();
-			rem = sha3_ce_transform(sctx->st, data, blocks,
-						digest_size);
-			kernel_neon_end();
-			data += (blocks - rem) * sctx->rsiz;
-			blocks = rem;
-		}
-	}
-
-	if (len) {
-		memcpy(sctx->buf + sctx->partial, data, len);
-		sctx->partial += len;
-	}
-	return 0;
+	struct crypto_shash *tfm = desc->tfm;
+	unsigned int bs, ds;
+	int blocks;
+
+	ds = crypto_shash_digestsize(tfm);
+	bs = crypto_shash_blocksize(tfm);
+	blocks = len / bs;
+	len -= blocks * bs;
+	do {
+		int rem;
+
+		kernel_neon_begin();
+		rem = sha3_ce_transform(sctx->st, data, blocks, ds);
+		kernel_neon_end();
+		data += (blocks - rem) * bs;
+		blocks = rem;
+	} while (blocks);
+	return len;
 }
 
-static int sha3_final(struct shash_desc *desc, u8 *out)
+static int sha3_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
+		      u8 *out)
 {
 	struct sha3_state *sctx = shash_desc_ctx(desc);
-	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
+	struct crypto_shash *tfm = desc->tfm;
 	__le64 *digest = (__le64 *)out;
+	u8 block[SHA3_224_BLOCK_SIZE];
+	unsigned int bs, ds;
 	int i;
 
-	if (!crypto_simd_usable())
-		return crypto_sha3_final(desc, out);
+	ds = crypto_shash_digestsize(tfm);
+	bs = crypto_shash_blocksize(tfm);
+	memcpy(block, src, len);
 
-	sctx->buf[sctx->partial++] = 0x06;
-	memset(sctx->buf + sctx->partial, 0, sctx->rsiz - sctx->partial);
-	sctx->buf[sctx->rsiz - 1] |= 0x80;
+	block[len++] = 0x06;
+	memset(block + len, 0, bs - len);
+	block[bs - 1] |= 0x80;
 
 	kernel_neon_begin();
-	sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size);
+	sha3_ce_transform(sctx->st, block, 1, ds);
 	kernel_neon_end();
+	memzero_explicit(block , sizeof(block));
 
-	for (i = 0; i < digest_size / 8; i++)
+	for (i = 0; i < ds / 8; i++)
 		put_unaligned_le64(sctx->st[i], digest++);
 
-	if (digest_size & 4)
+	if (ds & 4)
 		put_unaligned_le32(sctx->st[i], (__le32 *)digest);
 
-	memzero_explicit(sctx, sizeof(*sctx));
 	return 0;
 }
 
@@ -110,10 +91,11 @@ static struct shash_alg algs[] = { {
 	.digestsize		= SHA3_224_DIGEST_SIZE,
 	.init			= crypto_sha3_init,
 	.update			= sha3_update,
-	.final			= sha3_final,
-	.descsize		= sizeof(struct sha3_state),
+	.finup			= sha3_finup,
+	.descsize		= SHA3_STATE_SIZE,
 	.base.cra_name		= "sha3-224",
 	.base.cra_driver_name	= "sha3-224-ce",
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 	.base.cra_blocksize	= SHA3_224_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 200,
@@ -121,10 +103,11 @@ static struct shash_alg algs[] = { {
 	.digestsize		= SHA3_256_DIGEST_SIZE,
 	.init			= crypto_sha3_init,
 	.update			= sha3_update,
-	.final			= sha3_final,
-	.descsize		= sizeof(struct sha3_state),
+	.finup			= sha3_finup,
+	.descsize		= SHA3_STATE_SIZE,
 	.base.cra_name		= "sha3-256",
 	.base.cra_driver_name	= "sha3-256-ce",
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 	.base.cra_blocksize	= SHA3_256_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 200,
@@ -132,10 +115,11 @@ static struct shash_alg algs[] = { {
 	.digestsize		= SHA3_384_DIGEST_SIZE,
 	.init			= crypto_sha3_init,
 	.update			= sha3_update,
-	.final			= sha3_final,
-	.descsize		= sizeof(struct sha3_state),
+	.finup			= sha3_finup,
+	.descsize		= SHA3_STATE_SIZE,
 	.base.cra_name		= "sha3-384",
 	.base.cra_driver_name	= "sha3-384-ce",
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 	.base.cra_blocksize	= SHA3_384_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 200,
@@ -143,10 +127,11 @@ static struct shash_alg algs[] = { {
 	.digestsize		= SHA3_512_DIGEST_SIZE,
 	.init			= crypto_sha3_init,
 	.update			= sha3_update,
-	.final			= sha3_final,
-	.descsize		= sizeof(struct sha3_state),
+	.finup			= sha3_finup,
+	.descsize		= SHA3_STATE_SIZE,
 	.base.cra_name		= "sha3-512",
 	.base.cra_driver_name	= "sha3-512-ce",
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 	.base.cra_blocksize	= SHA3_512_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 200,
diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c
index 071f64293227..6fb3001fa2c9 100644
--- a/arch/arm64/crypto/sha512-ce-glue.c
+++ b/arch/arm64/crypto/sha512-ce-glue.c
@@ -10,14 +10,11 @@
  */
 
 #include <asm/neon.h>
-#include <asm/simd.h>
-#include <linux/unaligned.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/sha2.h>
 #include <crypto/sha512_base.h>
 #include <linux/cpufeature.h>
-#include <linux/crypto.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 
 MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions");
@@ -29,12 +26,10 @@ MODULE_ALIAS_CRYPTO("sha512");
 asmlinkage int __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
 				     int blocks);
 
-asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks);
-
 static void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
 				int blocks)
 {
-	while (blocks) {
+	do {
 		int rem;
 
 		kernel_neon_begin();
@@ -42,67 +37,47 @@ static void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
 		kernel_neon_end();
 		src += (blocks - rem) * SHA512_BLOCK_SIZE;
 		blocks = rem;
-	}
-}
-
-static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src,
-				   int blocks)
-{
-	sha512_block_data_order(sst->state, src, blocks);
+	} while (blocks);
 }
 
 static int sha512_ce_update(struct shash_desc *desc, const u8 *data,
 			    unsigned int len)
 {
-	sha512_block_fn *fn = crypto_simd_usable() ? sha512_ce_transform
-						   : sha512_arm64_transform;
-
-	sha512_base_do_update(desc, data, len, fn);
-	return 0;
+	return sha512_base_do_update_blocks(desc, data, len,
+					    sha512_ce_transform);
 }
 
 static int sha512_ce_finup(struct shash_desc *desc, const u8 *data,
 			   unsigned int len, u8 *out)
 {
-	sha512_block_fn *fn = crypto_simd_usable() ? sha512_ce_transform
-						   : sha512_arm64_transform;
-
-	sha512_base_do_update(desc, data, len, fn);
-	sha512_base_do_finalize(desc, fn);
-	return sha512_base_finish(desc, out);
-}
-
-static int sha512_ce_final(struct shash_desc *desc, u8 *out)
-{
-	sha512_block_fn *fn = crypto_simd_usable() ? sha512_ce_transform
-						   : sha512_arm64_transform;
-
-	sha512_base_do_finalize(desc, fn);
+	sha512_base_do_finup(desc, data, len, sha512_ce_transform);
 	return sha512_base_finish(desc, out);
 }
 
 static struct shash_alg algs[] = { {
 	.init			= sha384_base_init,
 	.update			= sha512_ce_update,
-	.final			= sha512_ce_final,
 	.finup			= sha512_ce_finup,
-	.descsize		= sizeof(struct sha512_state),
+	.descsize		= SHA512_STATE_SIZE,
 	.digestsize		= SHA384_DIGEST_SIZE,
 	.base.cra_name		= "sha384",
 	.base.cra_driver_name	= "sha384-ce",
 	.base.cra_priority	= 200,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				  CRYPTO_AHASH_ALG_FINUP_MAX,
 	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 }, {
 	.init			= sha512_base_init,
 	.update			= sha512_ce_update,
-	.final			= sha512_ce_final,
 	.finup			= sha512_ce_finup,
-	.descsize		= sizeof(struct sha512_state),
+	.descsize		= SHA512_STATE_SIZE,
 	.digestsize		= SHA512_DIGEST_SIZE,
 	.base.cra_name		= "sha512",
 	.base.cra_driver_name	= "sha512-ce",
 	.base.cra_priority	= 200,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				  CRYPTO_AHASH_ALG_FINUP_MAX,
 	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 } };
diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c
index 62f129dea83d..15aa9d8b7b2c 100644
--- a/arch/arm64/crypto/sha512-glue.c
+++ b/arch/arm64/crypto/sha512-glue.c
@@ -6,11 +6,10 @@
  */
 
 #include <crypto/internal/hash.h>
-#include <linux/types.h>
-#include <linux/string.h>
 #include <crypto/sha2.h>
 #include <crypto/sha512_base.h>
-#include <asm/neon.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 
 MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash for arm64");
 MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>");
@@ -19,59 +18,53 @@ MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("sha384");
 MODULE_ALIAS_CRYPTO("sha512");
 
-asmlinkage void sha512_block_data_order(u64 *digest, const void *data,
-					unsigned int num_blks);
-EXPORT_SYMBOL(sha512_block_data_order);
+asmlinkage void sha512_blocks_arch(u64 *digest, const void *data,
+				   unsigned int num_blks);
 
 static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src,
 				   int blocks)
 {
-	sha512_block_data_order(sst->state, src, blocks);
+	sha512_blocks_arch(sst->state, src, blocks);
 }
 
 static int sha512_update(struct shash_desc *desc, const u8 *data,
 			 unsigned int len)
 {
-	return sha512_base_do_update(desc, data, len, sha512_arm64_transform);
+	return sha512_base_do_update_blocks(desc, data, len,
+					    sha512_arm64_transform);
 }
 
 static int sha512_finup(struct shash_desc *desc, const u8 *data,
 			unsigned int len, u8 *out)
 {
-	if (len)
-		sha512_base_do_update(desc, data, len, sha512_arm64_transform);
-	sha512_base_do_finalize(desc, sha512_arm64_transform);
-
+	sha512_base_do_finup(desc, data, len, sha512_arm64_transform);
 	return sha512_base_finish(desc, out);
 }
 
-static int sha512_final(struct shash_desc *desc, u8 *out)
-{
-	return sha512_finup(desc, NULL, 0, out);
-}
-
 static struct shash_alg algs[] = { {
 	.digestsize		= SHA512_DIGEST_SIZE,
 	.init			= sha512_base_init,
 	.update			= sha512_update,
-	.final			= sha512_final,
 	.finup			= sha512_finup,
-	.descsize		= sizeof(struct sha512_state),
+	.descsize		= SHA512_STATE_SIZE,
 	.base.cra_name		= "sha512",
 	.base.cra_driver_name	= "sha512-arm64",
 	.base.cra_priority	= 150,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				  CRYPTO_AHASH_ALG_FINUP_MAX,
 	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 }, {
 	.digestsize		= SHA384_DIGEST_SIZE,
 	.init			= sha384_base_init,
 	.update			= sha512_update,
-	.final			= sha512_final,
 	.finup			= sha512_finup,
-	.descsize		= sizeof(struct sha512_state),
+	.descsize		= SHA512_STATE_SIZE,
 	.base.cra_name		= "sha384",
 	.base.cra_driver_name	= "sha384-arm64",
 	.base.cra_priority	= 150,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				  CRYPTO_AHASH_ALG_FINUP_MAX,
 	.base.cra_blocksize	= SHA384_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 } };
diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c
index 1a71788c4cda..eac6f5fa0abe 100644
--- a/arch/arm64/crypto/sm3-ce-glue.c
+++ b/arch/arm64/crypto/sm3-ce-glue.c
@@ -6,14 +6,11 @@
  */
 
 #include <asm/neon.h>
-#include <asm/simd.h>
-#include <linux/unaligned.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/sm3.h>
 #include <crypto/sm3_base.h>
 #include <linux/cpufeature.h>
-#include <linux/crypto.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 
 MODULE_DESCRIPTION("SM3 secure hash using ARMv8 Crypto Extensions");
@@ -26,50 +23,20 @@ asmlinkage void sm3_ce_transform(struct sm3_state *sst, u8 const *src,
 static int sm3_ce_update(struct shash_desc *desc, const u8 *data,
 			 unsigned int len)
 {
-	if (!crypto_simd_usable()) {
-		sm3_update(shash_desc_ctx(desc), data, len);
-		return 0;
-	}
+	int remain;
 
 	kernel_neon_begin();
-	sm3_base_do_update(desc, data, len, sm3_ce_transform);
+	remain = sm3_base_do_update_blocks(desc, data, len, sm3_ce_transform);
 	kernel_neon_end();
-
-	return 0;
-}
-
-static int sm3_ce_final(struct shash_desc *desc, u8 *out)
-{
-	if (!crypto_simd_usable()) {
-		sm3_final(shash_desc_ctx(desc), out);
-		return 0;
-	}
-
-	kernel_neon_begin();
-	sm3_base_do_finalize(desc, sm3_ce_transform);
-	kernel_neon_end();
-
-	return sm3_base_finish(desc, out);
+	return remain;
 }
 
 static int sm3_ce_finup(struct shash_desc *desc, const u8 *data,
 			unsigned int len, u8 *out)
 {
-	if (!crypto_simd_usable()) {
-		struct sm3_state *sctx = shash_desc_ctx(desc);
-
-		if (len)
-			sm3_update(sctx, data, len);
-		sm3_final(sctx, out);
-		return 0;
-	}
-
 	kernel_neon_begin();
-	if (len)
-		sm3_base_do_update(desc, data, len, sm3_ce_transform);
-	sm3_base_do_finalize(desc, sm3_ce_transform);
+	sm3_base_do_finup(desc, data, len, sm3_ce_transform);
 	kernel_neon_end();
-
 	return sm3_base_finish(desc, out);
 }
 
@@ -77,11 +44,12 @@ static struct shash_alg sm3_alg = {
 	.digestsize		= SM3_DIGEST_SIZE,
 	.init			= sm3_base_init,
 	.update			= sm3_ce_update,
-	.final			= sm3_ce_final,
 	.finup			= sm3_ce_finup,
-	.descsize		= sizeof(struct sm3_state),
+	.descsize		= SM3_STATE_SIZE,
 	.base.cra_name		= "sm3",
 	.base.cra_driver_name	= "sm3-ce",
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				  CRYPTO_AHASH_ALG_FINUP_MAX,
 	.base.cra_blocksize	= SM3_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 400,
diff --git a/arch/arm64/crypto/sm3-neon-glue.c b/arch/arm64/crypto/sm3-neon-glue.c
index 8dd71ce79b69..6c4611a503a3 100644
--- a/arch/arm64/crypto/sm3-neon-glue.c
+++ b/arch/arm64/crypto/sm3-neon-glue.c
@@ -6,14 +6,11 @@
  */
 
 #include <asm/neon.h>
-#include <asm/simd.h>
-#include <linux/unaligned.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/sm3.h>
 #include <crypto/sm3_base.h>
 #include <linux/cpufeature.h>
-#include <linux/crypto.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 
 
@@ -23,50 +20,20 @@ asmlinkage void sm3_neon_transform(struct sm3_state *sst, u8 const *src,
 static int sm3_neon_update(struct shash_desc *desc, const u8 *data,
 			   unsigned int len)
 {
-	if (!crypto_simd_usable()) {
-		sm3_update(shash_desc_ctx(desc), data, len);
-		return 0;
-	}
+	int remain;
 
 	kernel_neon_begin();
-	sm3_base_do_update(desc, data, len, sm3_neon_transform);
+	remain = sm3_base_do_update_blocks(desc, data, len, sm3_neon_transform);
 	kernel_neon_end();
-
-	return 0;
-}
-
-static int sm3_neon_final(struct shash_desc *desc, u8 *out)
-{
-	if (!crypto_simd_usable()) {
-		sm3_final(shash_desc_ctx(desc), out);
-		return 0;
-	}
-
-	kernel_neon_begin();
-	sm3_base_do_finalize(desc, sm3_neon_transform);
-	kernel_neon_end();
-
-	return sm3_base_finish(desc, out);
+	return remain;
 }
 
 static int sm3_neon_finup(struct shash_desc *desc, const u8 *data,
 			  unsigned int len, u8 *out)
 {
-	if (!crypto_simd_usable()) {
-		struct sm3_state *sctx = shash_desc_ctx(desc);
-
-		if (len)
-			sm3_update(sctx, data, len);
-		sm3_final(sctx, out);
-		return 0;
-	}
-
 	kernel_neon_begin();
-	if (len)
-		sm3_base_do_update(desc, data, len, sm3_neon_transform);
-	sm3_base_do_finalize(desc, sm3_neon_transform);
+	sm3_base_do_finup(desc, data, len, sm3_neon_transform);
 	kernel_neon_end();
-
 	return sm3_base_finish(desc, out);
 }
 
@@ -74,11 +41,12 @@ static struct shash_alg sm3_alg = {
 	.digestsize		= SM3_DIGEST_SIZE,
 	.init			= sm3_base_init,
 	.update			= sm3_neon_update,
-	.final			= sm3_neon_final,
 	.finup			= sm3_neon_finup,
-	.descsize		= sizeof(struct sm3_state),
+	.descsize		= SM3_STATE_SIZE,
 	.base.cra_name		= "sm3",
 	.base.cra_driver_name	= "sm3-neon",
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				  CRYPTO_AHASH_ALG_FINUP_MAX,
 	.base.cra_blocksize	= SM3_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 200,
diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c
index 43741bed874e..7a60e7b559dc 100644
--- a/arch/arm64/crypto/sm4-ce-glue.c
+++ b/arch/arm64/crypto/sm4-ce-glue.c
@@ -8,19 +8,18 @@
  * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
  */
 
-#include <linux/module.h>
-#include <linux/crypto.h>
-#include <linux/kernel.h>
-#include <linux/cpufeature.h>
 #include <asm/neon.h>
-#include <asm/simd.h>
 #include <crypto/b128ops.h>
-#include <crypto/internal/simd.h>
-#include <crypto/internal/skcipher.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
-#include <crypto/xts.h>
 #include <crypto/sm4.h>
+#include <crypto/utils.h>
+#include <crypto/xts.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
 
 #define BYTES2BLKS(nbytes)	((nbytes) >> 4)
 
@@ -64,7 +63,6 @@ struct sm4_mac_tfm_ctx {
 };
 
 struct sm4_mac_desc_ctx {
-	unsigned int len;
 	u8 digest[SM4_BLOCK_SIZE];
 };
 
@@ -591,8 +589,6 @@ static int sm4_mac_init(struct shash_desc *desc)
 	struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
 
 	memset(ctx->digest, 0, SM4_BLOCK_SIZE);
-	ctx->len = 0;
-
 	return 0;
 }
 
@@ -601,87 +597,50 @@ static int sm4_mac_update(struct shash_desc *desc, const u8 *p,
 {
 	struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
-	unsigned int l, nblocks;
-
-	if (len == 0)
-		return 0;
-
-	if (ctx->len || ctx->len + len < SM4_BLOCK_SIZE) {
-		l = min(len, SM4_BLOCK_SIZE - ctx->len);
-
-		crypto_xor(ctx->digest + ctx->len, p, l);
-		ctx->len += l;
-		len -= l;
-		p += l;
-	}
-
-	if (len && (ctx->len % SM4_BLOCK_SIZE) == 0) {
-		kernel_neon_begin();
-
-		if (len < SM4_BLOCK_SIZE && ctx->len == SM4_BLOCK_SIZE) {
-			sm4_ce_crypt_block(tctx->key.rkey_enc,
-					   ctx->digest, ctx->digest);
-			ctx->len = 0;
-		} else {
-			nblocks = len / SM4_BLOCK_SIZE;
-			len %= SM4_BLOCK_SIZE;
+	unsigned int nblocks = len / SM4_BLOCK_SIZE;
 
-			sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, p,
-					  nblocks, (ctx->len == SM4_BLOCK_SIZE),
-					  (len != 0));
-
-			p += nblocks * SM4_BLOCK_SIZE;
-
-			if (len == 0)
-				ctx->len = SM4_BLOCK_SIZE;
-		}
-
-		kernel_neon_end();
-
-		if (len) {
-			crypto_xor(ctx->digest, p, len);
-			ctx->len = len;
-		}
-	}
-
-	return 0;
+	len %= SM4_BLOCK_SIZE;
+	kernel_neon_begin();
+	sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, p,
+			  nblocks, false, true);
+	kernel_neon_end();
+	return len;
 }
 
-static int sm4_cmac_final(struct shash_desc *desc, u8 *out)
+static int sm4_cmac_finup(struct shash_desc *desc, const u8 *src,
+			  unsigned int len, u8 *out)
 {
 	struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
 	const u8 *consts = tctx->consts;
 
-	if (ctx->len != SM4_BLOCK_SIZE) {
-		ctx->digest[ctx->len] ^= 0x80;
+	crypto_xor(ctx->digest, src, len);
+	if (len != SM4_BLOCK_SIZE) {
+		ctx->digest[len] ^= 0x80;
 		consts += SM4_BLOCK_SIZE;
 	}
-
 	kernel_neon_begin();
 	sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, consts, 1,
 			  false, true);
 	kernel_neon_end();
-
 	memcpy(out, ctx->digest, SM4_BLOCK_SIZE);
-
 	return 0;
 }
 
-static int sm4_cbcmac_final(struct shash_desc *desc, u8 *out)
+static int sm4_cbcmac_finup(struct shash_desc *desc, const u8 *src,
+			    unsigned int len, u8 *out)
 {
 	struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
 
-	if (ctx->len) {
+	if (len) {
+		crypto_xor(ctx->digest, src, len);
 		kernel_neon_begin();
 		sm4_ce_crypt_block(tctx->key.rkey_enc, ctx->digest,
 				   ctx->digest);
 		kernel_neon_end();
 	}
-
 	memcpy(out, ctx->digest, SM4_BLOCK_SIZE);
-
 	return 0;
 }
 
@@ -691,6 +650,8 @@ static struct shash_alg sm4_mac_algs[] = {
 			.cra_name		= "cmac(sm4)",
 			.cra_driver_name	= "cmac-sm4-ce",
 			.cra_priority		= 400,
+			.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+						  CRYPTO_AHASH_ALG_FINAL_NONZERO,
 			.cra_blocksize		= SM4_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(struct sm4_mac_tfm_ctx)
 							+ SM4_BLOCK_SIZE * 2,
@@ -699,7 +660,7 @@ static struct shash_alg sm4_mac_algs[] = {
 		.digestsize	= SM4_BLOCK_SIZE,
 		.init		= sm4_mac_init,
 		.update		= sm4_mac_update,
-		.final		= sm4_cmac_final,
+		.finup		= sm4_cmac_finup,
 		.setkey		= sm4_cmac_setkey,
 		.descsize	= sizeof(struct sm4_mac_desc_ctx),
 	}, {
@@ -707,6 +668,8 @@ static struct shash_alg sm4_mac_algs[] = {
 			.cra_name		= "xcbc(sm4)",
 			.cra_driver_name	= "xcbc-sm4-ce",
 			.cra_priority		= 400,
+			.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+						  CRYPTO_AHASH_ALG_FINAL_NONZERO,
 			.cra_blocksize		= SM4_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(struct sm4_mac_tfm_ctx)
 							+ SM4_BLOCK_SIZE * 2,
@@ -715,7 +678,7 @@ static struct shash_alg sm4_mac_algs[] = {
 		.digestsize	= SM4_BLOCK_SIZE,
 		.init		= sm4_mac_init,
 		.update		= sm4_mac_update,
-		.final		= sm4_cmac_final,
+		.finup		= sm4_cmac_finup,
 		.setkey		= sm4_xcbc_setkey,
 		.descsize	= sizeof(struct sm4_mac_desc_ctx),
 	}, {
@@ -723,14 +686,15 @@ static struct shash_alg sm4_mac_algs[] = {
 			.cra_name		= "cbcmac(sm4)",
 			.cra_driver_name	= "cbcmac-sm4-ce",
 			.cra_priority		= 400,
-			.cra_blocksize		= 1,
+			.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
+			.cra_blocksize		= SM4_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(struct sm4_mac_tfm_ctx),
 			.cra_module		= THIS_MODULE,
 		},
 		.digestsize	= SM4_BLOCK_SIZE,
 		.init		= sm4_mac_init,
 		.update		= sm4_mac_update,
-		.final		= sm4_cbcmac_final,
+		.finup		= sm4_cbcmac_finup,
 		.setkey		= sm4_cbcmac_setkey,
 		.descsize	= sizeof(struct sm4_mac_desc_ctx),
 	}
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index d1cc0571798b..dffff6763812 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -81,6 +81,7 @@
 #define ARM_CPU_PART_CORTEX_A78AE	0xD42
 #define ARM_CPU_PART_CORTEX_X1		0xD44
 #define ARM_CPU_PART_CORTEX_A510	0xD46
+#define ARM_CPU_PART_CORTEX_X1C		0xD4C
 #define ARM_CPU_PART_CORTEX_A520	0xD80
 #define ARM_CPU_PART_CORTEX_A710	0xD47
 #define ARM_CPU_PART_CORTEX_A715	0xD4D
@@ -168,6 +169,7 @@
 #define MIDR_CORTEX_A78AE	MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE)
 #define MIDR_CORTEX_X1	MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1)
 #define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510)
+#define MIDR_CORTEX_X1C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1C)
 #define MIDR_CORTEX_A520 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A520)
 #define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710)
 #define MIDR_CORTEX_A715 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A715)
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index ebceaae3c749..d40e427ddad9 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -52,7 +52,7 @@
 	mrs	x0, id_aa64mmfr1_el1
 	ubfx	x0, x0, #ID_AA64MMFR1_EL1_HCX_SHIFT, #4
 	cbz	x0, .Lskip_hcrx_\@
-	mov_q	x0, HCRX_HOST_FLAGS
+	mov_q	x0, (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En | HCRX_EL2_EnFPM)
 
         /* Enable GCS if supported */
 	mrs_s	x1, SYS_ID_AA64PFR1_EL1
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 39577f1d079a..18c7811774d3 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -706,6 +706,7 @@ u32 aarch64_insn_gen_cas(enum aarch64_insn_register result,
 }
 #endif
 u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type);
+u32 aarch64_insn_gen_dsb(enum aarch64_insn_mb_type type);
 u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result,
 			 enum aarch64_insn_system_register sysreg);
 
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 974d72b5905b..e9c8a581e16f 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -100,9 +100,8 @@
 			 HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 | HCR_TID1)
 #define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA)
 #define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC)
-#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
+#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H | HCR_AMO | HCR_IMO | HCR_FMO)
 
-#define HCRX_HOST_FLAGS (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En | HCRX_EL2_EnFPM)
 #define MPAMHCR_HOST_FLAGS	0
 
 /* TCR_EL2 Registers bits */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e98cfe7855a6..08ba91e6fb03 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1588,4 +1588,9 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val);
 #define kvm_has_s1poe(k)				\
 	(kvm_has_feat((k), ID_AA64MMFR3_EL1, S1POE, IMP))
 
+static inline bool kvm_arch_has_irq_bypass(void)
+{
+	return true;
+}
+
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 30a29e88994b..6e8aa8e72601 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -94,17 +94,6 @@ static inline bool kaslr_requires_kpti(void)
 			return false;
 	}
 
-	/*
-	 * Systems affected by Cavium erratum 24756 are incompatible
-	 * with KPTI.
-	 */
-	if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) {
-		extern const struct midr_range cavium_erratum_27456_cpus[];
-
-		if (is_midr_in_range_list(cavium_erratum_27456_cpus))
-			return false;
-	}
-
 	return true;
 }
 
diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h
index f1524cdeacf1..8fef12626090 100644
--- a/arch/arm64/include/asm/spectre.h
+++ b/arch/arm64/include/asm/spectre.h
@@ -97,6 +97,9 @@ enum mitigation_state arm64_get_meltdown_state(void);
 
 enum mitigation_state arm64_get_spectre_bhb_state(void);
 bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope);
+extern bool __nospectre_bhb;
+u8 get_spectre_bhb_loop_value(void);
+bool is_spectre_bhb_fw_mitigated(void);
 void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *__unused);
 bool try_emulate_el1_ssbs(struct pt_regs *regs, u32 instr);
 
diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h
index 92a2b59a9f3d..3322c7047d84 100644
--- a/arch/arm64/include/asm/vdso/gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/gettimeofday.h
@@ -99,6 +99,19 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode,
 	return res;
 }
 
+#if IS_ENABLED(CONFIG_CC_IS_GCC) && IS_ENABLED(CONFIG_PAGE_SIZE_64KB)
+static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void)
+{
+	const struct vdso_time_data *ret = &vdso_u_time_data;
+
+	/* Work around invalid absolute relocations */
+	OPTIMIZER_HIDE_VAR(ret);
+
+	return ret;
+}
+#define __arch_get_vdso_u_time_data __arch_get_vdso_u_time_data
+#endif /* IS_ENABLED(CONFIG_CC_IS_GCC) && IS_ENABLED(CONFIG_PAGE_SIZE_64KB) */
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index b55f5f705750..6b0ad5070d3e 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -335,7 +335,7 @@ static const struct midr_range cavium_erratum_23154_cpus[] = {
 #endif
 
 #ifdef CONFIG_CAVIUM_ERRATUM_27456
-const struct midr_range cavium_erratum_27456_cpus[] = {
+static const struct midr_range cavium_erratum_27456_cpus[] = {
 	/* Cavium ThunderX, T88 pass 1.x - 2.1 */
 	MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1),
 	/* Cavium ThunderX, T81 pass 1.0 */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9c4d6d552b25..4c46d80aa64b 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -114,7 +114,14 @@ static struct arm64_cpu_capabilities const __ro_after_init *cpucap_ptrs[ARM64_NC
 
 DECLARE_BITMAP(boot_cpucaps, ARM64_NCAPS);
 
-bool arm64_use_ng_mappings = false;
+/*
+ * arm64_use_ng_mappings must be placed in the .data section, otherwise it
+ * ends up in the .bss section where it is initialized in early_map_kernel()
+ * after the MMU (with the idmap) was enabled. create_init_idmap() - which
+ * runs before early_map_kernel() and reads the variable via PTE_MAYBE_NG -
+ * may end up generating an incorrect idmap page table attributes.
+ */
+bool arm64_use_ng_mappings __read_mostly = false;
 EXPORT_SYMBOL(arm64_use_ng_mappings);
 
 DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors;
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 5e3c4b58f279..2004b4f41ade 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -47,10 +47,6 @@ PROVIDE(__pi_id_aa64smfr0_override	= id_aa64smfr0_override);
 PROVIDE(__pi_id_aa64zfr0_override	= id_aa64zfr0_override);
 PROVIDE(__pi_arm64_sw_feature_override	= arm64_sw_feature_override);
 PROVIDE(__pi_arm64_use_ng_mappings	= arm64_use_ng_mappings);
-#ifdef CONFIG_CAVIUM_ERRATUM_27456
-PROVIDE(__pi_cavium_erratum_27456_cpus	= cavium_erratum_27456_cpus);
-PROVIDE(__pi_is_midr_in_range_list	= is_midr_in_range_list);
-#endif
 PROVIDE(__pi__ctype			= _ctype);
 PROVIDE(__pi_memstart_offset_seed	= memstart_offset_seed);
 
diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c
index e57b043f324b..c6650cfe706c 100644
--- a/arch/arm64/kernel/pi/map_kernel.c
+++ b/arch/arm64/kernel/pi/map_kernel.c
@@ -207,6 +207,29 @@ static void __init map_fdt(u64 fdt)
 	dsb(ishst);
 }
 
+/*
+ * PI version of the Cavium Eratum 27456 detection, which makes it
+ * impossible to use non-global mappings.
+ */
+static bool __init ng_mappings_allowed(void)
+{
+	static const struct midr_range cavium_erratum_27456_cpus[] __initconst = {
+		/* Cavium ThunderX, T88 pass 1.x - 2.1 */
+		MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1),
+		/* Cavium ThunderX, T81 pass 1.0 */
+		MIDR_REV(MIDR_THUNDERX_81XX, 0, 0),
+		{},
+	};
+
+	for (const struct midr_range *r = cavium_erratum_27456_cpus; r->model; r++) {
+		if (midr_is_cpu_model_range(read_cpuid_id(), r->model,
+					    r->rv_min, r->rv_max))
+			return false;
+	}
+
+	return true;
+}
+
 asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt)
 {
 	static char const chosen_str[] __initconst = "/chosen";
@@ -246,7 +269,7 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt)
 		u64 kaslr_seed = kaslr_early_init(fdt, chosen);
 
 		if (kaslr_seed && kaslr_requires_kpti())
-			arm64_use_ng_mappings = true;
+			arm64_use_ng_mappings = ng_mappings_allowed();
 
 		kaslr_offset |= kaslr_seed & ~(MIN_KIMG_ALIGN - 1);
 	}
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index b198dde79e59..edf1783ffc81 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -879,16 +879,19 @@ static u8 spectre_bhb_loop_affected(void)
 	static const struct midr_range spectre_bhb_k132_list[] = {
 		MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
 		MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
+		{},
 	};
 	static const struct midr_range spectre_bhb_k38_list[] = {
 		MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
 		MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
+		{},
 	};
 	static const struct midr_range spectre_bhb_k32_list[] = {
 		MIDR_ALL_VERSIONS(MIDR_CORTEX_A78),
 		MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE),
 		MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C),
 		MIDR_ALL_VERSIONS(MIDR_CORTEX_X1),
+		MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C),
 		MIDR_ALL_VERSIONS(MIDR_CORTEX_A710),
 		MIDR_ALL_VERSIONS(MIDR_CORTEX_X2),
 		MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
@@ -997,6 +1000,11 @@ bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry,
 	return true;
 }
 
+u8 get_spectre_bhb_loop_value(void)
+{
+	return max_bhb_k;
+}
+
 static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot)
 {
 	const char *v = arm64_get_bp_hardening_vector(slot);
@@ -1014,7 +1022,7 @@ static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot)
 	isb();
 }
 
-static bool __read_mostly __nospectre_bhb;
+bool __read_mostly __nospectre_bhb;
 static int __init parse_spectre_bhb_param(char *str)
 {
 	__nospectre_bhb = true;
@@ -1092,6 +1100,11 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry)
 	update_mitigation_state(&spectre_bhb_state, state);
 }
 
+bool is_spectre_bhb_fw_mitigated(void)
+{
+	return test_bit(BHB_FW, &system_bhb_mitigations);
+}
+
 /* Patched to NOP when enabled */
 void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt,
 						     __le32 *origptr,
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 68fec8c95fee..19ca57def629 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -2743,11 +2743,6 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
 	return irqchip_in_kernel(kvm);
 }
 
-bool kvm_arch_has_irq_bypass(void)
-{
-	return true;
-}
-
 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
 				      struct irq_bypass_producer *prod)
 {
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index b741ea6aefa5..96f625dc7256 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -235,6 +235,8 @@ static inline void __deactivate_traps_mpam(void)
 
 static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
 {
+	struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
+
 	/* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */
 	write_sysreg(1 << 15, hstr_el2);
 
@@ -245,11 +247,8 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
 	 * EL1 instead of being trapped to EL2.
 	 */
 	if (system_supports_pmuv3()) {
-		struct kvm_cpu_context *hctxt;
-
 		write_sysreg(0, pmselr_el0);
 
-		hctxt = host_data_ptr(host_ctxt);
 		ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0);
 		write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
 		vcpu_set_flag(vcpu, PMUSERENR_ON_CPU);
@@ -269,6 +268,7 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
 			hcrx &= ~clr;
 		}
 
+		ctxt_sys_reg(hctxt, HCRX_EL2) = read_sysreg_s(SYS_HCRX_EL2);
 		write_sysreg_s(hcrx, SYS_HCRX_EL2);
 	}
 
@@ -278,19 +278,18 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
 
 static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
 {
+	struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
+
 	write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);
 
 	write_sysreg(0, hstr_el2);
 	if (system_supports_pmuv3()) {
-		struct kvm_cpu_context *hctxt;
-
-		hctxt = host_data_ptr(host_ctxt);
 		write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0);
 		vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU);
 	}
 
 	if (cpus_have_final_cap(ARM64_HAS_HCX))
-		write_sysreg_s(HCRX_HOST_FLAGS, SYS_HCRX_EL2);
+		write_sysreg_s(ctxt_sys_reg(hctxt, HCRX_EL2), SYS_HCRX_EL2);
 
 	__deactivate_traps_hfgxtr(vcpu);
 	__deactivate_traps_mpam();
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 2a5284f749b4..e80f3ebd3e2a 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -503,7 +503,7 @@ int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
 {
 	int ret;
 
-	if (!addr_is_memory(addr))
+	if (!range_is_memory(addr, addr + size))
 		return -EPERM;
 
 	ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index ed363aa3027e..50aa8dbcae75 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -429,23 +429,27 @@ u64 __vgic_v3_get_gic_config(void)
 	/*
 	 * To check whether we have a MMIO-based (GICv2 compatible)
 	 * CPU interface, we need to disable the system register
-	 * view. To do that safely, we have to prevent any interrupt
-	 * from firing (which would be deadly).
+	 * view.
 	 *
-	 * Note that this only makes sense on VHE, as interrupts are
-	 * already masked for nVHE as part of the exception entry to
-	 * EL2.
-	 */
-	if (has_vhe())
-		flags = local_daif_save();
-
-	/*
 	 * Table 11-2 "Permitted ICC_SRE_ELx.SRE settings" indicates
 	 * that to be able to set ICC_SRE_EL1.SRE to 0, all the
 	 * interrupt overrides must be set. You've got to love this.
+	 *
+	 * As we always run VHE with HCR_xMO set, no extra xMO
+	 * manipulation is required in that case.
+	 *
+	 * To safely disable SRE, we have to prevent any interrupt
+	 * from firing (which would be deadly). This only makes sense
+	 * on VHE, as interrupts are already masked for nVHE as part
+	 * of the exception entry to EL2.
 	 */
-	sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO);
-	isb();
+	if (has_vhe()) {
+		flags = local_daif_save();
+	} else {
+		sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO);
+		isb();
+	}
+
 	write_gicreg(0, ICC_SRE_EL1);
 	isb();
 
@@ -453,11 +457,13 @@ u64 __vgic_v3_get_gic_config(void)
 
 	write_gicreg(sre, ICC_SRE_EL1);
 	isb();
-	sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0);
-	isb();
 
-	if (has_vhe())
+	if (has_vhe()) {
 		local_daif_restore(flags);
+	} else {
+		sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0);
+		isb();
+	}
 
 	val  = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63);
 	val |= read_gicreg(ICH_VTR_EL2);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 754f2fe0cc67..eeda92330ade 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1501,6 +1501,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
+	if (!is_protected_kvm_enabled())
+		memcache = &vcpu->arch.mmu_page_cache;
+	else
+		memcache = &vcpu->arch.pkvm_memcache;
+
 	/*
 	 * Permission faults just need to update the existing leaf entry,
 	 * and so normally don't require allocations from the memcache. The
@@ -1510,13 +1515,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (!fault_is_perm || (logging_active && write_fault)) {
 		int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
 
-		if (!is_protected_kvm_enabled()) {
-			memcache = &vcpu->arch.mmu_page_cache;
+		if (!is_protected_kvm_enabled())
 			ret = kvm_mmu_topup_memory_cache(memcache, min_pages);
-		} else {
-			memcache = &vcpu->arch.pkvm_memcache;
+		else
 			ret = topup_hyp_memcache(memcache, min_pages);
-		}
+
 		if (ret)
 			return ret;
 	}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 005ad28f7306..5dde9285afc8 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1945,6 +1945,12 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
 	if ((hw_val & mpam_mask) == (user_val & mpam_mask))
 		user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK;
 
+	/* Fail the guest's request to disable the AA64 ISA at EL{0,1,2} */
+	if (!FIELD_GET(ID_AA64PFR0_EL1_EL0, user_val) ||
+	    !FIELD_GET(ID_AA64PFR0_EL1_EL1, user_val) ||
+	    (vcpu_has_nv(vcpu) && !FIELD_GET(ID_AA64PFR0_EL1_EL2, user_val)))
+		return -EINVAL;
+
 	return set_id_reg(vcpu, rd, user_val);
 }
 
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 4d49dff721a8..027bfa9689c6 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
+
+obj-y += crypto/
+
 lib-y		:= clear_user.o delay.o copy_from_user.o		\
 		   copy_to_user.o copy_page.o				\
 		   clear_page.o csum.o insn.o memchr.o memcpy.o		\
@@ -14,10 +17,10 @@ endif
 lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
 
 obj-$(CONFIG_CRC32_ARCH) += crc32-arm64.o
-crc32-arm64-y := crc32.o crc32-glue.o
+crc32-arm64-y := crc32.o crc32-core.o
 
 obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm64.o
-crc-t10dif-arm64-y := crc-t10dif-glue.o crc-t10dif-core.o
+crc-t10dif-arm64-y := crc-t10dif.o crc-t10dif-core.o
 
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
diff --git a/arch/arm64/lib/crc-t10dif-glue.c b/arch/arm64/lib/crc-t10dif.c
index bacd18f23168..c2ffe4fdb59d 100644
--- a/arch/arm64/lib/crc-t10dif-glue.c
+++ b/arch/arm64/lib/crc-t10dif.c
@@ -17,8 +17,8 @@
 #include <asm/neon.h>
 #include <asm/simd.h>
 
-static DEFINE_STATIC_KEY_FALSE(have_asimd);
-static DEFINE_STATIC_KEY_FALSE(have_pmull);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_asimd);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
 
 #define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
 
@@ -61,7 +61,7 @@ static int __init crc_t10dif_arm64_init(void)
 	}
 	return 0;
 }
-arch_initcall(crc_t10dif_arm64_init);
+subsys_initcall(crc_t10dif_arm64_init);
 
 static void __exit crc_t10dif_arm64_exit(void)
 {
diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32-core.S
index 68825317460f..68825317460f 100644
--- a/arch/arm64/lib/crc32.S
+++ b/arch/arm64/lib/crc32-core.S
diff --git a/arch/arm64/lib/crc32-glue.c b/arch/arm64/lib/crc32.c
index ed3acd71178f..ed3acd71178f 100644
--- a/arch/arm64/lib/crc32-glue.c
+++ b/arch/arm64/lib/crc32.c
diff --git a/arch/arm64/lib/crypto/.gitignore b/arch/arm64/lib/crypto/.gitignore
new file mode 100644
index 000000000000..12d74d8b03d0
--- /dev/null
+++ b/arch/arm64/lib/crypto/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
+sha256-core.S
diff --git a/arch/arm64/lib/crypto/Kconfig b/arch/arm64/lib/crypto/Kconfig
new file mode 100644
index 000000000000..129a7685cb4c
--- /dev/null
+++ b/arch/arm64/lib/crypto/Kconfig
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_CHACHA20_NEON
+	tristate
+	depends on KERNEL_MODE_NEON
+	default CRYPTO_LIB_CHACHA
+	select CRYPTO_LIB_CHACHA_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_NEON
+	tristate
+	depends on KERNEL_MODE_NEON
+	default CRYPTO_LIB_POLY1305
+	select CRYPTO_ARCH_HAVE_LIB_POLY1305
+
+config CRYPTO_SHA256_ARM64
+	tristate
+	default CRYPTO_LIB_SHA256
+	select CRYPTO_ARCH_HAVE_LIB_SHA256
+	select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
diff --git a/arch/arm64/lib/crypto/Makefile b/arch/arm64/lib/crypto/Makefile
new file mode 100644
index 000000000000..946c09903711
--- /dev/null
+++ b/arch/arm64/lib/crypto/Makefile
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+
+obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
+poly1305-neon-y := poly1305-core.o poly1305-glue.o
+AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch
+AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch
+
+obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o
+sha256-arm64-y := sha256.o sha256-core.o
+sha256-arm64-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $(<) void $(@)
+
+$(obj)/%-core.S: $(src)/%-armv8.pl
+	$(call cmd,perlasm)
+
+$(obj)/sha256-core.S: $(src)/sha2-armv8.pl
+	$(call cmd,perlasm)
+
+clean-files += poly1305-core.S sha256-core.S
diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/lib/crypto/chacha-neon-core.S
index b70ac76f2610..80079586ecc7 100644
--- a/arch/arm64/crypto/chacha-neon-core.S
+++ b/arch/arm64/lib/crypto/chacha-neon-core.S
@@ -1,5 +1,5 @@
 /*
- * ChaCha/XChaCha NEON helper functions
+ * ChaCha/HChaCha NEON helper functions
  *
  * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
  *
diff --git a/arch/arm64/lib/crypto/chacha-neon-glue.c b/arch/arm64/lib/crypto/chacha-neon-glue.c
new file mode 100644
index 000000000000..d0188f974ca5
--- /dev/null
+++ b/arch/arm64/lib/crypto/chacha-neon-glue.c
@@ -0,0 +1,119 @@
+/*
+ * ChaCha and HChaCha functions (ARM64 optimized)
+ *
+ * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
+				      u8 *dst, const u8 *src, int nrounds);
+asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       int nrounds, int bytes);
+asmlinkage void hchacha_block_neon(const struct chacha_state *state,
+				   u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
+			  int bytes, int nrounds)
+{
+	while (bytes > 0) {
+		int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
+
+		if (l <= CHACHA_BLOCK_SIZE) {
+			u8 buf[CHACHA_BLOCK_SIZE];
+
+			memcpy(buf, src, l);
+			chacha_block_xor_neon(state, buf, buf, nrounds);
+			memcpy(dst, buf, l);
+			state->x[12] += 1;
+			break;
+		}
+		chacha_4block_xor_neon(state, dst, src, nrounds, l);
+		bytes -= l;
+		src += l;
+		dst += l;
+		state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+	}
+}
+
+void hchacha_block_arch(const struct chacha_state *state,
+			u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
+		hchacha_block_generic(state, out, nrounds);
+	} else {
+		kernel_neon_begin();
+		hchacha_block_neon(state, out, nrounds);
+		kernel_neon_end();
+	}
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+		       unsigned int bytes, int nrounds)
+{
+	if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
+	    !crypto_simd_usable())
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		kernel_neon_begin();
+		chacha_doneon(state, dst, src, todo, nrounds);
+		kernel_neon_end();
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	return static_key_enabled(&have_neon);
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+static int __init chacha_simd_mod_init(void)
+{
+	if (cpu_have_named_feature(ASIMD))
+		static_branch_enable(&have_neon);
+	return 0;
+}
+subsys_initcall(chacha_simd_mod_init);
+
+static void __exit chacha_simd_mod_exit(void)
+{
+}
+module_exit(chacha_simd_mod_exit);
+
+MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/crypto/poly1305-armv8.pl b/arch/arm64/lib/crypto/poly1305-armv8.pl
index 22c9069c0650..22c9069c0650 100644
--- a/arch/arm64/crypto/poly1305-armv8.pl
+++ b/arch/arm64/lib/crypto/poly1305-armv8.pl
diff --git a/arch/arm64/lib/crypto/poly1305-glue.c b/arch/arm64/lib/crypto/poly1305-glue.c
new file mode 100644
index 000000000000..6a661cf04821
--- /dev/null
+++ b/arch/arm64/lib/crypto/poly1305-glue.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <crypto/internal/poly1305.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+asmlinkage void poly1305_block_init_arch(
+	struct poly1305_block_state *state,
+	const u8 raw_key[POLY1305_BLOCK_SIZE]);
+EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
+asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
+				const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
+				     const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
+				   u8 digest[POLY1305_DIGEST_SIZE],
+				   const u32 nonce[4]);
+EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
+			  unsigned int len, u32 padbit)
+{
+	len = round_down(len, POLY1305_BLOCK_SIZE);
+	if (static_branch_likely(&have_neon)) {
+		do {
+			unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+			kernel_neon_begin();
+			poly1305_blocks_neon(state, src, todo, 1);
+			kernel_neon_end();
+
+			len -= todo;
+			src += todo;
+		} while (len);
+	} else
+		poly1305_blocks(state, src, len, 1);
+}
+EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
+
+bool poly1305_is_arch_optimized(void)
+{
+	/* We always can use at least the ARM64 scalar implementation. */
+	return true;
+}
+EXPORT_SYMBOL(poly1305_is_arch_optimized);
+
+static int __init neon_poly1305_mod_init(void)
+{
+	if (cpu_have_named_feature(ASIMD))
+		static_branch_enable(&have_neon);
+	return 0;
+}
+subsys_initcall(neon_poly1305_mod_init);
+
+static void __exit neon_poly1305_mod_exit(void)
+{
+}
+module_exit(neon_poly1305_mod_exit);
+
+MODULE_DESCRIPTION("Poly1305 authenticator (ARM64 optimized)");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/crypto/sha512-armv8.pl b/arch/arm64/lib/crypto/sha2-armv8.pl
index 35ec9ae99fe1..4aebd20c498b 100644
--- a/arch/arm64/crypto/sha512-armv8.pl
+++ b/arch/arm64/lib/crypto/sha2-armv8.pl
@@ -95,7 +95,7 @@ if ($output =~ /512/) {
 	$reg_t="w";
 }
 
-$func="sha${BITS}_block_data_order";
+$func="sha${BITS}_blocks_arch";
 
 ($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
 
diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/lib/crypto/sha256-ce.S
index fce84d88ddb2..f3e21c6d87d2 100644
--- a/arch/arm64/crypto/sha2-ce-core.S
+++ b/arch/arm64/lib/crypto/sha256-ce.S
@@ -71,8 +71,8 @@
 	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 
 	/*
-	 * int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
-	 *			     int blocks)
+	 * size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
+	 *				const u8 *data, size_t nblocks);
 	 */
 	.text
 SYM_FUNC_START(__sha256_ce_transform)
@@ -86,20 +86,16 @@ SYM_FUNC_START(__sha256_ce_transform)
 	/* load state */
 	ld1		{dgav.4s, dgbv.4s}, [x0]
 
-	/* load sha256_ce_state::finalize */
-	ldr_l		w4, sha256_ce_offsetof_finalize, x4
-	ldr		w4, [x0, x4]
-
 	/* load input */
 0:	ld1		{v16.4s-v19.4s}, [x1], #64
-	sub		w2, w2, #1
+	sub		x2, x2, #1
 
 CPU_LE(	rev32		v16.16b, v16.16b	)
 CPU_LE(	rev32		v17.16b, v17.16b	)
 CPU_LE(	rev32		v18.16b, v18.16b	)
 CPU_LE(	rev32		v19.16b, v19.16b	)
 
-1:	add		t0.4s, v16.4s, v0.4s
+	add		t0.4s, v16.4s, v0.4s
 	mov		dg0v.16b, dgav.16b
 	mov		dg1v.16b, dgbv.16b
 
@@ -127,31 +123,14 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
 	add		dgav.4s, dgav.4s, dg0v.4s
 	add		dgbv.4s, dgbv.4s, dg1v.4s
 
-	/* handled all input blocks? */
-	cbz		w2, 2f
-	cond_yield	3f, x5, x6
-	b		0b
+	/* return early if voluntary preemption is needed */
+	cond_yield	1f, x5, x6
 
-	/*
-	 * Final block: add padding and total bit count.
-	 * Skip if the input size was not a round multiple of the block size,
-	 * the padding is handled by the C code in that case.
-	 */
-2:	cbz		x4, 3f
-	ldr_l		w4, sha256_ce_offsetof_count, x4
-	ldr		x4, [x0, x4]
-	movi		v17.2d, #0
-	mov		x8, #0x80000000
-	movi		v18.2d, #0
-	ror		x7, x4, #29		// ror(lsl(x4, 3), 32)
-	fmov		d16, x8
-	mov		x4, #0
-	mov		v19.d[0], xzr
-	mov		v19.d[1], x7
-	b		1b
+	/* handled all input blocks? */
+	cbnz		x2, 0b
 
 	/* store new state */
-3:	st1		{dgav.4s, dgbv.4s}, [x0]
-	mov		w0, w2
+1:	st1		{dgav.4s, dgbv.4s}, [x0]
+	mov		x0, x2
 	ret
 SYM_FUNC_END(__sha256_ce_transform)
diff --git a/arch/arm64/lib/crypto/sha256.c b/arch/arm64/lib/crypto/sha256.c
new file mode 100644
index 000000000000..bcf7a3adc0c4
--- /dev/null
+++ b/arch/arm64/lib/crypto/sha256.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-256 optimized for ARM64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <crypto/internal/sha2.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+				   const u8 *data, size_t nblocks);
+EXPORT_SYMBOL_GPL(sha256_blocks_arch);
+asmlinkage void sha256_block_neon(u32 state[SHA256_STATE_WORDS],
+				  const u8 *data, size_t nblocks);
+asmlinkage size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
+					const u8 *data, size_t nblocks);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
+			const u8 *data, size_t nblocks)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_neon)) {
+		if (static_branch_likely(&have_ce)) {
+			do {
+				size_t rem;
+
+				kernel_neon_begin();
+				rem = __sha256_ce_transform(state,
+							    data, nblocks);
+				kernel_neon_end();
+				data += (nblocks - rem) * SHA256_BLOCK_SIZE;
+				nblocks = rem;
+			} while (nblocks);
+		} else {
+			kernel_neon_begin();
+			sha256_block_neon(state, data, nblocks);
+			kernel_neon_end();
+		}
+	} else {
+		sha256_blocks_arch(state, data, nblocks);
+	}
+}
+EXPORT_SYMBOL_GPL(sha256_blocks_simd);
+
+bool sha256_is_arch_optimized(void)
+{
+	/* We always can use at least the ARM64 scalar implementation. */
+	return true;
+}
+EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
+
+static int __init sha256_arm64_mod_init(void)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    cpu_have_named_feature(ASIMD)) {
+		static_branch_enable(&have_neon);
+		if (cpu_have_named_feature(SHA2))
+			static_branch_enable(&have_ce);
+	}
+	return 0;
+}
+subsys_initcall(sha256_arm64_mod_init);
+
+static void __exit sha256_arm64_mod_exit(void)
+{
+}
+module_exit(sha256_arm64_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-256 optimized for ARM64");
diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
index 9bef696e2230..4e298baddc2e 100644
--- a/arch/arm64/lib/insn.c
+++ b/arch/arm64/lib/insn.c
@@ -5,6 +5,7 @@
  *
  * Copyright (C) 2014-2016 Zi Shen Lim <zlim.lnx@gmail.com>
  */
+#include <linux/bitfield.h>
 #include <linux/bitops.h>
 #include <linux/bug.h>
 #include <linux/printk.h>
@@ -1500,43 +1501,41 @@ u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant,
 	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm);
 }
 
-u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type)
+static u32 __get_barrier_crm_val(enum aarch64_insn_mb_type type)
 {
-	u32 opt;
-	u32 insn;
-
 	switch (type) {
 	case AARCH64_INSN_MB_SY:
-		opt = 0xf;
-		break;
+		return 0xf;
 	case AARCH64_INSN_MB_ST:
-		opt = 0xe;
-		break;
+		return 0xe;
 	case AARCH64_INSN_MB_LD:
-		opt = 0xd;
-		break;
+		return 0xd;
 	case AARCH64_INSN_MB_ISH:
-		opt = 0xb;
-		break;
+		return 0xb;
 	case AARCH64_INSN_MB_ISHST:
-		opt = 0xa;
-		break;
+		return 0xa;
 	case AARCH64_INSN_MB_ISHLD:
-		opt = 0x9;
-		break;
+		return 0x9;
 	case AARCH64_INSN_MB_NSH:
-		opt = 0x7;
-		break;
+		return 0x7;
 	case AARCH64_INSN_MB_NSHST:
-		opt = 0x6;
-		break;
+		return 0x6;
 	case AARCH64_INSN_MB_NSHLD:
-		opt = 0x5;
-		break;
+		return 0x5;
 	default:
-		pr_err("%s: unknown dmb type %d\n", __func__, type);
+		pr_err("%s: unknown barrier type %d\n", __func__, type);
 		return AARCH64_BREAK_FAULT;
 	}
+}
+
+u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type)
+{
+	u32 opt;
+	u32 insn;
+
+	opt = __get_barrier_crm_val(type);
+	if (opt == AARCH64_BREAK_FAULT)
+		return AARCH64_BREAK_FAULT;
 
 	insn = aarch64_insn_get_dmb_value();
 	insn &= ~GENMASK(11, 8);
@@ -1545,6 +1544,21 @@ u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type)
 	return insn;
 }
 
+u32 aarch64_insn_gen_dsb(enum aarch64_insn_mb_type type)
+{
+	u32 opt, insn;
+
+	opt = __get_barrier_crm_val(type);
+	if (opt == AARCH64_BREAK_FAULT)
+		return AARCH64_BREAK_FAULT;
+
+	insn = aarch64_insn_get_dsb_base_value();
+	insn &= ~GENMASK(11, 8);
+	insn |= (opt << 8);
+
+	return insn;
+}
+
 u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result,
 			 enum aarch64_insn_system_register sysreg)
 {
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 70d7c89d3ac9..634d78422adb 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -7,6 +7,7 @@
 
 #define pr_fmt(fmt) "bpf_jit: " fmt
 
+#include <linux/arm-smccc.h>
 #include <linux/bitfield.h>
 #include <linux/bpf.h>
 #include <linux/filter.h>
@@ -17,6 +18,7 @@
 #include <asm/asm-extable.h>
 #include <asm/byteorder.h>
 #include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
 #include <asm/debug-monitors.h>
 #include <asm/insn.h>
 #include <asm/text-patching.h>
@@ -939,7 +941,51 @@ static void build_plt(struct jit_ctx *ctx)
 		plt->target = (u64)&dummy_tramp;
 }
 
-static void build_epilogue(struct jit_ctx *ctx)
+/* Clobbers BPF registers 1-4, aka x0-x3 */
+static void __maybe_unused build_bhb_mitigation(struct jit_ctx *ctx)
+{
+	const u8 r1 = bpf2a64[BPF_REG_1]; /* aka x0 */
+	u8 k = get_spectre_bhb_loop_value();
+
+	if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY) ||
+	    cpu_mitigations_off() || __nospectre_bhb ||
+	    arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE)
+		return;
+
+	if (capable(CAP_SYS_ADMIN))
+		return;
+
+	if (supports_clearbhb(SCOPE_SYSTEM)) {
+		emit(aarch64_insn_gen_hint(AARCH64_INSN_HINT_CLEARBHB), ctx);
+		return;
+	}
+
+	if (k) {
+		emit_a64_mov_i64(r1, k, ctx);
+		emit(A64_B(1), ctx);
+		emit(A64_SUBS_I(true, r1, r1, 1), ctx);
+		emit(A64_B_(A64_COND_NE, -2), ctx);
+		emit(aarch64_insn_gen_dsb(AARCH64_INSN_MB_ISH), ctx);
+		emit(aarch64_insn_get_isb_value(), ctx);
+	}
+
+	if (is_spectre_bhb_fw_mitigated()) {
+		emit(A64_ORR_I(false, r1, AARCH64_INSN_REG_ZR,
+			       ARM_SMCCC_ARCH_WORKAROUND_3), ctx);
+		switch (arm_smccc_1_1_get_conduit()) {
+		case SMCCC_CONDUIT_HVC:
+			emit(aarch64_insn_get_hvc_value(), ctx);
+			break;
+		case SMCCC_CONDUIT_SMC:
+			emit(aarch64_insn_get_smc_value(), ctx);
+			break;
+		default:
+			pr_err_once("Firmware mitigation enabled with unknown conduit\n");
+		}
+	}
+}
+
+static void build_epilogue(struct jit_ctx *ctx, bool was_classic)
 {
 	const u8 r0 = bpf2a64[BPF_REG_0];
 	const u8 ptr = bpf2a64[TCCNT_PTR];
@@ -952,10 +998,13 @@ static void build_epilogue(struct jit_ctx *ctx)
 
 	emit(A64_POP(A64_ZR, ptr, A64_SP), ctx);
 
+	if (was_classic)
+		build_bhb_mitigation(ctx);
+
 	/* Restore FP/LR registers */
 	emit(A64_POP(A64_FP, A64_LR, A64_SP), ctx);
 
-	/* Set return value */
+	/* Move the return value from bpf:r0 (aka x7) to x0 */
 	emit(A64_MOV(1, A64_R(0), r0), ctx);
 
 	/* Authenticate lr */
@@ -1898,7 +1947,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	}
 
 	ctx.epilogue_offset = ctx.idx;
-	build_epilogue(&ctx);
+	build_epilogue(&ctx, was_classic);
 	build_plt(&ctx);
 
 	extable_align = __alignof__(struct exception_table_entry);
@@ -1961,7 +2010,7 @@ skip_init_ctx:
 		goto out_free_hdr;
 	}
 
-	build_epilogue(&ctx);
+	build_epilogue(&ctx, was_classic);
 	build_plt(&ctx);
 
 	/* Extra pass to validate JITed code. */
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 067c0b994648..1a2cf012b8f2 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -73,6 +73,7 @@ config LOONGARCH
 	select ARCH_SUPPORTS_RT
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF
+	select ARCH_USE_MEMTEST
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_WANT_DEFAULT_BPF_JIT
diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig
index 90f21dfe22b1..0d59af6007b7 100644
--- a/arch/loongarch/configs/loongson3_defconfig
+++ b/arch/loongarch/configs/loongson3_defconfig
@@ -1026,7 +1026,7 @@ CONFIG_SECURITY_APPARMOR=y
 CONFIG_SECURITY_YAMA=y
 CONFIG_DEFAULT_SECURITY_DAC=y
 CONFIG_CRYPTO_USER=m
-# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
+CONFIG_CRYPTO_SELFTESTS=y
 CONFIG_CRYPTO_PCRYPT=m
 CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_ANUBIS=m
diff --git a/arch/loongarch/include/asm/fpu.h b/arch/loongarch/include/asm/fpu.h
index 3177674228f8..45514f314664 100644
--- a/arch/loongarch/include/asm/fpu.h
+++ b/arch/loongarch/include/asm/fpu.h
@@ -22,22 +22,29 @@
 struct sigcontext;
 
 #define kernel_fpu_available() cpu_has_fpu
-extern void kernel_fpu_begin(void);
-extern void kernel_fpu_end(void);
-
-extern void _init_fpu(unsigned int);
-extern void _save_fp(struct loongarch_fpu *);
-extern void _restore_fp(struct loongarch_fpu *);
-
-extern void _save_lsx(struct loongarch_fpu *fpu);
-extern void _restore_lsx(struct loongarch_fpu *fpu);
-extern void _init_lsx_upper(void);
-extern void _restore_lsx_upper(struct loongarch_fpu *fpu);
-
-extern void _save_lasx(struct loongarch_fpu *fpu);
-extern void _restore_lasx(struct loongarch_fpu *fpu);
-extern void _init_lasx_upper(void);
-extern void _restore_lasx_upper(struct loongarch_fpu *fpu);
+
+void kernel_fpu_begin(void);
+void kernel_fpu_end(void);
+
+asmlinkage void _init_fpu(unsigned int);
+asmlinkage void _save_fp(struct loongarch_fpu *);
+asmlinkage void _restore_fp(struct loongarch_fpu *);
+asmlinkage int _save_fp_context(void __user *fpregs, void __user *fcc, void __user *csr);
+asmlinkage int _restore_fp_context(void __user *fpregs, void __user *fcc, void __user *csr);
+
+asmlinkage void _save_lsx(struct loongarch_fpu *fpu);
+asmlinkage void _restore_lsx(struct loongarch_fpu *fpu);
+asmlinkage void _init_lsx_upper(void);
+asmlinkage void _restore_lsx_upper(struct loongarch_fpu *fpu);
+asmlinkage int _save_lsx_context(void __user *fpregs, void __user *fcc, void __user *fcsr);
+asmlinkage int _restore_lsx_context(void __user *fpregs, void __user *fcc, void __user *fcsr);
+
+asmlinkage void _save_lasx(struct loongarch_fpu *fpu);
+asmlinkage void _restore_lasx(struct loongarch_fpu *fpu);
+asmlinkage void _init_lasx_upper(void);
+asmlinkage void _restore_lasx_upper(struct loongarch_fpu *fpu);
+asmlinkage int _save_lasx_context(void __user *fpregs, void __user *fcc, void __user *fcsr);
+asmlinkage int _restore_lasx_context(void __user *fpregs, void __user *fcc, void __user *fcsr);
 
 static inline void enable_lsx(void);
 static inline void disable_lsx(void);
diff --git a/arch/loongarch/include/asm/lbt.h b/arch/loongarch/include/asm/lbt.h
index e671978bf552..38566574e562 100644
--- a/arch/loongarch/include/asm/lbt.h
+++ b/arch/loongarch/include/asm/lbt.h
@@ -12,9 +12,13 @@
 #include <asm/loongarch.h>
 #include <asm/processor.h>
 
-extern void _init_lbt(void);
-extern void _save_lbt(struct loongarch_lbt *);
-extern void _restore_lbt(struct loongarch_lbt *);
+asmlinkage void _init_lbt(void);
+asmlinkage void _save_lbt(struct loongarch_lbt *);
+asmlinkage void _restore_lbt(struct loongarch_lbt *);
+asmlinkage int _save_lbt_context(void __user *regs, void __user *eflags);
+asmlinkage int _restore_lbt_context(void __user *regs, void __user *eflags);
+asmlinkage int _save_ftop_context(void __user *ftop);
+asmlinkage int _restore_ftop_context(void __user *ftop);
 
 static inline int is_lbt_enabled(void)
 {
diff --git a/arch/loongarch/include/asm/ptrace.h b/arch/loongarch/include/asm/ptrace.h
index f3ddaed9ef7f..e5d21e836d99 100644
--- a/arch/loongarch/include/asm/ptrace.h
+++ b/arch/loongarch/include/asm/ptrace.h
@@ -33,9 +33,9 @@ struct pt_regs {
 	unsigned long __last[];
 } __aligned(8);
 
-static inline int regs_irqs_disabled(struct pt_regs *regs)
+static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
 {
-	return arch_irqs_disabled_flags(regs->csr_prmd);
+	return !(regs->csr_prmd & CSR_PRMD_PIE);
 }
 
 static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
@@ -55,7 +55,7 @@ static inline void instruction_pointer_set(struct pt_regs *regs, unsigned long v
 
 /* Query offset/name of register from its name/offset */
 extern int regs_query_register_offset(const char *name);
-#define MAX_REG_OFFSET (offsetof(struct pt_regs, __last))
+#define MAX_REG_OFFSET (offsetof(struct pt_regs, __last) - sizeof(unsigned long))
 
 /**
  * regs_get_register() - get register value from its offset
diff --git a/arch/loongarch/include/asm/uprobes.h b/arch/loongarch/include/asm/uprobes.h
index 99a0d198927f..025fc3f0a102 100644
--- a/arch/loongarch/include/asm/uprobes.h
+++ b/arch/loongarch/include/asm/uprobes.h
@@ -15,7 +15,6 @@ typedef u32 uprobe_opcode_t;
 #define UPROBE_XOLBP_INSN	__emit_break(BRK_UPROBE_XOLBP)
 
 struct arch_uprobe {
-	unsigned long	resume_era;
 	u32	insn[2];
 	u32	ixol[2];
 	bool	simulate;
diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile
index 4853e8b04c6f..f9dcaa60033d 100644
--- a/arch/loongarch/kernel/Makefile
+++ b/arch/loongarch/kernel/Makefile
@@ -21,10 +21,10 @@ obj-$(CONFIG_CPU_HAS_LBT)	+= lbt.o
 
 obj-$(CONFIG_ARCH_STRICT_ALIGN)	+= unaligned.o
 
-CFLAGS_module.o		+= $(call cc-option,-Wno-override-init,)
-CFLAGS_syscall.o	+= $(call cc-option,-Wno-override-init,)
-CFLAGS_traps.o		+= $(call cc-option,-Wno-override-init,)
-CFLAGS_perf_event.o	+= $(call cc-option,-Wno-override-init,)
+CFLAGS_module.o		+= $(call cc-disable-warning, override-init)
+CFLAGS_syscall.o	+= $(call cc-disable-warning, override-init)
+CFLAGS_traps.o		+= $(call cc-disable-warning, override-init)
+CFLAGS_perf_event.o	+= $(call cc-disable-warning, override-init)
 
 ifdef CONFIG_FUNCTION_TRACER
   ifndef CONFIG_DYNAMIC_FTRACE
diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S
index 6ab640101457..28caf416ae36 100644
--- a/arch/loongarch/kernel/fpu.S
+++ b/arch/loongarch/kernel/fpu.S
@@ -458,6 +458,7 @@ SYM_FUNC_START(_save_fp_context)
 	li.w		a0, 0				# success
 	jr		ra
 SYM_FUNC_END(_save_fp_context)
+EXPORT_SYMBOL_GPL(_save_fp_context)
 
 /*
  * a0: fpregs
@@ -471,6 +472,7 @@ SYM_FUNC_START(_restore_fp_context)
 	li.w		a0, 0				# success
 	jr		ra
 SYM_FUNC_END(_restore_fp_context)
+EXPORT_SYMBOL_GPL(_restore_fp_context)
 
 /*
  * a0: fpregs
@@ -484,6 +486,7 @@ SYM_FUNC_START(_save_lsx_context)
 	li.w	a0, 0					# success
 	jr	ra
 SYM_FUNC_END(_save_lsx_context)
+EXPORT_SYMBOL_GPL(_save_lsx_context)
 
 /*
  * a0: fpregs
@@ -497,6 +500,7 @@ SYM_FUNC_START(_restore_lsx_context)
 	li.w	a0, 0					# success
 	jr	ra
 SYM_FUNC_END(_restore_lsx_context)
+EXPORT_SYMBOL_GPL(_restore_lsx_context)
 
 /*
  * a0: fpregs
@@ -510,6 +514,7 @@ SYM_FUNC_START(_save_lasx_context)
 	li.w	a0, 0					# success
 	jr	ra
 SYM_FUNC_END(_save_lasx_context)
+EXPORT_SYMBOL_GPL(_save_lasx_context)
 
 /*
  * a0: fpregs
@@ -523,6 +528,7 @@ SYM_FUNC_START(_restore_lasx_context)
 	li.w	a0, 0					# success
 	jr	ra
 SYM_FUNC_END(_restore_lasx_context)
+EXPORT_SYMBOL_GPL(_restore_lasx_context)
 
 .L_fpu_fault:
 	li.w	a0, -EFAULT				# failure
diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S
index 4f0912141781..733a7665e434 100644
--- a/arch/loongarch/kernel/genex.S
+++ b/arch/loongarch/kernel/genex.S
@@ -16,6 +16,7 @@
 #include <asm/stackframe.h>
 #include <asm/thread_info.h>
 
+	.section .cpuidle.text, "ax"
 	.align	5
 SYM_FUNC_START(__arch_cpu_idle)
 	/* start of idle interrupt region */
@@ -31,14 +32,16 @@ SYM_FUNC_START(__arch_cpu_idle)
 	 */
 	idle	0
 	/* end of idle interrupt region */
-1:	jr	ra
+idle_exit:
+	jr	ra
 SYM_FUNC_END(__arch_cpu_idle)
+	.previous
 
 SYM_CODE_START(handle_vint)
 	UNWIND_HINT_UNDEFINED
 	BACKUP_T0T1
 	SAVE_ALL
-	la_abs	t1, 1b
+	la_abs	t1, idle_exit
 	LONG_L	t0, sp, PT_ERA
 	/* 3 instructions idle interrupt region */
 	ori	t0, t0, 0b1100
diff --git a/arch/loongarch/kernel/kfpu.c b/arch/loongarch/kernel/kfpu.c
index ec5b28e570c9..4c476904227f 100644
--- a/arch/loongarch/kernel/kfpu.c
+++ b/arch/loongarch/kernel/kfpu.c
@@ -18,11 +18,28 @@ static unsigned int euen_mask = CSR_EUEN_FPEN;
 static DEFINE_PER_CPU(bool, in_kernel_fpu);
 static DEFINE_PER_CPU(unsigned int, euen_current);
 
+static inline void fpregs_lock(void)
+{
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_disable();
+	else
+		local_bh_disable();
+}
+
+static inline void fpregs_unlock(void)
+{
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_enable();
+	else
+		local_bh_enable();
+}
+
 void kernel_fpu_begin(void)
 {
 	unsigned int *euen_curr;
 
-	preempt_disable();
+	if (!irqs_disabled())
+		fpregs_lock();
 
 	WARN_ON(this_cpu_read(in_kernel_fpu));
 
@@ -73,7 +90,8 @@ void kernel_fpu_end(void)
 
 	this_cpu_write(in_kernel_fpu, false);
 
-	preempt_enable();
+	if (!irqs_disabled())
+		fpregs_unlock();
 }
 EXPORT_SYMBOL_GPL(kernel_fpu_end);
 
diff --git a/arch/loongarch/kernel/lbt.S b/arch/loongarch/kernel/lbt.S
index 001f061d226a..71678912d24c 100644
--- a/arch/loongarch/kernel/lbt.S
+++ b/arch/loongarch/kernel/lbt.S
@@ -90,6 +90,7 @@ SYM_FUNC_START(_save_lbt_context)
 	li.w		a0, 0			# success
 	jr		ra
 SYM_FUNC_END(_save_lbt_context)
+EXPORT_SYMBOL_GPL(_save_lbt_context)
 
 /*
  * a0: scr
@@ -110,6 +111,7 @@ SYM_FUNC_START(_restore_lbt_context)
 	li.w		a0, 0			# success
 	jr		ra
 SYM_FUNC_END(_restore_lbt_context)
+EXPORT_SYMBOL_GPL(_restore_lbt_context)
 
 /*
  * a0: ftop
@@ -120,6 +122,7 @@ SYM_FUNC_START(_save_ftop_context)
 	li.w		a0, 0			# success
 	jr		ra
 SYM_FUNC_END(_save_ftop_context)
+EXPORT_SYMBOL_GPL(_save_ftop_context)
 
 /*
  * a0: ftop
@@ -150,6 +153,7 @@ SYM_FUNC_START(_restore_ftop_context)
 	li.w		a0, 0			# success
 	jr		ra
 SYM_FUNC_END(_restore_ftop_context)
+EXPORT_SYMBOL_GPL(_restore_ftop_context)
 
 .L_lbt_fault:
 	li.w		a0, -EFAULT		# failure
diff --git a/arch/loongarch/kernel/signal.c b/arch/loongarch/kernel/signal.c
index 7a555b600171..4740cb5b2388 100644
--- a/arch/loongarch/kernel/signal.c
+++ b/arch/loongarch/kernel/signal.c
@@ -51,27 +51,6 @@
 #define lock_lbt_owner()	({ preempt_disable(); pagefault_disable(); })
 #define unlock_lbt_owner()	({ pagefault_enable(); preempt_enable(); })
 
-/* Assembly functions to move context to/from the FPU */
-extern asmlinkage int
-_save_fp_context(void __user *fpregs, void __user *fcc, void __user *csr);
-extern asmlinkage int
-_restore_fp_context(void __user *fpregs, void __user *fcc, void __user *csr);
-extern asmlinkage int
-_save_lsx_context(void __user *fpregs, void __user *fcc, void __user *fcsr);
-extern asmlinkage int
-_restore_lsx_context(void __user *fpregs, void __user *fcc, void __user *fcsr);
-extern asmlinkage int
-_save_lasx_context(void __user *fpregs, void __user *fcc, void __user *fcsr);
-extern asmlinkage int
-_restore_lasx_context(void __user *fpregs, void __user *fcc, void __user *fcsr);
-
-#ifdef CONFIG_CPU_HAS_LBT
-extern asmlinkage int _save_lbt_context(void __user *regs, void __user *eflags);
-extern asmlinkage int _restore_lbt_context(void __user *regs, void __user *eflags);
-extern asmlinkage int _save_ftop_context(void __user *ftop);
-extern asmlinkage int _restore_ftop_context(void __user *ftop);
-#endif
-
 struct rt_sigframe {
 	struct siginfo rs_info;
 	struct ucontext rs_uctx;
diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c
index e2d3bfeb6366..bc75a3a69fc8 100644
--- a/arch/loongarch/kernel/time.c
+++ b/arch/loongarch/kernel/time.c
@@ -111,7 +111,7 @@ static unsigned long __init get_loops_per_jiffy(void)
 	return lpj;
 }
 
-static long init_offset __nosavedata;
+static long init_offset;
 
 void save_counter(void)
 {
diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c
index 2ec3106c0da3..47fc2de6d150 100644
--- a/arch/loongarch/kernel/traps.c
+++ b/arch/loongarch/kernel/traps.c
@@ -553,9 +553,10 @@ asmlinkage void noinstr do_ale(struct pt_regs *regs)
 	die_if_kernel("Kernel ale access", regs);
 	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)regs->csr_badvaddr);
 #else
+	bool pie = regs_irqs_disabled(regs);
 	unsigned int *pc;
 
-	if (regs->csr_prmd & CSR_PRMD_PIE)
+	if (!pie)
 		local_irq_enable();
 
 	perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, regs->csr_badvaddr);
@@ -582,7 +583,7 @@ sigbus:
 	die_if_kernel("Kernel ale access", regs);
 	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)regs->csr_badvaddr);
 out:
-	if (regs->csr_prmd & CSR_PRMD_PIE)
+	if (!pie)
 		local_irq_disable();
 #endif
 	irqentry_exit(regs, state);
@@ -621,12 +622,13 @@ static void bug_handler(struct pt_regs *regs)
 asmlinkage void noinstr do_bce(struct pt_regs *regs)
 {
 	bool user = user_mode(regs);
+	bool pie = regs_irqs_disabled(regs);
 	unsigned long era = exception_era(regs);
 	u64 badv = 0, lower = 0, upper = ULONG_MAX;
 	union loongarch_instruction insn;
 	irqentry_state_t state = irqentry_enter(regs);
 
-	if (regs->csr_prmd & CSR_PRMD_PIE)
+	if (!pie)
 		local_irq_enable();
 
 	current->thread.trap_nr = read_csr_excode();
@@ -692,7 +694,7 @@ asmlinkage void noinstr do_bce(struct pt_regs *regs)
 	force_sig_bnderr((void __user *)badv, (void __user *)lower, (void __user *)upper);
 
 out:
-	if (regs->csr_prmd & CSR_PRMD_PIE)
+	if (!pie)
 		local_irq_disable();
 
 	irqentry_exit(regs, state);
@@ -710,11 +712,12 @@ bad_era:
 asmlinkage void noinstr do_bp(struct pt_regs *regs)
 {
 	bool user = user_mode(regs);
+	bool pie = regs_irqs_disabled(regs);
 	unsigned int opcode, bcode;
 	unsigned long era = exception_era(regs);
 	irqentry_state_t state = irqentry_enter(regs);
 
-	if (regs->csr_prmd & CSR_PRMD_PIE)
+	if (!pie)
 		local_irq_enable();
 
 	if (__get_inst(&opcode, (u32 *)era, user))
@@ -780,7 +783,7 @@ asmlinkage void noinstr do_bp(struct pt_regs *regs)
 	}
 
 out:
-	if (regs->csr_prmd & CSR_PRMD_PIE)
+	if (!pie)
 		local_irq_disable();
 
 	irqentry_exit(regs, state);
@@ -1015,6 +1018,7 @@ static void init_restore_lbt(void)
 
 asmlinkage void noinstr do_lbt(struct pt_regs *regs)
 {
+	bool pie = regs_irqs_disabled(regs);
 	irqentry_state_t state = irqentry_enter(regs);
 
 	/*
@@ -1024,7 +1028,7 @@ asmlinkage void noinstr do_lbt(struct pt_regs *regs)
 	 * (including the user using 'MOVGR2GCSR' to turn on TM, which
 	 * will not trigger the BTE), we need to check PRMD first.
 	 */
-	if (regs->csr_prmd & CSR_PRMD_PIE)
+	if (!pie)
 		local_irq_enable();
 
 	if (!cpu_has_lbt) {
@@ -1038,7 +1042,7 @@ asmlinkage void noinstr do_lbt(struct pt_regs *regs)
 	preempt_enable();
 
 out:
-	if (regs->csr_prmd & CSR_PRMD_PIE)
+	if (!pie)
 		local_irq_disable();
 
 	irqentry_exit(regs, state);
diff --git a/arch/loongarch/kernel/uprobes.c b/arch/loongarch/kernel/uprobes.c
index 87abc7137b73..6022eb0f71db 100644
--- a/arch/loongarch/kernel/uprobes.c
+++ b/arch/loongarch/kernel/uprobes.c
@@ -42,7 +42,6 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 	utask->autask.saved_trap_nr = current->thread.trap_nr;
 	current->thread.trap_nr = UPROBE_TRAP_NR;
 	instruction_pointer_set(regs, utask->xol_vaddr);
-	user_enable_single_step(current);
 
 	return 0;
 }
@@ -53,13 +52,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 
 	WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
 	current->thread.trap_nr = utask->autask.saved_trap_nr;
-
-	if (auprobe->simulate)
-		instruction_pointer_set(regs, auprobe->resume_era);
-	else
-		instruction_pointer_set(regs, utask->vaddr + LOONGARCH_INSN_SIZE);
-
-	user_disable_single_step(current);
+	instruction_pointer_set(regs, utask->vaddr + LOONGARCH_INSN_SIZE);
 
 	return 0;
 }
@@ -70,7 +63,6 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 
 	current->thread.trap_nr = utask->autask.saved_trap_nr;
 	instruction_pointer_set(regs, utask->vaddr);
-	user_disable_single_step(current);
 }
 
 bool arch_uprobe_xol_was_trapped(struct task_struct *t)
@@ -90,7 +82,6 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
 
 	insn.word = auprobe->insn[0];
 	arch_simulate_insn(insn, regs);
-	auprobe->resume_era = regs->csr_era;
 
 	return true;
 }
diff --git a/arch/loongarch/kvm/Makefile b/arch/loongarch/kvm/Makefile
index f4c8e35c216a..cb41d9265662 100644
--- a/arch/loongarch/kvm/Makefile
+++ b/arch/loongarch/kvm/Makefile
@@ -21,4 +21,4 @@ kvm-y += intc/eiointc.o
 kvm-y += intc/pch_pic.o
 kvm-y += irqfd.o
 
-CFLAGS_exit.o	+= $(call cc-option,-Wno-override-init,)
+CFLAGS_exit.o	+= $(call cc-disable-warning, override-init)
diff --git a/arch/loongarch/kvm/intc/ipi.c b/arch/loongarch/kvm/intc/ipi.c
index 93f4acd44523..fe734dc062ed 100644
--- a/arch/loongarch/kvm/intc/ipi.c
+++ b/arch/loongarch/kvm/intc/ipi.c
@@ -111,7 +111,7 @@ static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data)
 		ret = kvm_io_bus_read(vcpu, KVM_IOCSR_BUS, addr, sizeof(val), &val);
 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
 		if (unlikely(ret)) {
-			kvm_err("%s: : read date from addr %llx failed\n", __func__, addr);
+			kvm_err("%s: : read data from addr %llx failed\n", __func__, addr);
 			return ret;
 		}
 		/* Construct the mask by scanning the bit 27-30 */
@@ -127,7 +127,7 @@ static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data)
 	ret = kvm_io_bus_write(vcpu, KVM_IOCSR_BUS, addr, sizeof(val), &val);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	if (unlikely(ret))
-		kvm_err("%s: : write date to addr %llx failed\n", __func__, addr);
+		kvm_err("%s: : write data to addr %llx failed\n", __func__, addr);
 
 	return ret;
 }
diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c
index d165cd38c6bb..80ea63d465b8 100644
--- a/arch/loongarch/kvm/main.c
+++ b/arch/loongarch/kvm/main.c
@@ -296,10 +296,10 @@ int kvm_arch_enable_virtualization_cpu(void)
 	/*
 	 * Enable virtualization features granting guest direct control of
 	 * certain features:
-	 * GCI=2:       Trap on init or unimplement cache instruction.
+	 * GCI=2:       Trap on init or unimplemented cache instruction.
 	 * TORU=0:      Trap on Root Unimplement.
 	 * CACTRL=1:    Root control cache.
-	 * TOP=0:       Trap on Previlege.
+	 * TOP=0:       Trap on Privilege.
 	 * TOE=0:       Trap on Exception.
 	 * TIT=0:       Trap on Timer.
 	 */
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 8e427b379661..5af32ec62cb1 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -294,6 +294,7 @@ static int kvm_pre_enter_guest(struct kvm_vcpu *vcpu)
 		vcpu->arch.aux_inuse &= ~KVM_LARCH_SWCSR_LATEST;
 
 		if (kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending()) {
+			kvm_lose_pmu(vcpu);
 			/* make sure the vcpu mode has been written */
 			smp_store_mb(vcpu->mode, OUTSIDE_GUEST_MODE);
 			local_irq_enable();
@@ -902,6 +903,13 @@ static int kvm_set_one_reg(struct kvm_vcpu *vcpu,
 			vcpu->arch.st.guest_addr = 0;
 			memset(&vcpu->arch.irq_pending, 0, sizeof(vcpu->arch.irq_pending));
 			memset(&vcpu->arch.irq_clear, 0, sizeof(vcpu->arch.irq_clear));
+
+			/*
+			 * When vCPU reset, clear the ESTAT and GINTC registers
+			 * Other CSR registers are cleared with function _kvm_setcsr().
+			 */
+			kvm_write_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_GINTC, 0);
+			kvm_write_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_ESTAT, 0);
 			break;
 		default:
 			ret = -EINVAL;
diff --git a/arch/loongarch/lib/crc32-loongarch.c b/arch/loongarch/lib/crc32-loongarch.c
index c44ee4f32557..b37cd8537b45 100644
--- a/arch/loongarch/lib/crc32-loongarch.c
+++ b/arch/loongarch/lib/crc32-loongarch.c
@@ -26,7 +26,7 @@ do {							\
 #define CRC32(crc, value, size)		_CRC32(crc, value, size, crc)
 #define CRC32C(crc, value, size)	_CRC32(crc, value, size, crcc)
 
-static DEFINE_STATIC_KEY_FALSE(have_crc32);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
 
 u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
 {
@@ -114,7 +114,7 @@ static int __init crc32_loongarch_init(void)
 		static_branch_enable(&have_crc32);
 	return 0;
 }
-arch_initcall(crc32_loongarch_init);
+subsys_initcall(crc32_loongarch_init);
 
 static void __exit crc32_loongarch_exit(void)
 {
diff --git a/arch/loongarch/mm/hugetlbpage.c b/arch/loongarch/mm/hugetlbpage.c
index e4068906143b..cea84d7f2b91 100644
--- a/arch/loongarch/mm/hugetlbpage.c
+++ b/arch/loongarch/mm/hugetlbpage.c
@@ -47,7 +47,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
 				pmd = pmd_offset(pud, addr);
 		}
 	}
-	return (pte_t *) pmd;
+	return pmd_none(pmdp_get(pmd)) ? NULL : (pte_t *) pmd;
 }
 
 uint64_t pmd_to_entrylo(unsigned long pmd_val)
diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c
index fdb7f73ad160..06f11d9e4ec1 100644
--- a/arch/loongarch/mm/init.c
+++ b/arch/loongarch/mm/init.c
@@ -65,9 +65,6 @@ void __init paging_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 
-#ifdef CONFIG_ZONE_DMA
-	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
-#endif
 #ifdef CONFIG_ZONE_DMA32
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 #endif
diff --git a/arch/loongarch/power/hibernate.c b/arch/loongarch/power/hibernate.c
index 1e0590542f98..e7b7346592cb 100644
--- a/arch/loongarch/power/hibernate.c
+++ b/arch/loongarch/power/hibernate.c
@@ -2,6 +2,7 @@
 #include <asm/fpu.h>
 #include <asm/loongson.h>
 #include <asm/sections.h>
+#include <asm/time.h>
 #include <asm/tlbflush.h>
 #include <linux/suspend.h>
 
@@ -14,6 +15,7 @@ struct pt_regs saved_regs;
 
 void save_processor_state(void)
 {
+	save_counter();
 	saved_crmd = csr_read32(LOONGARCH_CSR_CRMD);
 	saved_prmd = csr_read32(LOONGARCH_CSR_PRMD);
 	saved_euen = csr_read32(LOONGARCH_CSR_EUEN);
@@ -26,6 +28,7 @@ void save_processor_state(void)
 
 void restore_processor_state(void)
 {
+	sync_counter();
 	csr_write32(saved_crmd, LOONGARCH_CSR_CRMD);
 	csr_write32(saved_prmd, LOONGARCH_CSR_PRMD);
 	csr_write32(saved_euen, LOONGARCH_CSR_EUEN);
diff --git a/arch/m68k/configs/amcore_defconfig b/arch/m68k/configs/amcore_defconfig
index 110279a64aa4..60767811e34a 100644
--- a/arch/m68k/configs/amcore_defconfig
+++ b/arch/m68k/configs/amcore_defconfig
@@ -2,7 +2,6 @@ CONFIG_LOCALVERSION="amcore-002"
 CONFIG_DEFAULT_HOSTNAME="amcore"
 CONFIG_SYSVIPC=y
 # CONFIG_FHANDLE is not set
-# CONFIG_USELIB is not set
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 # CONFIG_AIO is not set
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index 31ecb8b7b9f1..77f78d326a32 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -551,7 +551,7 @@ CONFIG_ENCRYPTED_KEYS=m
 CONFIG_HARDENED_USERCOPY=y
 CONFIG_CRYPTO_USER=m
 CONFIG_CRYPTO_CRYPTD=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_RSA=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index 1f57514624d5..f4031aa5d37f 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -508,7 +508,7 @@ CONFIG_ENCRYPTED_KEYS=m
 CONFIG_HARDENED_USERCOPY=y
 CONFIG_CRYPTO_USER=m
 CONFIG_CRYPTO_CRYPTD=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_RSA=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index 02db7a48e57e..fa92131cf4b3 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -528,7 +528,7 @@ CONFIG_ENCRYPTED_KEYS=m
 CONFIG_HARDENED_USERCOPY=y
 CONFIG_CRYPTO_USER=m
 CONFIG_CRYPTO_CRYPTD=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_RSA=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index f0e673cb17eb..9c2afc477061 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -500,7 +500,7 @@ CONFIG_ENCRYPTED_KEYS=m
 CONFIG_HARDENED_USERCOPY=y
 CONFIG_CRYPTO_USER=m
 CONFIG_CRYPTO_CRYPTD=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_RSA=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index e8ca5a50b86d..e7cdab059d96 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -510,7 +510,7 @@ CONFIG_ENCRYPTED_KEYS=m
 CONFIG_HARDENED_USERCOPY=y
 CONFIG_CRYPTO_USER=m
 CONFIG_CRYPTO_CRYPTD=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_RSA=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index b3a270441bb1..0a79751c20a5 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -527,7 +527,7 @@ CONFIG_ENCRYPTED_KEYS=m
 CONFIG_HARDENED_USERCOPY=y
 CONFIG_CRYPTO_USER=m
 CONFIG_CRYPTO_CRYPTD=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_RSA=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index d215dba006ce..f8ca490ee65a 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -614,7 +614,7 @@ CONFIG_ENCRYPTED_KEYS=m
 CONFIG_HARDENED_USERCOPY=y
 CONFIG_CRYPTO_USER=m
 CONFIG_CRYPTO_CRYPTD=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_RSA=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index a888ed93ff82..88fdcea906f3 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -500,7 +500,7 @@ CONFIG_ENCRYPTED_KEYS=m
 CONFIG_HARDENED_USERCOPY=y
 CONFIG_CRYPTO_USER=m
 CONFIG_CRYPTO_CRYPTD=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_RSA=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index b481782375f6..8acbe83dac72 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -501,7 +501,7 @@ CONFIG_ENCRYPTED_KEYS=m
 CONFIG_HARDENED_USERCOPY=y
 CONFIG_CRYPTO_USER=m
 CONFIG_CRYPTO_CRYPTD=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_RSA=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index 6eba743d8eb5..e3095301f3c5 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -517,7 +517,7 @@ CONFIG_ENCRYPTED_KEYS=m
 CONFIG_HARDENED_USERCOPY=y
 CONFIG_CRYPTO_USER=m
 CONFIG_CRYPTO_CRYPTD=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_RSA=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index 9bdbb418ffa8..948e48ddd128 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -498,7 +498,7 @@ CONFIG_ENCRYPTED_KEYS=m
 CONFIG_HARDENED_USERCOPY=y
 CONFIG_CRYPTO_USER=m
 CONFIG_CRYPTO_CRYPTD=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_RSA=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index e1cf20fa5343..5bcf9181c37c 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -498,7 +498,7 @@ CONFIG_ENCRYPTED_KEYS=m
 CONFIG_HARDENED_USERCOPY=y
 CONFIG_CRYPTO_USER=m
 CONFIG_CRYPTO_CRYPTD=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_RSA=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
diff --git a/arch/mips/cavium-octeon/Kconfig b/arch/mips/cavium-octeon/Kconfig
index 450e979ef5d9..11f4aa6e80e9 100644
--- a/arch/mips/cavium-octeon/Kconfig
+++ b/arch/mips/cavium-octeon/Kconfig
@@ -23,6 +23,12 @@ config CAVIUM_OCTEON_CVMSEG_SIZE
 	  legally range is from zero to 54 cache blocks (i.e. CVMSEG LM is
 	  between zero and 6192 bytes).
 
+config CRYPTO_SHA256_OCTEON
+	tristate
+	default CRYPTO_LIB_SHA256
+	select CRYPTO_ARCH_HAVE_LIB_SHA256
+	select CRYPTO_LIB_SHA256_GENERIC
+
 endif # CPU_CAVIUM_OCTEON
 
 if CAVIUM_OCTEON_SOC
diff --git a/arch/mips/cavium-octeon/crypto/octeon-md5.c b/arch/mips/cavium-octeon/crypto/octeon-md5.c
index 5ee4ade99b99..fbc84eb7fedf 100644
--- a/arch/mips/cavium-octeon/crypto/octeon-md5.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-md5.c
@@ -19,22 +19,26 @@
  * any later version.
  */
 
+#include <asm/octeon/octeon.h>
+#include <crypto/internal/hash.h>
 #include <crypto/md5.h>
-#include <linux/init.h>
-#include <linux/types.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/string.h>
-#include <asm/byteorder.h>
-#include <asm/octeon/octeon.h>
-#include <crypto/internal/hash.h>
+#include <linux/unaligned.h>
 
 #include "octeon-crypto.h"
 
+struct octeon_md5_state {
+	__le32 hash[MD5_HASH_WORDS];
+	u64 byte_count;
+};
+
 /*
  * We pass everything as 64-bit. OCTEON can handle misaligned data.
  */
 
-static void octeon_md5_store_hash(struct md5_state *ctx)
+static void octeon_md5_store_hash(struct octeon_md5_state *ctx)
 {
 	u64 *hash = (u64 *)ctx->hash;
 
@@ -42,7 +46,7 @@ static void octeon_md5_store_hash(struct md5_state *ctx)
 	write_octeon_64bit_hash_dword(hash[1], 1);
 }
 
-static void octeon_md5_read_hash(struct md5_state *ctx)
+static void octeon_md5_read_hash(struct octeon_md5_state *ctx)
 {
 	u64 *hash = (u64 *)ctx->hash;
 
@@ -66,13 +70,12 @@ static void octeon_md5_transform(const void *_block)
 
 static int octeon_md5_init(struct shash_desc *desc)
 {
-	struct md5_state *mctx = shash_desc_ctx(desc);
+	struct octeon_md5_state *mctx = shash_desc_ctx(desc);
 
-	mctx->hash[0] = MD5_H0;
-	mctx->hash[1] = MD5_H1;
-	mctx->hash[2] = MD5_H2;
-	mctx->hash[3] = MD5_H3;
-	cpu_to_le32_array(mctx->hash, 4);
+	mctx->hash[0] = cpu_to_le32(MD5_H0);
+	mctx->hash[1] = cpu_to_le32(MD5_H1);
+	mctx->hash[2] = cpu_to_le32(MD5_H2);
+	mctx->hash[3] = cpu_to_le32(MD5_H3);
 	mctx->byte_count = 0;
 
 	return 0;
@@ -81,52 +84,38 @@ static int octeon_md5_init(struct shash_desc *desc)
 static int octeon_md5_update(struct shash_desc *desc, const u8 *data,
 			     unsigned int len)
 {
-	struct md5_state *mctx = shash_desc_ctx(desc);
-	const u32 avail = sizeof(mctx->block) - (mctx->byte_count & 0x3f);
+	struct octeon_md5_state *mctx = shash_desc_ctx(desc);
 	struct octeon_cop2_state state;
 	unsigned long flags;
 
 	mctx->byte_count += len;
-
-	if (avail > len) {
-		memcpy((char *)mctx->block + (sizeof(mctx->block) - avail),
-		       data, len);
-		return 0;
-	}
-
-	memcpy((char *)mctx->block + (sizeof(mctx->block) - avail), data,
-	       avail);
-
 	flags = octeon_crypto_enable(&state);
 	octeon_md5_store_hash(mctx);
 
-	octeon_md5_transform(mctx->block);
-	data += avail;
-	len -= avail;
-
-	while (len >= sizeof(mctx->block)) {
+	do {
 		octeon_md5_transform(data);
-		data += sizeof(mctx->block);
-		len -= sizeof(mctx->block);
-	}
+		data += MD5_HMAC_BLOCK_SIZE;
+		len -= MD5_HMAC_BLOCK_SIZE;
+	} while (len >= MD5_HMAC_BLOCK_SIZE);
 
 	octeon_md5_read_hash(mctx);
 	octeon_crypto_disable(&state, flags);
-
-	memcpy(mctx->block, data, len);
-
-	return 0;
+	mctx->byte_count -= len;
+	return len;
 }
 
-static int octeon_md5_final(struct shash_desc *desc, u8 *out)
+static int octeon_md5_finup(struct shash_desc *desc, const u8 *src,
+			    unsigned int offset, u8 *out)
 {
-	struct md5_state *mctx = shash_desc_ctx(desc);
-	const unsigned int offset = mctx->byte_count & 0x3f;
-	char *p = (char *)mctx->block + offset;
+	struct octeon_md5_state *mctx = shash_desc_ctx(desc);
 	int padding = 56 - (offset + 1);
 	struct octeon_cop2_state state;
+	u32 block[MD5_BLOCK_WORDS];
 	unsigned long flags;
+	char *p;
 
+	p = memcpy(block, src, offset);
+	p += offset;
 	*p++ = 0x80;
 
 	flags = octeon_crypto_enable(&state);
@@ -134,39 +123,56 @@ static int octeon_md5_final(struct shash_desc *desc, u8 *out)
 
 	if (padding < 0) {
 		memset(p, 0x00, padding + sizeof(u64));
-		octeon_md5_transform(mctx->block);
-		p = (char *)mctx->block;
+		octeon_md5_transform(block);
+		p = (char *)block;
 		padding = 56;
 	}
 
 	memset(p, 0, padding);
-	mctx->block[14] = mctx->byte_count << 3;
-	mctx->block[15] = mctx->byte_count >> 29;
-	cpu_to_le32_array(mctx->block + 14, 2);
-	octeon_md5_transform(mctx->block);
+	mctx->byte_count += offset;
+	block[14] = mctx->byte_count << 3;
+	block[15] = mctx->byte_count >> 29;
+	cpu_to_le32_array(block + 14, 2);
+	octeon_md5_transform(block);
 
 	octeon_md5_read_hash(mctx);
 	octeon_crypto_disable(&state, flags);
 
+	memzero_explicit(block, sizeof(block));
 	memcpy(out, mctx->hash, sizeof(mctx->hash));
-	memset(mctx, 0, sizeof(*mctx));
 
 	return 0;
 }
 
 static int octeon_md5_export(struct shash_desc *desc, void *out)
 {
-	struct md5_state *ctx = shash_desc_ctx(desc);
-
-	memcpy(out, ctx, sizeof(*ctx));
+	struct octeon_md5_state *ctx = shash_desc_ctx(desc);
+	union {
+		u8 *u8;
+		u32 *u32;
+		u64 *u64;
+	} p = { .u8 = out };
+	int i;
+
+	for (i = 0; i < MD5_HASH_WORDS; i++)
+		put_unaligned(le32_to_cpu(ctx->hash[i]), p.u32++);
+	put_unaligned(ctx->byte_count, p.u64);
 	return 0;
 }
 
 static int octeon_md5_import(struct shash_desc *desc, const void *in)
 {
-	struct md5_state *ctx = shash_desc_ctx(desc);
-
-	memcpy(ctx, in, sizeof(*ctx));
+	struct octeon_md5_state *ctx = shash_desc_ctx(desc);
+	union {
+		const u8 *u8;
+		const u32 *u32;
+		const u64 *u64;
+	} p = { .u8 = in };
+	int i;
+
+	for (i = 0; i < MD5_HASH_WORDS; i++)
+		ctx->hash[i] = cpu_to_le32(get_unaligned(p.u32++));
+	ctx->byte_count = get_unaligned(p.u64);
 	return 0;
 }
 
@@ -174,15 +180,16 @@ static struct shash_alg alg = {
 	.digestsize	=	MD5_DIGEST_SIZE,
 	.init		=	octeon_md5_init,
 	.update		=	octeon_md5_update,
-	.final		=	octeon_md5_final,
+	.finup		=	octeon_md5_finup,
 	.export		=	octeon_md5_export,
 	.import		=	octeon_md5_import,
-	.descsize	=	sizeof(struct md5_state),
-	.statesize	=	sizeof(struct md5_state),
+	.statesize	=	MD5_STATE_SIZE,
+	.descsize	=	sizeof(struct octeon_md5_state),
 	.base		=	{
 		.cra_name	=	"md5",
 		.cra_driver_name=	"octeon-md5",
 		.cra_priority	=	OCTEON_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize	=	MD5_HMAC_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha1.c b/arch/mips/cavium-octeon/crypto/octeon-sha1.c
index 37a07b3c4568..e70f21a473da 100644
--- a/arch/mips/cavium-octeon/crypto/octeon-sha1.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-sha1.c
@@ -13,15 +13,13 @@
  * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
  */
 
-#include <linux/mm.h>
+#include <asm/octeon/octeon.h>
+#include <crypto/internal/hash.h>
 #include <crypto/sha1.h>
 #include <crypto/sha1_base.h>
-#include <linux/init.h>
-#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
-#include <asm/byteorder.h>
-#include <asm/octeon/octeon.h>
-#include <crypto/internal/hash.h>
 
 #include "octeon-crypto.h"
 
@@ -58,49 +56,23 @@ static void octeon_sha1_read_hash(struct sha1_state *sctx)
 	memzero_explicit(&hash_tail.dword, sizeof(hash_tail.dword));
 }
 
-static void octeon_sha1_transform(const void *_block)
+static void octeon_sha1_transform(struct sha1_state *sctx, const u8 *src,
+				  int blocks)
 {
-	const u64 *block = _block;
-
-	write_octeon_64bit_block_dword(block[0], 0);
-	write_octeon_64bit_block_dword(block[1], 1);
-	write_octeon_64bit_block_dword(block[2], 2);
-	write_octeon_64bit_block_dword(block[3], 3);
-	write_octeon_64bit_block_dword(block[4], 4);
-	write_octeon_64bit_block_dword(block[5], 5);
-	write_octeon_64bit_block_dword(block[6], 6);
-	octeon_sha1_start(block[7]);
-}
-
-static void __octeon_sha1_update(struct sha1_state *sctx, const u8 *data,
-				 unsigned int len)
-{
-	unsigned int partial;
-	unsigned int done;
-	const u8 *src;
-
-	partial = sctx->count % SHA1_BLOCK_SIZE;
-	sctx->count += len;
-	done = 0;
-	src = data;
-
-	if ((partial + len) >= SHA1_BLOCK_SIZE) {
-		if (partial) {
-			done = -partial;
-			memcpy(sctx->buffer + partial, data,
-			       done + SHA1_BLOCK_SIZE);
-			src = sctx->buffer;
-		}
-
-		do {
-			octeon_sha1_transform(src);
-			done += SHA1_BLOCK_SIZE;
-			src = data + done;
-		} while (done + SHA1_BLOCK_SIZE <= len);
-
-		partial = 0;
-	}
-	memcpy(sctx->buffer + partial, src, len - done);
+	do {
+		const u64 *block = (const u64 *)src;
+
+		write_octeon_64bit_block_dword(block[0], 0);
+		write_octeon_64bit_block_dword(block[1], 1);
+		write_octeon_64bit_block_dword(block[2], 2);
+		write_octeon_64bit_block_dword(block[3], 3);
+		write_octeon_64bit_block_dword(block[4], 4);
+		write_octeon_64bit_block_dword(block[5], 5);
+		write_octeon_64bit_block_dword(block[6], 6);
+		octeon_sha1_start(block[7]);
+
+		src += SHA1_BLOCK_SIZE;
+	} while (--blocks);
 }
 
 static int octeon_sha1_update(struct shash_desc *desc, const u8 *data,
@@ -109,95 +81,47 @@ static int octeon_sha1_update(struct shash_desc *desc, const u8 *data,
 	struct sha1_state *sctx = shash_desc_ctx(desc);
 	struct octeon_cop2_state state;
 	unsigned long flags;
-
-	/*
-	 * Small updates never reach the crypto engine, so the generic sha1 is
-	 * faster because of the heavyweight octeon_crypto_enable() /
-	 * octeon_crypto_disable().
-	 */
-	if ((sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
-		return crypto_sha1_update(desc, data, len);
+	int remain;
 
 	flags = octeon_crypto_enable(&state);
 	octeon_sha1_store_hash(sctx);
 
-	__octeon_sha1_update(sctx, data, len);
+	remain = sha1_base_do_update_blocks(desc, data, len,
+					    octeon_sha1_transform);
 
 	octeon_sha1_read_hash(sctx);
 	octeon_crypto_disable(&state, flags);
-
-	return 0;
+	return remain;
 }
 
-static int octeon_sha1_final(struct shash_desc *desc, u8 *out)
+static int octeon_sha1_finup(struct shash_desc *desc, const u8 *src,
+			     unsigned int len, u8 *out)
 {
 	struct sha1_state *sctx = shash_desc_ctx(desc);
-	static const u8 padding[64] = { 0x80, };
 	struct octeon_cop2_state state;
-	__be32 *dst = (__be32 *)out;
-	unsigned int pad_len;
 	unsigned long flags;
-	unsigned int index;
-	__be64 bits;
-	int i;
-
-	/* Save number of bits. */
-	bits = cpu_to_be64(sctx->count << 3);
-
-	/* Pad out to 56 mod 64. */
-	index = sctx->count & 0x3f;
-	pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
 
 	flags = octeon_crypto_enable(&state);
 	octeon_sha1_store_hash(sctx);
 
-	__octeon_sha1_update(sctx, padding, pad_len);
-
-	/* Append length (before padding). */
-	__octeon_sha1_update(sctx, (const u8 *)&bits, sizeof(bits));
+	sha1_base_do_finup(desc, src, len, octeon_sha1_transform);
 
 	octeon_sha1_read_hash(sctx);
 	octeon_crypto_disable(&state, flags);
-
-	/* Store state in digest */
-	for (i = 0; i < 5; i++)
-		dst[i] = cpu_to_be32(sctx->state[i]);
-
-	/* Zeroize sensitive information. */
-	memset(sctx, 0, sizeof(*sctx));
-
-	return 0;
-}
-
-static int octeon_sha1_export(struct shash_desc *desc, void *out)
-{
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, sctx, sizeof(*sctx));
-	return 0;
-}
-
-static int octeon_sha1_import(struct shash_desc *desc, const void *in)
-{
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
-	return 0;
+	return sha1_base_finish(desc, out);
 }
 
 static struct shash_alg octeon_sha1_alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
 	.init		=	sha1_base_init,
 	.update		=	octeon_sha1_update,
-	.final		=	octeon_sha1_final,
-	.export		=	octeon_sha1_export,
-	.import		=	octeon_sha1_import,
-	.descsize	=	sizeof(struct sha1_state),
-	.statesize	=	sizeof(struct sha1_state),
+	.finup		=	octeon_sha1_finup,
+	.descsize	=	SHA1_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha1",
 		.cra_driver_name=	"octeon-sha1",
 		.cra_priority	=	OCTEON_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha256.c b/arch/mips/cavium-octeon/crypto/octeon-sha256.c
index 435e4a6e7f13..f93faaf1f4af 100644
--- a/arch/mips/cavium-octeon/crypto/octeon-sha256.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-sha256.c
@@ -1,8 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Cryptographic API.
- *
- * SHA-224 and SHA-256 Secure Hash Algorithm.
+ * SHA-256 Secure Hash Algorithm.
  *
  * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
  *
@@ -14,15 +12,10 @@
  * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
  */
 
-#include <linux/mm.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/module.h>
-#include <asm/byteorder.h>
 #include <asm/octeon/octeon.h>
-#include <crypto/internal/hash.h>
+#include <crypto/internal/sha2.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 
 #include "octeon-crypto.h"
 
@@ -30,212 +23,51 @@
  * We pass everything as 64-bit. OCTEON can handle misaligned data.
  */
 
-static void octeon_sha256_store_hash(struct sha256_state *sctx)
-{
-	u64 *hash = (u64 *)sctx->state;
-
-	write_octeon_64bit_hash_dword(hash[0], 0);
-	write_octeon_64bit_hash_dword(hash[1], 1);
-	write_octeon_64bit_hash_dword(hash[2], 2);
-	write_octeon_64bit_hash_dword(hash[3], 3);
-}
-
-static void octeon_sha256_read_hash(struct sha256_state *sctx)
-{
-	u64 *hash = (u64 *)sctx->state;
-
-	hash[0] = read_octeon_64bit_hash_dword(0);
-	hash[1] = read_octeon_64bit_hash_dword(1);
-	hash[2] = read_octeon_64bit_hash_dword(2);
-	hash[3] = read_octeon_64bit_hash_dword(3);
-}
-
-static void octeon_sha256_transform(const void *_block)
-{
-	const u64 *block = _block;
-
-	write_octeon_64bit_block_dword(block[0], 0);
-	write_octeon_64bit_block_dword(block[1], 1);
-	write_octeon_64bit_block_dword(block[2], 2);
-	write_octeon_64bit_block_dword(block[3], 3);
-	write_octeon_64bit_block_dword(block[4], 4);
-	write_octeon_64bit_block_dword(block[5], 5);
-	write_octeon_64bit_block_dword(block[6], 6);
-	octeon_sha256_start(block[7]);
-}
-
-static void __octeon_sha256_update(struct sha256_state *sctx, const u8 *data,
-				   unsigned int len)
-{
-	unsigned int partial;
-	unsigned int done;
-	const u8 *src;
-
-	partial = sctx->count % SHA256_BLOCK_SIZE;
-	sctx->count += len;
-	done = 0;
-	src = data;
-
-	if ((partial + len) >= SHA256_BLOCK_SIZE) {
-		if (partial) {
-			done = -partial;
-			memcpy(sctx->buf + partial, data,
-			       done + SHA256_BLOCK_SIZE);
-			src = sctx->buf;
-		}
-
-		do {
-			octeon_sha256_transform(src);
-			done += SHA256_BLOCK_SIZE;
-			src = data + done;
-		} while (done + SHA256_BLOCK_SIZE <= len);
-
-		partial = 0;
-	}
-	memcpy(sctx->buf + partial, src, len - done);
-}
-
-static int octeon_sha256_update(struct shash_desc *desc, const u8 *data,
-				unsigned int len)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-	struct octeon_cop2_state state;
-	unsigned long flags;
-
-	/*
-	 * Small updates never reach the crypto engine, so the generic sha256 is
-	 * faster because of the heavyweight octeon_crypto_enable() /
-	 * octeon_crypto_disable().
-	 */
-	if ((sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
-		return crypto_sha256_update(desc, data, len);
-
-	flags = octeon_crypto_enable(&state);
-	octeon_sha256_store_hash(sctx);
-
-	__octeon_sha256_update(sctx, data, len);
-
-	octeon_sha256_read_hash(sctx);
-	octeon_crypto_disable(&state, flags);
-
-	return 0;
-}
-
-static int octeon_sha256_final(struct shash_desc *desc, u8 *out)
+void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+			const u8 *data, size_t nblocks)
 {
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-	static const u8 padding[64] = { 0x80, };
-	struct octeon_cop2_state state;
-	__be32 *dst = (__be32 *)out;
-	unsigned int pad_len;
+	struct octeon_cop2_state cop2_state;
+	u64 *state64 = (u64 *)state;
 	unsigned long flags;
-	unsigned int index;
-	__be64 bits;
-	int i;
-
-	/* Save number of bits. */
-	bits = cpu_to_be64(sctx->count << 3);
-
-	/* Pad out to 56 mod 64. */
-	index = sctx->count & 0x3f;
-	pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
-
-	flags = octeon_crypto_enable(&state);
-	octeon_sha256_store_hash(sctx);
-
-	__octeon_sha256_update(sctx, padding, pad_len);
-
-	/* Append length (before padding). */
-	__octeon_sha256_update(sctx, (const u8 *)&bits, sizeof(bits));
-
-	octeon_sha256_read_hash(sctx);
-	octeon_crypto_disable(&state, flags);
-
-	/* Store state in digest */
-	for (i = 0; i < 8; i++)
-		dst[i] = cpu_to_be32(sctx->state[i]);
-
-	/* Zeroize sensitive information. */
-	memset(sctx, 0, sizeof(*sctx));
-
-	return 0;
-}
-
-static int octeon_sha224_final(struct shash_desc *desc, u8 *hash)
-{
-	u8 D[SHA256_DIGEST_SIZE];
-
-	octeon_sha256_final(desc, D);
 
-	memcpy(hash, D, SHA224_DIGEST_SIZE);
-	memzero_explicit(D, SHA256_DIGEST_SIZE);
-
-	return 0;
-}
-
-static int octeon_sha256_export(struct shash_desc *desc, void *out)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, sctx, sizeof(*sctx));
-	return 0;
-}
-
-static int octeon_sha256_import(struct shash_desc *desc, const void *in)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
-	return 0;
-}
-
-static struct shash_alg octeon_sha256_algs[2] = { {
-	.digestsize	=	SHA256_DIGEST_SIZE,
-	.init		=	sha256_base_init,
-	.update		=	octeon_sha256_update,
-	.final		=	octeon_sha256_final,
-	.export		=	octeon_sha256_export,
-	.import		=	octeon_sha256_import,
-	.descsize	=	sizeof(struct sha256_state),
-	.statesize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha256",
-		.cra_driver_name=	"octeon-sha256",
-		.cra_priority	=	OCTEON_CR_OPCODE_PRIORITY,
-		.cra_blocksize	=	SHA256_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-}, {
-	.digestsize	=	SHA224_DIGEST_SIZE,
-	.init		=	sha224_base_init,
-	.update		=	octeon_sha256_update,
-	.final		=	octeon_sha224_final,
-	.descsize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha224",
-		.cra_driver_name=	"octeon-sha224",
-		.cra_blocksize	=	SHA224_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-} };
-
-static int __init octeon_sha256_mod_init(void)
-{
 	if (!octeon_has_crypto())
-		return -ENOTSUPP;
-	return crypto_register_shashes(octeon_sha256_algs,
-				       ARRAY_SIZE(octeon_sha256_algs));
+		return sha256_blocks_generic(state, data, nblocks);
+
+	flags = octeon_crypto_enable(&cop2_state);
+	write_octeon_64bit_hash_dword(state64[0], 0);
+	write_octeon_64bit_hash_dword(state64[1], 1);
+	write_octeon_64bit_hash_dword(state64[2], 2);
+	write_octeon_64bit_hash_dword(state64[3], 3);
+
+	do {
+		const u64 *block = (const u64 *)data;
+
+		write_octeon_64bit_block_dword(block[0], 0);
+		write_octeon_64bit_block_dword(block[1], 1);
+		write_octeon_64bit_block_dword(block[2], 2);
+		write_octeon_64bit_block_dword(block[3], 3);
+		write_octeon_64bit_block_dword(block[4], 4);
+		write_octeon_64bit_block_dword(block[5], 5);
+		write_octeon_64bit_block_dword(block[6], 6);
+		octeon_sha256_start(block[7]);
+
+		data += SHA256_BLOCK_SIZE;
+	} while (--nblocks);
+
+	state64[0] = read_octeon_64bit_hash_dword(0);
+	state64[1] = read_octeon_64bit_hash_dword(1);
+	state64[2] = read_octeon_64bit_hash_dword(2);
+	state64[3] = read_octeon_64bit_hash_dword(3);
+	octeon_crypto_disable(&cop2_state, flags);
 }
+EXPORT_SYMBOL_GPL(sha256_blocks_arch);
 
-static void __exit octeon_sha256_mod_fini(void)
+bool sha256_is_arch_optimized(void)
 {
-	crypto_unregister_shashes(octeon_sha256_algs,
-				  ARRAY_SIZE(octeon_sha256_algs));
+	return octeon_has_crypto();
 }
-
-module_init(octeon_sha256_mod_init);
-module_exit(octeon_sha256_mod_fini);
+EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
 
 MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-224 and SHA-256 Secure Hash Algorithm (OCTEON)");
+MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm (OCTEON)");
 MODULE_AUTHOR("Aaro Koskinen <aaro.koskinen@iki.fi>");
diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha512.c b/arch/mips/cavium-octeon/crypto/octeon-sha512.c
index 2dee9354e33f..215311053db3 100644
--- a/arch/mips/cavium-octeon/crypto/octeon-sha512.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-sha512.c
@@ -13,15 +13,12 @@
  * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
  */
 
-#include <linux/mm.h>
+#include <asm/octeon/octeon.h>
+#include <crypto/internal/hash.h>
 #include <crypto/sha2.h>
 #include <crypto/sha512_base.h>
-#include <linux/init.h>
-#include <linux/types.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
-#include <asm/byteorder.h>
-#include <asm/octeon/octeon.h>
-#include <crypto/internal/hash.h>
 
 #include "octeon-crypto.h"
 
@@ -53,60 +50,31 @@ static void octeon_sha512_read_hash(struct sha512_state *sctx)
 	sctx->state[7] = read_octeon_64bit_hash_sha512(7);
 }
 
-static void octeon_sha512_transform(const void *_block)
+static void octeon_sha512_transform(struct sha512_state *sctx,
+				    const u8 *src, int blocks)
 {
-	const u64 *block = _block;
-
-	write_octeon_64bit_block_sha512(block[0], 0);
-	write_octeon_64bit_block_sha512(block[1], 1);
-	write_octeon_64bit_block_sha512(block[2], 2);
-	write_octeon_64bit_block_sha512(block[3], 3);
-	write_octeon_64bit_block_sha512(block[4], 4);
-	write_octeon_64bit_block_sha512(block[5], 5);
-	write_octeon_64bit_block_sha512(block[6], 6);
-	write_octeon_64bit_block_sha512(block[7], 7);
-	write_octeon_64bit_block_sha512(block[8], 8);
-	write_octeon_64bit_block_sha512(block[9], 9);
-	write_octeon_64bit_block_sha512(block[10], 10);
-	write_octeon_64bit_block_sha512(block[11], 11);
-	write_octeon_64bit_block_sha512(block[12], 12);
-	write_octeon_64bit_block_sha512(block[13], 13);
-	write_octeon_64bit_block_sha512(block[14], 14);
-	octeon_sha512_start(block[15]);
-}
-
-static void __octeon_sha512_update(struct sha512_state *sctx, const u8 *data,
-				   unsigned int len)
-{
-	unsigned int part_len;
-	unsigned int index;
-	unsigned int i;
-
-	/* Compute number of bytes mod 128. */
-	index = sctx->count[0] % SHA512_BLOCK_SIZE;
-
-	/* Update number of bytes. */
-	if ((sctx->count[0] += len) < len)
-		sctx->count[1]++;
-
-	part_len = SHA512_BLOCK_SIZE - index;
-
-	/* Transform as many times as possible. */
-	if (len >= part_len) {
-		memcpy(&sctx->buf[index], data, part_len);
-		octeon_sha512_transform(sctx->buf);
-
-		for (i = part_len; i + SHA512_BLOCK_SIZE <= len;
-			i += SHA512_BLOCK_SIZE)
-			octeon_sha512_transform(&data[i]);
-
-		index = 0;
-	} else {
-		i = 0;
-	}
-
-	/* Buffer remaining input. */
-	memcpy(&sctx->buf[index], &data[i], len - i);
+	do {
+		const u64 *block = (const u64 *)src;
+
+		write_octeon_64bit_block_sha512(block[0], 0);
+		write_octeon_64bit_block_sha512(block[1], 1);
+		write_octeon_64bit_block_sha512(block[2], 2);
+		write_octeon_64bit_block_sha512(block[3], 3);
+		write_octeon_64bit_block_sha512(block[4], 4);
+		write_octeon_64bit_block_sha512(block[5], 5);
+		write_octeon_64bit_block_sha512(block[6], 6);
+		write_octeon_64bit_block_sha512(block[7], 7);
+		write_octeon_64bit_block_sha512(block[8], 8);
+		write_octeon_64bit_block_sha512(block[9], 9);
+		write_octeon_64bit_block_sha512(block[10], 10);
+		write_octeon_64bit_block_sha512(block[11], 11);
+		write_octeon_64bit_block_sha512(block[12], 12);
+		write_octeon_64bit_block_sha512(block[13], 13);
+		write_octeon_64bit_block_sha512(block[14], 14);
+		octeon_sha512_start(block[15]);
+
+		src += SHA512_BLOCK_SIZE;
+	} while (--blocks);
 }
 
 static int octeon_sha512_update(struct shash_desc *desc, const u8 *data,
@@ -115,89 +83,48 @@ static int octeon_sha512_update(struct shash_desc *desc, const u8 *data,
 	struct sha512_state *sctx = shash_desc_ctx(desc);
 	struct octeon_cop2_state state;
 	unsigned long flags;
-
-	/*
-	 * Small updates never reach the crypto engine, so the generic sha512 is
-	 * faster because of the heavyweight octeon_crypto_enable() /
-	 * octeon_crypto_disable().
-	 */
-	if ((sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
-		return crypto_sha512_update(desc, data, len);
+	int remain;
 
 	flags = octeon_crypto_enable(&state);
 	octeon_sha512_store_hash(sctx);
 
-	__octeon_sha512_update(sctx, data, len);
+	remain = sha512_base_do_update_blocks(desc, data, len,
+					      octeon_sha512_transform);
 
 	octeon_sha512_read_hash(sctx);
 	octeon_crypto_disable(&state, flags);
-
-	return 0;
+	return remain;
 }
 
-static int octeon_sha512_final(struct shash_desc *desc, u8 *hash)
+static int octeon_sha512_finup(struct shash_desc *desc, const u8 *src,
+			       unsigned int len, u8 *hash)
 {
 	struct sha512_state *sctx = shash_desc_ctx(desc);
-	static u8 padding[128] = { 0x80, };
 	struct octeon_cop2_state state;
-	__be64 *dst = (__be64 *)hash;
-	unsigned int pad_len;
 	unsigned long flags;
-	unsigned int index;
-	__be64 bits[2];
-	int i;
-
-	/* Save number of bits. */
-	bits[1] = cpu_to_be64(sctx->count[0] << 3);
-	bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
-
-	/* Pad out to 112 mod 128. */
-	index = sctx->count[0] & 0x7f;
-	pad_len = (index < 112) ? (112 - index) : ((128+112) - index);
 
 	flags = octeon_crypto_enable(&state);
 	octeon_sha512_store_hash(sctx);
 
-	__octeon_sha512_update(sctx, padding, pad_len);
-
-	/* Append length (before padding). */
-	__octeon_sha512_update(sctx, (const u8 *)bits, sizeof(bits));
+	sha512_base_do_finup(desc, src, len, octeon_sha512_transform);
 
 	octeon_sha512_read_hash(sctx);
 	octeon_crypto_disable(&state, flags);
-
-	/* Store state in digest. */
-	for (i = 0; i < 8; i++)
-		dst[i] = cpu_to_be64(sctx->state[i]);
-
-	/* Zeroize sensitive information. */
-	memset(sctx, 0, sizeof(struct sha512_state));
-
-	return 0;
-}
-
-static int octeon_sha384_final(struct shash_desc *desc, u8 *hash)
-{
-	u8 D[64];
-
-	octeon_sha512_final(desc, D);
-
-	memcpy(hash, D, 48);
-	memzero_explicit(D, 64);
-
-	return 0;
+	return sha512_base_finish(desc, hash);
 }
 
 static struct shash_alg octeon_sha512_algs[2] = { {
 	.digestsize	=	SHA512_DIGEST_SIZE,
 	.init		=	sha512_base_init,
 	.update		=	octeon_sha512_update,
-	.final		=	octeon_sha512_final,
-	.descsize	=	sizeof(struct sha512_state),
+	.finup		=	octeon_sha512_finup,
+	.descsize	=	SHA512_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha512",
 		.cra_driver_name=	"octeon-sha512",
 		.cra_priority	=	OCTEON_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize	=	SHA512_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@@ -205,12 +132,14 @@ static struct shash_alg octeon_sha512_algs[2] = { {
 	.digestsize	=	SHA384_DIGEST_SIZE,
 	.init		=	sha384_base_init,
 	.update		=	octeon_sha512_update,
-	.final		=	octeon_sha384_final,
-	.descsize	=	sizeof(struct sha512_state),
+	.finup		=	octeon_sha512_finup,
+	.descsize	=	SHA512_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha384",
 		.cra_driver_name=	"octeon-sha384",
 		.cra_priority	=	OCTEON_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize	=	SHA384_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
diff --git a/arch/mips/configs/cavium_octeon_defconfig b/arch/mips/configs/cavium_octeon_defconfig
index f523ee6f25bf..88ae0aa85364 100644
--- a/arch/mips/configs/cavium_octeon_defconfig
+++ b/arch/mips/configs/cavium_octeon_defconfig
@@ -157,7 +157,6 @@ CONFIG_CRYPTO_CBC=y
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_MD5_OCTEON=y
 CONFIG_CRYPTO_SHA1_OCTEON=m
-CONFIG_CRYPTO_SHA256_OCTEON=m
 CONFIG_CRYPTO_SHA512_OCTEON=m
 CONFIG_CRYPTO_DES=y
 CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
diff --git a/arch/mips/configs/decstation_64_defconfig b/arch/mips/configs/decstation_64_defconfig
index 9655567614aa..85a4472cb058 100644
--- a/arch/mips/configs/decstation_64_defconfig
+++ b/arch/mips/configs/decstation_64_defconfig
@@ -168,7 +168,6 @@ CONFIG_NLS_ISO8859_14=m
 CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_UTF8=m
 CONFIG_CRYPTO_RSA=m
-CONFIG_CRYPTO_MANAGER=y
 CONFIG_CRYPTO_CCM=m
 CONFIG_CRYPTO_GCM=m
 CONFIG_CRYPTO_CHACHA20POLY1305=m
diff --git a/arch/mips/configs/decstation_defconfig b/arch/mips/configs/decstation_defconfig
index 1539fe8eb34d..a3b2c8da2dde 100644
--- a/arch/mips/configs/decstation_defconfig
+++ b/arch/mips/configs/decstation_defconfig
@@ -163,7 +163,6 @@ CONFIG_NLS_ISO8859_14=m
 CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_UTF8=m
 CONFIG_CRYPTO_RSA=m
-CONFIG_CRYPTO_MANAGER=y
 CONFIG_CRYPTO_CCM=m
 CONFIG_CRYPTO_GCM=m
 CONFIG_CRYPTO_CHACHA20POLY1305=m
diff --git a/arch/mips/configs/decstation_r4k_defconfig b/arch/mips/configs/decstation_r4k_defconfig
index 58c36720c94a..a476717b8a6a 100644
--- a/arch/mips/configs/decstation_r4k_defconfig
+++ b/arch/mips/configs/decstation_r4k_defconfig
@@ -163,7 +163,6 @@ CONFIG_NLS_ISO8859_14=m
 CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_UTF8=m
 CONFIG_CRYPTO_RSA=m
-CONFIG_CRYPTO_MANAGER=y
 CONFIG_CRYPTO_CCM=m
 CONFIG_CRYPTO_GCM=m
 CONFIG_CRYPTO_CHACHA20POLY1305=m
diff --git a/arch/mips/configs/gcw0_defconfig b/arch/mips/configs/gcw0_defconfig
index bc1ef66e3999..8b7ad877e07a 100644
--- a/arch/mips/configs/gcw0_defconfig
+++ b/arch/mips/configs/gcw0_defconfig
@@ -13,7 +13,6 @@ CONFIG_MIPS_CMDLINE_DTB_EXTEND=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
-# CONFIG_BOUNCE is not set
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig
index 12f3eed8a946..48c8feec958f 100644
--- a/arch/mips/configs/gpr_defconfig
+++ b/arch/mips/configs/gpr_defconfig
@@ -273,7 +273,7 @@ CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_CODEPAGE_850=y
 CONFIG_NLS_ISO8859_1=y
 CONFIG_CRYPTO_AUTHENC=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
diff --git a/arch/mips/configs/ip28_defconfig b/arch/mips/configs/ip28_defconfig
index e0040110a3ee..6db21e498faa 100644
--- a/arch/mips/configs/ip28_defconfig
+++ b/arch/mips/configs/ip28_defconfig
@@ -60,6 +60,5 @@ CONFIG_TMPFS_POSIX_ACL=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3_ACL=y
 CONFIG_ROOT_NFS=y
-CONFIG_CRYPTO_MANAGER=y
 # CONFIG_CRYPTO_HW is not set
 CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/mips/configs/lemote2f_defconfig b/arch/mips/configs/lemote2f_defconfig
index 71d6340497c9..5038a27d035f 100644
--- a/arch/mips/configs/lemote2f_defconfig
+++ b/arch/mips/configs/lemote2f_defconfig
@@ -297,7 +297,7 @@ CONFIG_NLS_KOI8_R=m
 CONFIG_NLS_KOI8_U=m
 CONFIG_NLS_UTF8=y
 CONFIG_CRYPTO_CRYPTD=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig
index 06b7a0b97eca..cbf9c35a6177 100644
--- a/arch/mips/configs/mtx1_defconfig
+++ b/arch/mips/configs/mtx1_defconfig
@@ -662,7 +662,7 @@ CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_KOI8_R=m
 CONFIG_NLS_KOI8_U=m
 CONFIG_NLS_UTF8=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_MD5=y
diff --git a/arch/mips/configs/rb532_defconfig b/arch/mips/configs/rb532_defconfig
index 42b161d587c7..9fb114ef5e2d 100644
--- a/arch/mips/configs/rb532_defconfig
+++ b/arch/mips/configs/rb532_defconfig
@@ -153,6 +153,6 @@ CONFIG_JFFS2_FS=y
 CONFIG_JFFS2_SUMMARY=y
 CONFIG_JFFS2_COMPRESSION_OPTIONS=y
 CONFIG_SQUASHFS=y
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_STRIP_ASM_SYMS=y
diff --git a/arch/mips/crypto/Kconfig b/arch/mips/crypto/Kconfig
index 545fc0e12422..6bf073ae7613 100644
--- a/arch/mips/crypto/Kconfig
+++ b/arch/mips/crypto/Kconfig
@@ -2,17 +2,6 @@
 
 menu "Accelerated Cryptographic Algorithms for CPU (mips)"
 
-config CRYPTO_POLY1305_MIPS
-	tristate
-	depends on MIPS
-	select CRYPTO_HASH
-	select CRYPTO_ARCH_HAVE_LIB_POLY1305
-	default CRYPTO_LIB_POLY1305_INTERNAL
-	help
-	  Poly1305 authenticator algorithm (RFC7539)
-
-	  Architecture: mips
-
 config CRYPTO_MD5_OCTEON
 	tristate "Digests: MD5 (OCTEON)"
 	depends on CPU_CAVIUM_OCTEON
@@ -33,16 +22,6 @@ config CRYPTO_SHA1_OCTEON
 
 	  Architecture: mips OCTEON
 
-config CRYPTO_SHA256_OCTEON
-	tristate "Hash functions: SHA-224 and SHA-256 (OCTEON)"
-	depends on CPU_CAVIUM_OCTEON
-	select CRYPTO_SHA256
-	select CRYPTO_HASH
-	help
-	  SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
-	  Architecture: mips OCTEON using crypto instructions, when available
-
 config CRYPTO_SHA512_OCTEON
 	tristate "Hash functions: SHA-384 and SHA-512 (OCTEON)"
 	depends on CPU_CAVIUM_OCTEON
@@ -53,16 +32,4 @@ config CRYPTO_SHA512_OCTEON
 
 	  Architecture: mips OCTEON using crypto instructions, when available
 
-config CRYPTO_CHACHA_MIPS
-	tristate
-	depends on CPU_MIPS32_R2
-	select CRYPTO_SKCIPHER
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-	default CRYPTO_LIB_CHACHA_INTERNAL
-	help
-	  Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
-	  stream cipher algorithms
-
-	  Architecture: MIPS32r2
-
 endmenu
diff --git a/arch/mips/crypto/Makefile b/arch/mips/crypto/Makefile
index fddc88281412..5adb631a69c1 100644
--- a/arch/mips/crypto/Makefile
+++ b/arch/mips/crypto/Makefile
@@ -3,20 +3,3 @@
 # Makefile for MIPS crypto files..
 #
 
-obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
-chacha-mips-y := chacha-core.o chacha-glue.o
-AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
-
-obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
-poly1305-mips-y := poly1305-core.o poly1305-glue.o
-
-perlasm-flavour-$(CONFIG_32BIT) := o32
-perlasm-flavour-$(CONFIG_64BIT) := 64
-
-quiet_cmd_perlasm = PERLASM $@
-      cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
-
-$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
-	$(call if_changed,perlasm)
-
-targets += poly1305-core.S
diff --git a/arch/mips/crypto/chacha-glue.c b/arch/mips/crypto/chacha-glue.c
deleted file mode 100644
index f6fc2e1079a1..000000000000
--- a/arch/mips/crypto/chacha-glue.c
+++ /dev/null
@@ -1,146 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * MIPS accelerated ChaCha and XChaCha stream ciphers,
- * including ChaCha20 (RFC7539)
- *
- * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/byteorder.h>
-#include <crypto/algapi.h>
-#include <crypto/internal/chacha.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
-				  unsigned int bytes, int nrounds);
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-asmlinkage void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds);
-EXPORT_SYMBOL(hchacha_block_arch);
-
-static int chacha_mips_stream_xor(struct skcipher_request *req,
-				  const struct chacha_ctx *ctx, const u8 *iv)
-{
-	struct skcipher_walk walk;
-	u32 state[16];
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	chacha_init(state, ctx->key, iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-
-		if (nbytes < walk.total)
-			nbytes = round_down(nbytes, walk.stride);
-
-		chacha_crypt(state, walk.dst.virt.addr, walk.src.virt.addr,
-			     nbytes, ctx->nrounds);
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
-
-	return err;
-}
-
-static int chacha_mips(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	return chacha_mips_stream_xor(req, ctx, req->iv);
-}
-
-static int xchacha_mips(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct chacha_ctx subctx;
-	u32 state[16];
-	u8 real_iv[16];
-
-	chacha_init(state, ctx->key, req->iv);
-
-	hchacha_block(state, subctx.key, ctx->nrounds);
-	subctx.nrounds = ctx->nrounds;
-
-	memcpy(&real_iv[0], req->iv + 24, 8);
-	memcpy(&real_iv[8], req->iv + 16, 8);
-	return chacha_mips_stream_xor(req, &subctx, real_iv);
-}
-
-static struct skcipher_alg algs[] = {
-	{
-		.base.cra_name		= "chacha20",
-		.base.cra_driver_name	= "chacha20-mips",
-		.base.cra_priority	= 200,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= CHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= chacha_mips,
-		.decrypt		= chacha_mips,
-	}, {
-		.base.cra_name		= "xchacha20",
-		.base.cra_driver_name	= "xchacha20-mips",
-		.base.cra_priority	= 200,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= xchacha_mips,
-		.decrypt		= xchacha_mips,
-	}, {
-		.base.cra_name		= "xchacha12",
-		.base.cra_driver_name	= "xchacha12-mips",
-		.base.cra_priority	= 200,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= chacha12_setkey,
-		.encrypt		= xchacha_mips,
-		.decrypt		= xchacha_mips,
-	}
-};
-
-static int __init chacha_simd_mod_init(void)
-{
-	return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ?
-		crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
-}
-
-static void __exit chacha_simd_mod_fini(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER))
-		crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-module_init(chacha_simd_mod_init);
-module_exit(chacha_simd_mod_fini);
-
-MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (MIPS accelerated)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-mips");
-MODULE_ALIAS_CRYPTO("xchacha20");
-MODULE_ALIAS_CRYPTO("xchacha20-mips");
-MODULE_ALIAS_CRYPTO("xchacha12");
-MODULE_ALIAS_CRYPTO("xchacha12-mips");
diff --git a/arch/mips/crypto/poly1305-glue.c b/arch/mips/crypto/poly1305-glue.c
deleted file mode 100644
index c03ad0bbe69c..000000000000
--- a/arch/mips/crypto/poly1305-glue.c
+++ /dev/null
@@ -1,192 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/unaligned.h>
-#include <crypto/algapi.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/poly1305.h>
-#include <linux/cpufeature.h>
-#include <linux/crypto.h>
-#include <linux/module.h>
-
-asmlinkage void poly1305_init_mips(void *state, const u8 *key);
-asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_emit_mips(void *state, u8 *digest, const u32 *nonce);
-
-void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
-{
-	poly1305_init_mips(&dctx->h, key);
-	dctx->s[0] = get_unaligned_le32(key + 16);
-	dctx->s[1] = get_unaligned_le32(key + 20);
-	dctx->s[2] = get_unaligned_le32(key + 24);
-	dctx->s[3] = get_unaligned_le32(key + 28);
-	dctx->buflen = 0;
-}
-EXPORT_SYMBOL(poly1305_init_arch);
-
-static int mips_poly1305_init(struct shash_desc *desc)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	dctx->buflen = 0;
-	dctx->rset = 0;
-	dctx->sset = false;
-
-	return 0;
-}
-
-static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
-				 u32 len, u32 hibit)
-{
-	if (unlikely(!dctx->sset)) {
-		if (!dctx->rset) {
-			poly1305_init_mips(&dctx->h, src);
-			src += POLY1305_BLOCK_SIZE;
-			len -= POLY1305_BLOCK_SIZE;
-			dctx->rset = 1;
-		}
-		if (len >= POLY1305_BLOCK_SIZE) {
-			dctx->s[0] = get_unaligned_le32(src +  0);
-			dctx->s[1] = get_unaligned_le32(src +  4);
-			dctx->s[2] = get_unaligned_le32(src +  8);
-			dctx->s[3] = get_unaligned_le32(src + 12);
-			src += POLY1305_BLOCK_SIZE;
-			len -= POLY1305_BLOCK_SIZE;
-			dctx->sset = true;
-		}
-		if (len < POLY1305_BLOCK_SIZE)
-			return;
-	}
-
-	len &= ~(POLY1305_BLOCK_SIZE - 1);
-
-	poly1305_blocks_mips(&dctx->h, src, len, hibit);
-}
-
-static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
-				unsigned int len)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	if (unlikely(dctx->buflen)) {
-		u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
-
-		memcpy(dctx->buf + dctx->buflen, src, bytes);
-		src += bytes;
-		len -= bytes;
-		dctx->buflen += bytes;
-
-		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-			mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
-			dctx->buflen = 0;
-		}
-	}
-
-	if (likely(len >= POLY1305_BLOCK_SIZE)) {
-		mips_poly1305_blocks(dctx, src, len, 1);
-		src += round_down(len, POLY1305_BLOCK_SIZE);
-		len %= POLY1305_BLOCK_SIZE;
-	}
-
-	if (unlikely(len)) {
-		dctx->buflen = len;
-		memcpy(dctx->buf, src, len);
-	}
-	return 0;
-}
-
-void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
-			  unsigned int nbytes)
-{
-	if (unlikely(dctx->buflen)) {
-		u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
-
-		memcpy(dctx->buf + dctx->buflen, src, bytes);
-		src += bytes;
-		nbytes -= bytes;
-		dctx->buflen += bytes;
-
-		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-			poly1305_blocks_mips(&dctx->h, dctx->buf,
-					     POLY1305_BLOCK_SIZE, 1);
-			dctx->buflen = 0;
-		}
-	}
-
-	if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
-		unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
-
-		poly1305_blocks_mips(&dctx->h, src, len, 1);
-		src += len;
-		nbytes %= POLY1305_BLOCK_SIZE;
-	}
-
-	if (unlikely(nbytes)) {
-		dctx->buflen = nbytes;
-		memcpy(dctx->buf, src, nbytes);
-	}
-}
-EXPORT_SYMBOL(poly1305_update_arch);
-
-void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
-{
-	if (unlikely(dctx->buflen)) {
-		dctx->buf[dctx->buflen++] = 1;
-		memset(dctx->buf + dctx->buflen, 0,
-		       POLY1305_BLOCK_SIZE - dctx->buflen);
-		poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
-	}
-
-	poly1305_emit_mips(&dctx->h, dst, dctx->s);
-	*dctx = (struct poly1305_desc_ctx){};
-}
-EXPORT_SYMBOL(poly1305_final_arch);
-
-static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	if (unlikely(!dctx->sset))
-		return -ENOKEY;
-
-	poly1305_final_arch(dctx, dst);
-	return 0;
-}
-
-static struct shash_alg mips_poly1305_alg = {
-	.init			= mips_poly1305_init,
-	.update			= mips_poly1305_update,
-	.final			= mips_poly1305_final,
-	.digestsize		= POLY1305_DIGEST_SIZE,
-	.descsize		= sizeof(struct poly1305_desc_ctx),
-
-	.base.cra_name		= "poly1305",
-	.base.cra_driver_name	= "poly1305-mips",
-	.base.cra_priority	= 200,
-	.base.cra_blocksize	= POLY1305_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-};
-
-static int __init mips_poly1305_mod_init(void)
-{
-	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
-		crypto_register_shash(&mips_poly1305_alg) : 0;
-}
-
-static void __exit mips_poly1305_mod_exit(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
-		crypto_unregister_shash(&mips_poly1305_alg);
-}
-
-module_init(mips_poly1305_mod_init);
-module_exit(mips_poly1305_mod_exit);
-
-MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("poly1305");
-MODULE_ALIAS_CRYPTO("poly1305-mips");
diff --git a/arch/mips/include/asm/idle.h b/arch/mips/include/asm/idle.h
index 0992cad9c632..c7d75807d13f 100644
--- a/arch/mips/include/asm/idle.h
+++ b/arch/mips/include/asm/idle.h
@@ -6,11 +6,10 @@
 #include <linux/linkage.h>
 
 extern void (*cpu_wait)(void);
-extern void r4k_wait(void);
-extern asmlinkage void __r4k_wait(void);
+extern asmlinkage void r4k_wait(void);
 extern void r4k_wait_irqoff(void);
 
-static inline int using_rollback_handler(void)
+static inline int using_skipover_handler(void)
 {
 	return cpu_wait == r4k_wait;
 }
diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h
index 85fa9962266a..ef72c46b5568 100644
--- a/arch/mips/include/asm/ptrace.h
+++ b/arch/mips/include/asm/ptrace.h
@@ -65,7 +65,8 @@ static inline void instruction_pointer_set(struct pt_regs *regs,
 
 /* Query offset/name of register from its name/offset */
 extern int regs_query_register_offset(const char *name);
-#define MAX_REG_OFFSET (offsetof(struct pt_regs, __last))
+#define MAX_REG_OFFSET \
+	(offsetof(struct pt_regs, __last) - sizeof(unsigned long))
 
 /**
  * regs_get_register() - get register value from its offset
diff --git a/arch/mips/include/asm/socket.h b/arch/mips/include/asm/socket.h
index 4724a563c5bf..43a09f0dd3ff 100644
--- a/arch/mips/include/asm/socket.h
+++ b/arch/mips/include/asm/socket.h
@@ -36,15 +36,6 @@ enum sock_type {
 	SOCK_PACKET	= 10,
 };
 
-#define SOCK_MAX (SOCK_PACKET + 1)
-/* Mask which covers at least up to SOCK_MASK-1.  The
- *  * remaining bits are used as flags. */
-#define SOCK_TYPE_MASK 0xf
-
-/* Flags for socket, socketpair, paccept */
-#define SOCK_CLOEXEC	O_CLOEXEC
-#define SOCK_NONBLOCK	O_NONBLOCK
-
 #define ARCH_HAS_SOCKET_TYPES 1
 
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/mips/kernel/genex.S b/arch/mips/kernel/genex.S
index a572ce36a24f..08c0a01d9a29 100644
--- a/arch/mips/kernel/genex.S
+++ b/arch/mips/kernel/genex.S
@@ -104,48 +104,59 @@ handle_vcei:
 
 	__FINIT
 
-	.align	5	/* 32 byte rollback region */
-LEAF(__r4k_wait)
-	.set	push
-	.set	noreorder
-	/* start of rollback region */
-	LONG_L	t0, TI_FLAGS($28)
-	nop
-	andi	t0, _TIF_NEED_RESCHED
-	bnez	t0, 1f
-	 nop
-	nop
-	nop
-#ifdef CONFIG_CPU_MICROMIPS
-	nop
-	nop
-	nop
-	nop
-#endif
+	.section .cpuidle.text,"ax"
+	/* Align to 32 bytes for the maximum idle interrupt region size. */
+	.align	5
+LEAF(r4k_wait)
+	/* Keep the ISA bit clear for calculations on local labels here. */
+0:	.fill 	0
+	/* Start of idle interrupt region. */
+	local_irq_enable
+	/*
+	 * If an interrupt lands here, before going idle on the next
+	 * instruction, we must *NOT* go idle since the interrupt could
+	 * have set TIF_NEED_RESCHED or caused a timer to need resched.
+	 * Fall through -- see skipover_handler below -- and have the
+	 * idle loop take care of things.
+	 */
+1:	.fill	0
+	/* The R2 EI/EHB sequence takes 8 bytes, otherwise pad up.  */
+	.if		1b - 0b > 32
+	.error	"overlong idle interrupt region"
+	.elseif	1b - 0b > 8
+	.align	4
+	.endif
+2:	.fill	0
+	.equ	r4k_wait_idle_size, 2b - 0b
+	/* End of idle interrupt region; size has to be a power of 2. */
 	.set	MIPS_ISA_ARCH_LEVEL_RAW
+r4k_wait_insn:
 	wait
-	/* end of rollback region (the region size must be power of two) */
-1:
+r4k_wait_exit:
+	.set	mips0
+	local_irq_disable
 	jr	ra
-	 nop
-	.set	pop
-	END(__r4k_wait)
+	END(r4k_wait)
+	.previous
 
-	.macro	BUILD_ROLLBACK_PROLOGUE handler
-	FEXPORT(rollback_\handler)
+	.macro	BUILD_SKIPOVER_PROLOGUE handler
+	FEXPORT(skipover_\handler)
 	.set	push
 	.set	noat
 	MFC0	k0, CP0_EPC
-	PTR_LA	k1, __r4k_wait
-	ori	k0, 0x1f	/* 32 byte rollback region */
-	xori	k0, 0x1f
+	/* Subtract/add 2 to let the ISA bit propagate through the mask.  */
+	PTR_LA	k1, r4k_wait_insn - 2
+	ori 	k0, r4k_wait_idle_size - 2
+	.set	noreorder
 	bne	k0, k1, \handler
+	PTR_ADDIU 	k0, r4k_wait_exit - r4k_wait_insn + 2
+	.set	reorder
 	MTC0	k0, CP0_EPC
 	.set pop
 	.endm
 
 	.align	5
-BUILD_ROLLBACK_PROLOGUE handle_int
+BUILD_SKIPOVER_PROLOGUE handle_int
 NESTED(handle_int, PT_SIZE, sp)
 	.cfi_signal_frame
 #ifdef CONFIG_TRACE_IRQFLAGS
@@ -265,7 +276,7 @@ NESTED(except_vec_ejtag_debug, 0, sp)
  * This prototype is copied to ebase + n*IntCtl.VS and patched
  * to invoke the handler
  */
-BUILD_ROLLBACK_PROLOGUE except_vec_vi
+BUILD_SKIPOVER_PROLOGUE except_vec_vi
 NESTED(except_vec_vi, 0, sp)
 	SAVE_SOME docfi=1
 	SAVE_AT docfi=1
diff --git a/arch/mips/kernel/idle.c b/arch/mips/kernel/idle.c
index 5abc8b7340f8..80e8a04a642e 100644
--- a/arch/mips/kernel/idle.c
+++ b/arch/mips/kernel/idle.c
@@ -35,13 +35,6 @@ static void __cpuidle r3081_wait(void)
 	write_c0_conf(cfg | R30XX_CONF_HALT);
 }
 
-void __cpuidle r4k_wait(void)
-{
-	raw_local_irq_enable();
-	__r4k_wait();
-	raw_local_irq_disable();
-}
-
 /*
  * This variant is preferable as it allows testing need_resched and going to
  * sleep depending on the outcome atomically.  Unfortunately the "It is
diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index e85bd087467e..cc26d56f3ab6 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -332,6 +332,8 @@ static void __init cps_prepare_cpus(unsigned int max_cpus)
 	mips_cps_cluster_bootcfg = kcalloc(nclusters,
 					   sizeof(*mips_cps_cluster_bootcfg),
 					   GFP_KERNEL);
+	if (!mips_cps_cluster_bootcfg)
+		goto err_out;
 
 	if (nclusters > 1)
 		mips_cm_update_property();
@@ -348,6 +350,8 @@ static void __init cps_prepare_cpus(unsigned int max_cpus)
 		mips_cps_cluster_bootcfg[cl].core_power =
 			kcalloc(BITS_TO_LONGS(ncores), sizeof(unsigned long),
 				GFP_KERNEL);
+		if (!mips_cps_cluster_bootcfg[cl].core_power)
+			goto err_out;
 
 		/* Allocate VPE boot configuration structs */
 		for (c = 0; c < ncores; c++) {
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 39e248d0ed59..8ec1e185b35c 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -77,7 +77,7 @@
 #include "access-helper.h"
 
 extern void check_wait(void);
-extern asmlinkage void rollback_handle_int(void);
+extern asmlinkage void skipover_handle_int(void);
 extern asmlinkage void handle_int(void);
 extern asmlinkage void handle_adel(void);
 extern asmlinkage void handle_ades(void);
@@ -2066,7 +2066,7 @@ void *set_vi_handler(int n, vi_handler_t addr)
 {
 	extern const u8 except_vec_vi[];
 	extern const u8 except_vec_vi_ori[], except_vec_vi_end[];
-	extern const u8 rollback_except_vec_vi[];
+	extern const u8 skipover_except_vec_vi[];
 	unsigned long handler;
 	unsigned long old_handler = vi_handlers[n];
 	int srssets = current_cpu_data.srsets;
@@ -2095,7 +2095,7 @@ void *set_vi_handler(int n, vi_handler_t addr)
 			change_c0_srsmap(0xf << n*4, 0 << n*4);
 	}
 
-	vec_start = using_rollback_handler() ? rollback_except_vec_vi :
+	vec_start = using_skipover_handler() ? skipover_except_vec_vi :
 					       except_vec_vi;
 #if defined(CONFIG_CPU_MICROMIPS) || defined(CONFIG_CPU_BIG_ENDIAN)
 	ori_offset = except_vec_vi_ori - vec_start + 2;
@@ -2426,8 +2426,8 @@ void __init trap_init(void)
 	if (board_be_init)
 		board_be_init();
 
-	set_except_vector(EXCCODE_INT, using_rollback_handler() ?
-					rollback_handle_int : handle_int);
+	set_except_vector(EXCCODE_INT, using_skipover_handler() ?
+					skipover_handle_int : handle_int);
 	set_except_vector(EXCCODE_MOD, handle_tlbm);
 	set_except_vector(EXCCODE_TLBL, handle_tlbl);
 	set_except_vector(EXCCODE_TLBS, handle_tlbs);
diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile
index 9c024e6d5e54..9d75845ef78e 100644
--- a/arch/mips/lib/Makefile
+++ b/arch/mips/lib/Makefile
@@ -3,6 +3,8 @@
 # Makefile for MIPS-specific library files..
 #
 
+obj-y	+= crypto/
+
 lib-y	+= bitops.o csum_partial.o delay.o memcpy.o memset.o \
 	   mips-atomic.o strncpy_user.o \
 	   strnlen_user.o uncached.o
diff --git a/arch/mips/lib/crc32-mips.c b/arch/mips/lib/crc32-mips.c
index 676a4b3e290b..45e4d2c9fbf5 100644
--- a/arch/mips/lib/crc32-mips.c
+++ b/arch/mips/lib/crc32-mips.c
@@ -62,7 +62,7 @@ do {							\
 #define CRC32C(crc, value, size) \
 	_CRC32(crc, value, size, crc32c)
 
-static DEFINE_STATIC_KEY_FALSE(have_crc32);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
 
 u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
 {
@@ -163,7 +163,7 @@ static int __init crc32_mips_init(void)
 		static_branch_enable(&have_crc32);
 	return 0;
 }
-arch_initcall(crc32_mips_init);
+subsys_initcall(crc32_mips_init);
 
 static void __exit crc32_mips_exit(void)
 {
diff --git a/arch/mips/lib/crypto/.gitignore b/arch/mips/lib/crypto/.gitignore
new file mode 100644
index 000000000000..0d47d4f21c6d
--- /dev/null
+++ b/arch/mips/lib/crypto/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
diff --git a/arch/mips/lib/crypto/Kconfig b/arch/mips/lib/crypto/Kconfig
new file mode 100644
index 000000000000..0670a170c1be
--- /dev/null
+++ b/arch/mips/lib/crypto/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_CHACHA_MIPS
+	tristate
+	depends on CPU_MIPS32_R2
+	default CRYPTO_LIB_CHACHA
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_MIPS
+	tristate
+	default CRYPTO_LIB_POLY1305
+	select CRYPTO_ARCH_HAVE_LIB_POLY1305
diff --git a/arch/mips/lib/crypto/Makefile b/arch/mips/lib/crypto/Makefile
new file mode 100644
index 000000000000..804488c7aded
--- /dev/null
+++ b/arch/mips/lib/crypto/Makefile
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
+chacha-mips-y := chacha-core.o chacha-glue.o
+AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
+
+obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
+poly1305-mips-y := poly1305-core.o poly1305-glue.o
+
+perlasm-flavour-$(CONFIG_32BIT) := o32
+perlasm-flavour-$(CONFIG_64BIT) := 64
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
+
+$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
+	$(call if_changed,perlasm)
+
+targets += poly1305-core.S
diff --git a/arch/mips/crypto/chacha-core.S b/arch/mips/lib/crypto/chacha-core.S
index 5755f69cfe00..5755f69cfe00 100644
--- a/arch/mips/crypto/chacha-core.S
+++ b/arch/mips/lib/crypto/chacha-core.S
diff --git a/arch/mips/lib/crypto/chacha-glue.c b/arch/mips/lib/crypto/chacha-glue.c
new file mode 100644
index 000000000000..88c097594eb0
--- /dev/null
+++ b/arch/mips/lib/crypto/chacha-glue.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ChaCha and HChaCha functions (MIPS optimized)
+ *
+ * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <crypto/chacha.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+asmlinkage void chacha_crypt_arch(struct chacha_state *state,
+				  u8 *dst, const u8 *src,
+				  unsigned int bytes, int nrounds);
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+asmlinkage void hchacha_block_arch(const struct chacha_state *state,
+				   u32 out[HCHACHA_OUT_WORDS], int nrounds);
+EXPORT_SYMBOL(hchacha_block_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	return true;
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+MODULE_DESCRIPTION("ChaCha and HChaCha functions (MIPS optimized)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/mips/lib/crypto/poly1305-glue.c b/arch/mips/lib/crypto/poly1305-glue.c
new file mode 100644
index 000000000000..764a38a65200
--- /dev/null
+++ b/arch/mips/lib/crypto/poly1305-glue.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <crypto/internal/poly1305.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+asmlinkage void poly1305_block_init_arch(
+	struct poly1305_block_state *state,
+	const u8 raw_key[POLY1305_BLOCK_SIZE]);
+EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
+asmlinkage void poly1305_blocks_arch(struct poly1305_block_state *state,
+				     const u8 *src, u32 len, u32 hibit);
+EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
+asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
+				   u8 digest[POLY1305_DIGEST_SIZE],
+				   const u32 nonce[4]);
+EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+
+bool poly1305_is_arch_optimized(void)
+{
+	return true;
+}
+EXPORT_SYMBOL(poly1305_is_arch_optimized);
+
+MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/mips/crypto/poly1305-mips.pl b/arch/mips/lib/crypto/poly1305-mips.pl
index b05bab884ed2..399f10c3e385 100644
--- a/arch/mips/crypto/poly1305-mips.pl
+++ b/arch/mips/lib/crypto/poly1305-mips.pl
@@ -93,9 +93,9 @@ $code.=<<___;
 #endif
 
 #ifdef	__KERNEL__
-# define poly1305_init   poly1305_init_mips
-# define poly1305_blocks poly1305_blocks_mips
-# define poly1305_emit   poly1305_emit_mips
+# define poly1305_init   poly1305_block_init_arch
+# define poly1305_blocks poly1305_blocks_arch
+# define poly1305_emit   poly1305_emit_arch
 #endif
 
 #if defined(__MIPSEB__) && !defined(MIPSEB)
@@ -565,9 +565,9 @@ $code.=<<___;
 #endif
 
 #ifdef	__KERNEL__
-# define poly1305_init   poly1305_init_mips
-# define poly1305_blocks poly1305_blocks_mips
-# define poly1305_emit   poly1305_emit_mips
+# define poly1305_init   poly1305_block_init_arch
+# define poly1305_blocks poly1305_blocks_arch
+# define poly1305_emit   poly1305_emit_arch
 #endif
 
 #if defined(__MIPSEB__) && !defined(MIPSEB)
diff --git a/arch/openrisc/include/asm/cacheflush.h b/arch/openrisc/include/asm/cacheflush.h
index 984c331ff5f4..0e60af486ec1 100644
--- a/arch/openrisc/include/asm/cacheflush.h
+++ b/arch/openrisc/include/asm/cacheflush.h
@@ -23,6 +23,9 @@
  */
 extern void local_dcache_page_flush(struct page *page);
 extern void local_icache_page_inv(struct page *page);
+extern void local_dcache_range_flush(unsigned long start, unsigned long end);
+extern void local_dcache_range_inv(unsigned long start, unsigned long end);
+extern void local_icache_range_inv(unsigned long start, unsigned long end);
 
 /*
  * Data cache flushing always happen on the local cpu. Instruction cache
@@ -39,6 +42,20 @@ extern void smp_icache_page_inv(struct page *page);
 #endif /* CONFIG_SMP */
 
 /*
+ * Even if the actual block size is larger than L1_CACHE_BYTES, paddr
+ * can be incremented by L1_CACHE_BYTES. When paddr is written to the
+ * invalidate register, the entire cache line encompassing this address
+ * is invalidated. Each subsequent reference to the same cache line will
+ * not affect the invalidation process.
+ */
+#define local_dcache_block_flush(addr) \
+	local_dcache_range_flush(addr, addr + L1_CACHE_BYTES)
+#define local_dcache_block_inv(addr) \
+	local_dcache_range_inv(addr, addr + L1_CACHE_BYTES)
+#define local_icache_block_inv(addr) \
+	local_icache_range_inv(addr, addr + L1_CACHE_BYTES)
+
+/*
  * Synchronizes caches. Whenever a cpu writes executable code to memory, this
  * should be called to make sure the processor sees the newly written code.
  */
diff --git a/arch/openrisc/include/asm/cpuinfo.h b/arch/openrisc/include/asm/cpuinfo.h
index 5e4744153d0e..3cfc4cf0b019 100644
--- a/arch/openrisc/include/asm/cpuinfo.h
+++ b/arch/openrisc/include/asm/cpuinfo.h
@@ -15,16 +15,21 @@
 #ifndef __ASM_OPENRISC_CPUINFO_H
 #define __ASM_OPENRISC_CPUINFO_H
 
+#include <asm/spr.h>
+#include <asm/spr_defs.h>
+
+struct cache_desc {
+	u32 size;
+	u32 sets;
+	u32 block_size;
+	u32 ways;
+};
+
 struct cpuinfo_or1k {
 	u32 clock_frequency;
 
-	u32 icache_size;
-	u32 icache_block_size;
-	u32 icache_ways;
-
-	u32 dcache_size;
-	u32 dcache_block_size;
-	u32 dcache_ways;
+	struct cache_desc icache;
+	struct cache_desc dcache;
 
 	u16 coreid;
 };
@@ -32,4 +37,9 @@ struct cpuinfo_or1k {
 extern struct cpuinfo_or1k cpuinfo_or1k[NR_CPUS];
 extern void setup_cpuinfo(void);
 
+/*
+ * Check if the cache component exists.
+ */
+extern bool cpu_cache_is_present(const unsigned int cache_type);
+
 #endif /* __ASM_OPENRISC_CPUINFO_H */
diff --git a/arch/openrisc/kernel/Makefile b/arch/openrisc/kernel/Makefile
index 79129161f3e0..e4c7d9bdd598 100644
--- a/arch/openrisc/kernel/Makefile
+++ b/arch/openrisc/kernel/Makefile
@@ -7,7 +7,7 @@ extra-y	:= vmlinux.lds
 
 obj-y	:= head.o setup.o or32_ksyms.o process.o dma.o \
 	   traps.o time.o irq.o entry.o ptrace.o signal.o \
-	   sys_call_table.o unwinder.o
+	   sys_call_table.o unwinder.o cacheinfo.o
 
 obj-$(CONFIG_SMP)		+= smp.o sync-timer.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
diff --git a/arch/openrisc/kernel/cacheinfo.c b/arch/openrisc/kernel/cacheinfo.c
new file mode 100644
index 000000000000..61230545e4ff
--- /dev/null
+++ b/arch/openrisc/kernel/cacheinfo.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * OpenRISC cacheinfo support
+ *
+ * Based on work done for MIPS and LoongArch. All original copyrights
+ * apply as per the original source declaration.
+ *
+ * OpenRISC implementation:
+ * Copyright (C) 2025 Sahil Siddiq <sahilcdq@proton.me>
+ */
+
+#include <linux/cacheinfo.h>
+#include <asm/cpuinfo.h>
+#include <asm/spr.h>
+#include <asm/spr_defs.h>
+
+static inline void ci_leaf_init(struct cacheinfo *this_leaf, enum cache_type type,
+				unsigned int level, struct cache_desc *cache, int cpu)
+{
+	this_leaf->type = type;
+	this_leaf->level = level;
+	this_leaf->coherency_line_size = cache->block_size;
+	this_leaf->number_of_sets = cache->sets;
+	this_leaf->ways_of_associativity = cache->ways;
+	this_leaf->size = cache->size;
+	cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
+}
+
+int init_cache_level(unsigned int cpu)
+{
+	struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()];
+	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+	int leaves = 0, levels = 0;
+	unsigned long upr = mfspr(SPR_UPR);
+	unsigned long iccfgr, dccfgr;
+
+	if (!(upr & SPR_UPR_UP)) {
+		printk(KERN_INFO
+		       "-- no UPR register... unable to detect configuration\n");
+		return -ENOENT;
+	}
+
+	if (cpu_cache_is_present(SPR_UPR_DCP)) {
+		dccfgr = mfspr(SPR_DCCFGR);
+		cpuinfo->dcache.ways = 1 << (dccfgr & SPR_DCCFGR_NCW);
+		cpuinfo->dcache.sets = 1 << ((dccfgr & SPR_DCCFGR_NCS) >> 3);
+		cpuinfo->dcache.block_size = 16 << ((dccfgr & SPR_DCCFGR_CBS) >> 7);
+		cpuinfo->dcache.size =
+		    cpuinfo->dcache.sets * cpuinfo->dcache.ways * cpuinfo->dcache.block_size;
+		leaves += 1;
+		printk(KERN_INFO
+		       "-- dcache: %d bytes total, %d bytes/line, %d set(s), %d way(s)\n",
+		       cpuinfo->dcache.size, cpuinfo->dcache.block_size,
+		       cpuinfo->dcache.sets, cpuinfo->dcache.ways);
+	} else
+		printk(KERN_INFO "-- dcache disabled\n");
+
+	if (cpu_cache_is_present(SPR_UPR_ICP)) {
+		iccfgr = mfspr(SPR_ICCFGR);
+		cpuinfo->icache.ways = 1 << (iccfgr & SPR_ICCFGR_NCW);
+		cpuinfo->icache.sets = 1 << ((iccfgr & SPR_ICCFGR_NCS) >> 3);
+		cpuinfo->icache.block_size = 16 << ((iccfgr & SPR_ICCFGR_CBS) >> 7);
+		cpuinfo->icache.size =
+		    cpuinfo->icache.sets * cpuinfo->icache.ways * cpuinfo->icache.block_size;
+		leaves += 1;
+		printk(KERN_INFO
+		       "-- icache: %d bytes total, %d bytes/line, %d set(s), %d way(s)\n",
+		       cpuinfo->icache.size, cpuinfo->icache.block_size,
+		       cpuinfo->icache.sets, cpuinfo->icache.ways);
+	} else
+		printk(KERN_INFO "-- icache disabled\n");
+
+	if (!leaves)
+		return -ENOENT;
+
+	levels = 1;
+
+	this_cpu_ci->num_leaves = leaves;
+	this_cpu_ci->num_levels = levels;
+
+	return 0;
+}
+
+int populate_cache_leaves(unsigned int cpu)
+{
+	struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()];
+	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+	struct cacheinfo *this_leaf = this_cpu_ci->info_list;
+	int level = 1;
+
+	if (cpu_cache_is_present(SPR_UPR_DCP)) {
+		ci_leaf_init(this_leaf, CACHE_TYPE_DATA, level, &cpuinfo->dcache, cpu);
+		this_leaf->attributes = ((mfspr(SPR_DCCFGR) & SPR_DCCFGR_CWS) >> 8) ?
+					CACHE_WRITE_BACK : CACHE_WRITE_THROUGH;
+		this_leaf++;
+	}
+
+	if (cpu_cache_is_present(SPR_UPR_ICP))
+		ci_leaf_init(this_leaf, CACHE_TYPE_INST, level, &cpuinfo->icache, cpu);
+
+	this_cpu_ci->cpu_map_populated = true;
+
+	return 0;
+}
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c
index b3edbb33b621..3a7b5baaa450 100644
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@@ -17,6 +17,7 @@
 #include <linux/pagewalk.h>
 
 #include <asm/cpuinfo.h>
+#include <asm/cacheflush.h>
 #include <asm/spr_defs.h>
 #include <asm/tlbflush.h>
 
@@ -24,9 +25,6 @@ static int
 page_set_nocache(pte_t *pte, unsigned long addr,
 		 unsigned long next, struct mm_walk *walk)
 {
-	unsigned long cl;
-	struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()];
-
 	pte_val(*pte) |= _PAGE_CI;
 
 	/*
@@ -36,8 +34,7 @@ page_set_nocache(pte_t *pte, unsigned long addr,
 	flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 
 	/* Flush page out of dcache */
-	for (cl = __pa(addr); cl < __pa(next); cl += cpuinfo->dcache_block_size)
-		mtspr(SPR_DCBFR, cl);
+	local_dcache_range_flush(__pa(addr), __pa(next));
 
 	return 0;
 }
@@ -98,21 +95,14 @@ void arch_dma_clear_uncached(void *cpu_addr, size_t size)
 void arch_sync_dma_for_device(phys_addr_t addr, size_t size,
 		enum dma_data_direction dir)
 {
-	unsigned long cl;
-	struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()];
-
 	switch (dir) {
 	case DMA_TO_DEVICE:
 		/* Flush the dcache for the requested range */
-		for (cl = addr; cl < addr + size;
-		     cl += cpuinfo->dcache_block_size)
-			mtspr(SPR_DCBFR, cl);
+		local_dcache_range_flush(addr, addr + size);
 		break;
 	case DMA_FROM_DEVICE:
 		/* Invalidate the dcache for the requested range */
-		for (cl = addr; cl < addr + size;
-		     cl += cpuinfo->dcache_block_size)
-			mtspr(SPR_DCBIR, cl);
+		local_dcache_range_inv(addr, addr + size);
 		break;
 	default:
 		/*
diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c
index be56eaafc8b9..a9fb9cc6779e 100644
--- a/arch/openrisc/kernel/setup.c
+++ b/arch/openrisc/kernel/setup.c
@@ -113,21 +113,6 @@ static void print_cpuinfo(void)
 		return;
 	}
 
-	if (upr & SPR_UPR_DCP)
-		printk(KERN_INFO
-		       "-- dcache: %4d bytes total, %2d bytes/line, %d way(s)\n",
-		       cpuinfo->dcache_size, cpuinfo->dcache_block_size,
-		       cpuinfo->dcache_ways);
-	else
-		printk(KERN_INFO "-- dcache disabled\n");
-	if (upr & SPR_UPR_ICP)
-		printk(KERN_INFO
-		       "-- icache: %4d bytes total, %2d bytes/line, %d way(s)\n",
-		       cpuinfo->icache_size, cpuinfo->icache_block_size,
-		       cpuinfo->icache_ways);
-	else
-		printk(KERN_INFO "-- icache disabled\n");
-
 	if (upr & SPR_UPR_DMP)
 		printk(KERN_INFO "-- dmmu: %4d entries, %lu way(s)\n",
 		       1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2),
@@ -155,8 +140,6 @@ static void print_cpuinfo(void)
 void __init setup_cpuinfo(void)
 {
 	struct device_node *cpu;
-	unsigned long iccfgr, dccfgr;
-	unsigned long cache_set_size;
 	int cpu_id = smp_processor_id();
 	struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[cpu_id];
 
@@ -164,20 +147,6 @@ void __init setup_cpuinfo(void)
 	if (!cpu)
 		panic("Couldn't find CPU%d in device tree...\n", cpu_id);
 
-	iccfgr = mfspr(SPR_ICCFGR);
-	cpuinfo->icache_ways = 1 << (iccfgr & SPR_ICCFGR_NCW);
-	cache_set_size = 1 << ((iccfgr & SPR_ICCFGR_NCS) >> 3);
-	cpuinfo->icache_block_size = 16 << ((iccfgr & SPR_ICCFGR_CBS) >> 7);
-	cpuinfo->icache_size =
-	    cache_set_size * cpuinfo->icache_ways * cpuinfo->icache_block_size;
-
-	dccfgr = mfspr(SPR_DCCFGR);
-	cpuinfo->dcache_ways = 1 << (dccfgr & SPR_DCCFGR_NCW);
-	cache_set_size = 1 << ((dccfgr & SPR_DCCFGR_NCS) >> 3);
-	cpuinfo->dcache_block_size = 16 << ((dccfgr & SPR_DCCFGR_CBS) >> 7);
-	cpuinfo->dcache_size =
-	    cache_set_size * cpuinfo->dcache_ways * cpuinfo->dcache_block_size;
-
 	if (of_property_read_u32(cpu, "clock-frequency",
 				 &cpuinfo->clock_frequency)) {
 		printk(KERN_WARNING
@@ -294,14 +263,14 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	unsigned int vr, cpucfgr;
 	unsigned int avr;
 	unsigned int version;
+#ifdef CONFIG_SMP
 	struct cpuinfo_or1k *cpuinfo = v;
+	seq_printf(m, "processor\t\t: %d\n", cpuinfo->coreid);
+#endif
 
 	vr = mfspr(SPR_VR);
 	cpucfgr = mfspr(SPR_CPUCFGR);
 
-#ifdef CONFIG_SMP
-	seq_printf(m, "processor\t\t: %d\n", cpuinfo->coreid);
-#endif
 	if (vr & SPR_VR_UVRP) {
 		vr = mfspr(SPR_VR2);
 		version = vr & SPR_VR2_VER;
@@ -320,14 +289,6 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		seq_printf(m, "revision\t\t: %d\n", vr & SPR_VR_REV);
 	}
 	seq_printf(m, "frequency\t\t: %ld\n", loops_per_jiffy * HZ);
-	seq_printf(m, "dcache size\t\t: %d bytes\n", cpuinfo->dcache_size);
-	seq_printf(m, "dcache block size\t: %d bytes\n",
-		   cpuinfo->dcache_block_size);
-	seq_printf(m, "dcache ways\t\t: %d\n", cpuinfo->dcache_ways);
-	seq_printf(m, "icache size\t\t: %d bytes\n", cpuinfo->icache_size);
-	seq_printf(m, "icache block size\t: %d bytes\n",
-		   cpuinfo->icache_block_size);
-	seq_printf(m, "icache ways\t\t: %d\n", cpuinfo->icache_ways);
 	seq_printf(m, "immu\t\t\t: %d entries, %lu ways\n",
 		   1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2),
 		   1 + (mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTW));
diff --git a/arch/openrisc/mm/cache.c b/arch/openrisc/mm/cache.c
index eb43b73f3855..0f265b8e73ec 100644
--- a/arch/openrisc/mm/cache.c
+++ b/arch/openrisc/mm/cache.c
@@ -14,31 +14,70 @@
 #include <asm/spr_defs.h>
 #include <asm/cache.h>
 #include <asm/cacheflush.h>
+#include <asm/cpuinfo.h>
 #include <asm/tlbflush.h>
 
-static __always_inline void cache_loop(struct page *page, const unsigned int reg)
+/*
+ * Check if the cache component exists.
+ */
+bool cpu_cache_is_present(const unsigned int cache_type)
 {
-	unsigned long paddr = page_to_pfn(page) << PAGE_SHIFT;
-	unsigned long line = paddr & ~(L1_CACHE_BYTES - 1);
+	unsigned long upr = mfspr(SPR_UPR);
+	unsigned long mask = SPR_UPR_UP | cache_type;
+
+	return !((upr & mask) ^ mask);
+}
+
+static __always_inline void cache_loop(unsigned long paddr, unsigned long end,
+				       const unsigned short reg, const unsigned int cache_type)
+{
+	if (!cpu_cache_is_present(cache_type))
+		return;
 
-	while (line < paddr + PAGE_SIZE) {
-		mtspr(reg, line);
-		line += L1_CACHE_BYTES;
+	while (paddr < end) {
+		mtspr(reg, paddr);
+		paddr += L1_CACHE_BYTES;
 	}
 }
 
+static __always_inline void cache_loop_page(struct page *page, const unsigned short reg,
+					    const unsigned int cache_type)
+{
+	unsigned long paddr = page_to_pfn(page) << PAGE_SHIFT;
+	unsigned long end = paddr + PAGE_SIZE;
+
+	paddr &= ~(L1_CACHE_BYTES - 1);
+
+	cache_loop(paddr, end, reg, cache_type);
+}
+
 void local_dcache_page_flush(struct page *page)
 {
-	cache_loop(page, SPR_DCBFR);
+	cache_loop_page(page, SPR_DCBFR, SPR_UPR_DCP);
 }
 EXPORT_SYMBOL(local_dcache_page_flush);
 
 void local_icache_page_inv(struct page *page)
 {
-	cache_loop(page, SPR_ICBIR);
+	cache_loop_page(page, SPR_ICBIR, SPR_UPR_ICP);
 }
 EXPORT_SYMBOL(local_icache_page_inv);
 
+void local_dcache_range_flush(unsigned long start, unsigned long end)
+{
+	cache_loop(start, end, SPR_DCBFR, SPR_UPR_DCP);
+}
+
+void local_dcache_range_inv(unsigned long start, unsigned long end)
+{
+	cache_loop(start, end, SPR_DCBIR, SPR_UPR_DCP);
+}
+
+void local_icache_range_inv(unsigned long start, unsigned long end)
+{
+	cache_loop(start, end, SPR_ICBIR, SPR_UPR_ICP);
+}
+
 void update_cache(struct vm_area_struct *vma, unsigned long address,
 	pte_t *pte)
 {
@@ -58,4 +97,3 @@ void update_cache(struct vm_area_struct *vma, unsigned long address,
 			sync_icache_dcache(folio_page(folio, nr));
 	}
 }
-
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
index be1c2eb8bb94..e4904ca6f0a0 100644
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -35,6 +35,7 @@
 #include <asm/fixmap.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
+#include <asm/cacheflush.h>
 
 int mem_init_done;
 
@@ -176,8 +177,8 @@ void __init paging_init(void)
 	barrier();
 
 	/* Invalidate instruction caches after code modification */
-	mtspr(SPR_ICBIR, 0x900);
-	mtspr(SPR_ICBIR, 0xa00);
+	local_icache_block_inv(0x900);
+	local_icache_block_inv(0xa00);
 
 	/* New TLB miss handlers and kernel page tables are in now place.
 	 * Make sure that page flags get updated for all pages in TLB by
diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig
index 5b65c9859613..94928d114d4c 100644
--- a/arch/parisc/configs/generic-32bit_defconfig
+++ b/arch/parisc/configs/generic-32bit_defconfig
@@ -251,7 +251,7 @@ CONFIG_CIFS=m
 CONFIG_CIFS_XATTR=y
 CONFIG_CIFS_POSIX=y
 # CONFIG_CIFS_DEBUG is not set
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig
index ecc9ffcc11cd..d8cd7f858b2a 100644
--- a/arch/parisc/configs/generic-64bit_defconfig
+++ b/arch/parisc/configs/generic-64bit_defconfig
@@ -283,7 +283,6 @@ CONFIG_NLS_ASCII=m
 CONFIG_NLS_ISO8859_1=m
 CONFIG_NLS_ISO8859_2=m
 CONFIG_NLS_UTF8=m
-CONFIG_CRYPTO_MANAGER=y
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_ECB=m
 CONFIG_CRYPTO_PCBC=m
diff --git a/arch/parisc/math-emu/driver.c b/arch/parisc/math-emu/driver.c
index 34495446e051..71829cb7bc81 100644
--- a/arch/parisc/math-emu/driver.c
+++ b/arch/parisc/math-emu/driver.c
@@ -97,9 +97,19 @@ handle_fpe(struct pt_regs *regs)
 
 	memcpy(regs->fr, frcopy, sizeof regs->fr);
 	if (signalcode != 0) {
-	    force_sig_fault(signalcode >> 24, signalcode & 0xffffff,
-			    (void __user *) regs->iaoq[0]);
-	    return -1;
+		int sig = signalcode >> 24;
+
+		if (sig == SIGFPE) {
+			/*
+			 * Clear floating point trap bit to avoid trapping
+			 * again on the first floating-point instruction in
+			 * the userspace signal handler.
+			 */
+			regs->fr[0] &= ~(1ULL << 38);
+		}
+		force_sig_fault(sig, signalcode & 0xffffff,
+				(void __user *) regs->iaoq[0]);
+		return -1;
 	}
 
 	return signalcode ? -1 : 0;
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6722625a406a..c3e0cc83f120 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -277,6 +277,7 @@ config PPC
 	select HAVE_PERF_EVENTS_NMI		if PPC64
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
+	select HAVE_PREEMPT_DYNAMIC_KEY
 	select HAVE_RETHOOK			if KPROBES
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE
@@ -894,7 +895,7 @@ config DATA_SHIFT
 	int "Data shift" if DATA_SHIFT_BOOL
 	default 24 if STRICT_KERNEL_RWX && PPC64
 	range 17 28 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && PPC_BOOK3S_32
-	range 19 23 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && PPC_8xx
+	range 14 23 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && PPC_8xx
 	range 20 24 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && PPC_85xx
 	default 22 if STRICT_KERNEL_RWX && PPC_BOOK3S_32
 	default 18 if (DEBUG_PAGEALLOC || KFENCE) && PPC_BOOK3S_32
@@ -907,10 +908,10 @@ config DATA_SHIFT
 	  On Book3S 32 (603+), DBATs are used to map kernel text and rodata RO.
 	  Smaller is the alignment, greater is the number of necessary DBATs.
 
-	  On 8xx, large pages (512kb or 8M) are used to map kernel linear
-	  memory. Aligning to 8M reduces TLB misses as only 8M pages are used
-	  in that case. If PIN_TLB is selected, it must be aligned to 8M as
-	  8M pages will be pinned.
+	  On 8xx, large pages (16kb or 512kb or 8M) are used to map kernel
+	  linear memory. Aligning to 8M reduces TLB misses as only 8M pages
+	  are used in that case. If PIN_TLB is selected, it must be aligned
+	  to 8M as 8M pages will be pinned.
 
 config ARCH_FORCE_MAX_ORDER
 	int "Order of maximal physically contiguous allocations"
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 184d0680e661..a7ab087d412c 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -70,6 +70,7 @@ BOOTCPPFLAGS	:= -nostdinc $(LINUXINCLUDE)
 BOOTCPPFLAGS	+= -isystem $(shell $(BOOTCC) -print-file-name=include)
 
 BOOTCFLAGS	:= $(BOOTTARGETFLAGS) \
+		   -std=gnu11 \
 		   -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 		   -fno-strict-aliasing -O2 \
 		   -msoft-float -mno-altivec -mno-vsx \
diff --git a/arch/powerpc/boot/rs6000.h b/arch/powerpc/boot/rs6000.h
index a9d879155ef9..16df8f3c43f1 100644
--- a/arch/powerpc/boot/rs6000.h
+++ b/arch/powerpc/boot/rs6000.h
@@ -1,11 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* IBM RS/6000 "XCOFF" file definitions for BFD.
    Copyright (C) 1990, 1991 Free Software Foundation, Inc.
-   FIXME: Can someone provide a transliteration of this name into ASCII?
-   Using the following chars caused a compiler warning on HIUX (so I replaced
-   them with octal escapes), and isn't useful without an understanding of what
-   character set it is.
-   Written by Mimi Ph\373\364ng-Th\345o V\365 of IBM
+   Written by Mimi Phuong-Thao Vo of IBM
    and John Gilmore of Cygnus Support.  */
 
 /********************** FILE HEADER **********************/
diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper
index 1db60fe13802..3d8dc822282a 100755
--- a/arch/powerpc/boot/wrapper
+++ b/arch/powerpc/boot/wrapper
@@ -234,10 +234,8 @@ fi
 
 # suppress some warnings in recent ld versions
 nowarn="-z noexecstack"
-if ! ld_is_lld; then
-	if [ "$LD_VERSION" -ge "$(echo 2.39 | ld_version)" ]; then
-		nowarn="$nowarn --no-warn-rwx-segments"
-	fi
+if "${CROSS}ld" -v --no-warn-rwx-segments >/dev/null 2>&1; then
+	nowarn="$nowarn --no-warn-rwx-segments"
 fi
 
 platformo=$object/"$platform".o
diff --git a/arch/powerpc/configs/g5_defconfig b/arch/powerpc/configs/g5_defconfig
index 7e58f3e6c987..428f17b45513 100644
--- a/arch/powerpc/configs/g5_defconfig
+++ b/arch/powerpc/configs/g5_defconfig
@@ -235,7 +235,7 @@ CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_MUTEXES=y
 CONFIG_BOOTX_TEXT=y
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index 6b6d7467fecf..379229c982a4 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -317,7 +317,7 @@ CONFIG_CODE_PATCHING_SELFTEST=y
 CONFIG_FTR_FIXUP_SELFTEST=y
 CONFIG_MSI_BITMAP_SELFTEST=y
 CONFIG_XMON=y
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_MD5_PPC=m
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index 5fa154185efa..3423c405cad4 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -377,7 +377,7 @@ CONFIG_IMA_WRITE_POLICY=y
 CONFIG_IMA_APPRAISE=y
 CONFIG_IMA_ARCH_POLICY=y
 CONFIG_IMA_APPRAISE_MODSIG=y
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_SERPENT=m
diff --git a/arch/powerpc/configs/ppc64e_defconfig b/arch/powerpc/configs/ppc64e_defconfig
index d2e659a2d8cb..90247b2a0ab0 100644
--- a/arch/powerpc/configs/ppc64e_defconfig
+++ b/arch/powerpc/configs/ppc64e_defconfig
@@ -220,7 +220,7 @@ CONFIG_CODE_PATCHING_SELFTEST=y
 CONFIG_FTR_FIXUP_SELFTEST=y
 CONFIG_MSI_BITMAP_SELFTEST=y
 CONFIG_XMON=y
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_CCM=m
 CONFIG_CRYPTO_GCM=m
 CONFIG_CRYPTO_PCBC=m
diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index a91a766b71a4..242c1fab9d46 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -1073,7 +1073,7 @@ CONFIG_SECURITY_NETWORK_XFRM=y
 CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_BOOTPARAM=y
 CONFIG_SECURITY_SELINUX_DISABLE=y
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_CTS=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 370db8192ce6..caaa359f4742 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -17,7 +17,6 @@ config CRYPTO_CURVE25519_PPC64
 
 config CRYPTO_MD5_PPC
 	tristate "Digests: MD5"
-	depends on PPC
 	select CRYPTO_HASH
 	help
 	  MD5 message digest algorithm (RFC1321)
@@ -26,7 +25,6 @@ config CRYPTO_MD5_PPC
 
 config CRYPTO_SHA1_PPC
 	tristate "Hash functions: SHA-1"
-	depends on PPC
 	help
 	  SHA-1 secure hash algorithm (FIPS 180)
 
@@ -34,27 +32,16 @@ config CRYPTO_SHA1_PPC
 
 config CRYPTO_SHA1_PPC_SPE
 	tristate "Hash functions: SHA-1 (SPE)"
-	depends on PPC && SPE
+	depends on SPE
 	help
 	  SHA-1 secure hash algorithm (FIPS 180)
 
 	  Architecture: powerpc using
 	  - SPE (Signal Processing Engine) extensions
 
-config CRYPTO_SHA256_PPC_SPE
-	tristate "Hash functions: SHA-224 and SHA-256 (SPE)"
-	depends on PPC && SPE
-	select CRYPTO_SHA256
-	select CRYPTO_HASH
-	help
-	  SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
-	  Architecture: powerpc using
-	  - SPE (Signal Processing Engine) extensions
-
 config CRYPTO_AES_PPC_SPE
 	tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (SPE)"
-	depends on PPC && SPE
+	depends on SPE
 	select CRYPTO_SKCIPHER
 	help
 	  Block ciphers: AES cipher algorithms (FIPS-197)
@@ -92,33 +79,6 @@ config CRYPTO_AES_GCM_P10
 	  Support for cryptographic acceleration instructions on Power10 or
 	  later CPU. This module supports stitched acceleration for AES/GCM.
 
-config CRYPTO_CHACHA20_P10
-	tristate
-	depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
-	select CRYPTO_SKCIPHER
-	select CRYPTO_LIB_CHACHA_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-	default CRYPTO_LIB_CHACHA_INTERNAL
-	help
-	  Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
-	  stream cipher algorithms
-
-	  Architecture: PowerPC64
-	  - Power10 or later
-	  - Little-endian
-
-config CRYPTO_POLY1305_P10
-	tristate "Hash functions: Poly1305 (P10 or later)"
-	depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
-	select CRYPTO_HASH
-	select CRYPTO_LIB_POLY1305_GENERIC
-	help
-	  Poly1305 authenticator algorithm (RFC7539)
-
-	  Architecture: PowerPC64
-	  - Power10 or later
-	  - Little-endian
-
 config CRYPTO_DEV_VMX
         bool "Support for VMX cryptographic acceleration instructions"
         depends on PPC64 && VSX
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index 2f00b22b0823..8c2936ae466f 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -9,10 +9,7 @@ obj-$(CONFIG_CRYPTO_AES_PPC_SPE) += aes-ppc-spe.o
 obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o
 obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o
 obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o
-obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
 obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o
-obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o
-obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
 obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o
 obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o
 
@@ -20,10 +17,7 @@ aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-
 md5-ppc-y := md5-asm.o md5-glue.o
 sha1-powerpc-y := sha1-powerpc-asm.o sha1.o
 sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o
-sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o
 aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o
-chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
-poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
 vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o
 curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o
 
diff --git a/arch/powerpc/crypto/aes.c b/arch/powerpc/crypto/aes.c
index ec06189fbf99..3f1e5e894902 100644
--- a/arch/powerpc/crypto/aes.c
+++ b/arch/powerpc/crypto/aes.c
@@ -7,15 +7,15 @@
  * Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
  */
 
-#include <linux/types.h>
-#include <linux/err.h>
-#include <linux/crypto.h>
-#include <linux/delay.h>
 #include <asm/simd.h>
 #include <asm/switch_to.h>
 #include <crypto/aes.h>
 #include <crypto/internal/cipher.h>
 #include <crypto/internal/simd.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
 
 #include "aesp8-ppc.h"
 
diff --git a/arch/powerpc/crypto/aes_cbc.c b/arch/powerpc/crypto/aes_cbc.c
index ed0debc7acb5..5f2a4f375eef 100644
--- a/arch/powerpc/crypto/aes_cbc.c
+++ b/arch/powerpc/crypto/aes_cbc.c
@@ -12,6 +12,10 @@
 #include <crypto/aes.h>
 #include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
 
 #include "aesp8-ppc.h"
 
diff --git a/arch/powerpc/crypto/aes_ctr.c b/arch/powerpc/crypto/aes_ctr.c
index 3da75f42529a..e27c4036e711 100644
--- a/arch/powerpc/crypto/aes_ctr.c
+++ b/arch/powerpc/crypto/aes_ctr.c
@@ -12,6 +12,10 @@
 #include <crypto/aes.h>
 #include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
 
 #include "aesp8-ppc.h"
 
diff --git a/arch/powerpc/crypto/aes_xts.c b/arch/powerpc/crypto/aes_xts.c
index dabbccb41550..9440e771cede 100644
--- a/arch/powerpc/crypto/aes_xts.c
+++ b/arch/powerpc/crypto/aes_xts.c
@@ -13,6 +13,10 @@
 #include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/xts.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
 
 #include "aesp8-ppc.h"
 
diff --git a/arch/powerpc/crypto/chacha-p10-glue.c b/arch/powerpc/crypto/chacha-p10-glue.c
deleted file mode 100644
index d8796decc1fb..000000000000
--- a/arch/powerpc/crypto/chacha-p10-glue.c
+++ /dev/null
@@ -1,221 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * PowerPC P10 (ppc64le) accelerated ChaCha and XChaCha stream ciphers,
- * including ChaCha20 (RFC7539)
- *
- * Copyright 2023- IBM Corp. All rights reserved.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/internal/chacha.h>
-#include <crypto/internal/simd.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/cpufeature.h>
-#include <linux/sizes.h>
-#include <asm/simd.h>
-#include <asm/switch_to.h>
-
-asmlinkage void chacha_p10le_8x(u32 *state, u8 *dst, const u8 *src,
-				unsigned int len, int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
-
-static void vsx_begin(void)
-{
-	preempt_disable();
-	enable_kernel_vsx();
-}
-
-static void vsx_end(void)
-{
-	disable_kernel_vsx();
-	preempt_enable();
-}
-
-static void chacha_p10_do_8x(u32 *state, u8 *dst, const u8 *src,
-			     unsigned int bytes, int nrounds)
-{
-	unsigned int l = bytes & ~0x0FF;
-
-	if (l > 0) {
-		chacha_p10le_8x(state, dst, src, l, nrounds);
-		bytes -= l;
-		src += l;
-		dst += l;
-		state[12] += l / CHACHA_BLOCK_SIZE;
-	}
-
-	if (bytes > 0)
-		chacha_crypt_generic(state, dst, src, bytes, nrounds);
-}
-
-void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
-{
-	hchacha_block_generic(state, stream, nrounds);
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
-		       int nrounds)
-{
-	if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE ||
-	    !crypto_simd_usable())
-		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
-
-	do {
-		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
-		vsx_begin();
-		chacha_p10_do_8x(state, dst, src, todo, nrounds);
-		vsx_end();
-
-		bytes -= todo;
-		src += todo;
-		dst += todo;
-	} while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-static int chacha_p10_stream_xor(struct skcipher_request *req,
-				 const struct chacha_ctx *ctx, const u8 *iv)
-{
-	struct skcipher_walk walk;
-	u32 state[16];
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-	if (err)
-		return err;
-
-	chacha_init(state, ctx->key, iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-
-		if (nbytes < walk.total)
-			nbytes = rounddown(nbytes, walk.stride);
-
-		if (!crypto_simd_usable()) {
-			chacha_crypt_generic(state, walk.dst.virt.addr,
-					     walk.src.virt.addr, nbytes,
-					     ctx->nrounds);
-		} else {
-			vsx_begin();
-			chacha_p10_do_8x(state, walk.dst.virt.addr,
-				      walk.src.virt.addr, nbytes, ctx->nrounds);
-			vsx_end();
-		}
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-		if (err)
-			break;
-	}
-
-	return err;
-}
-
-static int chacha_p10(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	return chacha_p10_stream_xor(req, ctx, req->iv);
-}
-
-static int xchacha_p10(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct chacha_ctx subctx;
-	u32 state[16];
-	u8 real_iv[16];
-
-	chacha_init(state, ctx->key, req->iv);
-	hchacha_block_arch(state, subctx.key, ctx->nrounds);
-	subctx.nrounds = ctx->nrounds;
-
-	memcpy(&real_iv[0], req->iv + 24, 8);
-	memcpy(&real_iv[8], req->iv + 16, 8);
-	return chacha_p10_stream_xor(req, &subctx, real_iv);
-}
-
-static struct skcipher_alg algs[] = {
-	{
-		.base.cra_name		= "chacha20",
-		.base.cra_driver_name	= "chacha20-p10",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= CHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= chacha_p10,
-		.decrypt		= chacha_p10,
-	}, {
-		.base.cra_name		= "xchacha20",
-		.base.cra_driver_name	= "xchacha20-p10",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= xchacha_p10,
-		.decrypt		= xchacha_p10,
-	}, {
-		.base.cra_name		= "xchacha12",
-		.base.cra_driver_name	= "xchacha12-p10",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= chacha12_setkey,
-		.encrypt		= xchacha_p10,
-		.decrypt		= xchacha_p10,
-	}
-};
-
-static int __init chacha_p10_init(void)
-{
-	if (!cpu_has_feature(CPU_FTR_ARCH_31))
-		return 0;
-
-	static_branch_enable(&have_p10);
-
-	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit chacha_p10_exit(void)
-{
-	if (!static_branch_likely(&have_p10))
-		return;
-
-	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-module_init(chacha_p10_init);
-module_exit(chacha_p10_exit);
-
-MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (P10 accelerated)");
-MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-p10");
-MODULE_ALIAS_CRYPTO("xchacha20");
-MODULE_ALIAS_CRYPTO("xchacha20-p10");
-MODULE_ALIAS_CRYPTO("xchacha12");
-MODULE_ALIAS_CRYPTO("xchacha12-p10");
diff --git a/arch/powerpc/crypto/ghash.c b/arch/powerpc/crypto/ghash.c
index 77eca20bc7ac..7308735bdb33 100644
--- a/arch/powerpc/crypto/ghash.c
+++ b/arch/powerpc/crypto/ghash.c
@@ -11,19 +11,18 @@
  *   Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
  */
 
-#include <linux/types.h>
-#include <linux/err.h>
-#include <linux/crypto.h>
-#include <linux/delay.h>
-#include <asm/simd.h>
+#include "aesp8-ppc.h"
 #include <asm/switch_to.h>
 #include <crypto/aes.h>
+#include <crypto/gf128mul.h>
 #include <crypto/ghash.h>
-#include <crypto/scatterwalk.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
-#include <crypto/b128ops.h>
-#include "aesp8-ppc.h"
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
 
 void gcm_init_p8(u128 htable[16], const u64 Xi[2]);
 void gcm_gmult_p8(u64 Xi[2], const u128 htable[16]);
@@ -39,15 +38,12 @@ struct p8_ghash_ctx {
 
 struct p8_ghash_desc_ctx {
 	u64 shash[2];
-	u8 buffer[GHASH_DIGEST_SIZE];
-	int bytes;
 };
 
 static int p8_ghash_init(struct shash_desc *desc)
 {
 	struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
 
-	dctx->bytes = 0;
 	memset(dctx->shash, 0, GHASH_DIGEST_SIZE);
 	return 0;
 }
@@ -74,27 +70,30 @@ static int p8_ghash_setkey(struct crypto_shash *tfm, const u8 *key,
 }
 
 static inline void __ghash_block(struct p8_ghash_ctx *ctx,
-				 struct p8_ghash_desc_ctx *dctx)
+				 struct p8_ghash_desc_ctx *dctx,
+				 const u8 *src)
 {
 	if (crypto_simd_usable()) {
 		preempt_disable();
 		pagefault_disable();
 		enable_kernel_vsx();
-		gcm_ghash_p8(dctx->shash, ctx->htable,
-				dctx->buffer, GHASH_DIGEST_SIZE);
+		gcm_ghash_p8(dctx->shash, ctx->htable, src, GHASH_BLOCK_SIZE);
 		disable_kernel_vsx();
 		pagefault_enable();
 		preempt_enable();
 	} else {
-		crypto_xor((u8 *)dctx->shash, dctx->buffer, GHASH_BLOCK_SIZE);
+		crypto_xor((u8 *)dctx->shash, src, GHASH_BLOCK_SIZE);
 		gf128mul_lle((be128 *)dctx->shash, &ctx->key);
 	}
 }
 
-static inline void __ghash_blocks(struct p8_ghash_ctx *ctx,
-				  struct p8_ghash_desc_ctx *dctx,
-				  const u8 *src, unsigned int srclen)
+static inline int __ghash_blocks(struct p8_ghash_ctx *ctx,
+				 struct p8_ghash_desc_ctx *dctx,
+				 const u8 *src, unsigned int srclen)
 {
+	int remain = srclen - round_down(srclen, GHASH_BLOCK_SIZE);
+
+	srclen -= remain;
 	if (crypto_simd_usable()) {
 		preempt_disable();
 		pagefault_disable();
@@ -105,62 +104,38 @@ static inline void __ghash_blocks(struct p8_ghash_ctx *ctx,
 		pagefault_enable();
 		preempt_enable();
 	} else {
-		while (srclen >= GHASH_BLOCK_SIZE) {
+		do {
 			crypto_xor((u8 *)dctx->shash, src, GHASH_BLOCK_SIZE);
 			gf128mul_lle((be128 *)dctx->shash, &ctx->key);
 			srclen -= GHASH_BLOCK_SIZE;
 			src += GHASH_BLOCK_SIZE;
-		}
+		} while (srclen);
 	}
+
+	return remain;
 }
 
 static int p8_ghash_update(struct shash_desc *desc,
 			   const u8 *src, unsigned int srclen)
 {
-	unsigned int len;
 	struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
 	struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
 
-	if (dctx->bytes) {
-		if (dctx->bytes + srclen < GHASH_DIGEST_SIZE) {
-			memcpy(dctx->buffer + dctx->bytes, src,
-				srclen);
-			dctx->bytes += srclen;
-			return 0;
-		}
-		memcpy(dctx->buffer + dctx->bytes, src,
-			GHASH_DIGEST_SIZE - dctx->bytes);
-
-		__ghash_block(ctx, dctx);
-
-		src += GHASH_DIGEST_SIZE - dctx->bytes;
-		srclen -= GHASH_DIGEST_SIZE - dctx->bytes;
-		dctx->bytes = 0;
-	}
-	len = srclen & ~(GHASH_DIGEST_SIZE - 1);
-	if (len) {
-		__ghash_blocks(ctx, dctx, src, len);
-		src += len;
-		srclen -= len;
-	}
-	if (srclen) {
-		memcpy(dctx->buffer, src, srclen);
-		dctx->bytes = srclen;
-	}
-	return 0;
+	return __ghash_blocks(ctx, dctx, src, srclen);
 }
 
-static int p8_ghash_final(struct shash_desc *desc, u8 *out)
+static int p8_ghash_finup(struct shash_desc *desc, const u8 *src,
+			  unsigned int len, u8 *out)
 {
-	int i;
 	struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
 	struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
 
-	if (dctx->bytes) {
-		for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++)
-			dctx->buffer[i] = 0;
-		__ghash_block(ctx, dctx);
-		dctx->bytes = 0;
+	if (len) {
+		u8 buf[GHASH_BLOCK_SIZE] = {};
+
+		memcpy(buf, src, len);
+		__ghash_block(ctx, dctx, buf);
+		memzero_explicit(buf, sizeof(buf));
 	}
 	memcpy(out, dctx->shash, GHASH_DIGEST_SIZE);
 	return 0;
@@ -170,14 +145,14 @@ struct shash_alg p8_ghash_alg = {
 	.digestsize = GHASH_DIGEST_SIZE,
 	.init = p8_ghash_init,
 	.update = p8_ghash_update,
-	.final = p8_ghash_final,
+	.finup = p8_ghash_finup,
 	.setkey = p8_ghash_setkey,
-	.descsize = sizeof(struct p8_ghash_desc_ctx)
-		+ sizeof(struct ghash_desc_ctx),
+	.descsize = sizeof(struct p8_ghash_desc_ctx),
 	.base = {
 		 .cra_name = "ghash",
 		 .cra_driver_name = "p8_ghash",
 		 .cra_priority = 1000,
+		 .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		 .cra_blocksize = GHASH_BLOCK_SIZE,
 		 .cra_ctxsize = sizeof(struct p8_ghash_ctx),
 		 .cra_module = THIS_MODULE,
diff --git a/arch/powerpc/crypto/md5-glue.c b/arch/powerpc/crypto/md5-glue.c
index c24f605033bd..204440a90cd8 100644
--- a/arch/powerpc/crypto/md5-glue.c
+++ b/arch/powerpc/crypto/md5-glue.c
@@ -8,25 +8,13 @@
  */
 
 #include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
 #include <crypto/md5.h>
-#include <asm/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
 
 extern void ppc_md5_transform(u32 *state, const u8 *src, u32 blocks);
 
-static inline void ppc_md5_clear_context(struct md5_state *sctx)
-{
-	int count = sizeof(struct md5_state) >> 2;
-	u32 *ptr = (u32 *)sctx;
-
-	/* make sure we can clear the fast way */
-	BUILD_BUG_ON(sizeof(struct md5_state) % 4);
-	do { *ptr++ = 0; } while (--count);
-}
-
 static int ppc_md5_init(struct shash_desc *desc)
 {
 	struct md5_state *sctx = shash_desc_ctx(desc);
@@ -44,79 +32,34 @@ static int ppc_md5_update(struct shash_desc *desc, const u8 *data,
 			unsigned int len)
 {
 	struct md5_state *sctx = shash_desc_ctx(desc);
-	const unsigned int offset = sctx->byte_count & 0x3f;
-	unsigned int avail = 64 - offset;
-	const u8 *src = data;
 
-	sctx->byte_count += len;
-
-	if (avail > len) {
-		memcpy((char *)sctx->block + offset, src, len);
-		return 0;
-	}
-
-	if (offset) {
-		memcpy((char *)sctx->block + offset, src, avail);
-		ppc_md5_transform(sctx->hash, (const u8 *)sctx->block, 1);
-		len -= avail;
-		src += avail;
-	}
-
-	if (len > 63) {
-		ppc_md5_transform(sctx->hash, src, len >> 6);
-		src += len & ~0x3f;
-		len &= 0x3f;
-	}
-
-	memcpy((char *)sctx->block, src, len);
-	return 0;
+	sctx->byte_count += round_down(len, MD5_HMAC_BLOCK_SIZE);
+	ppc_md5_transform(sctx->hash, data, len >> 6);
+	return len - round_down(len, MD5_HMAC_BLOCK_SIZE);
 }
 
-static int ppc_md5_final(struct shash_desc *desc, u8 *out)
+static int ppc_md5_finup(struct shash_desc *desc, const u8 *src,
+			 unsigned int offset, u8 *out)
 {
 	struct md5_state *sctx = shash_desc_ctx(desc);
-	const unsigned int offset = sctx->byte_count & 0x3f;
-	const u8 *src = (const u8 *)sctx->block;
-	u8 *p = (u8 *)src + offset;
-	int padlen = 55 - offset;
-	__le64 *pbits = (__le64 *)((char *)sctx->block + 56);
+	__le64 block[MD5_BLOCK_WORDS] = {};
+	u8 *p = memcpy(block, src, offset);
 	__le32 *dst = (__le32 *)out;
+	__le64 *pbits;
 
+	src = p;
+	p += offset;
 	*p++ = 0x80;
-
-	if (padlen < 0) {
-		memset(p, 0x00, padlen + sizeof (u64));
-		ppc_md5_transform(sctx->hash, src, 1);
-		p = (char *)sctx->block;
-		padlen = 56;
-	}
-
-	memset(p, 0, padlen);
+	sctx->byte_count += offset;
+	pbits = &block[(MD5_BLOCK_WORDS / (offset > 55 ? 1 : 2)) - 1];
 	*pbits = cpu_to_le64(sctx->byte_count << 3);
-	ppc_md5_transform(sctx->hash, src, 1);
+	ppc_md5_transform(sctx->hash, src, (pbits - block + 1) / 8);
+	memzero_explicit(block, sizeof(block));
 
 	dst[0] = cpu_to_le32(sctx->hash[0]);
 	dst[1] = cpu_to_le32(sctx->hash[1]);
 	dst[2] = cpu_to_le32(sctx->hash[2]);
 	dst[3] = cpu_to_le32(sctx->hash[3]);
-
-	ppc_md5_clear_context(sctx);
-	return 0;
-}
-
-static int ppc_md5_export(struct shash_desc *desc, void *out)
-{
-	struct md5_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, sctx, sizeof(*sctx));
-	return 0;
-}
-
-static int ppc_md5_import(struct shash_desc *desc, const void *in)
-{
-	struct md5_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
 	return 0;
 }
 
@@ -124,15 +67,13 @@ static struct shash_alg alg = {
 	.digestsize	=	MD5_DIGEST_SIZE,
 	.init		=	ppc_md5_init,
 	.update		=	ppc_md5_update,
-	.final		=	ppc_md5_final,
-	.export		=	ppc_md5_export,
-	.import		=	ppc_md5_import,
-	.descsize	=	sizeof(struct md5_state),
-	.statesize	=	sizeof(struct md5_state),
+	.finup		=	ppc_md5_finup,
+	.descsize	=	MD5_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"md5",
 		.cra_driver_name=	"md5-ppc",
 		.cra_priority	=	200,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize	=	MD5_HMAC_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
diff --git a/arch/powerpc/crypto/poly1305-p10-glue.c b/arch/powerpc/crypto/poly1305-p10-glue.c
deleted file mode 100644
index 369686e9370b..000000000000
--- a/arch/powerpc/crypto/poly1305-p10-glue.c
+++ /dev/null
@@ -1,186 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Poly1305 authenticator algorithm, RFC7539.
- *
- * Copyright 2023- IBM Corp. All rights reserved.
- */
-
-#include <crypto/algapi.h>
-#include <linux/crypto.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/jump_label.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/poly1305.h>
-#include <crypto/internal/simd.h>
-#include <linux/cpufeature.h>
-#include <linux/unaligned.h>
-#include <asm/simd.h>
-#include <asm/switch_to.h>
-
-asmlinkage void poly1305_p10le_4blocks(void *h, const u8 *m, u32 mlen);
-asmlinkage void poly1305_64s(void *h, const u8 *m, u32 mlen, int highbit);
-asmlinkage void poly1305_emit_64(void *h, void *s, u8 *dst);
-
-static void vsx_begin(void)
-{
-	preempt_disable();
-	enable_kernel_vsx();
-}
-
-static void vsx_end(void)
-{
-	disable_kernel_vsx();
-	preempt_enable();
-}
-
-static int crypto_poly1305_p10_init(struct shash_desc *desc)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	poly1305_core_init(&dctx->h);
-	dctx->buflen = 0;
-	dctx->rset = 0;
-	dctx->sset = false;
-
-	return 0;
-}
-
-static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
-					       const u8 *inp, unsigned int len)
-{
-	unsigned int acc = 0;
-
-	if (unlikely(!dctx->sset)) {
-		if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
-			struct poly1305_core_key *key = &dctx->core_r;
-
-			key->key.r64[0] = get_unaligned_le64(&inp[0]);
-			key->key.r64[1] = get_unaligned_le64(&inp[8]);
-			inp += POLY1305_BLOCK_SIZE;
-			len -= POLY1305_BLOCK_SIZE;
-			acc += POLY1305_BLOCK_SIZE;
-			dctx->rset = 1;
-		}
-		if (len >= POLY1305_BLOCK_SIZE) {
-			dctx->s[0] = get_unaligned_le32(&inp[0]);
-			dctx->s[1] = get_unaligned_le32(&inp[4]);
-			dctx->s[2] = get_unaligned_le32(&inp[8]);
-			dctx->s[3] = get_unaligned_le32(&inp[12]);
-			acc += POLY1305_BLOCK_SIZE;
-			dctx->sset = true;
-		}
-	}
-	return acc;
-}
-
-static int crypto_poly1305_p10_update(struct shash_desc *desc,
-				      const u8 *src, unsigned int srclen)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-	unsigned int bytes, used;
-
-	if (unlikely(dctx->buflen)) {
-		bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
-		memcpy(dctx->buf + dctx->buflen, src, bytes);
-		src += bytes;
-		srclen -= bytes;
-		dctx->buflen += bytes;
-
-		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-			if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf,
-							       POLY1305_BLOCK_SIZE))) {
-				vsx_begin();
-				poly1305_64s(&dctx->h, dctx->buf,
-						  POLY1305_BLOCK_SIZE, 1);
-				vsx_end();
-			}
-			dctx->buflen = 0;
-		}
-	}
-
-	if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
-		bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
-		used = crypto_poly1305_setdctxkey(dctx, src, bytes);
-		if (likely(used)) {
-			srclen -= used;
-			src += used;
-		}
-		if (crypto_simd_usable() && (srclen >= POLY1305_BLOCK_SIZE*4)) {
-			vsx_begin();
-			poly1305_p10le_4blocks(&dctx->h, src, srclen);
-			vsx_end();
-			src += srclen - (srclen % (POLY1305_BLOCK_SIZE * 4));
-			srclen %= POLY1305_BLOCK_SIZE * 4;
-		}
-		while (srclen >= POLY1305_BLOCK_SIZE) {
-			vsx_begin();
-			poly1305_64s(&dctx->h, src, POLY1305_BLOCK_SIZE, 1);
-			vsx_end();
-			srclen -= POLY1305_BLOCK_SIZE;
-			src += POLY1305_BLOCK_SIZE;
-		}
-	}
-
-	if (unlikely(srclen)) {
-		dctx->buflen = srclen;
-		memcpy(dctx->buf, src, srclen);
-	}
-
-	return 0;
-}
-
-static int crypto_poly1305_p10_final(struct shash_desc *desc, u8 *dst)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	if (unlikely(!dctx->sset))
-		return -ENOKEY;
-
-	if ((dctx->buflen)) {
-		dctx->buf[dctx->buflen++] = 1;
-		memset(dctx->buf + dctx->buflen, 0,
-		       POLY1305_BLOCK_SIZE - dctx->buflen);
-		vsx_begin();
-		poly1305_64s(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
-		vsx_end();
-		dctx->buflen = 0;
-	}
-
-	poly1305_emit_64(&dctx->h, &dctx->s, dst);
-	return 0;
-}
-
-static struct shash_alg poly1305_alg = {
-	.digestsize	= POLY1305_DIGEST_SIZE,
-	.init		= crypto_poly1305_p10_init,
-	.update		= crypto_poly1305_p10_update,
-	.final		= crypto_poly1305_p10_final,
-	.descsize	= sizeof(struct poly1305_desc_ctx),
-	.base		= {
-		.cra_name		= "poly1305",
-		.cra_driver_name	= "poly1305-p10",
-		.cra_priority		= 300,
-		.cra_blocksize		= POLY1305_BLOCK_SIZE,
-		.cra_module		= THIS_MODULE,
-	},
-};
-
-static int __init poly1305_p10_init(void)
-{
-	return crypto_register_shash(&poly1305_alg);
-}
-
-static void __exit poly1305_p10_exit(void)
-{
-	crypto_unregister_shash(&poly1305_alg);
-}
-
-module_cpu_feature_match(PPC_MODULE_FEATURE_P10, poly1305_p10_init);
-module_exit(poly1305_p10_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>");
-MODULE_DESCRIPTION("Optimized Poly1305 for P10");
-MODULE_ALIAS_CRYPTO("poly1305");
-MODULE_ALIAS_CRYPTO("poly1305-p10");
diff --git a/arch/powerpc/crypto/sha1-spe-glue.c b/arch/powerpc/crypto/sha1-spe-glue.c
index 9170892a8557..04c88e173ce1 100644
--- a/arch/powerpc/crypto/sha1-spe-glue.c
+++ b/arch/powerpc/crypto/sha1-spe-glue.c
@@ -7,16 +7,13 @@
  * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
  */
 
+#include <asm/switch_to.h>
 #include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
 #include <crypto/sha1.h>
 #include <crypto/sha1_base.h>
-#include <asm/byteorder.h>
-#include <asm/switch_to.h>
-#include <linux/hardirq.h>
+#include <linux/kernel.h>
+#include <linux/preempt.h>
+#include <linux/module.h>
 
 /*
  * MAX_BYTES defines the number of bytes that are allowed to be processed
@@ -30,7 +27,7 @@
  */
 #define MAX_BYTES 2048
 
-extern void ppc_spe_sha1_transform(u32 *state, const u8 *src, u32 blocks);
+asmlinkage void ppc_spe_sha1_transform(u32 *state, const u8 *src, u32 blocks);
 
 static void spe_begin(void)
 {
@@ -46,126 +43,45 @@ static void spe_end(void)
 	preempt_enable();
 }
 
-static inline void ppc_sha1_clear_context(struct sha1_state *sctx)
+static void ppc_spe_sha1_block(struct sha1_state *sctx, const u8 *src,
+			       int blocks)
 {
-	int count = sizeof(struct sha1_state) >> 2;
-	u32 *ptr = (u32 *)sctx;
-
-	/* make sure we can clear the fast way */
-	BUILD_BUG_ON(sizeof(struct sha1_state) % 4);
-	do { *ptr++ = 0; } while (--count);
-}
-
-static int ppc_spe_sha1_update(struct shash_desc *desc, const u8 *data,
-			unsigned int len)
-{
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-	const unsigned int offset = sctx->count & 0x3f;
-	const unsigned int avail = 64 - offset;
-	unsigned int bytes;
-	const u8 *src = data;
-
-	if (avail > len) {
-		sctx->count += len;
-		memcpy((char *)sctx->buffer + offset, src, len);
-		return 0;
-	}
-
-	sctx->count += len;
-
-	if (offset) {
-		memcpy((char *)sctx->buffer + offset, src, avail);
+	do {
+		int unit = min(blocks, MAX_BYTES / SHA1_BLOCK_SIZE);
 
 		spe_begin();
-		ppc_spe_sha1_transform(sctx->state, (const u8 *)sctx->buffer, 1);
+		ppc_spe_sha1_transform(sctx->state, src, unit);
 		spe_end();
 
-		len -= avail;
-		src += avail;
-	}
-
-	while (len > 63) {
-		bytes = (len > MAX_BYTES) ? MAX_BYTES : len;
-		bytes = bytes & ~0x3f;
-
-		spe_begin();
-		ppc_spe_sha1_transform(sctx->state, src, bytes >> 6);
-		spe_end();
-
-		src += bytes;
-		len -= bytes;
-	}
-
-	memcpy((char *)sctx->buffer, src, len);
-	return 0;
-}
-
-static int ppc_spe_sha1_final(struct shash_desc *desc, u8 *out)
-{
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-	const unsigned int offset = sctx->count & 0x3f;
-	char *p = (char *)sctx->buffer + offset;
-	int padlen;
-	__be64 *pbits = (__be64 *)(((char *)&sctx->buffer) + 56);
-	__be32 *dst = (__be32 *)out;
-
-	padlen = 55 - offset;
-	*p++ = 0x80;
-
-	spe_begin();
-
-	if (padlen < 0) {
-		memset(p, 0x00, padlen + sizeof (u64));
-		ppc_spe_sha1_transform(sctx->state, sctx->buffer, 1);
-		p = (char *)sctx->buffer;
-		padlen = 56;
-	}
-
-	memset(p, 0, padlen);
-	*pbits = cpu_to_be64(sctx->count << 3);
-	ppc_spe_sha1_transform(sctx->state, sctx->buffer, 1);
-
-	spe_end();
-
-	dst[0] = cpu_to_be32(sctx->state[0]);
-	dst[1] = cpu_to_be32(sctx->state[1]);
-	dst[2] = cpu_to_be32(sctx->state[2]);
-	dst[3] = cpu_to_be32(sctx->state[3]);
-	dst[4] = cpu_to_be32(sctx->state[4]);
-
-	ppc_sha1_clear_context(sctx);
-	return 0;
+		src += unit * SHA1_BLOCK_SIZE;
+		blocks -= unit;
+	} while (blocks);
 }
 
-static int ppc_spe_sha1_export(struct shash_desc *desc, void *out)
+static int ppc_spe_sha1_update(struct shash_desc *desc, const u8 *data,
+			unsigned int len)
 {
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, sctx, sizeof(*sctx));
-	return 0;
+	return sha1_base_do_update_blocks(desc, data, len, ppc_spe_sha1_block);
 }
 
-static int ppc_spe_sha1_import(struct shash_desc *desc, const void *in)
+static int ppc_spe_sha1_finup(struct shash_desc *desc, const u8 *src,
+			      unsigned int len, u8 *out)
 {
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
-	return 0;
+	sha1_base_do_finup(desc, src, len, ppc_spe_sha1_block);
+	return sha1_base_finish(desc, out);
 }
 
 static struct shash_alg alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
 	.init		=	sha1_base_init,
 	.update		=	ppc_spe_sha1_update,
-	.final		=	ppc_spe_sha1_final,
-	.export		=	ppc_spe_sha1_export,
-	.import		=	ppc_spe_sha1_import,
-	.descsize	=	sizeof(struct sha1_state),
-	.statesize	=	sizeof(struct sha1_state),
+	.finup		=	ppc_spe_sha1_finup,
+	.descsize	=	SHA1_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha1",
 		.cra_driver_name=	"sha1-ppc-spe",
 		.cra_priority	=	300,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
diff --git a/arch/powerpc/crypto/sha1.c b/arch/powerpc/crypto/sha1.c
index f283bbd3f121..4593946aa9b3 100644
--- a/arch/powerpc/crypto/sha1.c
+++ b/arch/powerpc/crypto/sha1.c
@@ -13,107 +13,46 @@
  * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
  */
 #include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
 #include <crypto/sha1.h>
 #include <crypto/sha1_base.h>
-#include <asm/byteorder.h>
-
-void powerpc_sha_transform(u32 *state, const u8 *src);
-
-static int powerpc_sha1_update(struct shash_desc *desc, const u8 *data,
-			       unsigned int len)
-{
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-	unsigned int partial, done;
-	const u8 *src;
-
-	partial = sctx->count & 0x3f;
-	sctx->count += len;
-	done = 0;
-	src = data;
-
-	if ((partial + len) > 63) {
-
-		if (partial) {
-			done = -partial;
-			memcpy(sctx->buffer + partial, data, done + 64);
-			src = sctx->buffer;
-		}
-
-		do {
-			powerpc_sha_transform(sctx->state, src);
-			done += 64;
-			src = data + done;
-		} while (done + 63 < len);
-
-		partial = 0;
-	}
-	memcpy(sctx->buffer + partial, src, len - done);
-
-	return 0;
-}
+#include <linux/kernel.h>
+#include <linux/module.h>
 
+asmlinkage void powerpc_sha_transform(u32 *state, const u8 *src);
 
-/* Add padding and return the message digest. */
-static int powerpc_sha1_final(struct shash_desc *desc, u8 *out)
+static void powerpc_sha_block(struct sha1_state *sctx, const u8 *data,
+			      int blocks)
 {
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-	__be32 *dst = (__be32 *)out;
-	u32 i, index, padlen;
-	__be64 bits;
-	static const u8 padding[64] = { 0x80, };
-
-	bits = cpu_to_be64(sctx->count << 3);
-
-	/* Pad out to 56 mod 64 */
-	index = sctx->count & 0x3f;
-	padlen = (index < 56) ? (56 - index) : ((64+56) - index);
-	powerpc_sha1_update(desc, padding, padlen);
-
-	/* Append length */
-	powerpc_sha1_update(desc, (const u8 *)&bits, sizeof(bits));
-
-	/* Store state in digest */
-	for (i = 0; i < 5; i++)
-		dst[i] = cpu_to_be32(sctx->state[i]);
-
-	/* Wipe context */
-	memset(sctx, 0, sizeof *sctx);
-
-	return 0;
+	do {
+		powerpc_sha_transform(sctx->state, data);
+		data += 64;
+	} while (--blocks);
 }
 
-static int powerpc_sha1_export(struct shash_desc *desc, void *out)
+static int powerpc_sha1_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len)
 {
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, sctx, sizeof(*sctx));
-	return 0;
+	return sha1_base_do_update_blocks(desc, data, len, powerpc_sha_block);
 }
 
-static int powerpc_sha1_import(struct shash_desc *desc, const void *in)
+/* Add padding and return the message digest. */
+static int powerpc_sha1_finup(struct shash_desc *desc, const u8 *src,
+			      unsigned int len, u8 *out)
 {
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
-	return 0;
+	sha1_base_do_finup(desc, src, len, powerpc_sha_block);
+	return sha1_base_finish(desc, out);
 }
 
 static struct shash_alg alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
 	.init		=	sha1_base_init,
 	.update		=	powerpc_sha1_update,
-	.final		=	powerpc_sha1_final,
-	.export		=	powerpc_sha1_export,
-	.import		=	powerpc_sha1_import,
-	.descsize	=	sizeof(struct sha1_state),
-	.statesize	=	sizeof(struct sha1_state),
+	.finup		=	powerpc_sha1_finup,
+	.descsize	=	SHA1_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha1",
 		.cra_driver_name=	"sha1-powerpc",
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
diff --git a/arch/powerpc/crypto/sha256-spe-glue.c b/arch/powerpc/crypto/sha256-spe-glue.c
deleted file mode 100644
index 2997d13236e0..000000000000
--- a/arch/powerpc/crypto/sha256-spe-glue.c
+++ /dev/null
@@ -1,235 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for SHA-256 implementation for SPE instructions (PPC)
- *
- * Based on generic implementation. The assembler module takes care 
- * about the SPE registers so it can run from interrupt context.
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- */
-
-#include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <asm/byteorder.h>
-#include <asm/switch_to.h>
-#include <linux/hardirq.h>
-
-/*
- * MAX_BYTES defines the number of bytes that are allowed to be processed
- * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000
- * operations per 64 bytes. e500 cores can issue two arithmetic instructions
- * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
- * Thus 1KB of input data will need an estimated maximum of 18,000 cycles.
- * Headroom for cache misses included. Even with the low end model clocked
- * at 667 MHz this equals to a critical time window of less than 27us.
- *
- */
-#define MAX_BYTES 1024
-
-extern void ppc_spe_sha256_transform(u32 *state, const u8 *src, u32 blocks);
-
-static void spe_begin(void)
-{
-	/* We just start SPE operations and will save SPE registers later. */
-	preempt_disable();
-	enable_kernel_spe();
-}
-
-static void spe_end(void)
-{
-	disable_kernel_spe();
-	/* reenable preemption */
-	preempt_enable();
-}
-
-static inline void ppc_sha256_clear_context(struct sha256_state *sctx)
-{
-	int count = sizeof(struct sha256_state) >> 2;
-	u32 *ptr = (u32 *)sctx;
-
-	/* make sure we can clear the fast way */
-	BUILD_BUG_ON(sizeof(struct sha256_state) % 4);
-	do { *ptr++ = 0; } while (--count);
-}
-
-static int ppc_spe_sha256_update(struct shash_desc *desc, const u8 *data,
-			unsigned int len)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-	const unsigned int offset = sctx->count & 0x3f;
-	const unsigned int avail = 64 - offset;
-	unsigned int bytes;
-	const u8 *src = data;
-
-	if (avail > len) {
-		sctx->count += len;
-		memcpy((char *)sctx->buf + offset, src, len);
-		return 0;
-	}
-
-	sctx->count += len;
-
-	if (offset) {
-		memcpy((char *)sctx->buf + offset, src, avail);
-
-		spe_begin();
-		ppc_spe_sha256_transform(sctx->state, (const u8 *)sctx->buf, 1);
-		spe_end();
-
-		len -= avail;
-		src += avail;
-	}
-
-	while (len > 63) {
-		/* cut input data into smaller blocks */
-		bytes = (len > MAX_BYTES) ? MAX_BYTES : len;
-		bytes = bytes & ~0x3f;
-
-		spe_begin();
-		ppc_spe_sha256_transform(sctx->state, src, bytes >> 6);
-		spe_end();
-
-		src += bytes;
-		len -= bytes;
-	}
-
-	memcpy((char *)sctx->buf, src, len);
-	return 0;
-}
-
-static int ppc_spe_sha256_final(struct shash_desc *desc, u8 *out)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-	const unsigned int offset = sctx->count & 0x3f;
-	char *p = (char *)sctx->buf + offset;
-	int padlen;
-	__be64 *pbits = (__be64 *)(((char *)&sctx->buf) + 56);
-	__be32 *dst = (__be32 *)out;
-
-	padlen = 55 - offset;
-	*p++ = 0x80;
-
-	spe_begin();
-
-	if (padlen < 0) {
-		memset(p, 0x00, padlen + sizeof (u64));
-		ppc_spe_sha256_transform(sctx->state, sctx->buf, 1);
-		p = (char *)sctx->buf;
-		padlen = 56;
-	}
-
-	memset(p, 0, padlen);
-	*pbits = cpu_to_be64(sctx->count << 3);
-	ppc_spe_sha256_transform(sctx->state, sctx->buf, 1);
-
-	spe_end();
-
-	dst[0] = cpu_to_be32(sctx->state[0]);
-	dst[1] = cpu_to_be32(sctx->state[1]);
-	dst[2] = cpu_to_be32(sctx->state[2]);
-	dst[3] = cpu_to_be32(sctx->state[3]);
-	dst[4] = cpu_to_be32(sctx->state[4]);
-	dst[5] = cpu_to_be32(sctx->state[5]);
-	dst[6] = cpu_to_be32(sctx->state[6]);
-	dst[7] = cpu_to_be32(sctx->state[7]);
-
-	ppc_sha256_clear_context(sctx);
-	return 0;
-}
-
-static int ppc_spe_sha224_final(struct shash_desc *desc, u8 *out)
-{
-	__be32 D[SHA256_DIGEST_SIZE >> 2];
-	__be32 *dst = (__be32 *)out;
-
-	ppc_spe_sha256_final(desc, (u8 *)D);
-
-	/* avoid bytewise memcpy */
-	dst[0] = D[0];
-	dst[1] = D[1];
-	dst[2] = D[2];
-	dst[3] = D[3];
-	dst[4] = D[4];
-	dst[5] = D[5];
-	dst[6] = D[6];
-
-	/* clear sensitive data */
-	memzero_explicit(D, SHA256_DIGEST_SIZE);
-	return 0;
-}
-
-static int ppc_spe_sha256_export(struct shash_desc *desc, void *out)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, sctx, sizeof(*sctx));
-	return 0;
-}
-
-static int ppc_spe_sha256_import(struct shash_desc *desc, const void *in)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
-	return 0;
-}
-
-static struct shash_alg algs[2] = { {
-	.digestsize	=	SHA256_DIGEST_SIZE,
-	.init		=	sha256_base_init,
-	.update		=	ppc_spe_sha256_update,
-	.final		=	ppc_spe_sha256_final,
-	.export		=	ppc_spe_sha256_export,
-	.import		=	ppc_spe_sha256_import,
-	.descsize	=	sizeof(struct sha256_state),
-	.statesize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha256",
-		.cra_driver_name=	"sha256-ppc-spe",
-		.cra_priority	=	300,
-		.cra_blocksize	=	SHA256_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-}, {
-	.digestsize	=	SHA224_DIGEST_SIZE,
-	.init		=	sha224_base_init,
-	.update		=	ppc_spe_sha256_update,
-	.final		=	ppc_spe_sha224_final,
-	.export		=	ppc_spe_sha256_export,
-	.import		=	ppc_spe_sha256_import,
-	.descsize	=	sizeof(struct sha256_state),
-	.statesize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha224",
-		.cra_driver_name=	"sha224-ppc-spe",
-		.cra_priority	=	300,
-		.cra_blocksize	=	SHA224_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-} };
-
-static int __init ppc_spe_sha256_mod_init(void)
-{
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit ppc_spe_sha256_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_init(ppc_spe_sha256_mod_init);
-module_exit(ppc_spe_sha256_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-224 and SHA-256 Secure Hash Algorithm, SPE optimized");
-
-MODULE_ALIAS_CRYPTO("sha224");
-MODULE_ALIAS_CRYPTO("sha224-ppc-spe");
-MODULE_ALIAS_CRYPTO("sha256");
-MODULE_ALIAS_CRYPTO("sha256-ppc-spe");
diff --git a/arch/powerpc/include/asm/guest-state-buffer.h b/arch/powerpc/include/asm/guest-state-buffer.h
index d107abe1468f..acd61eb36d59 100644
--- a/arch/powerpc/include/asm/guest-state-buffer.h
+++ b/arch/powerpc/include/asm/guest-state-buffer.h
@@ -28,6 +28,21 @@
  /* Process Table Info */
 #define KVMPPC_GSID_PROCESS_TABLE		0x0006
 
+/* Guest Management Heap Size */
+#define KVMPPC_GSID_L0_GUEST_HEAP		0x0800
+
+/* Guest Management Heap Max Size */
+#define KVMPPC_GSID_L0_GUEST_HEAP_MAX		0x0801
+
+/* Guest Pagetable Size */
+#define KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE	0x0802
+
+/* Guest Pagetable Max Size */
+#define KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE_MAX	0x0803
+
+/* Guest Pagetable Reclaim in bytes */
+#define KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM	0x0804
+
 /* H_GUEST_RUN_VCPU input buffer Info */
 #define KVMPPC_GSID_RUN_INPUT			0x0C00
 /* H_GUEST_RUN_VCPU output buffer Info */
@@ -106,6 +121,11 @@
 #define KVMPPC_GSE_GUESTWIDE_COUNT \
 	(KVMPPC_GSE_GUESTWIDE_END - KVMPPC_GSE_GUESTWIDE_START + 1)
 
+#define KVMPPC_GSE_HOSTWIDE_START KVMPPC_GSID_L0_GUEST_HEAP
+#define KVMPPC_GSE_HOSTWIDE_END KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM
+#define KVMPPC_GSE_HOSTWIDE_COUNT \
+	(KVMPPC_GSE_HOSTWIDE_END - KVMPPC_GSE_HOSTWIDE_START + 1)
+
 #define KVMPPC_GSE_META_START KVMPPC_GSID_RUN_INPUT
 #define KVMPPC_GSE_META_END KVMPPC_GSID_VPA
 #define KVMPPC_GSE_META_COUNT (KVMPPC_GSE_META_END - KVMPPC_GSE_META_START + 1)
@@ -130,7 +150,8 @@
 	(KVMPPC_GSE_INTR_REGS_END - KVMPPC_GSE_INTR_REGS_START + 1)
 
 #define KVMPPC_GSE_IDEN_COUNT                                 \
-	(KVMPPC_GSE_GUESTWIDE_COUNT + KVMPPC_GSE_META_COUNT + \
+	(KVMPPC_GSE_HOSTWIDE_COUNT + \
+	 KVMPPC_GSE_GUESTWIDE_COUNT + KVMPPC_GSE_META_COUNT + \
 	 KVMPPC_GSE_DW_REGS_COUNT + KVMPPC_GSE_W_REGS_COUNT + \
 	 KVMPPC_GSE_VSRS_COUNT + KVMPPC_GSE_INTR_REGS_COUNT)
 
@@ -139,10 +160,11 @@
  */
 enum {
 	KVMPPC_GS_CLASS_GUESTWIDE = 0x01,
-	KVMPPC_GS_CLASS_META = 0x02,
-	KVMPPC_GS_CLASS_DWORD_REG = 0x04,
-	KVMPPC_GS_CLASS_WORD_REG = 0x08,
-	KVMPPC_GS_CLASS_VECTOR = 0x10,
+	KVMPPC_GS_CLASS_HOSTWIDE = 0x02,
+	KVMPPC_GS_CLASS_META = 0x04,
+	KVMPPC_GS_CLASS_DWORD_REG = 0x08,
+	KVMPPC_GS_CLASS_WORD_REG = 0x10,
+	KVMPPC_GS_CLASS_VECTOR = 0x18,
 	KVMPPC_GS_CLASS_INTR = 0x20,
 };
 
@@ -164,6 +186,7 @@ enum {
  */
 enum {
 	KVMPPC_GS_FLAGS_WIDE = 0x01,
+	KVMPPC_GS_FLAGS_HOST_WIDE = 0x02,
 };
 
 /**
@@ -287,7 +310,7 @@ struct kvmppc_gs_msg_ops {
  * struct kvmppc_gs_msg - a guest state message
  * @bitmap: the guest state ids that should be included
  * @ops: modify message behavior for reading and writing to buffers
- * @flags: guest wide or thread wide
+ * @flags: host wide, guest wide or thread wide
  * @data: location where buffer data will be written to or from.
  *
  * A guest state message is allows flexibility in sending in receiving data
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index eeef13db2770..6df6dbbe1e7c 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -490,14 +490,15 @@
 #define H_RPTI_PAGE_ALL (-1UL)
 
 /* Flags for H_GUEST_{S,G}_STATE */
-#define H_GUEST_FLAGS_WIDE     (1UL<<(63-0))
+#define H_GUEST_FLAGS_WIDE     (1UL << (63 - 0))
+#define H_GUEST_FLAGS_HOST_WIDE	(1UL << (63 - 1))
 
 /* Flag values used for H_{S,G}SET_GUEST_CAPABILITIES */
-#define H_GUEST_CAP_COPY_MEM	(1UL<<(63-0))
-#define H_GUEST_CAP_POWER9	(1UL<<(63-1))
-#define H_GUEST_CAP_POWER10	(1UL<<(63-2))
-#define H_GUEST_CAP_POWER11	(1UL<<(63-3))
-#define H_GUEST_CAP_BITMAP2	(1UL<<(63-63))
+#define H_GUEST_CAP_COPY_MEM	(1UL << (63 - 0))
+#define H_GUEST_CAP_POWER9	(1UL << (63 - 1))
+#define H_GUEST_CAP_POWER10	(1UL << (63 - 2))
+#define H_GUEST_CAP_POWER11	(1UL << (63 - 3))
+#define H_GUEST_CAP_BITMAP2	(1UL << (63 - 63))
 
 /*
  * Defines for H_HTM - Macros for hardware trace macro (HTM) function.
diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h
index 91be7b885944..f2b6cc4341bb 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -65,6 +65,14 @@ static inline long register_dtl(unsigned long cpu, unsigned long vpa)
 	return vpa_call(H_VPA_REG_DTL, cpu, vpa);
 }
 
+/*
+ * Invokes H_HTM hcall with parameters passed from htm_hcall_wrapper.
+ * flags: Set to hardwareTarget.
+ * target: Specifies target using node index, nodal chip index and core index.
+ * operation : action to perform ie configure, start, stop, deconfigure, trace
+ * based on the HTM type.
+ * param1, param2, param3: parameters for each action.
+ */
 static inline long htm_call(unsigned long flags, unsigned long target,
                unsigned long operation, unsigned long param1,
                unsigned long param2, unsigned long param3)
@@ -73,17 +81,17 @@ static inline long htm_call(unsigned long flags, unsigned long target,
                                  param1, param2, param3);
 }
 
-static inline long htm_get_dump_hardware(unsigned long nodeindex,
+static inline long htm_hcall_wrapper(unsigned long flags, unsigned long nodeindex,
                unsigned long nodalchipindex, unsigned long coreindexonchip,
-               unsigned long type, unsigned long addr, unsigned long size,
-               unsigned long offset)
+	       unsigned long type, unsigned long htm_op, unsigned long param1, unsigned long param2,
+	       unsigned long param3)
 {
-       return htm_call(H_HTM_FLAGS_HARDWARE_TARGET,
+	return htm_call(H_HTM_FLAGS_HARDWARE_TARGET | flags,
                        H_HTM_TARGET_NODE_INDEX(nodeindex) |
                        H_HTM_TARGET_NODAL_CHIP_INDEX(nodalchipindex) |
                        H_HTM_TARGET_CORE_INDEX_ON_CHIP(coreindexonchip),
-                       H_HTM_OP(H_HTM_OP_DUMP_DATA) | H_HTM_TYPE(type),
-                       addr, size, offset);
+		       H_HTM_OP(htm_op) | H_HTM_TYPE(type),
+		       param1, param2, param3);
 }
 
 extern void vpa_init(int cpu);
diff --git a/arch/powerpc/include/asm/preempt.h b/arch/powerpc/include/asm/preempt.h
new file mode 100644
index 000000000000..000e2b9681f3
--- /dev/null
+++ b/arch/powerpc/include/asm/preempt.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_POWERPC_PREEMPT_H
+#define __ASM_POWERPC_PREEMPT_H
+
+#include <asm-generic/preempt.h>
+
+#if defined(CONFIG_PREEMPT_DYNAMIC)
+#include <linux/jump_label.h>
+DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
+#define need_irq_preemption() \
+	(static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
+#else
+#define need_irq_preemption()   (IS_ENABLED(CONFIG_PREEMPTION))
+#endif
+
+#endif /* __ASM_POWERPC_PREEMPT_H */
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 04406162fc5a..75fa0293c508 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -515,6 +515,10 @@ extern char rtas_data_buf[RTAS_DATA_BUF_SIZE];
 extern unsigned long rtas_rmo_buf;
 
 extern struct mutex rtas_ibm_get_vpd_lock;
+extern struct mutex rtas_ibm_get_indices_lock;
+extern struct mutex rtas_ibm_set_dynamic_indicator_lock;
+extern struct mutex rtas_ibm_get_dynamic_sensor_state_lock;
+extern struct mutex rtas_ibm_physical_attestation_lock;
 
 #define GLOBAL_INTERRUPT_QUEUE 9005
 
diff --git a/arch/powerpc/include/uapi/asm/papr-indices.h b/arch/powerpc/include/uapi/asm/papr-indices.h
new file mode 100644
index 000000000000..c2999d89d52a
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/papr-indices.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_PAPR_INDICES_H_
+#define _UAPI_PAPR_INDICES_H_
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+#include <asm/papr-miscdev.h>
+
+#define LOC_CODE_SIZE			80
+#define RTAS_GET_INDICES_BUF_SIZE	SZ_4K
+
+struct papr_indices_io_block {
+	union {
+		struct {
+			__u8 is_sensor; /* 0 for indicator and 1 for sensor */
+			__u32 indice_type;
+		} indices;
+		struct {
+			__u32 token; /* Sensor or indicator token */
+			__u32 state; /* get / set state */
+			/*
+			 * PAPR+ 12.3.2.4 Converged Location Code Rules - Length
+			 * Restrictions. 79 characters plus null.
+			 */
+			char location_code_str[LOC_CODE_SIZE]; /* location code */
+		} dynamic_param;
+	};
+};
+
+/*
+ * ioctls for /dev/papr-indices.
+ * PAPR_INDICES_IOC_GET: Returns a get-indices handle fd to read data
+ * PAPR_DYNAMIC_SENSOR_IOC_GET: Gets the state of the input sensor
+ * PAPR_DYNAMIC_INDICATOR_IOC_SET: Sets the new state for the input indicator
+ */
+#define PAPR_INDICES_IOC_GET		_IOW(PAPR_MISCDEV_IOC_ID, 3, struct papr_indices_io_block)
+#define PAPR_DYNAMIC_SENSOR_IOC_GET	_IOWR(PAPR_MISCDEV_IOC_ID, 4, struct papr_indices_io_block)
+#define PAPR_DYNAMIC_INDICATOR_IOC_SET	_IOW(PAPR_MISCDEV_IOC_ID, 5, struct papr_indices_io_block)
+
+
+#endif /* _UAPI_PAPR_INDICES_H_ */
diff --git a/arch/powerpc/include/uapi/asm/papr-physical-attestation.h b/arch/powerpc/include/uapi/asm/papr-physical-attestation.h
new file mode 100644
index 000000000000..ea746837bb9a
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/papr-physical-attestation.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_PAPR_PHYSICAL_ATTESTATION_H_
+#define _UAPI_PAPR_PHYSICAL_ATTESTATION_H_
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+#include <asm/papr-miscdev.h>
+
+#define PAPR_PHYATTEST_MAX_INPUT 4084 /* Max 4K buffer: 4K-12 */
+
+/*
+ * Defined in PAPR 2.13+ 21.6 Attestation Command Structures.
+ * User space pass this struct and the max size should be 4K.
+ */
+struct papr_phy_attest_io_block {
+	__u8 version;
+	__u8 command;
+	__u8 TCG_major_ver;
+	__u8 TCG_minor_ver;
+	__be32 length;
+	__be32 correlator;
+	__u8 payload[PAPR_PHYATTEST_MAX_INPUT];
+};
+
+/*
+ * ioctl for /dev/papr-physical-attestation. Returns a attestation
+ * command fd handle
+ */
+#define PAPR_PHY_ATTEST_IOC_HANDLE _IOW(PAPR_MISCDEV_IOC_ID, 8, struct papr_phy_attest_io_block)
+
+#endif /* _UAPI_PAPR_PHYSICAL_ATTESTATION_H_ */
diff --git a/arch/powerpc/include/uapi/asm/papr-platform-dump.h b/arch/powerpc/include/uapi/asm/papr-platform-dump.h
new file mode 100644
index 000000000000..8a1c060e89a9
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/papr-platform-dump.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_PAPR_PLATFORM_DUMP_H_
+#define _UAPI_PAPR_PLATFORM_DUMP_H_
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+#include <asm/papr-miscdev.h>
+
+/*
+ * ioctl for /dev/papr-platform-dump. Returns a platform-dump handle fd
+ * corresponding to dump tag.
+ */
+#define PAPR_PLATFORM_DUMP_IOC_CREATE_HANDLE _IOW(PAPR_MISCDEV_IOC_ID, 6, __u64)
+#define PAPR_PLATFORM_DUMP_IOC_INVALIDATE    _IOW(PAPR_MISCDEV_IOC_ID, 7, __u64)
+
+#endif /* _UAPI_PAPR_PLATFORM_DUMP_H_ */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 6ac621155ec3..9d1ab3971694 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -160,9 +160,7 @@ endif
 
 obj64-$(CONFIG_PPC_TRANSACTIONAL_MEM)	+= tm.o
 
-ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC_CORE)(CONFIG_PPC_BOOK3S),)
 obj-y				+= ppc_save_regs.o
-endif
 
 obj-$(CONFIG_EPAPR_PARAVIRT)	+= epapr_paravirt.o epapr_hcalls.o
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvm_emul.o
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index df16c7f547ab..8ca49e40c473 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -290,10 +290,8 @@ static void __init fadump_show_config(void)
 	if (!fw_dump.fadump_supported)
 		return;
 
-	pr_debug("Fadump enabled    : %s\n",
-				(fw_dump.fadump_enabled ? "yes" : "no"));
-	pr_debug("Dump Active       : %s\n",
-				(fw_dump.dump_active ? "yes" : "no"));
+	pr_debug("Fadump enabled    : %s\n", str_yes_no(fw_dump.fadump_enabled));
+	pr_debug("Dump Active       : %s\n", str_yes_no(fw_dump.dump_active));
 	pr_debug("Dump section sizes:\n");
 	pr_debug("    CPU state data size: %lx\n", fw_dump.cpu_state_data_size);
 	pr_debug("    HPTE region size   : %lx\n", fw_dump.hpte_region_size);
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 8f4acc55407b..e0c681d0b076 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -25,6 +25,10 @@
 unsigned long global_dbcr0[NR_CPUS];
 #endif
 
+#if defined(CONFIG_PREEMPT_DYNAMIC)
+DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
+#endif
+
 #ifdef CONFIG_PPC_BOOK3S_64
 DEFINE_STATIC_KEY_FALSE(interrupt_exit_not_reentrant);
 static inline bool exit_must_hard_disable(void)
@@ -396,7 +400,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs)
 		/* Returning to a kernel context with local irqs enabled. */
 		WARN_ON_ONCE(!(regs->msr & MSR_EE));
 again:
-		if (IS_ENABLED(CONFIG_PREEMPTION)) {
+		if (need_irq_preemption()) {
 			/* Return to preemptible kernel context */
 			if (unlikely(read_thread_flags() & _TIF_NEED_RESCHED)) {
 				if (preempt_count() == 0)
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0ebae6e4c19d..244eb4857e7f 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -16,6 +16,7 @@
 #include <linux/mm.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
+#include <linux/string_choices.h>
 #include <linux/dma-mapping.h>
 #include <linux/bitmap.h>
 #include <linux/iommu-helper.h>
@@ -769,8 +770,8 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
 	iommu_table_clear(tbl);
 
 	if (!welcomed) {
-		printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n",
-		       novmerge ? "disabled" : "enabled");
+		pr_info("IOMMU table initialized, virtual merging %s\n",
+			str_disabled_enabled(novmerge));
 		welcomed = 1;
 	}
 
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 34a5aec4908f..126bf3b06ab7 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -258,10 +258,6 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
 			break;
 		}
 	}
-	if (i == hdr->e_shnum) {
-		pr_err("%s: doesn't contain __patchable_function_entries.\n", me->name);
-		return -ENOEXEC;
-	}
 #endif
 
 	pr_debug("Looks like a total of %lu stubs, max\n", relocs);
diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c
index 3816a2bf2b84..d083b4517065 100644
--- a/arch/powerpc/kernel/proc_powerpc.c
+++ b/arch/powerpc/kernel/proc_powerpc.c
@@ -9,6 +9,7 @@
 #include <linux/proc_fs.h>
 #include <linux/kernel.h>
 #include <linux/of.h>
+#include <linux/string.h>
 
 #include <asm/machdep.h>
 #include <asm/vdso_datapage.h>
@@ -56,7 +57,7 @@ static int __init proc_ppc64_init(void)
 {
 	struct proc_dir_entry *pde;
 
-	strcpy((char *)systemcfg->eye_catcher, "SYSTEMCFG:PPC64");
+	strscpy(systemcfg->eye_catcher, "SYSTEMCFG:PPC64");
 	systemcfg->version.major = SYSTEMCFG_MAJOR;
 	systemcfg->version.minor = SYSTEMCFG_MINOR;
 	systemcfg->processor = mfspr(SPRN_PVR);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index ef91f71e07c4..855e09886503 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1000,7 +1000,7 @@ static inline void tm_reclaim_task(struct task_struct *tsk)
 
 	WARN_ON(tm_suspend_disabled);
 
-	TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, "
+	TM_DEBUG("---- tm_reclaim on pid %d (NIP=%lx, "
 		 "ccr=%lx, msr=%lx, trap=%lx)\n",
 		 tsk->pid, thr->regs->nip,
 		 thr->regs->ccr, thr->regs->msr,
@@ -1008,7 +1008,7 @@ static inline void tm_reclaim_task(struct task_struct *tsk)
 
 	tm_reclaim_thread(thr, TM_CAUSE_RESCHED);
 
-	TM_DEBUG("--- tm_reclaim on pid %d complete\n",
+	TM_DEBUG("---- tm_reclaim on pid %d complete\n",
 		 tsk->pid);
 
 out_and_saveregs:
@@ -2367,14 +2367,14 @@ void __no_sanitize_address show_stack(struct task_struct *tsk,
 				(sp + STACK_INT_FRAME_REGS);
 
 			lr = regs->link;
-			printk("%s--- interrupt: %lx at %pS\n",
+			printk("%s---- interrupt: %lx at %pS\n",
 			       loglvl, regs->trap, (void *)regs->nip);
 
 			// Detect the case of an empty pt_regs at the very base
 			// of the stack and suppress showing it in full.
 			if (!empty_user_regs(regs, tsk)) {
 				__show_regs(regs);
-				printk("%s--- interrupt: %lx\n", loglvl, regs->trap);
+				printk("%s---- interrupt: %lx\n", loglvl, regs->trap);
 			}
 
 			firstframe = 1;
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index d7a738f1858d..e61245c4468e 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -92,12 +92,12 @@ struct rtas_function {
  * Per-function locks for sequence-based RTAS functions.
  */
 static DEFINE_MUTEX(rtas_ibm_activate_firmware_lock);
-static DEFINE_MUTEX(rtas_ibm_get_dynamic_sensor_state_lock);
-static DEFINE_MUTEX(rtas_ibm_get_indices_lock);
 static DEFINE_MUTEX(rtas_ibm_lpar_perftools_lock);
-static DEFINE_MUTEX(rtas_ibm_physical_attestation_lock);
-static DEFINE_MUTEX(rtas_ibm_set_dynamic_indicator_lock);
+DEFINE_MUTEX(rtas_ibm_physical_attestation_lock);
 DEFINE_MUTEX(rtas_ibm_get_vpd_lock);
+DEFINE_MUTEX(rtas_ibm_get_indices_lock);
+DEFINE_MUTEX(rtas_ibm_set_dynamic_indicator_lock);
+DEFINE_MUTEX(rtas_ibm_get_dynamic_sensor_state_lock);
 
 static struct rtas_function rtas_function_table[] __ro_after_init = {
 	[RTAS_FNIDX__CHECK_EXCEPTION] = {
diff --git a/arch/powerpc/kernel/trace/ftrace_entry.S b/arch/powerpc/kernel/trace/ftrace_entry.S
index 2c1b24100eca..3565c67fc638 100644
--- a/arch/powerpc/kernel/trace/ftrace_entry.S
+++ b/arch/powerpc/kernel/trace/ftrace_entry.S
@@ -212,10 +212,10 @@
 	bne-	1f
 
 	mr	r3, r15
+1:	mtlr	r3
 	.if \allregs == 0
 	REST_GPR(15, r1)
 	.endif
-1:	mtlr	r3
 #endif
 
 	/* Restore gprs */
diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c
index 9ac3266e4965..a325c1c02f96 100644
--- a/arch/powerpc/kexec/crash.c
+++ b/arch/powerpc/kexec/crash.c
@@ -359,7 +359,10 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
 	if (TRAP(regs) == INTERRUPT_SYSTEM_RESET)
 		is_via_system_reset = 1;
 
-	crash_smp_send_stop();
+	if (IS_ENABLED(CONFIG_SMP))
+		crash_smp_send_stop();
+	else
+		crash_kexec_prepare();
 
 	crash_save_cpu(regs, crashing_cpu);
 
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index dbfdc126bf14..2f2702c867f7 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -83,6 +83,7 @@ config KVM_BOOK3S_64_HV
 	depends on KVM_BOOK3S_64 && PPC_POWERNV
 	select KVM_BOOK3S_HV_POSSIBLE
 	select KVM_GENERIC_MMU_NOTIFIER
+	select KVM_BOOK3S_HV_PMU
 	select CMA
 	help
 	  Support running unmodified book3s_64 guest kernels in
@@ -171,6 +172,18 @@ config KVM_BOOK3S_HV_NESTED_PMU_WORKAROUND
 	  those buggy L1s which saves the L2 state, at the cost of performance
 	  in all nested-capable guest entry/exit.
 
+config KVM_BOOK3S_HV_PMU
+	tristate "Hypervisor Perf events for KVM Book3s-HV"
+	depends on KVM_BOOK3S_64_HV
+	help
+	  Enable Book3s-HV Hypervisor Perf events PMU named 'kvm-hv'. These
+	  Perf events give an overview of hypervisor performance overall
+	  instead of a specific guests. Currently the PMU reports
+	  L0-Hypervisor stats on a kvm-hv enabled PSeries LPAR like:
+	  * Total/Used Guest-Heap
+	  * Total/Used Guest Page-table Memory
+	  * Total amount of Guest Page-table Memory reclaimed
+
 config KVM_BOOKE_HV
 	bool
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 19f4d298dd17..7667563fb9ff 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -6541,10 +6541,6 @@ static struct kvmppc_ops kvm_ops_hv = {
 	.fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
 	.arch_vm_ioctl  = kvm_arch_vm_ioctl_hv,
 	.hcall_implemented = kvmppc_hcall_impl_hv,
-#ifdef CONFIG_KVM_XICS
-	.irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
-	.irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
-#endif
 	.configure_mmu = kvmhv_configure_mmu,
 	.get_rmmu_info = kvmhv_get_rmmu_info,
 	.set_smt_mode = kvmhv_set_smt_mode,
@@ -6662,6 +6658,22 @@ static int kvmppc_book3s_init_hv(void)
 		return r;
 	}
 
+#if defined(CONFIG_KVM_XICS)
+	/*
+	 * IRQ bypass is supported only for interrupts whose EOI operations are
+	 * handled via OPAL calls. Therefore, register IRQ bypass handlers
+	 * exclusively for PowerNV KVM when booted with 'xive=off', indicating
+	 * the use of the emulated XICS interrupt controller.
+	 */
+	if (!kvmhv_on_pseries()) {
+		pr_info("KVM-HV: Enabling IRQ bypass\n");
+		kvm_ops_hv.irq_bypass_add_producer =
+			kvmppc_irq_bypass_add_producer_hv;
+		kvm_ops_hv.irq_bypass_del_producer =
+			kvmppc_irq_bypass_del_producer_hv;
+	}
+#endif
+
 	kvm_ops_hv.owner = THIS_MODULE;
 	kvmppc_hv_ops = &kvm_ops_hv;
 
diff --git a/arch/powerpc/kvm/book3s_hv_nestedv2.c b/arch/powerpc/kvm/book3s_hv_nestedv2.c
index e5c7ce1fb761..87691cf86cae 100644
--- a/arch/powerpc/kvm/book3s_hv_nestedv2.c
+++ b/arch/powerpc/kvm/book3s_hv_nestedv2.c
@@ -123,6 +123,12 @@ static size_t gs_msg_ops_vcpu_get_size(struct kvmppc_gs_msg *gsm)
 		case KVMPPC_GSID_PROCESS_TABLE:
 		case KVMPPC_GSID_RUN_INPUT:
 		case KVMPPC_GSID_RUN_OUTPUT:
+		  /* Host wide counters */
+		case KVMPPC_GSID_L0_GUEST_HEAP:
+		case KVMPPC_GSID_L0_GUEST_HEAP_MAX:
+		case KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE:
+		case KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE_MAX:
+		case KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM:
 			break;
 		default:
 			size += kvmppc_gse_total_size(kvmppc_gsid_size(iden));
diff --git a/arch/powerpc/kvm/guest-state-buffer.c b/arch/powerpc/kvm/guest-state-buffer.c
index b80dbc58621f..871cf60ddeb6 100644
--- a/arch/powerpc/kvm/guest-state-buffer.c
+++ b/arch/powerpc/kvm/guest-state-buffer.c
@@ -92,6 +92,10 @@ static int kvmppc_gsid_class(u16 iden)
 	    (iden <= KVMPPC_GSE_GUESTWIDE_END))
 		return KVMPPC_GS_CLASS_GUESTWIDE;
 
+	if ((iden >= KVMPPC_GSE_HOSTWIDE_START) &&
+	    (iden <= KVMPPC_GSE_HOSTWIDE_END))
+		return KVMPPC_GS_CLASS_HOSTWIDE;
+
 	if ((iden >= KVMPPC_GSE_META_START) && (iden <= KVMPPC_GSE_META_END))
 		return KVMPPC_GS_CLASS_META;
 
@@ -118,6 +122,21 @@ static int kvmppc_gsid_type(u16 iden)
 	int type = -1;
 
 	switch (kvmppc_gsid_class(iden)) {
+	case KVMPPC_GS_CLASS_HOSTWIDE:
+		switch (iden) {
+		case KVMPPC_GSID_L0_GUEST_HEAP:
+			fallthrough;
+		case KVMPPC_GSID_L0_GUEST_HEAP_MAX:
+			fallthrough;
+		case KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE:
+			fallthrough;
+		case KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE_MAX:
+			fallthrough;
+		case KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM:
+			type = KVMPPC_GSE_BE64;
+			break;
+		}
+		break;
 	case KVMPPC_GS_CLASS_GUESTWIDE:
 		switch (iden) {
 		case KVMPPC_GSID_HOST_STATE_SIZE:
@@ -187,6 +206,9 @@ unsigned long kvmppc_gsid_flags(u16 iden)
 	case KVMPPC_GS_CLASS_GUESTWIDE:
 		flags = KVMPPC_GS_FLAGS_WIDE;
 		break;
+	case KVMPPC_GS_CLASS_HOSTWIDE:
+		flags = KVMPPC_GS_FLAGS_HOST_WIDE;
+		break;
 	case KVMPPC_GS_CLASS_META:
 	case KVMPPC_GS_CLASS_DWORD_REG:
 	case KVMPPC_GS_CLASS_WORD_REG:
@@ -310,6 +332,13 @@ static inline int kvmppc_gse_flatten_iden(u16 iden)
 
 	bit += KVMPPC_GSE_GUESTWIDE_COUNT;
 
+	if (class == KVMPPC_GS_CLASS_HOSTWIDE) {
+		bit += iden - KVMPPC_GSE_HOSTWIDE_START;
+		return bit;
+	}
+
+	bit += KVMPPC_GSE_HOSTWIDE_COUNT;
+
 	if (class == KVMPPC_GS_CLASS_META) {
 		bit += iden - KVMPPC_GSE_META_START;
 		return bit;
@@ -356,6 +385,12 @@ static inline u16 kvmppc_gse_unflatten_iden(int bit)
 	}
 	bit -= KVMPPC_GSE_GUESTWIDE_COUNT;
 
+	if (bit < KVMPPC_GSE_HOSTWIDE_COUNT) {
+		iden = KVMPPC_GSE_HOSTWIDE_START + bit;
+		return iden;
+	}
+	bit -= KVMPPC_GSE_HOSTWIDE_COUNT;
+
 	if (bit < KVMPPC_GSE_META_COUNT) {
 		iden = KVMPPC_GSE_META_START + bit;
 		return iden;
@@ -588,6 +623,8 @@ int kvmppc_gsb_send(struct kvmppc_gs_buff *gsb, unsigned long flags)
 
 	if (flags & KVMPPC_GS_FLAGS_WIDE)
 		hflags |= H_GUEST_FLAGS_WIDE;
+	if (flags & KVMPPC_GS_FLAGS_HOST_WIDE)
+		hflags |= H_GUEST_FLAGS_HOST_WIDE;
 
 	rc = plpar_guest_set_state(hflags, gsb->guest_id, gsb->vcpu_id,
 				   __pa(gsb->hdr), gsb->capacity, &i);
@@ -613,6 +650,8 @@ int kvmppc_gsb_recv(struct kvmppc_gs_buff *gsb, unsigned long flags)
 
 	if (flags & KVMPPC_GS_FLAGS_WIDE)
 		hflags |= H_GUEST_FLAGS_WIDE;
+	if (flags & KVMPPC_GS_FLAGS_HOST_WIDE)
+		hflags |= H_GUEST_FLAGS_HOST_WIDE;
 
 	rc = plpar_guest_get_state(hflags, gsb->guest_id, gsb->vcpu_id,
 				   __pa(gsb->hdr), gsb->capacity, &i);
diff --git a/arch/powerpc/kvm/test-guest-state-buffer.c b/arch/powerpc/kvm/test-guest-state-buffer.c
index bfd225329a18..5ccca306997a 100644
--- a/arch/powerpc/kvm/test-guest-state-buffer.c
+++ b/arch/powerpc/kvm/test-guest-state-buffer.c
@@ -5,6 +5,7 @@
 #include <kunit/test.h>
 
 #include <asm/guest-state-buffer.h>
+#include <asm/kvm_ppc.h>
 
 static void test_creating_buffer(struct kunit *test)
 {
@@ -141,6 +142,16 @@ static void test_gs_bitmap(struct kunit *test)
 		i++;
 	}
 
+	for (u16 iden = KVMPPC_GSID_L0_GUEST_HEAP;
+	     iden <= KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM; iden++) {
+		kvmppc_gsbm_set(&gsbm, iden);
+		kvmppc_gsbm_set(&gsbm1, iden);
+		KUNIT_EXPECT_TRUE(test, kvmppc_gsbm_test(&gsbm, iden));
+		kvmppc_gsbm_clear(&gsbm, iden);
+		KUNIT_EXPECT_FALSE(test, kvmppc_gsbm_test(&gsbm, iden));
+		i++;
+	}
+
 	for (u16 iden = KVMPPC_GSID_RUN_INPUT; iden <= KVMPPC_GSID_VPA;
 	     iden++) {
 		kvmppc_gsbm_set(&gsbm, iden);
@@ -309,12 +320,215 @@ static void test_gs_msg(struct kunit *test)
 	kvmppc_gsm_free(gsm);
 }
 
+/* Test data struct for hostwide/L0 counters */
+struct kvmppc_gs_msg_test_hostwide_data {
+	u64 guest_heap;
+	u64 guest_heap_max;
+	u64 guest_pgtable_size;
+	u64 guest_pgtable_size_max;
+	u64 guest_pgtable_reclaim;
+};
+
+static size_t test_hostwide_get_size(struct kvmppc_gs_msg *gsm)
+
+{
+	size_t size = 0;
+	u16 ids[] = {
+		KVMPPC_GSID_L0_GUEST_HEAP,
+		KVMPPC_GSID_L0_GUEST_HEAP_MAX,
+		KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE,
+		KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE_MAX,
+		KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM
+	};
+
+	for (int i = 0; i < ARRAY_SIZE(ids); i++)
+		size += kvmppc_gse_total_size(kvmppc_gsid_size(ids[i]));
+	return size;
+}
+
+static int test_hostwide_fill_info(struct kvmppc_gs_buff *gsb,
+				   struct kvmppc_gs_msg *gsm)
+{
+	struct kvmppc_gs_msg_test_hostwide_data *data = gsm->data;
+
+	if (kvmppc_gsm_includes(gsm, KVMPPC_GSID_L0_GUEST_HEAP))
+		kvmppc_gse_put_u64(gsb, KVMPPC_GSID_L0_GUEST_HEAP,
+				   data->guest_heap);
+	if (kvmppc_gsm_includes(gsm, KVMPPC_GSID_L0_GUEST_HEAP_MAX))
+		kvmppc_gse_put_u64(gsb, KVMPPC_GSID_L0_GUEST_HEAP_MAX,
+				   data->guest_heap_max);
+	if (kvmppc_gsm_includes(gsm, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE))
+		kvmppc_gse_put_u64(gsb, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE,
+				   data->guest_pgtable_size);
+	if (kvmppc_gsm_includes(gsm, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE_MAX))
+		kvmppc_gse_put_u64(gsb, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE_MAX,
+				   data->guest_pgtable_size_max);
+	if (kvmppc_gsm_includes(gsm, KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM))
+		kvmppc_gse_put_u64(gsb, KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM,
+				   data->guest_pgtable_reclaim);
+
+	return 0;
+}
+
+static int test_hostwide_refresh_info(struct kvmppc_gs_msg *gsm,
+				      struct kvmppc_gs_buff *gsb)
+{
+	struct kvmppc_gs_parser gsp = { 0 };
+	struct kvmppc_gs_msg_test_hostwide_data *data = gsm->data;
+	struct kvmppc_gs_elem *gse;
+	int rc;
+
+	rc = kvmppc_gse_parse(&gsp, gsb);
+	if (rc < 0)
+		return rc;
+
+	gse = kvmppc_gsp_lookup(&gsp, KVMPPC_GSID_L0_GUEST_HEAP);
+	if (gse)
+		data->guest_heap = kvmppc_gse_get_u64(gse);
+
+	gse = kvmppc_gsp_lookup(&gsp, KVMPPC_GSID_L0_GUEST_HEAP_MAX);
+	if (gse)
+		data->guest_heap_max = kvmppc_gse_get_u64(gse);
+
+	gse = kvmppc_gsp_lookup(&gsp, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE);
+	if (gse)
+		data->guest_pgtable_size = kvmppc_gse_get_u64(gse);
+
+	gse = kvmppc_gsp_lookup(&gsp, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE_MAX);
+	if (gse)
+		data->guest_pgtable_size_max = kvmppc_gse_get_u64(gse);
+
+	gse = kvmppc_gsp_lookup(&gsp, KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM);
+	if (gse)
+		data->guest_pgtable_reclaim = kvmppc_gse_get_u64(gse);
+
+	return 0;
+}
+
+static struct kvmppc_gs_msg_ops gs_msg_test_hostwide_ops = {
+	.get_size = test_hostwide_get_size,
+	.fill_info = test_hostwide_fill_info,
+	.refresh_info = test_hostwide_refresh_info,
+};
+
+static void test_gs_hostwide_msg(struct kunit *test)
+{
+	struct kvmppc_gs_msg_test_hostwide_data test_data = {
+		.guest_heap = 0xdeadbeef,
+		.guest_heap_max = ~0ULL,
+		.guest_pgtable_size = 0xff,
+		.guest_pgtable_size_max = 0xffffff,
+		.guest_pgtable_reclaim = 0xdeadbeef,
+	};
+	struct kvmppc_gs_msg *gsm;
+	struct kvmppc_gs_buff *gsb;
+
+	gsm = kvmppc_gsm_new(&gs_msg_test_hostwide_ops, &test_data, GSM_SEND,
+			     GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, gsm);
+
+	gsb = kvmppc_gsb_new(kvmppc_gsm_size(gsm), 0, 0, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, gsb);
+
+	kvmppc_gsm_include(gsm, KVMPPC_GSID_L0_GUEST_HEAP);
+	kvmppc_gsm_include(gsm, KVMPPC_GSID_L0_GUEST_HEAP_MAX);
+	kvmppc_gsm_include(gsm, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE);
+	kvmppc_gsm_include(gsm, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE_MAX);
+	kvmppc_gsm_include(gsm, KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM);
+
+	kvmppc_gsm_fill_info(gsm, gsb);
+
+	memset(&test_data, 0, sizeof(test_data));
+
+	kvmppc_gsm_refresh_info(gsm, gsb);
+	KUNIT_EXPECT_EQ(test, test_data.guest_heap, 0xdeadbeef);
+	KUNIT_EXPECT_EQ(test, test_data.guest_heap_max, ~0ULL);
+	KUNIT_EXPECT_EQ(test, test_data.guest_pgtable_size, 0xff);
+	KUNIT_EXPECT_EQ(test, test_data.guest_pgtable_size_max, 0xffffff);
+	KUNIT_EXPECT_EQ(test, test_data.guest_pgtable_reclaim, 0xdeadbeef);
+
+	kvmppc_gsm_free(gsm);
+}
+
+/* Test if the H_GUEST_GET_STATE for hostwide counters works */
+static void test_gs_hostwide_counters(struct kunit *test)
+{
+	struct kvmppc_gs_msg_test_hostwide_data test_data;
+	struct kvmppc_gs_parser gsp = { 0 };
+
+	struct kvmppc_gs_msg *gsm;
+	struct kvmppc_gs_buff *gsb;
+	struct kvmppc_gs_elem *gse;
+	int rc;
+
+	if (!kvmhv_on_pseries())
+		kunit_skip(test, "This test need a kmv-hv guest");
+
+	gsm = kvmppc_gsm_new(&gs_msg_test_hostwide_ops, &test_data, GSM_SEND,
+			     GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, gsm);
+
+	gsb = kvmppc_gsb_new(kvmppc_gsm_size(gsm), 0, 0, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, gsb);
+
+	kvmppc_gsm_include(gsm, KVMPPC_GSID_L0_GUEST_HEAP);
+
+	kvmppc_gsm_include(gsm, KVMPPC_GSID_L0_GUEST_HEAP_MAX);
+
+	kvmppc_gsm_include(gsm, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE);
+
+	kvmppc_gsm_include(gsm, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE_MAX);
+
+	kvmppc_gsm_include(gsm, KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM);
+
+	kvmppc_gsm_fill_info(gsm, gsb);
+
+	/* With HOST_WIDE flags guestid and vcpuid will be ignored */
+	rc = kvmppc_gsb_recv(gsb, KVMPPC_GS_FLAGS_HOST_WIDE);
+	KUNIT_ASSERT_EQ(test, rc, 0);
+
+	/* Parse the guest state buffer is successful */
+	rc = kvmppc_gse_parse(&gsp, gsb);
+	KUNIT_ASSERT_EQ(test, rc, 0);
+
+	/* Parse the GSB and get the counters */
+	gse = kvmppc_gsp_lookup(&gsp, KVMPPC_GSID_L0_GUEST_HEAP);
+	KUNIT_ASSERT_NOT_NULL_MSG(test, gse, "L0 Heap counter missing");
+	kunit_info(test, "Guest Heap Size=%llu bytes",
+		   kvmppc_gse_get_u64(gse));
+
+	gse = kvmppc_gsp_lookup(&gsp, KVMPPC_GSID_L0_GUEST_HEAP_MAX);
+	KUNIT_ASSERT_NOT_NULL_MSG(test, gse, "L0 Heap counter max missing");
+	kunit_info(test, "Guest Heap Size Max=%llu bytes",
+		   kvmppc_gse_get_u64(gse));
+
+	gse = kvmppc_gsp_lookup(&gsp, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE);
+	KUNIT_ASSERT_NOT_NULL_MSG(test, gse, "L0 page-table size missing");
+	kunit_info(test, "Guest Page-table Size=%llu bytes",
+		   kvmppc_gse_get_u64(gse));
+
+	gse = kvmppc_gsp_lookup(&gsp, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE_MAX);
+	KUNIT_ASSERT_NOT_NULL_MSG(test, gse, "L0 page-table size-max missing");
+	kunit_info(test, "Guest Page-table Size Max=%llu bytes",
+		   kvmppc_gse_get_u64(gse));
+
+	gse = kvmppc_gsp_lookup(&gsp, KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM);
+	KUNIT_ASSERT_NOT_NULL_MSG(test, gse, "L0 page-table reclaim size missing");
+	kunit_info(test, "Guest Page-table Reclaim Size=%llu bytes",
+		   kvmppc_gse_get_u64(gse));
+
+	kvmppc_gsm_free(gsm);
+	kvmppc_gsb_free(gsb);
+}
+
 static struct kunit_case guest_state_buffer_testcases[] = {
 	KUNIT_CASE(test_creating_buffer),
 	KUNIT_CASE(test_adding_element),
 	KUNIT_CASE(test_gs_bitmap),
 	KUNIT_CASE(test_gs_parsing),
 	KUNIT_CASE(test_gs_msg),
+	KUNIT_CASE(test_gs_hostwide_msg),
+	KUNIT_CASE(test_gs_hostwide_counters),
 	{}
 };
 
diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h
index 45817ab82bb4..14b0e23f601f 100644
--- a/arch/powerpc/kvm/timing.h
+++ b/arch/powerpc/kvm/timing.h
@@ -38,11 +38,7 @@ static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type) {}
 static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type)
 {
 	/* type has to be known at build time for optimization */
-
-	/* The BUILD_BUG_ON below breaks in funny ways, commented out
-	 * for now ... -BenH
 	BUILD_BUG_ON(!__builtin_constant_p(type));
-	*/
 	switch (type) {
 	case EXT_INTR_EXITS:
 		vcpu->stat.ext_intr_exits++;
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index dd8a4b52a0cc..481f968e42c7 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -3,6 +3,8 @@
 # Makefile for ppc-specific library files..
 #
 
+obj-y += crypto/
+
 CFLAGS_code-patching.o += -fno-stack-protector
 CFLAGS_feature-fixups.o += -fno-stack-protector
 
@@ -79,9 +81,9 @@ CFLAGS_xor_vmx.o += -mhard-float -maltivec $(call cc-option,-mabi=altivec)
 CFLAGS_xor_vmx.o += -isystem $(shell $(CC) -print-file-name=include)
 
 obj-$(CONFIG_CRC32_ARCH) += crc32-powerpc.o
-crc32-powerpc-y := crc32-glue.o crc32c-vpmsum_asm.o
+crc32-powerpc-y := crc32.o crc32c-vpmsum_asm.o
 
 obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-powerpc.o
-crc-t10dif-powerpc-y := crc-t10dif-glue.o crct10dif-vpmsum_asm.o
+crc-t10dif-powerpc-y := crc-t10dif.o crct10dif-vpmsum_asm.o
 
 obj-$(CONFIG_PPC64) += $(obj64-y)
diff --git a/arch/powerpc/lib/crc-t10dif-glue.c b/arch/powerpc/lib/crc-t10dif.c
index f411b0120cc5..be23ded3a9df 100644
--- a/arch/powerpc/lib/crc-t10dif-glue.c
+++ b/arch/powerpc/lib/crc-t10dif.c
@@ -6,22 +6,22 @@
  * [based on crc32c-vpmsum_glue.c]
  */
 
-#include <linux/crc-t10dif.h>
+#include <asm/switch_to.h>
 #include <crypto/internal/simd.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
 #include <linux/cpufeature.h>
-#include <asm/simd.h>
-#include <asm/switch_to.h>
+#include <linux/crc-t10dif.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/uaccess.h>
 
 #define VMX_ALIGN		16
 #define VMX_ALIGN_MASK		(VMX_ALIGN-1)
 
 #define VECTOR_BREAKPOINT	64
 
-static DEFINE_STATIC_KEY_FALSE(have_vec_crypto);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto);
 
 u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len);
 
@@ -71,7 +71,7 @@ static int __init crc_t10dif_powerpc_init(void)
 		static_branch_enable(&have_vec_crypto);
 	return 0;
 }
-arch_initcall(crc_t10dif_powerpc_init);
+subsys_initcall(crc_t10dif_powerpc_init);
 
 static void __exit crc_t10dif_powerpc_exit(void)
 {
diff --git a/arch/powerpc/lib/crc32-vpmsum_core.S b/arch/powerpc/lib/crc-vpmsum-template.S
index b0f87f595b26..b0f87f595b26 100644
--- a/arch/powerpc/lib/crc32-vpmsum_core.S
+++ b/arch/powerpc/lib/crc-vpmsum-template.S
diff --git a/arch/powerpc/lib/crc32-glue.c b/arch/powerpc/lib/crc32.c
index dbd10f339183..0d9befb6e7b8 100644
--- a/arch/powerpc/lib/crc32-glue.c
+++ b/arch/powerpc/lib/crc32.c
@@ -1,19 +1,20 @@
 // SPDX-License-Identifier: GPL-2.0-only
-#include <linux/crc32.h>
+#include <asm/switch_to.h>
 #include <crypto/internal/simd.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
 #include <linux/cpufeature.h>
-#include <asm/simd.h>
-#include <asm/switch_to.h>
+#include <linux/crc32.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/uaccess.h>
 
 #define VMX_ALIGN		16
 #define VMX_ALIGN_MASK		(VMX_ALIGN-1)
 
 #define VECTOR_BREAKPOINT	512
 
-static DEFINE_STATIC_KEY_FALSE(have_vec_crypto);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto);
 
 u32 __crc32c_vpmsum(u32 crc, const u8 *p, size_t len);
 
@@ -72,7 +73,7 @@ static int __init crc32_powerpc_init(void)
 		static_branch_enable(&have_vec_crypto);
 	return 0;
 }
-arch_initcall(crc32_powerpc_init);
+subsys_initcall(crc32_powerpc_init);
 
 static void __exit crc32_powerpc_exit(void)
 {
diff --git a/arch/powerpc/lib/crc32c-vpmsum_asm.S b/arch/powerpc/lib/crc32c-vpmsum_asm.S
index bf442004ea1f..1b35c55cce0a 100644
--- a/arch/powerpc/lib/crc32c-vpmsum_asm.S
+++ b/arch/powerpc/lib/crc32c-vpmsum_asm.S
@@ -839,4 +839,4 @@
 
 #define CRC_FUNCTION_NAME __crc32c_vpmsum
 #define REFLECT
-#include "crc32-vpmsum_core.S"
+#include "crc-vpmsum-template.S"
diff --git a/arch/powerpc/lib/crct10dif-vpmsum_asm.S b/arch/powerpc/lib/crct10dif-vpmsum_asm.S
index f0b93a0fe168..47a6266d89a8 100644
--- a/arch/powerpc/lib/crct10dif-vpmsum_asm.S
+++ b/arch/powerpc/lib/crct10dif-vpmsum_asm.S
@@ -842,4 +842,4 @@
 	.octa 0x0000000000000000000000018bb70000
 
 #define CRC_FUNCTION_NAME __crct10dif_vpmsum
-#include "crc32-vpmsum_core.S"
+#include "crc-vpmsum-template.S"
diff --git a/arch/powerpc/lib/crypto/Kconfig b/arch/powerpc/lib/crypto/Kconfig
new file mode 100644
index 000000000000..3f9e1bbd9905
--- /dev/null
+++ b/arch/powerpc/lib/crypto/Kconfig
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_CHACHA20_P10
+	tristate
+	depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
+	default CRYPTO_LIB_CHACHA
+	select CRYPTO_LIB_CHACHA_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_P10
+	tristate
+	depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
+	depends on BROKEN # Needs to be fixed to work in softirq context
+	default CRYPTO_LIB_POLY1305
+	select CRYPTO_ARCH_HAVE_LIB_POLY1305
+	select CRYPTO_LIB_POLY1305_GENERIC
+
+config CRYPTO_SHA256_PPC_SPE
+	tristate
+	depends on SPE
+	default CRYPTO_LIB_SHA256
+	select CRYPTO_ARCH_HAVE_LIB_SHA256
diff --git a/arch/powerpc/lib/crypto/Makefile b/arch/powerpc/lib/crypto/Makefile
new file mode 100644
index 000000000000..27f231f8e334
--- /dev/null
+++ b/arch/powerpc/lib/crypto/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o
+chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
+
+obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
+poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
+
+obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
+sha256-ppc-spe-y := sha256.o sha256-spe-asm.o
diff --git a/arch/powerpc/lib/crypto/chacha-p10-glue.c b/arch/powerpc/lib/crypto/chacha-p10-glue.c
new file mode 100644
index 000000000000..fcd23c6f1590
--- /dev/null
+++ b/arch/powerpc/lib/crypto/chacha-p10-glue.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * ChaCha stream cipher (P10 accelerated)
+ *
+ * Copyright 2023- IBM Corp. All rights reserved.
+ */
+
+#include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <linux/sizes.h>
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+
+asmlinkage void chacha_p10le_8x(const struct chacha_state *state, u8 *dst,
+				const u8 *src, unsigned int len, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
+
+static void vsx_begin(void)
+{
+	preempt_disable();
+	enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+	disable_kernel_vsx();
+	preempt_enable();
+}
+
+static void chacha_p10_do_8x(struct chacha_state *state, u8 *dst, const u8 *src,
+			     unsigned int bytes, int nrounds)
+{
+	unsigned int l = bytes & ~0x0FF;
+
+	if (l > 0) {
+		chacha_p10le_8x(state, dst, src, l, nrounds);
+		bytes -= l;
+		src += l;
+		dst += l;
+		state->x[12] += l / CHACHA_BLOCK_SIZE;
+	}
+
+	if (bytes > 0)
+		chacha_crypt_generic(state, dst, src, bytes, nrounds);
+}
+
+void hchacha_block_arch(const struct chacha_state *state,
+			u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	hchacha_block_generic(state, out, nrounds);
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+		       unsigned int bytes, int nrounds)
+{
+	if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE ||
+	    !crypto_simd_usable())
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		vsx_begin();
+		chacha_p10_do_8x(state, dst, src, todo, nrounds);
+		vsx_end();
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	return static_key_enabled(&have_p10);
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+static int __init chacha_p10_init(void)
+{
+	if (cpu_has_feature(CPU_FTR_ARCH_31))
+		static_branch_enable(&have_p10);
+	return 0;
+}
+subsys_initcall(chacha_p10_init);
+
+static void __exit chacha_p10_exit(void)
+{
+}
+module_exit(chacha_p10_exit);
+
+MODULE_DESCRIPTION("ChaCha stream cipher (P10 accelerated)");
+MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/powerpc/crypto/chacha-p10le-8x.S b/arch/powerpc/lib/crypto/chacha-p10le-8x.S
index 17bedb66b822..b29562bd5d40 100644
--- a/arch/powerpc/crypto/chacha-p10le-8x.S
+++ b/arch/powerpc/lib/crypto/chacha-p10le-8x.S
@@ -7,9 +7,6 @@
 #===================================================================================
 # Written by Danny Tsen <dtsen@us.ibm.com>
 #
-# chacha_p10le_8x(u32 *state, byte *dst, const byte *src,
-#				 size_t len, int nrounds);
-#
 # do rounds,  8 quarter rounds
 # 1.  a += b; d ^= a; d <<<= 16;
 # 2.  c += d; b ^= c; b <<<= 12;
@@ -575,7 +572,8 @@
 .endm
 
 #
-# chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len, int nrounds);
+# void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, const u8 *src,
+#		       unsigned int len, int nrounds);
 #
 SYM_FUNC_START(chacha_p10le_8x)
 .align 5
diff --git a/arch/powerpc/lib/crypto/poly1305-p10-glue.c b/arch/powerpc/lib/crypto/poly1305-p10-glue.c
new file mode 100644
index 000000000000..3f1664a724b6
--- /dev/null
+++ b/arch/powerpc/lib/crypto/poly1305-p10-glue.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Poly1305 authenticator algorithm, RFC7539.
+ *
+ * Copyright 2023- IBM Corp. All rights reserved.
+ */
+#include <asm/switch_to.h>
+#include <crypto/internal/poly1305.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen);
+asmlinkage void poly1305_64s(struct poly1305_block_state *state, const u8 *m, u32 mlen, int highbit);
+asmlinkage void poly1305_emit_64(const struct poly1305_state *state, const u32 nonce[4], u8 digest[POLY1305_DIGEST_SIZE]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
+
+static void vsx_begin(void)
+{
+	preempt_disable();
+	enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+	disable_kernel_vsx();
+	preempt_enable();
+}
+
+void poly1305_block_init_arch(struct poly1305_block_state *dctx,
+			      const u8 raw_key[POLY1305_BLOCK_SIZE])
+{
+	if (!static_key_enabled(&have_p10))
+		return poly1305_block_init_generic(dctx, raw_key);
+
+	dctx->h = (struct poly1305_state){};
+	dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0);
+	dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8);
+}
+EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
+
+void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
+			  unsigned int len, u32 padbit)
+{
+	if (!static_key_enabled(&have_p10))
+		return poly1305_blocks_generic(state, src, len, padbit);
+	vsx_begin();
+	if (len >= POLY1305_BLOCK_SIZE * 4) {
+		poly1305_p10le_4blocks(state, src, len);
+		src += len - (len % (POLY1305_BLOCK_SIZE * 4));
+		len %= POLY1305_BLOCK_SIZE * 4;
+	}
+	while (len >= POLY1305_BLOCK_SIZE) {
+		poly1305_64s(state, src, POLY1305_BLOCK_SIZE, padbit);
+		len -= POLY1305_BLOCK_SIZE;
+		src += POLY1305_BLOCK_SIZE;
+	}
+	vsx_end();
+}
+EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
+
+void poly1305_emit_arch(const struct poly1305_state *state,
+			u8 digest[POLY1305_DIGEST_SIZE],
+			const u32 nonce[4])
+{
+	if (!static_key_enabled(&have_p10))
+		return poly1305_emit_generic(state, digest, nonce);
+	poly1305_emit_64(state, nonce, digest);
+}
+EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+
+bool poly1305_is_arch_optimized(void)
+{
+	return static_key_enabled(&have_p10);
+}
+EXPORT_SYMBOL(poly1305_is_arch_optimized);
+
+static int __init poly1305_p10_init(void)
+{
+	if (cpu_has_feature(CPU_FTR_ARCH_31))
+		static_branch_enable(&have_p10);
+	return 0;
+}
+subsys_initcall(poly1305_p10_init);
+
+static void __exit poly1305_p10_exit(void)
+{
+}
+module_exit(poly1305_p10_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>");
+MODULE_DESCRIPTION("Optimized Poly1305 for P10");
diff --git a/arch/powerpc/crypto/poly1305-p10le_64.S b/arch/powerpc/lib/crypto/poly1305-p10le_64.S
index a3c1987f1ecd..a3c1987f1ecd 100644
--- a/arch/powerpc/crypto/poly1305-p10le_64.S
+++ b/arch/powerpc/lib/crypto/poly1305-p10le_64.S
diff --git a/arch/powerpc/crypto/sha256-spe-asm.S b/arch/powerpc/lib/crypto/sha256-spe-asm.S
index cd99d71dae34..cd99d71dae34 100644
--- a/arch/powerpc/crypto/sha256-spe-asm.S
+++ b/arch/powerpc/lib/crypto/sha256-spe-asm.S
diff --git a/arch/powerpc/lib/crypto/sha256.c b/arch/powerpc/lib/crypto/sha256.c
new file mode 100644
index 000000000000..6b0f079587eb
--- /dev/null
+++ b/arch/powerpc/lib/crypto/sha256.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-256 Secure Hash Algorithm, SPE optimized
+ *
+ * Based on generic implementation. The assembler module takes care
+ * about the SPE registers so it can run from interrupt context.
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/switch_to.h>
+#include <crypto/internal/sha2.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+
+/*
+ * MAX_BYTES defines the number of bytes that are allowed to be processed
+ * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000
+ * operations per 64 bytes. e500 cores can issue two arithmetic instructions
+ * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
+ * Thus 1KB of input data will need an estimated maximum of 18,000 cycles.
+ * Headroom for cache misses included. Even with the low end model clocked
+ * at 667 MHz this equals to a critical time window of less than 27us.
+ *
+ */
+#define MAX_BYTES 1024
+
+extern void ppc_spe_sha256_transform(u32 *state, const u8 *src, u32 blocks);
+
+static void spe_begin(void)
+{
+	/* We just start SPE operations and will save SPE registers later. */
+	preempt_disable();
+	enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+	disable_kernel_spe();
+	/* reenable preemption */
+	preempt_enable();
+}
+
+void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+			const u8 *data, size_t nblocks)
+{
+	do {
+		/* cut input data into smaller blocks */
+		u32 unit = min_t(size_t, nblocks,
+				 MAX_BYTES / SHA256_BLOCK_SIZE);
+
+		spe_begin();
+		ppc_spe_sha256_transform(state, data, unit);
+		spe_end();
+
+		data += unit * SHA256_BLOCK_SIZE;
+		nblocks -= unit;
+	} while (nblocks);
+}
+EXPORT_SYMBOL_GPL(sha256_blocks_arch);
+
+bool sha256_is_arch_optimized(void)
+{
+	return true;
+}
+EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm, SPE optimized");
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index 58ed6bd613a6..54340912398f 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -45,7 +45,7 @@ int exit_vmx_usercopy(void)
 	 * set and we are preemptible. The hack here is to schedule a
 	 * decrementer to fire here and reschedule for us if necessary.
 	 */
-	if (IS_ENABLED(CONFIG_PREEMPTION) && need_resched())
+	if (need_irq_preemption() && need_resched())
 		set_dec(1);
 	return 0;
 }
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 311e2112d782..9f764bc42b8c 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -976,7 +976,7 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
 	return 0;
 }
 
-
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
 bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
 {
 	if (radix_enabled())
@@ -984,6 +984,7 @@ bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
 
 	return false;
 }
+#endif
 
 int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
 				unsigned long addr, unsigned long next)
@@ -1120,6 +1121,19 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in
 	pmd_t *pmd;
 	pte_t *pte;
 
+	/*
+	 * Make sure we align the start vmemmap addr so that we calculate
+	 * the correct start_pfn in altmap boundary check to decided whether
+	 * we should use altmap or RAM based backing memory allocation. Also
+	 * the address need to be aligned for set_pte operation.
+
+	 * If the start addr is already PMD_SIZE aligned we will try to use
+	 * a pmd mapping. We don't want to be too aggressive here beacause
+	 * that will cause more allocations in RAM. So only if the namespace
+	 * vmemmap start addr is PMD_SIZE aligned we will use PMD mapping.
+	 */
+
+	start = ALIGN_DOWN(start, PAGE_SIZE);
 	for (addr = start; addr < end; addr = next) {
 		next = pmd_addr_end(addr, end);
 
@@ -1145,8 +1159,8 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in
 			 * in altmap block allocation failures, in which case
 			 * we fallback to RAM for vmemmap allocation.
 			 */
-			if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
-				       altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
+			if (!IS_ALIGNED(addr, PMD_SIZE) || (altmap &&
+			    altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
 				/*
 				 * make sure we don't create altmap mappings
 				 * covering things outside the device.
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index c156fe0d53c3..806c74e0d5ab 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/string_choices.h>
 #include <linux/types.h>
 #include <linux/pagemap.h>
 #include <linux/ptrace.h>
@@ -218,7 +219,7 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
 	// Read/write fault blocked by KUAP is bad, it can never succeed.
 	if (bad_kuap_fault(regs, address, is_write)) {
 		pr_crit_ratelimited("Kernel attempted to %s user page (%lx) - exploit attempt? (uid: %d)\n",
-				    is_write ? "write" : "read", address,
+				    str_write_read(is_write), address,
 				    from_kuid(&init_user_ns, current_uid()));
 
 		// Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad
@@ -625,7 +626,7 @@ static void __bad_page_fault(struct pt_regs *regs, int sig)
 	case INTERRUPT_DATA_STORAGE:
 	case INTERRUPT_H_DATA_STORAGE:
 		pr_alert("BUG: %s on %s at 0x%08lx\n", msg,
-			 is_write ? "write" : "read", regs->dar);
+			 str_write_read(is_write), regs->dar);
 		break;
 	case INTERRUPT_DATA_SEGMENT:
 		pr_alert("BUG: %s at 0x%08lx\n", msg, regs->dar);
diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index 8b54f12d1889..ab1505cf42bf 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -54,20 +54,13 @@ static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa,
 {
 	pmd_t *pmdp = pmd_off_k(va);
 	pte_t *ptep;
-
-	if (WARN_ON(psize != MMU_PAGE_512K && psize != MMU_PAGE_8M))
-		return -EINVAL;
+	unsigned int shift = mmu_psize_to_shift(psize);
 
 	if (new) {
 		if (WARN_ON(slab_is_available()))
 			return -EINVAL;
 
-		if (psize == MMU_PAGE_512K) {
-			ptep = early_pte_alloc_kernel(pmdp, va);
-			/* The PTE should never be already present */
-			if (WARN_ON(pte_present(*ptep) && pgprot_val(prot)))
-				return -EINVAL;
-		} else {
+		if (psize == MMU_PAGE_8M) {
 			if (WARN_ON(!pmd_none(*pmdp) || !pmd_none(*(pmdp + 1))))
 				return -EINVAL;
 
@@ -78,20 +71,25 @@ static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa,
 			pmd_populate_kernel(&init_mm, pmdp + 1, ptep);
 
 			ptep = (pte_t *)pmdp;
+		} else {
+			ptep = early_pte_alloc_kernel(pmdp, va);
+			/* The PTE should never be already present */
+			if (WARN_ON(pte_present(*ptep) && pgprot_val(prot)))
+				return -EINVAL;
 		}
 	} else {
-		if (psize == MMU_PAGE_512K)
-			ptep = pte_offset_kernel(pmdp, va);
-		else
+		if (psize == MMU_PAGE_8M)
 			ptep = (pte_t *)pmdp;
+		else
+			ptep = pte_offset_kernel(pmdp, va);
 	}
 
 	if (WARN_ON(!ptep))
 		return -ENOMEM;
 
 	set_huge_pte_at(&init_mm, va, ptep,
-			pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)),
-			1UL << mmu_psize_to_shift(psize));
+			arch_make_huge_pte(pfn_pte(pa >> PAGE_SHIFT, prot), shift, 0),
+			1UL << shift);
 
 	return 0;
 }
@@ -123,14 +121,18 @@ static int mmu_mapin_ram_chunk(unsigned long offset, unsigned long top,
 	unsigned long p = offset;
 	int err = 0;
 
-	WARN_ON(!IS_ALIGNED(offset, SZ_512K) || !IS_ALIGNED(top, SZ_512K));
+	WARN_ON(!IS_ALIGNED(offset, SZ_16K) || !IS_ALIGNED(top, SZ_16K));
 
+	for (; p < ALIGN(p, SZ_512K) && p < top && !err; p += SZ_16K, v += SZ_16K)
+		err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_16K, new);
 	for (; p < ALIGN(p, SZ_8M) && p < top && !err; p += SZ_512K, v += SZ_512K)
 		err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new);
 	for (; p < ALIGN_DOWN(top, SZ_8M) && p < top && !err; p += SZ_8M, v += SZ_8M)
 		err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_8M, new);
 	for (; p < ALIGN_DOWN(top, SZ_512K) && p < top && !err; p += SZ_512K, v += SZ_512K)
 		err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new);
+	for (; p < ALIGN_DOWN(top, SZ_16K) && p < top && !err; p += SZ_16K, v += SZ_16K)
+		err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_16K, new);
 
 	if (!new)
 		flush_tlb_kernel_range(PAGE_OFFSET + v, PAGE_OFFSET + top);
diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index 6beacaec63d3..4c26912c2e3c 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -51,8 +51,16 @@
 		EMIT(PPC_INST_BRANCH_COND | (((cond) & 0x3ff) << 16) | (offset & 0xfffc));					\
 	} while (0)
 
-/* Sign-extended 32-bit immediate load */
+/*
+ * Sign-extended 32-bit immediate load
+ *
+ * If this is a dummy pass (!image), account for
+ * maximum possible instructions.
+ */
 #define PPC_LI32(d, i)		do {					      \
+	if (!image)							      \
+		ctx->idx += 2;						      \
+	else {								      \
 		if ((int)(uintptr_t)(i) >= -32768 &&			      \
 				(int)(uintptr_t)(i) < 32768)		      \
 			EMIT(PPC_RAW_LI(d, i));				      \
@@ -60,10 +68,15 @@
 			EMIT(PPC_RAW_LIS(d, IMM_H(i)));			      \
 			if (IMM_L(i))					      \
 				EMIT(PPC_RAW_ORI(d, d, IMM_L(i)));	      \
-		} } while(0)
+		}							      \
+	} } while (0)
 
 #ifdef CONFIG_PPC64
+/* If dummy pass (!image), account for maximum possible instructions */
 #define PPC_LI64(d, i)		do {					      \
+	if (!image)							      \
+		ctx->idx += 5;						      \
+	else {								      \
 		if ((long)(i) >= -2147483648 &&				      \
 				(long)(i) < 2147483648)			      \
 			PPC_LI32(d, i);					      \
@@ -84,7 +97,8 @@
 			if ((uintptr_t)(i) & 0x000000000000ffffULL)	      \
 				EMIT(PPC_RAW_ORI(d, d, (uintptr_t)(i) &       \
 							0xffff));             \
-		} } while (0)
+		}							      \
+	} } while (0)
 #define PPC_LI_ADDR	PPC_LI64
 
 #ifndef CONFIG_PPC_KERNEL_PCREL
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 2991bb171a9b..c0684733e9d6 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -504,10 +504,11 @@ static int invoke_bpf_prog(u32 *image, u32 *ro_image, struct codegen_context *ct
 	EMIT(PPC_RAW_ADDI(_R3, _R1, regs_off));
 	if (!p->jited)
 		PPC_LI_ADDR(_R4, (unsigned long)p->insnsi);
-	if (!create_branch(&branch_insn, (u32 *)&ro_image[ctx->idx], (unsigned long)p->bpf_func,
-			   BRANCH_SET_LINK)) {
-		if (image)
-			image[ctx->idx] = ppc_inst_val(branch_insn);
+	/* Account for max possible instructions during dummy pass for size calculation */
+	if (image && !create_branch(&branch_insn, (u32 *)&ro_image[ctx->idx],
+				    (unsigned long)p->bpf_func,
+				    BRANCH_SET_LINK)) {
+		image[ctx->idx] = ppc_inst_val(branch_insn);
 		ctx->idx++;
 	} else {
 		EMIT(PPC_RAW_LL(_R12, _R25, offsetof(struct bpf_prog, bpf_func)));
@@ -889,7 +890,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 			bpf_trampoline_restore_tail_call_cnt(image, ctx, func_frame_offset, r4_off);
 
 		/* Reserve space to patch branch instruction to skip fexit progs */
-		im->ip_after_call = &((u32 *)ro_image)[ctx->idx];
+		if (ro_image) /* image is NULL for dummy pass */
+			im->ip_after_call = &((u32 *)ro_image)[ctx->idx];
 		EMIT(PPC_RAW_NOP());
 	}
 
@@ -912,7 +914,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 		}
 
 	if (flags & BPF_TRAMP_F_CALL_ORIG) {
-		im->ip_epilogue = &((u32 *)ro_image)[ctx->idx];
+		if (ro_image) /* image is NULL for dummy pass */
+			im->ip_epilogue = &((u32 *)ro_image)[ctx->idx];
 		PPC_LI_ADDR(_R3, im);
 		ret = bpf_jit_emit_func_call_rel(image, ro_image, ctx,
 						 (unsigned long)__bpf_tramp_exit);
@@ -973,25 +976,9 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
 			     struct bpf_tramp_links *tlinks, void *func_addr)
 {
 	struct bpf_tramp_image im;
-	void *image;
 	int ret;
 
-	/*
-	 * Allocate a temporary buffer for __arch_prepare_bpf_trampoline().
-	 * This will NOT cause fragmentation in direct map, as we do not
-	 * call set_memory_*() on this buffer.
-	 *
-	 * We cannot use kvmalloc here, because we need image to be in
-	 * module memory range.
-	 */
-	image = bpf_jit_alloc_exec(PAGE_SIZE);
-	if (!image)
-		return -ENOMEM;
-
-	ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, image,
-					    m, flags, tlinks, func_addr);
-	bpf_jit_free_exec(image);
-
+	ret = __arch_prepare_bpf_trampoline(&im, NULL, NULL, NULL, m, flags, tlinks, func_addr);
 	return ret;
 }
 
diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c
index c4db278dae36..0aace304dfe1 100644
--- a/arch/powerpc/net/bpf_jit_comp32.c
+++ b/arch/powerpc/net/bpf_jit_comp32.c
@@ -313,7 +313,6 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
 		u64 func_addr;
 		u32 true_cond;
 		u32 tmp_idx;
-		int j;
 
 		if (i && (BPF_CLASS(code) == BPF_ALU64 || BPF_CLASS(code) == BPF_ALU) &&
 		    (BPF_CLASS(prevcode) == BPF_ALU64 || BPF_CLASS(prevcode) == BPF_ALU) &&
@@ -1099,13 +1098,8 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
 		 * 16 byte instruction that uses two 'struct bpf_insn'
 		 */
 		case BPF_LD | BPF_IMM | BPF_DW: /* dst = (u64) imm */
-			tmp_idx = ctx->idx;
 			PPC_LI32(dst_reg_h, (u32)insn[i + 1].imm);
 			PPC_LI32(dst_reg, (u32)insn[i].imm);
-			/* padding to allow full 4 instructions for later patching */
-			if (!image)
-				for (j = ctx->idx - tmp_idx; j < 4; j++)
-					EMIT(PPC_RAW_NOP());
 			/* Adjust for two bpf instructions */
 			addrs[++i] = ctx->idx * 4;
 			break;
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 233703b06d7c..5daa77aee7f7 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -227,7 +227,14 @@ int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context *
 #ifdef CONFIG_PPC_KERNEL_PCREL
 	reladdr = func_addr - local_paca->kernelbase;
 
-	if (reladdr < (long)SZ_8G && reladdr >= -(long)SZ_8G) {
+	/*
+	 * If fimage is NULL (the initial pass to find image size),
+	 * account for the maximum no. of instructions possible.
+	 */
+	if (!fimage) {
+		ctx->idx += 7;
+		return 0;
+	} else if (reladdr < (long)SZ_8G && reladdr >= -(long)SZ_8G) {
 		EMIT(PPC_RAW_LD(_R12, _R13, offsetof(struct paca_struct, kernelbase)));
 		/* Align for subsequent prefix instruction */
 		if (!IS_ALIGNED((unsigned long)fimage + CTX_NIA(ctx), 8))
@@ -412,7 +419,6 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
 		u64 imm64;
 		u32 true_cond;
 		u32 tmp_idx;
-		int j;
 
 		/*
 		 * addrs[] maps a BPF bytecode address into a real offset from
@@ -1046,12 +1052,7 @@ emit_clear:
 		case BPF_LD | BPF_IMM | BPF_DW: /* dst = (u64) imm */
 			imm64 = ((u64)(u32) insn[i].imm) |
 				    (((u64)(u32) insn[i+1].imm) << 32);
-			tmp_idx = ctx->idx;
 			PPC_LI64(dst_reg, imm64);
-			/* padding to allow full 5 instructions for later patching */
-			if (!image)
-				for (j = ctx->idx - tmp_idx; j < 5; j++)
-					EMIT(PPC_RAW_NOP());
 			/* Adjust for two bpf instructions */
 			addrs[++i] = ctx->idx * 4;
 			break;
diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile
index ac2cf58d62db..7f53fcb7495a 100644
--- a/arch/powerpc/perf/Makefile
+++ b/arch/powerpc/perf/Makefile
@@ -18,6 +18,8 @@ obj-$(CONFIG_HV_PERF_CTRS) += hv-24x7.o hv-gpci.o hv-common.o
 
 obj-$(CONFIG_VPA_PMU) += vpa-pmu.o
 
+obj-$(CONFIG_KVM_BOOK3S_HV_PMU) += kvm-hv-pmu.o
+
 obj-$(CONFIG_PPC_8xx) += 8xx-pmu.o
 
 obj-$(CONFIG_PPC64)		+= $(obj64-y)
diff --git a/arch/powerpc/perf/kvm-hv-pmu.c b/arch/powerpc/perf/kvm-hv-pmu.c
new file mode 100644
index 000000000000..ae264c9080ef
--- /dev/null
+++ b/arch/powerpc/perf/kvm-hv-pmu.c
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Description: PMUs specific to running nested KVM-HV guests
+ * on Book3S processors (specifically POWER9 and later).
+ */
+
+#define pr_fmt(fmt)  "kvmppc-pmu: " fmt
+
+#include "asm-generic/local64.h"
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/ratelimit.h>
+#include <linux/kvm_host.h>
+#include <linux/gfp_types.h>
+#include <linux/pgtable.h>
+#include <linux/perf_event.h>
+#include <linux/spinlock_types.h>
+#include <linux/spinlock.h>
+
+#include <asm/types.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu.h>
+#include <asm/pgalloc.h>
+#include <asm/pte-walk.h>
+#include <asm/reg.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/firmware.h>
+
+#include "asm/guest-state-buffer.h"
+
+enum kvmppc_pmu_eventid {
+	KVMPPC_EVENT_HOST_HEAP,
+	KVMPPC_EVENT_HOST_HEAP_MAX,
+	KVMPPC_EVENT_HOST_PGTABLE,
+	KVMPPC_EVENT_HOST_PGTABLE_MAX,
+	KVMPPC_EVENT_HOST_PGTABLE_RECLAIM,
+	KVMPPC_EVENT_MAX,
+};
+
+#define KVMPPC_PMU_EVENT_ATTR(_name, _id) \
+	PMU_EVENT_ATTR_ID(_name, kvmppc_events_sysfs_show, _id)
+
+static ssize_t kvmppc_events_sysfs_show(struct device *dev,
+					struct device_attribute *attr,
+					char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+	return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
+}
+
+/* Holds the hostwide stats */
+static struct kvmppc_hostwide_stats {
+	u64 guest_heap;
+	u64 guest_heap_max;
+	u64 guest_pgtable_size;
+	u64 guest_pgtable_size_max;
+	u64 guest_pgtable_reclaim;
+} l0_stats;
+
+/* Protect access to l0_stats */
+static DEFINE_SPINLOCK(lock_l0_stats);
+
+/* GSB related structs needed to talk to L0 */
+static struct kvmppc_gs_msg *gsm_l0_stats;
+static struct kvmppc_gs_buff *gsb_l0_stats;
+static struct kvmppc_gs_parser gsp_l0_stats;
+
+static struct attribute *kvmppc_pmu_events_attr[] = {
+	KVMPPC_PMU_EVENT_ATTR(host_heap, KVMPPC_EVENT_HOST_HEAP),
+	KVMPPC_PMU_EVENT_ATTR(host_heap_max, KVMPPC_EVENT_HOST_HEAP_MAX),
+	KVMPPC_PMU_EVENT_ATTR(host_pagetable, KVMPPC_EVENT_HOST_PGTABLE),
+	KVMPPC_PMU_EVENT_ATTR(host_pagetable_max, KVMPPC_EVENT_HOST_PGTABLE_MAX),
+	KVMPPC_PMU_EVENT_ATTR(host_pagetable_reclaim, KVMPPC_EVENT_HOST_PGTABLE_RECLAIM),
+	NULL,
+};
+
+static const struct attribute_group kvmppc_pmu_events_group = {
+	.name = "events",
+	.attrs = kvmppc_pmu_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-5");
+static struct attribute *kvmppc_pmu_format_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group kvmppc_pmu_format_group = {
+	.name = "format",
+	.attrs = kvmppc_pmu_format_attr,
+};
+
+static const struct attribute_group *kvmppc_pmu_attr_groups[] = {
+	&kvmppc_pmu_events_group,
+	&kvmppc_pmu_format_group,
+	NULL,
+};
+
+/*
+ * Issue the hcall to get the L0-host stats.
+ * Should be called with l0-stat lock held
+ */
+static int kvmppc_update_l0_stats(void)
+{
+	int rc;
+
+	/* With HOST_WIDE flags guestid and vcpuid will be ignored */
+	rc = kvmppc_gsb_recv(gsb_l0_stats, KVMPPC_GS_FLAGS_HOST_WIDE);
+	if (rc)
+		goto out;
+
+	/* Parse the guest state buffer is successful */
+	rc = kvmppc_gse_parse(&gsp_l0_stats, gsb_l0_stats);
+	if (rc)
+		goto out;
+
+	/* Update the l0 returned stats*/
+	memset(&l0_stats, 0, sizeof(l0_stats));
+	rc = kvmppc_gsm_refresh_info(gsm_l0_stats, gsb_l0_stats);
+
+out:
+	return rc;
+}
+
+/* Update the value of the given perf_event */
+static int kvmppc_pmu_event_update(struct perf_event *event)
+{
+	int rc;
+	u64 curr_val, prev_val;
+	unsigned long flags;
+	unsigned int config = event->attr.config;
+
+	/* Ensure no one else is modifying the l0_stats */
+	spin_lock_irqsave(&lock_l0_stats, flags);
+
+	rc = kvmppc_update_l0_stats();
+	if (!rc) {
+		switch (config) {
+		case KVMPPC_EVENT_HOST_HEAP:
+			curr_val = l0_stats.guest_heap;
+			break;
+		case KVMPPC_EVENT_HOST_HEAP_MAX:
+			curr_val = l0_stats.guest_heap_max;
+			break;
+		case KVMPPC_EVENT_HOST_PGTABLE:
+			curr_val = l0_stats.guest_pgtable_size;
+			break;
+		case KVMPPC_EVENT_HOST_PGTABLE_MAX:
+			curr_val = l0_stats.guest_pgtable_size_max;
+			break;
+		case KVMPPC_EVENT_HOST_PGTABLE_RECLAIM:
+			curr_val = l0_stats.guest_pgtable_reclaim;
+			break;
+		default:
+			rc = -ENOENT;
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&lock_l0_stats, flags);
+
+	/* If no error than update the perf event */
+	if (!rc) {
+		prev_val = local64_xchg(&event->hw.prev_count, curr_val);
+		if (curr_val > prev_val)
+			local64_add(curr_val - prev_val, &event->count);
+	}
+
+	return rc;
+}
+
+static int kvmppc_pmu_event_init(struct perf_event *event)
+{
+	unsigned int config = event->attr.config;
+
+	pr_debug("%s: Event(%p) id=%llu cpu=%x on_cpu=%x config=%u",
+		 __func__, event, event->id, event->cpu,
+		 event->oncpu, config);
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	if (config >= KVMPPC_EVENT_MAX)
+		return -EINVAL;
+
+	local64_set(&event->hw.prev_count, 0);
+	local64_set(&event->count, 0);
+
+	return 0;
+}
+
+static void kvmppc_pmu_del(struct perf_event *event, int flags)
+{
+	kvmppc_pmu_event_update(event);
+}
+
+static int kvmppc_pmu_add(struct perf_event *event, int flags)
+{
+	if (flags & PERF_EF_START)
+		return kvmppc_pmu_event_update(event);
+	return 0;
+}
+
+static void kvmppc_pmu_read(struct perf_event *event)
+{
+	kvmppc_pmu_event_update(event);
+}
+
+/* Return the size of the needed guest state buffer */
+static size_t hostwide_get_size(struct kvmppc_gs_msg *gsm)
+
+{
+	size_t size = 0;
+	const u16 ids[] = {
+		KVMPPC_GSID_L0_GUEST_HEAP,
+		KVMPPC_GSID_L0_GUEST_HEAP_MAX,
+		KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE,
+		KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE_MAX,
+		KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM
+	};
+
+	for (int i = 0; i < ARRAY_SIZE(ids); i++)
+		size += kvmppc_gse_total_size(kvmppc_gsid_size(ids[i]));
+	return size;
+}
+
+/* Populate the request guest state buffer */
+static int hostwide_fill_info(struct kvmppc_gs_buff *gsb,
+			      struct kvmppc_gs_msg *gsm)
+{
+	int rc = 0;
+	struct kvmppc_hostwide_stats  *stats = gsm->data;
+
+	/*
+	 * It doesn't matter what values are put into request buffer as
+	 * they are going to be overwritten anyways. But for the sake of
+	 * testcode and symmetry contents of existing stats are put
+	 * populated into the request guest state buffer.
+	 */
+	if (kvmppc_gsm_includes(gsm, KVMPPC_GSID_L0_GUEST_HEAP))
+		rc = kvmppc_gse_put_u64(gsb,
+					KVMPPC_GSID_L0_GUEST_HEAP,
+					stats->guest_heap);
+
+	if (!rc && kvmppc_gsm_includes(gsm, KVMPPC_GSID_L0_GUEST_HEAP_MAX))
+		rc = kvmppc_gse_put_u64(gsb,
+					KVMPPC_GSID_L0_GUEST_HEAP_MAX,
+					stats->guest_heap_max);
+
+	if (!rc && kvmppc_gsm_includes(gsm, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE))
+		rc = kvmppc_gse_put_u64(gsb,
+					KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE,
+					stats->guest_pgtable_size);
+	if (!rc &&
+	    kvmppc_gsm_includes(gsm, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE_MAX))
+		rc = kvmppc_gse_put_u64(gsb,
+					KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE_MAX,
+					stats->guest_pgtable_size_max);
+	if (!rc &&
+	    kvmppc_gsm_includes(gsm, KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM))
+		rc = kvmppc_gse_put_u64(gsb,
+					KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM,
+					stats->guest_pgtable_reclaim);
+
+	return rc;
+}
+
+/* Parse and update the host wide stats from returned gsb */
+static int hostwide_refresh_info(struct kvmppc_gs_msg *gsm,
+				 struct kvmppc_gs_buff *gsb)
+{
+	struct kvmppc_gs_parser gsp = { 0 };
+	struct kvmppc_hostwide_stats *stats = gsm->data;
+	struct kvmppc_gs_elem *gse;
+	int rc;
+
+	rc = kvmppc_gse_parse(&gsp, gsb);
+	if (rc < 0)
+		return rc;
+
+	gse = kvmppc_gsp_lookup(&gsp, KVMPPC_GSID_L0_GUEST_HEAP);
+	if (gse)
+		stats->guest_heap = kvmppc_gse_get_u64(gse);
+
+	gse = kvmppc_gsp_lookup(&gsp, KVMPPC_GSID_L0_GUEST_HEAP_MAX);
+	if (gse)
+		stats->guest_heap_max = kvmppc_gse_get_u64(gse);
+
+	gse = kvmppc_gsp_lookup(&gsp, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE);
+	if (gse)
+		stats->guest_pgtable_size = kvmppc_gse_get_u64(gse);
+
+	gse = kvmppc_gsp_lookup(&gsp, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE_MAX);
+	if (gse)
+		stats->guest_pgtable_size_max = kvmppc_gse_get_u64(gse);
+
+	gse = kvmppc_gsp_lookup(&gsp, KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM);
+	if (gse)
+		stats->guest_pgtable_reclaim = kvmppc_gse_get_u64(gse);
+
+	return 0;
+}
+
+/* gsb-message ops for setting up/parsing */
+static struct kvmppc_gs_msg_ops gsb_ops_l0_stats = {
+	.get_size = hostwide_get_size,
+	.fill_info = hostwide_fill_info,
+	.refresh_info = hostwide_refresh_info,
+};
+
+static int kvmppc_init_hostwide(void)
+{
+	int rc = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&lock_l0_stats, flags);
+
+	/* already registered ? */
+	if (gsm_l0_stats) {
+		rc = 0;
+		goto out;
+	}
+
+	/* setup the Guest state message/buffer to talk to L0 */
+	gsm_l0_stats = kvmppc_gsm_new(&gsb_ops_l0_stats, &l0_stats,
+				      GSM_SEND, GFP_KERNEL);
+	if (!gsm_l0_stats) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* Populate the Idents */
+	kvmppc_gsm_include(gsm_l0_stats, KVMPPC_GSID_L0_GUEST_HEAP);
+	kvmppc_gsm_include(gsm_l0_stats, KVMPPC_GSID_L0_GUEST_HEAP_MAX);
+	kvmppc_gsm_include(gsm_l0_stats, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE);
+	kvmppc_gsm_include(gsm_l0_stats, KVMPPC_GSID_L0_GUEST_PGTABLE_SIZE_MAX);
+	kvmppc_gsm_include(gsm_l0_stats, KVMPPC_GSID_L0_GUEST_PGTABLE_RECLAIM);
+
+	/* allocate GSB. Guest/Vcpu Id is ignored */
+	gsb_l0_stats = kvmppc_gsb_new(kvmppc_gsm_size(gsm_l0_stats), 0, 0,
+				      GFP_KERNEL);
+	if (!gsb_l0_stats) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* ask the ops to fill in the info */
+	rc = kvmppc_gsm_fill_info(gsm_l0_stats, gsb_l0_stats);
+
+out:
+	if (rc) {
+		if (gsm_l0_stats)
+			kvmppc_gsm_free(gsm_l0_stats);
+		if (gsb_l0_stats)
+			kvmppc_gsb_free(gsb_l0_stats);
+		gsm_l0_stats = NULL;
+		gsb_l0_stats = NULL;
+	}
+	spin_unlock_irqrestore(&lock_l0_stats, flags);
+	return rc;
+}
+
+static void kvmppc_cleanup_hostwide(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&lock_l0_stats, flags);
+
+	if (gsm_l0_stats)
+		kvmppc_gsm_free(gsm_l0_stats);
+	if (gsb_l0_stats)
+		kvmppc_gsb_free(gsb_l0_stats);
+	gsm_l0_stats = NULL;
+	gsb_l0_stats = NULL;
+
+	spin_unlock_irqrestore(&lock_l0_stats, flags);
+}
+
+/* L1 wide counters PMU */
+static struct pmu kvmppc_pmu = {
+	.module = THIS_MODULE,
+	.task_ctx_nr = perf_sw_context,
+	.name = "kvm-hv",
+	.event_init = kvmppc_pmu_event_init,
+	.add = kvmppc_pmu_add,
+	.del = kvmppc_pmu_del,
+	.read = kvmppc_pmu_read,
+	.attr_groups = kvmppc_pmu_attr_groups,
+	.type = -1,
+	.scope = PERF_PMU_SCOPE_SYS_WIDE,
+	.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static int __init kvmppc_register_pmu(void)
+{
+	int rc = -EOPNOTSUPP;
+
+	/* only support events for nestedv2 right now */
+	if (kvmhv_is_nestedv2()) {
+		rc = kvmppc_init_hostwide();
+		if (rc)
+			goto out;
+
+		/* Register the pmu */
+		rc = perf_pmu_register(&kvmppc_pmu, kvmppc_pmu.name, -1);
+		if (rc)
+			goto out;
+
+		pr_info("Registered kvm-hv pmu");
+	}
+
+out:
+	return rc;
+}
+
+static void __exit kvmppc_unregister_pmu(void)
+{
+	if (kvmhv_is_nestedv2()) {
+		kvmppc_cleanup_hostwide();
+
+		if (kvmppc_pmu.type != -1)
+			perf_pmu_unregister(&kvmppc_pmu);
+
+		pr_info("kvmhv_pmu unregistered.\n");
+	}
+}
+
+module_init(kvmppc_register_pmu);
+module_exit(kvmppc_unregister_pmu);
+MODULE_DESCRIPTION("KVM PPC Book3s-hv PMU");
+MODULE_AUTHOR("Vaibhav Jain <vaibhav@linux.ibm.com>");
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/platforms/44x/gpio.c b/arch/powerpc/platforms/44x/gpio.c
index e5f2319e5cbe..d540e261d85a 100644
--- a/arch/powerpc/platforms/44x/gpio.c
+++ b/arch/powerpc/platforms/44x/gpio.c
@@ -75,8 +75,7 @@ __ppc4xx_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
 		clrbits32(&regs->or, GPIO_MASK(gpio));
 }
 
-static void
-ppc4xx_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
+static int ppc4xx_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
 {
 	struct ppc4xx_gpio_chip *chip = gpiochip_get_data(gc);
 	unsigned long flags;
@@ -88,6 +87,8 @@ ppc4xx_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
 	spin_unlock_irqrestore(&chip->lock, flags);
 
 	pr_debug("%s: gpio: %d val: %d\n", __func__, gpio, val);
+
+	return 0;
 }
 
 static int ppc4xx_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio)
@@ -179,7 +180,7 @@ static int __init ppc4xx_add_gpiochips(void)
 		gc->direction_input = ppc4xx_gpio_dir_in;
 		gc->direction_output = ppc4xx_gpio_dir_out;
 		gc->get = ppc4xx_gpio_get;
-		gc->set = ppc4xx_gpio_set;
+		gc->set_rv = ppc4xx_gpio_set;
 
 		ret = of_mm_gpiochip_add_data(np, mm_gc, ppc4xx_gc);
 		if (ret)
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index 1ea591ec6083..c96af6b0eab4 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -280,7 +280,7 @@ static int mpc52xx_gpt_gpio_get(struct gpio_chip *gc, unsigned int gpio)
 	return (in_be32(&gpt->regs->status) >> 8) & 1;
 }
 
-static void
+static int
 mpc52xx_gpt_gpio_set(struct gpio_chip *gc, unsigned int gpio, int v)
 {
 	struct mpc52xx_gpt_priv *gpt = gpiochip_get_data(gc);
@@ -293,6 +293,8 @@ mpc52xx_gpt_gpio_set(struct gpio_chip *gc, unsigned int gpio, int v)
 	raw_spin_lock_irqsave(&gpt->lock, flags);
 	clrsetbits_be32(&gpt->regs->mode, MPC52xx_GPT_MODE_GPIO_MASK, r);
 	raw_spin_unlock_irqrestore(&gpt->lock, flags);
+
+	return 0;
 }
 
 static int mpc52xx_gpt_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio)
@@ -334,7 +336,7 @@ static void mpc52xx_gpt_gpio_setup(struct mpc52xx_gpt_priv *gpt)
 	gpt->gc.direction_input  = mpc52xx_gpt_gpio_dir_in;
 	gpt->gc.direction_output = mpc52xx_gpt_gpio_dir_out;
 	gpt->gc.get = mpc52xx_gpt_gpio_get;
-	gpt->gc.set = mpc52xx_gpt_gpio_set;
+	gpt->gc.set_rv = mpc52xx_gpt_gpio_set;
 	gpt->gc.base = -1;
 	gpt->gc.parent = gpt->dev;
 
diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
index 4d8fa9ed1a67..6e37dfc6c5c9 100644
--- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
+++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
@@ -92,10 +92,11 @@ static void mcu_power_off(void)
 	mutex_unlock(&mcu->lock);
 }
 
-static void mcu_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
+static int mcu_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
 {
 	struct mcu *mcu = gpiochip_get_data(gc);
 	u8 bit = 1 << (4 + gpio);
+	int ret;
 
 	mutex_lock(&mcu->lock);
 	if (val)
@@ -103,14 +104,16 @@ static void mcu_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
 	else
 		mcu->reg_ctrl |= bit;
 
-	i2c_smbus_write_byte_data(mcu->client, MCU_REG_CTRL, mcu->reg_ctrl);
+	ret = i2c_smbus_write_byte_data(mcu->client, MCU_REG_CTRL,
+					mcu->reg_ctrl);
 	mutex_unlock(&mcu->lock);
+
+	return ret;
 }
 
 static int mcu_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
 {
-	mcu_gpio_set(gc, gpio, val);
-	return 0;
+	return mcu_gpio_set(gc, gpio, val);
 }
 
 static int mcu_gpiochip_add(struct mcu *mcu)
@@ -123,7 +126,7 @@ static int mcu_gpiochip_add(struct mcu *mcu)
 	gc->can_sleep = 1;
 	gc->ngpio = MCU_NUM_GPIO;
 	gc->base = -1;
-	gc->set = mcu_gpio_set;
+	gc->set_rv = mcu_gpio_set;
 	gc->direction_output = mcu_gpio_dir_out;
 	gc->parent = dev;
 
diff --git a/arch/powerpc/platforms/8xx/cpm1.c b/arch/powerpc/platforms/8xx/cpm1.c
index 1dc095ad48fc..7462c221115c 100644
--- a/arch/powerpc/platforms/8xx/cpm1.c
+++ b/arch/powerpc/platforms/8xx/cpm1.c
@@ -417,7 +417,7 @@ static void __cpm1_gpio16_set(struct cpm1_gpio16_chip *cpm1_gc, u16 pin_mask, in
 	out_be16(&iop->dat, cpm1_gc->cpdata);
 }
 
-static void cpm1_gpio16_set(struct gpio_chip *gc, unsigned int gpio, int value)
+static int cpm1_gpio16_set(struct gpio_chip *gc, unsigned int gpio, int value)
 {
 	struct cpm1_gpio16_chip *cpm1_gc = gpiochip_get_data(gc);
 	unsigned long flags;
@@ -428,6 +428,8 @@ static void cpm1_gpio16_set(struct gpio_chip *gc, unsigned int gpio, int value)
 	__cpm1_gpio16_set(cpm1_gc, pin_mask, value);
 
 	spin_unlock_irqrestore(&cpm1_gc->lock, flags);
+
+	return 0;
 }
 
 static int cpm1_gpio16_to_irq(struct gpio_chip *gc, unsigned int gpio)
@@ -497,7 +499,7 @@ int cpm1_gpiochip_add16(struct device *dev)
 	gc->direction_input = cpm1_gpio16_dir_in;
 	gc->direction_output = cpm1_gpio16_dir_out;
 	gc->get = cpm1_gpio16_get;
-	gc->set = cpm1_gpio16_set;
+	gc->set_rv = cpm1_gpio16_set;
 	gc->to_irq = cpm1_gpio16_to_irq;
 	gc->parent = dev;
 	gc->owner = THIS_MODULE;
@@ -554,7 +556,7 @@ static void __cpm1_gpio32_set(struct cpm1_gpio32_chip *cpm1_gc, u32 pin_mask, in
 	out_be32(&iop->dat, cpm1_gc->cpdata);
 }
 
-static void cpm1_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value)
+static int cpm1_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value)
 {
 	struct cpm1_gpio32_chip *cpm1_gc = gpiochip_get_data(gc);
 	unsigned long flags;
@@ -565,6 +567,8 @@ static void cpm1_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value)
 	__cpm1_gpio32_set(cpm1_gc, pin_mask, value);
 
 	spin_unlock_irqrestore(&cpm1_gc->lock, flags);
+
+	return 0;
 }
 
 static int cpm1_gpio32_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
@@ -618,7 +622,7 @@ int cpm1_gpiochip_add32(struct device *dev)
 	gc->direction_input = cpm1_gpio32_dir_in;
 	gc->direction_output = cpm1_gpio32_dir_out;
 	gc->get = cpm1_gpio32_get;
-	gc->set = cpm1_gpio32_set;
+	gc->set_rv = cpm1_gpio32_set;
 	gc->parent = dev;
 	gc->owner = THIS_MODULE;
 
diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c
index 6de1cd5d8a58..e119ced05d10 100644
--- a/arch/powerpc/platforms/powermac/setup.c
+++ b/arch/powerpc/platforms/powermac/setup.c
@@ -45,6 +45,7 @@
 #include <linux/root_dev.h>
 #include <linux/bitops.h>
 #include <linux/suspend.h>
+#include <linux/string_choices.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
 
@@ -238,8 +239,7 @@ static void __init l2cr_init(void)
 				_set_L2CR(0);
 				_set_L2CR(*l2cr);
 				pr_info("L2CR overridden (0x%x), backside cache is %s\n",
-					*l2cr, ((*l2cr) & 0x80000000) ?
-					"enabled" : "disabled");
+					*l2cr, str_enabled_disabled((*l2cr) & 0x80000000));
 			}
 			of_node_put(np);
 			break;
diff --git a/arch/powerpc/platforms/powermac/time.c b/arch/powerpc/platforms/powermac/time.c
index 8633891b7aa5..b4426a35aca3 100644
--- a/arch/powerpc/platforms/powermac/time.c
+++ b/arch/powerpc/platforms/powermac/time.c
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/string.h>
+#include <linux/string_choices.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/time.h>
@@ -77,7 +78,7 @@ long __init pmac_time_init(void)
 		delta |= 0xFF000000UL;
 	dst = ((pmac_xpram_read(PMAC_XPRAM_MACHINE_LOC + 0x8) & 0x80) != 0);
 	printk("GMT Delta read from XPRAM: %d minutes, DST: %s\n", delta/60,
-		dst ? "on" : "off");
+		str_on_off(dst));
 #endif
 	return delta;
 }
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 3fbe0295ce14..95d7ba73d43d 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -17,7 +17,7 @@ config PPC_POWERNV
 	select MMU_NOTIFIER
 	select FORCE_SMP
 	select ARCH_SUPPORTS_PER_VMA_LOCK
-	select PPC_RADIX_BROADCAST_TLBIE
+	select PPC_RADIX_BROADCAST_TLBIE if PPC_RADIX_MMU
 	default y
 
 config OPAL_PRD
diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
index 61722133eb2d..22d91ac424dd 100644
--- a/arch/powerpc/platforms/ps3/device-init.c
+++ b/arch/powerpc/platforms/ps3/device-init.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/reboot.h>
 #include <linux/rcuwait.h>
+#include <linux/string_choices.h>
 
 #include <asm/firmware.h>
 #include <asm/lv1call.h>
@@ -724,7 +725,7 @@ static irqreturn_t ps3_notification_interrupt(int irq, void *data)
 static int ps3_notification_read_write(struct ps3_notification_device *dev,
 				       u64 lpar, int write)
 {
-	const char *op = write ? "write" : "read";
+	const char *op = str_write_read(write);
 	unsigned long flags;
 	int res;
 
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index a934c2a262f6..fa3c2fff082a 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -23,7 +23,7 @@ config PPC_PSERIES
 	select FORCE_SMP
 	select SWIOTLB
 	select ARCH_SUPPORTS_PER_VMA_LOCK
-	select PPC_RADIX_BROADCAST_TLBIE
+	select PPC_RADIX_BROADCAST_TLBIE if PPC_RADIX_MMU
 	default y
 
 config PARAVIRT
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 3f3e3492e436..57222678bb3f 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -3,7 +3,8 @@ ccflags-$(CONFIG_PPC_PSERIES_DEBUG)	+= -DDEBUG
 
 obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
 			   of_helpers.o rtas-work-area.o papr-sysparm.o \
-			   papr-vpd.o \
+			   papr-rtas-common.o papr-vpd.o papr-indices.o \
+			   papr-platform-dump.o papr-phy-attest.o \
 			   setup.o iommu.o event_sources.o ras.o \
 			   firmware.o power.o dlpar.o mobility.o rng.o \
 			   pci.o pci_dlpar.o eeh_pseries.o msi.o \
diff --git a/arch/powerpc/platforms/pseries/htmdump.c b/arch/powerpc/platforms/pseries/htmdump.c
index 57fc1700f604..742ec52c9d4d 100644
--- a/arch/powerpc/platforms/pseries/htmdump.c
+++ b/arch/powerpc/platforms/pseries/htmdump.c
@@ -10,28 +10,40 @@
 #include <asm/io.h>
 #include <asm/machdep.h>
 #include <asm/plpar_wrappers.h>
+#include <asm/kvm_guest.h>
 
 static void *htm_buf;
+static void *htm_status_buf;
+static void *htm_info_buf;
+static void *htm_caps_buf;
 static u32 nodeindex;
 static u32 nodalchipindex;
 static u32 coreindexonchip;
 static u32 htmtype;
+static u32 htmconfigure;
+static u32 htmstart;
+static u32 htmsetup;
+static u64 htmflags;
+
 static struct dentry *htmdump_debugfs_dir;
+#define	HTM_ENABLE	1
+#define	HTM_DISABLE	0
+#define	HTM_NOWRAP	1
+#define	HTM_WRAP	0
 
-static ssize_t htmdump_read(struct file *filp, char __user *ubuf,
-			     size_t count, loff_t *ppos)
+/*
+ * Check the return code for H_HTM hcall.
+ * Return non-zero value (1) if either H_PARTIAL or H_SUCCESS
+ * is returned. For other return codes:
+ * Return zero if H_NOT_AVAILABLE.
+ * Return -EBUSY if hcall return busy.
+ * Return -EINVAL if any parameter or operation is not valid.
+ * Return -EPERM if HTM Virtualization Engine Technology code
+ * is not applied.
+ * Return -EIO if the HTM state is not valid.
+ */
+static ssize_t htm_return_check(long rc)
 {
-	void *htm_buf = filp->private_data;
-	unsigned long page, read_size, available;
-	loff_t offset;
-	long rc;
-
-	page = ALIGN_DOWN(*ppos, PAGE_SIZE);
-	offset = (*ppos) % PAGE_SIZE;
-
-	rc = htm_get_dump_hardware(nodeindex, nodalchipindex, coreindexonchip,
-				   htmtype, virt_to_phys(htm_buf), PAGE_SIZE, page);
-
 	switch (rc) {
 	case H_SUCCESS:
 	/* H_PARTIAL for the case where all available data can't be
@@ -65,6 +77,38 @@ static ssize_t htmdump_read(struct file *filp, char __user *ubuf,
 		return -EPERM;
 	}
 
+	/*
+	 * Return 1 for H_SUCCESS/H_PARTIAL
+	 */
+	return 1;
+}
+
+static ssize_t htmdump_read(struct file *filp, char __user *ubuf,
+			     size_t count, loff_t *ppos)
+{
+	void *htm_buf = filp->private_data;
+	unsigned long page, read_size, available;
+	loff_t offset;
+	long rc, ret;
+
+	page = ALIGN_DOWN(*ppos, PAGE_SIZE);
+	offset = (*ppos) % PAGE_SIZE;
+
+	/*
+	 * Invoke H_HTM call with:
+	 * - operation as htm dump (H_HTM_OP_DUMP_DATA)
+	 * - last three values are address, size and offset
+	 */
+	rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip,
+				   htmtype, H_HTM_OP_DUMP_DATA, virt_to_phys(htm_buf),
+				   PAGE_SIZE, page);
+
+	ret = htm_return_check(rc);
+	if (ret <= 0) {
+		pr_debug("H_HTM hcall failed for op: H_HTM_OP_DUMP_DATA, returning %ld\n", ret);
+		return ret;
+	}
+
 	available = PAGE_SIZE;
 	read_size = min(count, available);
 	*ppos += read_size;
@@ -77,6 +121,292 @@ static const struct file_operations htmdump_fops = {
 	.open	= simple_open,
 };
 
+static int  htmconfigure_set(void *data, u64 val)
+{
+	long rc, ret;
+	unsigned long param1 = -1, param2 = -1;
+
+	/*
+	 * value as 1 : configure HTM.
+	 * value as 0 : deconfigure HTM. Return -EINVAL for
+	 * other values.
+	 */
+	if (val == HTM_ENABLE) {
+		/*
+		 * Invoke H_HTM call with:
+		 * - operation as htm configure (H_HTM_OP_CONFIGURE)
+		 * - If htmflags is set, param1 and param2 will be -1
+		 *   which is an indicator to use default htm mode reg mask
+		 *   and htm mode reg value.
+		 * - last three values are unused, hence set to zero
+		 */
+		if (!htmflags) {
+			param1 = 0;
+			param2 = 0;
+		}
+
+		rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip,
+			   htmtype, H_HTM_OP_CONFIGURE, param1, param2, 0);
+	} else if (val == HTM_DISABLE) {
+		/*
+		 * Invoke H_HTM call with:
+		 * - operation as htm deconfigure (H_HTM_OP_DECONFIGURE)
+		 * - last three values are unused, hence set to zero
+		 */
+		rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip,
+				htmtype, H_HTM_OP_DECONFIGURE, 0, 0, 0);
+	} else
+		return -EINVAL;
+
+	ret = htm_return_check(rc);
+	if (ret <= 0) {
+		pr_debug("H_HTM hcall failed, returning %ld\n", ret);
+		return ret;
+	}
+
+	/* Set htmconfigure if operation succeeds */
+	htmconfigure = val;
+
+	return 0;
+}
+
+static int htmconfigure_get(void *data, u64 *val)
+{
+	*val = htmconfigure;
+	return 0;
+}
+
+static int  htmstart_set(void *data, u64 val)
+{
+	long rc, ret;
+
+	/*
+	 * value as 1: start HTM
+	 * value as 0: stop HTM
+	 * Return -EINVAL for other values.
+	 */
+	if (val == HTM_ENABLE) {
+		/*
+		 * Invoke H_HTM call with:
+		 * - operation as htm start (H_HTM_OP_START)
+		 * - last three values are unused, hence set to zero
+		 */
+		rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip,
+			   htmtype, H_HTM_OP_START, 0, 0, 0);
+
+	} else if (val == HTM_DISABLE) {
+		/*
+		 * Invoke H_HTM call with:
+		 * - operation as htm stop (H_HTM_OP_STOP)
+		 * - last three values are unused, hence set to zero
+		 */
+		rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip,
+				htmtype, H_HTM_OP_STOP, 0, 0, 0);
+	} else
+		return -EINVAL;
+
+	ret = htm_return_check(rc);
+	if (ret <= 0) {
+		pr_debug("H_HTM hcall failed, returning %ld\n", ret);
+		return ret;
+	}
+
+	/* Set htmstart if H_HTM_OP_START/H_HTM_OP_STOP operation succeeds */
+	htmstart = val;
+
+	return 0;
+}
+
+static int htmstart_get(void *data, u64 *val)
+{
+	*val = htmstart;
+	return 0;
+}
+
+static ssize_t htmstatus_read(struct file *filp, char __user *ubuf,
+			     size_t count, loff_t *ppos)
+{
+	void *htm_status_buf = filp->private_data;
+	long rc, ret;
+	u64 *num_entries;
+	u64 to_copy;
+	int htmstatus_flag;
+
+	/*
+	 * Invoke H_HTM call with:
+	 * - operation as htm status (H_HTM_OP_STATUS)
+	 * - last three values as addr, size and offset
+	 */
+	rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip,
+				   htmtype, H_HTM_OP_STATUS, virt_to_phys(htm_status_buf),
+				   PAGE_SIZE, 0);
+
+	ret = htm_return_check(rc);
+	if (ret <= 0) {
+		pr_debug("H_HTM hcall failed for op: H_HTM_OP_STATUS, returning %ld\n", ret);
+		return ret;
+	}
+
+	/*
+	 * HTM status buffer, start of buffer + 0x10 gives the
+	 * number of HTM entries in the buffer. Each nest htm status
+	 * entry is 0x6 bytes where each core htm status entry is
+	 * 0x8 bytes.
+	 * So total count to copy is:
+	 * 32 bytes (for first 7 fields) + (number of HTM entries * entry size)
+	 */
+	num_entries = htm_status_buf + 0x10;
+	if (htmtype == 0x2)
+		htmstatus_flag = 0x8;
+	else
+		htmstatus_flag = 0x6;
+	to_copy = 32 + (be64_to_cpu(*num_entries) * htmstatus_flag);
+	return simple_read_from_buffer(ubuf, count, ppos, htm_status_buf, to_copy);
+}
+
+static const struct file_operations htmstatus_fops = {
+	.llseek = NULL,
+	.read	= htmstatus_read,
+	.open	= simple_open,
+};
+
+static ssize_t htminfo_read(struct file *filp, char __user *ubuf,
+			     size_t count, loff_t *ppos)
+{
+	void *htm_info_buf = filp->private_data;
+	long rc, ret;
+	u64 *num_entries;
+	u64 to_copy;
+
+	/*
+	 * Invoke H_HTM call with:
+	 * - operation as htm status (H_HTM_OP_STATUS)
+	 * - last three values as addr, size and offset
+	 */
+	rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip,
+				   htmtype, H_HTM_OP_DUMP_SYSPROC_CONF, virt_to_phys(htm_info_buf),
+				   PAGE_SIZE, 0);
+
+	ret = htm_return_check(rc);
+	if (ret <= 0) {
+		pr_debug("H_HTM hcall failed for op: H_HTM_OP_DUMP_SYSPROC_CONF, returning %ld\n", ret);
+		return ret;
+	}
+
+	/*
+	 * HTM status buffer, start of buffer + 0x10 gives the
+	 * number of HTM entries in the buffer. Each entry of processor
+	 * is 16 bytes.
+	 *
+	 * So total count to copy is:
+	 * 32 bytes (for first 5 fields) + (number of HTM entries * entry size)
+	 */
+	num_entries = htm_info_buf + 0x10;
+	to_copy = 32 + (be64_to_cpu(*num_entries) * 16);
+	return simple_read_from_buffer(ubuf, count, ppos, htm_info_buf, to_copy);
+}
+
+static ssize_t htmcaps_read(struct file *filp, char __user *ubuf,
+			     size_t count, loff_t *ppos)
+{
+	void *htm_caps_buf = filp->private_data;
+	long rc, ret;
+
+	/*
+	 * Invoke H_HTM call with:
+	 * - operation as htm capabilities (H_HTM_OP_CAPABILITIES)
+	 * - last three values as addr, size (0x80 for Capabilities Output Buffer
+	 *   and zero
+	 */
+	rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip,
+				   htmtype, H_HTM_OP_CAPABILITIES, virt_to_phys(htm_caps_buf),
+				   0x80, 0);
+
+	ret = htm_return_check(rc);
+	if (ret <= 0) {
+		pr_debug("H_HTM hcall failed for op: H_HTM_OP_CAPABILITIES, returning %ld\n", ret);
+		return ret;
+	}
+
+	return simple_read_from_buffer(ubuf, count, ppos, htm_caps_buf, 0x80);
+}
+
+static const struct file_operations htminfo_fops = {
+	.llseek = NULL,
+	.read   = htminfo_read,
+	.open   = simple_open,
+};
+
+static const struct file_operations htmcaps_fops = {
+	.llseek = NULL,
+	.read   = htmcaps_read,
+	.open   = simple_open,
+};
+
+static int  htmsetup_set(void *data, u64 val)
+{
+	long rc, ret;
+
+	/*
+	 * Input value: HTM buffer size in the power of 2
+	 * example: hex value 0x21 ( decimal: 33 ) is for
+	 * 8GB
+	 * Invoke H_HTM call with:
+	 * - operation as htm start (H_HTM_OP_SETUP)
+	 * - parameter 1 set to input value.
+	 * - last two values are unused, hence set to zero
+	 */
+	rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip,
+			htmtype, H_HTM_OP_SETUP, val, 0, 0);
+
+	ret = htm_return_check(rc);
+	if (ret <= 0) {
+		pr_debug("H_HTM hcall failed for op: H_HTM_OP_SETUP, returning %ld\n", ret);
+		return ret;
+	}
+
+	/* Set htmsetup if H_HTM_OP_SETUP operation succeeds */
+	htmsetup = val;
+
+	return 0;
+}
+
+static int htmsetup_get(void *data, u64 *val)
+{
+	*val = htmsetup;
+	return 0;
+}
+
+static int  htmflags_set(void *data, u64 val)
+{
+	/*
+	 * Input value:
+	 * Currently supported flag value is to enable/disable
+	 * HTM buffer wrap. wrap is used along with "configure"
+	 * to prevent HTM buffer from wrapping.
+	 * Writing 1 will set noWrap while configuring HTM
+	 */
+	if (val == HTM_NOWRAP)
+		htmflags = H_HTM_FLAGS_NOWRAP;
+	else if (val == HTM_WRAP)
+		htmflags = 0;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+static int htmflags_get(void *data, u64 *val)
+{
+	*val = htmflags;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(htmconfigure_fops, htmconfigure_get, htmconfigure_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(htmstart_fops, htmstart_get, htmstart_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(htmsetup_fops, htmsetup_get, htmsetup_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(htmflags_fops, htmflags_get, htmflags_set, "%llu\n");
+
 static int htmdump_init_debugfs(void)
 {
 	htm_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
@@ -98,11 +428,50 @@ static int htmdump_init_debugfs(void)
 			htmdump_debugfs_dir, &htmtype);
 	debugfs_create_file("trace", 0400, htmdump_debugfs_dir, htm_buf, &htmdump_fops);
 
+	/*
+	 * Debugfs interface files to control HTM operations:
+	 */
+	debugfs_create_file("htmconfigure", 0600, htmdump_debugfs_dir, NULL, &htmconfigure_fops);
+	debugfs_create_file("htmstart", 0600, htmdump_debugfs_dir, NULL, &htmstart_fops);
+	debugfs_create_file("htmsetup", 0600, htmdump_debugfs_dir, NULL, &htmsetup_fops);
+	debugfs_create_file("htmflags", 0600, htmdump_debugfs_dir, NULL, &htmflags_fops);
+
+	/* Debugfs interface file to present status of HTM */
+	htm_status_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!htm_status_buf) {
+		pr_err("Failed to allocate htmstatus buf\n");
+		return -ENOMEM;
+	}
+
+	/* Debugfs interface file to present System Processor Configuration */
+	htm_info_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!htm_info_buf) {
+		pr_err("Failed to allocate htm info buf\n");
+		return -ENOMEM;
+	}
+
+	/* Debugfs interface file to present HTM capabilities */
+	htm_caps_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!htm_caps_buf) {
+		pr_err("Failed to allocate htm caps buf\n");
+		return -ENOMEM;
+	}
+
+	debugfs_create_file("htmstatus", 0400, htmdump_debugfs_dir, htm_status_buf, &htmstatus_fops);
+	debugfs_create_file("htminfo", 0400, htmdump_debugfs_dir, htm_info_buf, &htminfo_fops);
+	debugfs_create_file("htmcaps", 0400, htmdump_debugfs_dir, htm_caps_buf, &htmcaps_fops);
+
 	return 0;
 }
 
 static int __init htmdump_init(void)
 {
+	/* Disable on kvm guest */
+	if (is_kvm_guest()) {
+		pr_info("htmdump not supported inside KVM guest\n");
+		return -EOPNOTSUPP;
+	}
+
 	if (htmdump_init_debugfs())
 		return -ENOMEM;
 
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index d6ebc19fb99c..eec333dd2e59 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -197,7 +197,7 @@ static void tce_iommu_userspace_view_free(struct iommu_table *tbl)
 
 static void tce_free_pSeries(struct iommu_table *tbl)
 {
-	if (!tbl->it_userspace)
+	if (tbl->it_userspace)
 		tce_iommu_userspace_view_free(tbl);
 }
 
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index f9d80111c322..957c0c03d259 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -525,7 +525,12 @@ static struct msi_domain_info pseries_msi_domain_info = {
 
 static void pseries_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
 {
-	__pci_read_msi_msg(irq_data_get_msi_desc(data), msg);
+	struct pci_dev *dev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data));
+
+	if (dev->current_state == PCI_D0)
+		__pci_read_msi_msg(irq_data_get_msi_desc(data), msg);
+	else
+		get_cached_msi_msg(data->irq, msg);
 }
 
 static struct irq_chip pseries_msi_irq_chip = {
diff --git a/arch/powerpc/platforms/pseries/papr-indices.c b/arch/powerpc/platforms/pseries/papr-indices.c
new file mode 100644
index 000000000000..3c7545591c45
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/papr-indices.c
@@ -0,0 +1,488 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "papr-indices: " fmt
+
+#include <linux/build_bug.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/lockdep.h>
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/signal.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/string_helpers.h>
+#include <linux/uaccess.h>
+#include <asm/machdep.h>
+#include <asm/rtas-work-area.h>
+#include <asm/rtas.h>
+#include <uapi/asm/papr-indices.h>
+#include "papr-rtas-common.h"
+
+/*
+ * Function-specific return values for ibm,set-dynamic-indicator and
+ * ibm,get-dynamic-sensor-state RTAS calls.
+ * PAPR+ v2.13 7.3.18 and 7.3.19.
+ */
+#define RTAS_IBM_DYNAMIC_INDICE_NO_INDICATOR	-3
+
+/**
+ * struct rtas_get_indices_params - Parameters (in and out) for
+ *                                      ibm,get-indices.
+ * @is_sensor:	In: Caller-provided whether sensor or indicator.
+ * @indice_type:In: Caller-provided indice (sensor or indicator) token
+ * @work_area:	In: Caller-provided work area buffer for results.
+ * @next:	In: Sequence number. Out: Next sequence number.
+ * @status:	Out: RTAS call status.
+ */
+struct rtas_get_indices_params {
+	u8 is_sensor;
+	u32 indice_type;
+	struct rtas_work_area *work_area;
+	u32 next;
+	s32 status;
+};
+
+/*
+ * rtas_ibm_get_indices() - Call ibm,get-indices to fill a work area buffer.
+ * @params: See &struct rtas_ibm_get_indices_params.
+ *
+ * Calls ibm,get-indices until it errors or successfully deposits data
+ * into the supplied work area. Handles RTAS retry statuses. Maps RTAS
+ * error statuses to reasonable errno values.
+ *
+ * The caller is expected to invoke rtas_ibm_get_indices() multiple times
+ * to retrieve all indices data for the provided indice type. Only one
+ * sequence should be in progress at any time; starting a new sequence
+ * will disrupt any sequence already in progress. Serialization of
+ * indices retrieval sequences is the responsibility of the caller.
+ *
+ * The caller should inspect @params.status to determine whether more
+ * calls are needed to complete the sequence.
+ *
+ * Context: May sleep.
+ * Return: -ve on error, 0 otherwise.
+ */
+static int rtas_ibm_get_indices(struct rtas_get_indices_params *params)
+{
+	struct rtas_work_area *work_area = params->work_area;
+	const s32 token = rtas_function_token(RTAS_FN_IBM_GET_INDICES);
+	u32 rets;
+	s32 fwrc;
+	int ret;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	lockdep_assert_held(&rtas_ibm_get_indices_lock);
+
+	do {
+		fwrc = rtas_call(token, 5, 2, &rets, params->is_sensor,
+				params->indice_type,
+				rtas_work_area_phys(work_area),
+				rtas_work_area_size(work_area),
+				params->next);
+	} while (rtas_busy_delay(fwrc));
+
+	switch (fwrc) {
+	case RTAS_HARDWARE_ERROR:
+		ret = -EIO;
+		break;
+	case RTAS_INVALID_PARAMETER: /* Indicator type is not supported */
+		ret = -EINVAL;
+		break;
+	case RTAS_SEQ_START_OVER:
+		ret = -EAGAIN;
+		pr_info_ratelimited("Indices changed during retrieval, retrying\n");
+		params->next = 1;
+		break;
+	case RTAS_SEQ_MORE_DATA:
+		params->next = rets;
+		ret = 0;
+		break;
+	case RTAS_SEQ_COMPLETE:
+		params->next = 0;
+		ret = 0;
+		break;
+	default:
+		ret = -EIO;
+		pr_err_ratelimited("unexpected ibm,get-indices status %d\n", fwrc);
+		break;
+	}
+
+	params->status = fwrc;
+	return ret;
+}
+
+/*
+ * Internal indices sequence APIs. A sequence is a series of calls to
+ * ibm,get-indices for a given location code. The sequence ends when
+ * an error is encountered or all indices for the input has been
+ * returned.
+ */
+
+/*
+ * indices_sequence_begin() - Begin a indices retrieval sequence.
+ *
+ * Context: May sleep.
+ */
+static void indices_sequence_begin(struct papr_rtas_sequence *seq)
+{
+	struct rtas_get_indices_params  *param;
+
+	param = (struct rtas_get_indices_params *)seq->params;
+	/*
+	 * We could allocate the work area before acquiring the
+	 * function lock, but that would allow concurrent requests to
+	 * exhaust the limited work area pool for no benefit. So
+	 * allocate the work area under the lock.
+	 */
+	mutex_lock(&rtas_ibm_get_indices_lock);
+	param->work_area = rtas_work_area_alloc(RTAS_GET_INDICES_BUF_SIZE);
+	param->next = 1;
+	param->status = 0;
+}
+
+/*
+ * indices_sequence_end() - Finalize a indices retrieval sequence.
+ *
+ * Releases resources obtained by indices_sequence_begin().
+ */
+static void indices_sequence_end(struct papr_rtas_sequence *seq)
+{
+	struct rtas_get_indices_params *param;
+
+	param =  (struct rtas_get_indices_params *)seq->params;
+	rtas_work_area_free(param->work_area);
+	mutex_unlock(&rtas_ibm_get_indices_lock);
+}
+
+/*
+ * Work function to be passed to papr_rtas_blob_generate().
+ *
+ * ibm,get-indices RTAS call fills the work area with the certain
+ * format but does not return the bytes written in the buffer. So
+ * instead of kernel parsing this work area to determine the buffer
+ * length, copy the complete work area (RTAS_GET_INDICES_BUF_SIZE)
+ * to the blob and let the user space to obtain the data.
+ * Means RTAS_GET_INDICES_BUF_SIZE data will be returned for each
+ * read().
+ */
+
+static const char *indices_sequence_fill_work_area(struct papr_rtas_sequence *seq,
+						size_t *len)
+{
+	struct rtas_get_indices_params *p;
+	bool init_state;
+
+	p = (struct rtas_get_indices_params *)seq->params;
+	init_state = (p->next == 1) ? true : false;
+
+	if (papr_rtas_sequence_should_stop(seq, p->status, init_state))
+		return NULL;
+	if (papr_rtas_sequence_set_err(seq, rtas_ibm_get_indices(p)))
+		return NULL;
+
+	*len = RTAS_GET_INDICES_BUF_SIZE;
+	return rtas_work_area_raw_buf(p->work_area);
+}
+
+/*
+ * papr_indices_handle_read - returns indices blob data to the user space
+ *
+ * ibm,get-indices RTAS call fills the work area with the certian
+ * format but does not return the bytes written in the buffer and
+ * copied RTAS_GET_INDICES_BUF_SIZE data to the blob for each RTAS
+ * call. So send RTAS_GET_INDICES_BUF_SIZE buffer to the user space
+ * for each read().
+ */
+static ssize_t papr_indices_handle_read(struct file *file,
+		char __user *buf, size_t size, loff_t *off)
+{
+	const struct papr_rtas_blob *blob = file->private_data;
+
+	/* we should not instantiate a handle without any data attached. */
+	if (!papr_rtas_blob_has_data(blob)) {
+		pr_err_once("handle without data\n");
+		return -EIO;
+	}
+
+	if (size < RTAS_GET_INDICES_BUF_SIZE) {
+		pr_err_once("Invalid buffer length %ld, expect %d\n",
+				size, RTAS_GET_INDICES_BUF_SIZE);
+		return -EINVAL;
+	} else if (size > RTAS_GET_INDICES_BUF_SIZE)
+		size = RTAS_GET_INDICES_BUF_SIZE;
+
+	return simple_read_from_buffer(buf, size, off, blob->data, blob->len);
+}
+
+static const struct file_operations papr_indices_handle_ops = {
+	.read = papr_indices_handle_read,
+	.llseek = papr_rtas_common_handle_seek,
+	.release = papr_rtas_common_handle_release,
+};
+
+/*
+ * papr_indices_create_handle() - Create a fd-based handle for reading
+ *                                indices data
+ * @ubuf: Input parameters to RTAS call such as whether sensor or indicator
+ *        and indice type in user memory
+ *
+ * Handler for PAPR_INDICES_IOC_GET ioctl command. Validates @ubuf
+ * and instantiates an immutable indices "blob" for it. The blob is
+ * attached to a file descriptor for reading by user space. The memory
+ * backing the blob is freed when the file is released.
+ *
+ * The entire requested indices is retrieved by this call and all
+ * necessary RTAS interactions are performed before returning the fd
+ * to user space. This keeps the read handler simple and ensures that
+ * the kernel can prevent interleaving of ibm,get-indices call sequences.
+ *
+ * Return: The installed fd number if successful, -ve errno otherwise.
+ */
+static long papr_indices_create_handle(struct papr_indices_io_block __user *ubuf)
+{
+	struct papr_rtas_sequence seq = {};
+	struct rtas_get_indices_params params = {};
+	int fd;
+
+	if (get_user(params.is_sensor, &ubuf->indices.is_sensor))
+		return -EFAULT;
+
+	if (get_user(params.indice_type, &ubuf->indices.indice_type))
+		return -EFAULT;
+
+	seq = (struct papr_rtas_sequence) {
+		.begin = indices_sequence_begin,
+		.end = indices_sequence_end,
+		.work = indices_sequence_fill_work_area,
+	};
+
+	seq.params = &params;
+	fd = papr_rtas_setup_file_interface(&seq,
+			&papr_indices_handle_ops, "[papr-indices]");
+
+	return fd;
+}
+
+/*
+ * Create work area with the input parameters. This function is used
+ * for both ibm,set-dynamic-indicator and ibm,get-dynamic-sensor-state
+ * RTAS Calls.
+ */
+static struct rtas_work_area *
+papr_dynamic_indice_buf_from_user(struct papr_indices_io_block __user *ubuf,
+				struct papr_indices_io_block *kbuf)
+{
+	struct rtas_work_area *work_area;
+	u32 length;
+	__be32 len_be;
+
+	if (copy_from_user(kbuf, ubuf, sizeof(*kbuf)))
+		return ERR_PTR(-EFAULT);
+
+
+	if (!string_is_terminated(kbuf->dynamic_param.location_code_str,
+			ARRAY_SIZE(kbuf->dynamic_param.location_code_str)))
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * The input data in the work area should be as follows:
+	 * - 32-bit integer length of the location code string,
+	 *   including NULL.
+	 * - Location code string, NULL terminated, identifying the
+	 *   token (sensor or indicator).
+	 * PAPR 2.13 - R1–7.3.18–5 ibm,set-dynamic-indicator
+	 *           - R1–7.3.19–5 ibm,get-dynamic-sensor-state
+	 */
+	/*
+	 * Length that user space passed should also include NULL
+	 * terminator.
+	 */
+	length = strlen(kbuf->dynamic_param.location_code_str) + 1;
+	if (length > LOC_CODE_SIZE)
+		return ERR_PTR(-EINVAL);
+
+	len_be = cpu_to_be32(length);
+
+	work_area = rtas_work_area_alloc(LOC_CODE_SIZE + sizeof(u32));
+	memcpy(rtas_work_area_raw_buf(work_area), &len_be, sizeof(u32));
+	memcpy((rtas_work_area_raw_buf(work_area) + sizeof(u32)),
+			&kbuf->dynamic_param.location_code_str, length);
+
+	return work_area;
+}
+
+/**
+ * papr_dynamic_indicator_ioc_set - ibm,set-dynamic-indicator RTAS Call
+ * PAPR 2.13 7.3.18
+ *
+ * @ubuf: Input parameters to RTAS call such as indicator token and
+ *        new state.
+ *
+ * Returns success or -errno.
+ */
+static long papr_dynamic_indicator_ioc_set(struct papr_indices_io_block __user *ubuf)
+{
+	struct papr_indices_io_block kbuf;
+	struct rtas_work_area *work_area;
+	s32 fwrc, token, ret;
+
+	token = rtas_function_token(RTAS_FN_IBM_SET_DYNAMIC_INDICATOR);
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	mutex_lock(&rtas_ibm_set_dynamic_indicator_lock);
+	work_area = papr_dynamic_indice_buf_from_user(ubuf, &kbuf);
+	if (IS_ERR(work_area)) {
+		ret = PTR_ERR(work_area);
+		goto out;
+	}
+
+	do {
+		fwrc = rtas_call(token, 3, 1, NULL,
+				kbuf.dynamic_param.token,
+				kbuf.dynamic_param.state,
+				rtas_work_area_phys(work_area));
+	} while (rtas_busy_delay(fwrc));
+
+	rtas_work_area_free(work_area);
+
+	switch (fwrc) {
+	case RTAS_SUCCESS:
+		ret = 0;
+		break;
+	case RTAS_IBM_DYNAMIC_INDICE_NO_INDICATOR:	/* No such indicator */
+		ret = -EOPNOTSUPP;
+		break;
+	default:
+		pr_err("unexpected ibm,set-dynamic-indicator result %d\n",
+			fwrc);
+		fallthrough;
+	case RTAS_HARDWARE_ERROR:	/* Hardware/platform error */
+		ret = -EIO;
+		break;
+	}
+
+out:
+	mutex_unlock(&rtas_ibm_set_dynamic_indicator_lock);
+	return ret;
+}
+
+/**
+ * papr_dynamic_sensor_ioc_get - ibm,get-dynamic-sensor-state RTAS Call
+ * PAPR 2.13 7.3.19
+ *
+ * @ubuf: Input parameters to RTAS call such as sensor token
+ *        Copies the state in user space buffer.
+ *
+ *
+ * Returns success or -errno.
+ */
+
+static long papr_dynamic_sensor_ioc_get(struct papr_indices_io_block __user *ubuf)
+{
+	struct papr_indices_io_block kbuf;
+	struct rtas_work_area *work_area;
+	s32 fwrc, token, ret;
+	u32 rets;
+
+	token = rtas_function_token(RTAS_FN_IBM_GET_DYNAMIC_SENSOR_STATE);
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	mutex_lock(&rtas_ibm_get_dynamic_sensor_state_lock);
+	work_area = papr_dynamic_indice_buf_from_user(ubuf, &kbuf);
+	if (IS_ERR(work_area)) {
+		ret = PTR_ERR(work_area);
+		goto out;
+	}
+
+	do {
+		fwrc = rtas_call(token, 2, 2, &rets,
+				kbuf.dynamic_param.token,
+				rtas_work_area_phys(work_area));
+	} while (rtas_busy_delay(fwrc));
+
+	rtas_work_area_free(work_area);
+
+	switch (fwrc) {
+	case RTAS_SUCCESS:
+		if (put_user(rets, &ubuf->dynamic_param.state))
+			ret = -EFAULT;
+		else
+			ret = 0;
+		break;
+	case RTAS_IBM_DYNAMIC_INDICE_NO_INDICATOR:	/* No such indicator */
+		ret = -EOPNOTSUPP;
+		break;
+	default:
+		pr_err("unexpected ibm,get-dynamic-sensor result %d\n",
+				fwrc);
+		fallthrough;
+	case RTAS_HARDWARE_ERROR:	/* Hardware/platform error */
+		ret = -EIO;
+		break;
+	}
+
+out:
+	mutex_unlock(&rtas_ibm_get_dynamic_sensor_state_lock);
+	return ret;
+}
+
+/*
+ * Top-level ioctl handler for /dev/papr-indices.
+ */
+static long papr_indices_dev_ioctl(struct file *filp, unsigned int ioctl,
+				unsigned long arg)
+{
+	void __user *argp = (__force void __user *)arg;
+	long ret;
+
+	switch (ioctl) {
+	case PAPR_INDICES_IOC_GET:
+		ret = papr_indices_create_handle(argp);
+		break;
+	case PAPR_DYNAMIC_SENSOR_IOC_GET:
+		ret = papr_dynamic_sensor_ioc_get(argp);
+		break;
+	case PAPR_DYNAMIC_INDICATOR_IOC_SET:
+		if (filp->f_mode & FMODE_WRITE)
+			ret = papr_dynamic_indicator_ioc_set(argp);
+		else
+			ret = -EBADF;
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+
+	return ret;
+}
+
+static const struct file_operations papr_indices_ops = {
+	.unlocked_ioctl = papr_indices_dev_ioctl,
+};
+
+static struct miscdevice papr_indices_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "papr-indices",
+	.fops = &papr_indices_ops,
+};
+
+static __init int papr_indices_init(void)
+{
+	if (!rtas_function_implemented(RTAS_FN_IBM_GET_INDICES))
+		return -ENODEV;
+
+	if (!rtas_function_implemented(RTAS_FN_IBM_SET_DYNAMIC_INDICATOR))
+		return -ENODEV;
+
+	if (!rtas_function_implemented(RTAS_FN_IBM_GET_DYNAMIC_SENSOR_STATE))
+		return -ENODEV;
+
+	return misc_register(&papr_indices_dev);
+}
+machine_device_initcall(pseries, papr_indices_init);
diff --git a/arch/powerpc/platforms/pseries/papr-phy-attest.c b/arch/powerpc/platforms/pseries/papr-phy-attest.c
new file mode 100644
index 000000000000..1907f2411567
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/papr-phy-attest.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "papr-phy-attest: " fmt
+
+#include <linux/build_bug.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/lockdep.h>
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/signal.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/string_helpers.h>
+#include <linux/uaccess.h>
+#include <asm/machdep.h>
+#include <asm/rtas-work-area.h>
+#include <asm/rtas.h>
+#include <uapi/asm/papr-physical-attestation.h>
+#include "papr-rtas-common.h"
+
+/**
+ * struct rtas_phy_attest_params - Parameters (in and out) for
+ * ibm,physical-attestation.
+ *
+ * @cmd:  In: Caller-provided attestation command buffer. Must be
+ *        RTAS-addressable.
+ * @work_area: In: Caller-provided work area buffer for attestation
+ *             command structure
+ *             Out: Caller-provided work area buffer for the response
+ * @cmd_len:   In: Caller-provided attestation command structure
+ *             length
+ * @sequence:  In: Sequence number. Out: Next sequence number.
+ * @written:   Out: Bytes written by ibm,physical-attestation to
+ *             @work_area.
+ * @status:    Out: RTAS call status.
+ */
+struct rtas_phy_attest_params {
+	struct papr_phy_attest_io_block cmd;
+	struct rtas_work_area *work_area;
+	u32 cmd_len;
+	u32 sequence;
+	u32 written;
+	s32 status;
+};
+
+/**
+ * rtas_physical_attestation() - Call ibm,physical-attestation to
+ * fill a work area buffer.
+ * @params: See &struct rtas_phy_attest_params.
+ *
+ * Calls ibm,physical-attestation until it errors or successfully
+ * deposits data into the supplied work area. Handles RTAS retry
+ * statuses. Maps RTAS error statuses to reasonable errno values.
+ *
+ * The caller is expected to invoke rtas_physical_attestation()
+ * multiple times to retrieve all the data for the provided
+ * attestation command. Only one sequence should be in progress at
+ * any time; starting a new sequence will disrupt any sequence
+ * already in progress. Serialization of attestation retrieval
+ * sequences is the responsibility of the caller.
+ *
+ * The caller should inspect @params.status to determine whether more
+ * calls are needed to complete the sequence.
+ *
+ * Context: May sleep.
+ * Return: -ve on error, 0 otherwise.
+ */
+static int rtas_physical_attestation(struct rtas_phy_attest_params *params)
+{
+	struct rtas_work_area *work_area;
+	s32 fwrc, token;
+	u32 rets[2];
+	int ret;
+
+	work_area = params->work_area;
+	token = rtas_function_token(RTAS_FN_IBM_PHYSICAL_ATTESTATION);
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	lockdep_assert_held(&rtas_ibm_physical_attestation_lock);
+
+	do {
+		fwrc = rtas_call(token, 3, 3, rets,
+				 rtas_work_area_phys(work_area),
+				 params->cmd_len,
+				 params->sequence);
+	} while (rtas_busy_delay(fwrc));
+
+	switch (fwrc) {
+	case RTAS_HARDWARE_ERROR:
+		ret = -EIO;
+		break;
+	case RTAS_INVALID_PARAMETER:
+		ret = -EINVAL;
+		break;
+	case RTAS_SEQ_MORE_DATA:
+		params->sequence = rets[0];
+		fallthrough;
+	case RTAS_SEQ_COMPLETE:
+		params->written = rets[1];
+		/*
+		 * Kernel or firmware bug, do not continue.
+		 */
+		if (WARN(params->written > rtas_work_area_size(work_area),
+			 "possible write beyond end of work area"))
+			ret = -EFAULT;
+		else
+			ret = 0;
+		break;
+	default:
+		ret = -EIO;
+		pr_err_ratelimited("unexpected ibm,get-phy_attest status %d\n", fwrc);
+		break;
+	}
+
+	params->status = fwrc;
+	return ret;
+}
+
+/*
+ * Internal physical-attestation sequence APIs. A physical-attestation
+ * sequence is a series of calls to get ibm,physical-attestation
+ * for a given attestation command. The sequence ends when an error
+ * is encountered or all data for the attestation command has been
+ * returned.
+ */
+
+/**
+ * phy_attest_sequence_begin() - Begin a response data for attestation
+ * command retrieval sequence.
+ * @seq: user specified parameters for RTAS call from seq struct.
+ *
+ * Context: May sleep.
+ */
+static void phy_attest_sequence_begin(struct papr_rtas_sequence *seq)
+{
+	struct rtas_phy_attest_params *param;
+
+	/*
+	 * We could allocate the work area before acquiring the
+	 * function lock, but that would allow concurrent requests to
+	 * exhaust the limited work area pool for no benefit. So
+	 * allocate the work area under the lock.
+	 */
+	mutex_lock(&rtas_ibm_physical_attestation_lock);
+	param =  (struct rtas_phy_attest_params *)seq->params;
+	param->work_area = rtas_work_area_alloc(SZ_4K);
+	memcpy(rtas_work_area_raw_buf(param->work_area), &param->cmd,
+			param->cmd_len);
+	param->sequence = 1;
+	param->status = 0;
+}
+
+/**
+ * phy_attest_sequence_end() - Finalize a attestation command
+ * response retrieval sequence.
+ * @seq: Sequence state.
+ *
+ * Releases resources obtained by phy_attest_sequence_begin().
+ */
+static void phy_attest_sequence_end(struct papr_rtas_sequence *seq)
+{
+	struct rtas_phy_attest_params *param;
+
+	param =  (struct rtas_phy_attest_params *)seq->params;
+	rtas_work_area_free(param->work_area);
+	mutex_unlock(&rtas_ibm_physical_attestation_lock);
+	kfree(param);
+}
+
+/*
+ * Generator function to be passed to papr_rtas_blob_generate().
+ */
+static const char *phy_attest_sequence_fill_work_area(struct papr_rtas_sequence *seq,
+						size_t *len)
+{
+	struct rtas_phy_attest_params *p;
+	bool init_state;
+
+	p = (struct rtas_phy_attest_params *)seq->params;
+	init_state = (p->written == 0) ? true : false;
+
+	if (papr_rtas_sequence_should_stop(seq, p->status, init_state))
+		return NULL;
+	if (papr_rtas_sequence_set_err(seq, rtas_physical_attestation(p)))
+		return NULL;
+	*len = p->written;
+	return rtas_work_area_raw_buf(p->work_area);
+}
+
+static const struct file_operations papr_phy_attest_handle_ops = {
+	.read = papr_rtas_common_handle_read,
+	.llseek = papr_rtas_common_handle_seek,
+	.release = papr_rtas_common_handle_release,
+};
+
+/**
+ * papr_phy_attest_create_handle() - Create a fd-based handle for
+ * reading the response for the given attestation command.
+ * @ulc: Attestation command in user memory; defines the scope of
+ *       data for the attestation command to retrieve.
+ *
+ * Handler for PAPR_PHYSICAL_ATTESTATION_IOC_CREATE_HANDLE ioctl
+ * command. Validates @ulc and instantiates an immutable response
+ * "blob" for attestation command. The blob is attached to a file
+ * descriptor for reading by user space. The memory backing the blob
+ * is freed when the file is released.
+ *
+ * The entire requested response buffer for the attestation command
+ * retrieved by this call and all necessary RTAS interactions are
+ * performed before returning the fd to user space. This keeps the
+ * read handler simple and ensures that kernel can prevent
+ * interleaving ibm,physical-attestation call sequences.
+ *
+ * Return: The installed fd number if successful, -ve errno otherwise.
+ */
+static long papr_phy_attest_create_handle(struct papr_phy_attest_io_block __user *ulc)
+{
+	struct rtas_phy_attest_params *params;
+	struct papr_rtas_sequence seq = {};
+	int fd;
+
+	/*
+	 * Freed in phy_attest_sequence_end().
+	 */
+	params =  kzalloc(sizeof(*params), GFP_KERNEL_ACCOUNT);
+	if (!params)
+		return -ENOMEM;
+
+	if (copy_from_user(&params->cmd, ulc,
+			sizeof(struct papr_phy_attest_io_block)))
+		return -EFAULT;
+
+	params->cmd_len = be32_to_cpu(params->cmd.length);
+	seq = (struct papr_rtas_sequence) {
+		.begin = phy_attest_sequence_begin,
+		.end = phy_attest_sequence_end,
+		.work = phy_attest_sequence_fill_work_area,
+	};
+
+	seq.params = (void *)params;
+
+	fd = papr_rtas_setup_file_interface(&seq,
+			&papr_phy_attest_handle_ops,
+			"[papr-physical-attestation]");
+
+	return fd;
+}
+
+/*
+ * Top-level ioctl handler for /dev/papr-physical-attestation.
+ */
+static long papr_phy_attest_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+	void __user *argp = (__force void __user *)arg;
+	long ret;
+
+	switch (ioctl) {
+	case PAPR_PHY_ATTEST_IOC_HANDLE:
+		ret = papr_phy_attest_create_handle(argp);
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+	return ret;
+}
+
+static const struct file_operations papr_phy_attest_ops = {
+	.unlocked_ioctl = papr_phy_attest_dev_ioctl,
+};
+
+static struct miscdevice papr_phy_attest_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "papr-physical-attestation",
+	.fops = &papr_phy_attest_ops,
+};
+
+static __init int papr_phy_attest_init(void)
+{
+	if (!rtas_function_implemented(RTAS_FN_IBM_PHYSICAL_ATTESTATION))
+		return -ENODEV;
+
+	return misc_register(&papr_phy_attest_dev);
+}
+machine_device_initcall(pseries, papr_phy_attest_init);
diff --git a/arch/powerpc/platforms/pseries/papr-platform-dump.c b/arch/powerpc/platforms/pseries/papr-platform-dump.c
new file mode 100644
index 000000000000..f8d55eccdb6b
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/papr-platform-dump.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "papr-platform-dump: " fmt
+
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <asm/machdep.h>
+#include <asm/rtas-work-area.h>
+#include <asm/rtas.h>
+#include <uapi/asm/papr-platform-dump.h>
+
+/*
+ * Function-specific return values for ibm,platform-dump, derived from
+ * PAPR+ v2.13 7.3.3.4.1 "ibm,platform-dump RTAS Call".
+ */
+#define	RTAS_IBM_PLATFORM_DUMP_COMPLETE	0	/* Complete dump retrieved. */
+#define	RTAS_IBM_PLATFORM_DUMP_CONTINUE	1	/* Continue dump */
+#define	RTAS_NOT_AUTHORIZED		-9002	/* Not Authorized */
+
+#define	RTAS_IBM_PLATFORM_DUMP_START	2 /* Linux status to start dump */
+
+/**
+ * struct ibm_platform_dump_params - Parameters (in and out) for
+ *                                              ibm,platform-dump
+ * @work_area:		In: work area buffer for results.
+ * @buf_length:		In: work area buffer length in bytes
+ * @dump_tag_hi:	In: Most-significant 32 bits of a Dump_Tag representing
+ *                      an id of the dump being processed.
+ * @dump_tag_lo:	In: Least-significant 32 bits of a Dump_Tag representing
+ *                      an id of the dump being processed.
+ * @sequence_hi:	In: Sequence number in most-significant 32 bits.
+ *                      Out: Next sequence number in most-significant 32 bits.
+ * @sequence_lo:	In: Sequence number in Least-significant 32 bits
+ *                      Out: Next sequence number in Least-significant 32 bits.
+ * @bytes_ret_hi:	Out: Bytes written in most-significant 32 bits.
+ * @bytes_ret_lo:	Out: Bytes written in Least-significant 32 bits.
+ * @status:		Out: RTAS call status.
+ * @list:		Maintain the list of dumps are in progress. Can
+ *                      retrieve multiple dumps with different dump IDs at
+ *                      the same time but not with the same dump ID. This list
+ *                      is used to determine whether the dump for the same ID
+ *                      is in progress.
+ */
+struct ibm_platform_dump_params {
+	struct rtas_work_area	*work_area;
+	u32			buf_length;
+	u32			dump_tag_hi;
+	u32			dump_tag_lo;
+	u32			sequence_hi;
+	u32			sequence_lo;
+	u32			bytes_ret_hi;
+	u32			bytes_ret_lo;
+	s32			status;
+	struct list_head	list;
+};
+
+/*
+ * Multiple dumps with different dump IDs can be retrieved at the same
+ * time, but not with dame dump ID. platform_dump_list_mutex and
+ * platform_dump_list are used to prevent this behavior.
+ */
+static DEFINE_MUTEX(platform_dump_list_mutex);
+static LIST_HEAD(platform_dump_list);
+
+/**
+ * rtas_ibm_platform_dump() - Call ibm,platform-dump to fill a work area
+ * buffer.
+ * @params: See &struct ibm_platform_dump_params.
+ * @buf_addr: Address of dump buffer (work_area)
+ * @buf_length: Length of the buffer in bytes (min. 1024)
+ *
+ * Calls ibm,platform-dump until it errors or successfully deposits data
+ * into the supplied work area. Handles RTAS retry statuses. Maps RTAS
+ * error statuses to reasonable errno values.
+ *
+ * Can request multiple dumps with different dump IDs at the same time,
+ * but not with the same dump ID which is prevented with the check in
+ * the ioctl code (papr_platform_dump_create_handle()).
+ *
+ * The caller should inspect @params.status to determine whether more
+ * calls are needed to complete the sequence.
+ *
+ * Context: May sleep.
+ * Return: -ve on error, 0 for dump complete and 1 for continue dump
+ */
+static int rtas_ibm_platform_dump(struct ibm_platform_dump_params *params,
+				phys_addr_t buf_addr, u32 buf_length)
+{
+	u32 rets[4];
+	s32 fwrc;
+	int ret = 0;
+
+	do {
+		fwrc = rtas_call(rtas_function_token(RTAS_FN_IBM_PLATFORM_DUMP),
+				6, 5,
+				rets,
+				params->dump_tag_hi,
+				params->dump_tag_lo,
+				params->sequence_hi,
+				params->sequence_lo,
+				buf_addr,
+				buf_length);
+	} while (rtas_busy_delay(fwrc));
+
+	switch (fwrc) {
+	case RTAS_HARDWARE_ERROR:
+		ret = -EIO;
+		break;
+	case RTAS_NOT_AUTHORIZED:
+		ret = -EPERM;
+		break;
+	case RTAS_IBM_PLATFORM_DUMP_CONTINUE:
+	case RTAS_IBM_PLATFORM_DUMP_COMPLETE:
+		params->sequence_hi = rets[0];
+		params->sequence_lo = rets[1];
+		params->bytes_ret_hi = rets[2];
+		params->bytes_ret_lo = rets[3];
+		break;
+	default:
+		ret = -EIO;
+		pr_err_ratelimited("unexpected ibm,platform-dump status %d\n",
+				fwrc);
+		break;
+	}
+
+	params->status = fwrc;
+	return ret;
+}
+
+/*
+ * Platform dump is used with multiple RTAS calls to retrieve the
+ * complete dump for the provided dump ID. Once the complete dump is
+ * retrieved, the hypervisor returns dump complete status (0) for the
+ * last RTAS call and expects the caller issues one more call with
+ * NULL buffer to invalidate the dump so that the hypervisor can remove
+ * the dump.
+ *
+ * After the specific dump is invalidated in the hypervisor, expect the
+ * dump complete status for the new sequence - the user space initiates
+ * new request for the same dump ID.
+ */
+static ssize_t papr_platform_dump_handle_read(struct file *file,
+		char __user *buf, size_t size, loff_t *off)
+{
+	struct ibm_platform_dump_params *params = file->private_data;
+	u64 total_bytes;
+	s32 fwrc;
+
+	/*
+	 * Dump already completed with the previous read calls.
+	 * In case if the user space issues further reads, returns
+	 * -EINVAL.
+	 */
+	if (!params->buf_length) {
+		pr_warn_once("Platform dump completed for dump ID %llu\n",
+			(u64) (((u64)params->dump_tag_hi << 32) |
+				params->dump_tag_lo));
+		return -EINVAL;
+	}
+
+	/*
+	 * The hypervisor returns status 0 if no more data available to
+	 * download. The dump will be invalidated with ioctl (see below).
+	 */
+	if (params->status == RTAS_IBM_PLATFORM_DUMP_COMPLETE) {
+		params->buf_length = 0;
+		/*
+		 * Returns 0 to the user space so that user
+		 * space read stops.
+		 */
+		return 0;
+	}
+
+	if (size < SZ_1K) {
+		pr_err_once("Buffer length should be minimum 1024 bytes\n");
+		return -EINVAL;
+	} else if (size > params->buf_length) {
+		/*
+		 * Allocate 4K work area. So if the user requests > 4K,
+		 * resize the buffer length.
+		 */
+		size = params->buf_length;
+	}
+
+	fwrc = rtas_ibm_platform_dump(params,
+			rtas_work_area_phys(params->work_area),
+			size);
+	if (fwrc < 0)
+		return fwrc;
+
+	total_bytes = (u64) (((u64)params->bytes_ret_hi << 32) |
+			params->bytes_ret_lo);
+
+	/*
+	 * Kernel or firmware bug, do not continue.
+	 */
+	if (WARN(total_bytes > size, "possible write beyond end of work area"))
+		return -EFAULT;
+
+	if (copy_to_user(buf, rtas_work_area_raw_buf(params->work_area),
+			total_bytes))
+		return -EFAULT;
+
+	return total_bytes;
+}
+
+static int papr_platform_dump_handle_release(struct inode *inode,
+					struct file *file)
+{
+	struct ibm_platform_dump_params *params = file->private_data;
+
+	if (params->work_area)
+		rtas_work_area_free(params->work_area);
+
+	mutex_lock(&platform_dump_list_mutex);
+	list_del(&params->list);
+	mutex_unlock(&platform_dump_list_mutex);
+
+	kfree(params);
+	file->private_data = NULL;
+	return 0;
+}
+
+/*
+ * This ioctl is used to invalidate the dump assuming the user space
+ * issue this ioctl after obtain the complete dump.
+ * Issue the last RTAS call with NULL buffer to invalidate the dump
+ * which means dump will be freed in the hypervisor.
+ */
+static long papr_platform_dump_invalidate_ioctl(struct file *file,
+				unsigned int ioctl, unsigned long arg)
+{
+	struct ibm_platform_dump_params *params;
+	u64 __user *argp = (void __user *)arg;
+	u64 param_dump_tag, dump_tag;
+
+	if (ioctl != PAPR_PLATFORM_DUMP_IOC_INVALIDATE)
+		return -ENOIOCTLCMD;
+
+	if (get_user(dump_tag, argp))
+		return -EFAULT;
+
+	/*
+	 * private_data is freeded during release(), so should not
+	 * happen.
+	 */
+	if (!file->private_data) {
+		pr_err("No valid FD to invalidate dump for the ID(%llu)\n",
+				dump_tag);
+		return -EINVAL;
+	}
+
+	params = file->private_data;
+	param_dump_tag = (u64) (((u64)params->dump_tag_hi << 32) |
+				params->dump_tag_lo);
+	if (dump_tag != param_dump_tag) {
+		pr_err("Invalid dump ID(%llu) to invalidate dump\n",
+				dump_tag);
+		return -EINVAL;
+	}
+
+	if (params->status != RTAS_IBM_PLATFORM_DUMP_COMPLETE) {
+		pr_err("Platform dump is not complete, but requested "
+			"to invalidate dump for ID(%llu)\n",
+			dump_tag);
+		return -EINPROGRESS;
+	}
+
+	return rtas_ibm_platform_dump(params, 0, 0);
+}
+
+static const struct file_operations papr_platform_dump_handle_ops = {
+	.read = papr_platform_dump_handle_read,
+	.release = papr_platform_dump_handle_release,
+	.unlocked_ioctl	= papr_platform_dump_invalidate_ioctl,
+};
+
+/**
+ * papr_platform_dump_create_handle() - Create a fd-based handle for
+ * reading platform dump
+ *
+ * Handler for PAPR_PLATFORM_DUMP_IOC_CREATE_HANDLE ioctl command
+ * Allocates RTAS parameter struct and work area and attached to the
+ * file descriptor for reading by user space with the multiple RTAS
+ * calls until the dump is completed. This memory allocation is freed
+ * when the file is released.
+ *
+ * Multiple dump requests with different IDs are allowed at the same
+ * time, but not with the same dump ID. So if the user space is
+ * already opened file descriptor for the specific dump ID, return
+ * -EALREADY for the next request.
+ *
+ * @dump_tag: Dump ID for the dump requested to retrieve from the
+ *		hypervisor
+ *
+ * Return: The installed fd number if successful, -ve errno otherwise.
+ */
+static long papr_platform_dump_create_handle(u64 dump_tag)
+{
+	struct ibm_platform_dump_params *params;
+	u64 param_dump_tag;
+	struct file *file;
+	long err;
+	int fd;
+
+	/*
+	 * Return failure if the user space is already opened FD for
+	 * the specific dump ID. This check will prevent multiple dump
+	 * requests for the same dump ID at the same time. Generally
+	 * should not expect this, but in case.
+	 */
+	list_for_each_entry(params, &platform_dump_list, list) {
+		param_dump_tag = (u64) (((u64)params->dump_tag_hi << 32) |
+					params->dump_tag_lo);
+		if (dump_tag == param_dump_tag) {
+			pr_err("Platform dump for ID(%llu) is already in progress\n",
+					dump_tag);
+			return -EALREADY;
+		}
+	}
+
+	params =  kzalloc(sizeof(struct ibm_platform_dump_params),
+			GFP_KERNEL_ACCOUNT);
+	if (!params)
+		return -ENOMEM;
+
+	params->work_area = rtas_work_area_alloc(SZ_4K);
+	params->buf_length = SZ_4K;
+	params->dump_tag_hi = (u32)(dump_tag >> 32);
+	params->dump_tag_lo = (u32)(dump_tag & 0x00000000ffffffffULL);
+	params->status = RTAS_IBM_PLATFORM_DUMP_START;
+
+	fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC);
+	if (fd < 0) {
+		err = fd;
+		goto free_area;
+	}
+
+	file = anon_inode_getfile_fmode("[papr-platform-dump]",
+				&papr_platform_dump_handle_ops,
+				(void *)params, O_RDONLY,
+				FMODE_LSEEK | FMODE_PREAD);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto put_fd;
+	}
+
+	fd_install(fd, file);
+
+	list_add(&params->list, &platform_dump_list);
+
+	pr_info("%s (%d) initiated platform dump for dump tag %llu\n",
+		current->comm, current->pid, dump_tag);
+	return fd;
+put_fd:
+	put_unused_fd(fd);
+free_area:
+	rtas_work_area_free(params->work_area);
+	kfree(params);
+	return err;
+}
+
+/*
+ * Top-level ioctl handler for /dev/papr-platform-dump.
+ */
+static long papr_platform_dump_dev_ioctl(struct file *filp,
+					unsigned int ioctl,
+					unsigned long arg)
+{
+	u64 __user *argp = (void __user *)arg;
+	u64 dump_tag;
+	long ret;
+
+	if (get_user(dump_tag, argp))
+		return -EFAULT;
+
+	switch (ioctl) {
+	case PAPR_PLATFORM_DUMP_IOC_CREATE_HANDLE:
+		mutex_lock(&platform_dump_list_mutex);
+		ret = papr_platform_dump_create_handle(dump_tag);
+		mutex_unlock(&platform_dump_list_mutex);
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+	return ret;
+}
+
+static const struct file_operations papr_platform_dump_ops = {
+	.unlocked_ioctl = papr_platform_dump_dev_ioctl,
+};
+
+static struct miscdevice papr_platform_dump_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "papr-platform-dump",
+	.fops = &papr_platform_dump_ops,
+};
+
+static __init int papr_platform_dump_init(void)
+{
+	if (!rtas_function_implemented(RTAS_FN_IBM_PLATFORM_DUMP))
+		return -ENODEV;
+
+	return misc_register(&papr_platform_dump_dev);
+}
+machine_device_initcall(pseries, papr_platform_dump_init);
diff --git a/arch/powerpc/platforms/pseries/papr-rtas-common.c b/arch/powerpc/platforms/pseries/papr-rtas-common.c
new file mode 100644
index 000000000000..33c606e3378a
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/papr-rtas-common.c
@@ -0,0 +1,311 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "papr-common: " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/sched/signal.h>
+#include "papr-rtas-common.h"
+
+/*
+ * Sequence based RTAS HCALL has to issue multiple times to retrieve
+ * complete data from the hypervisor. For some of these RTAS calls,
+ * the OS should not interleave calls with different input until the
+ * sequence is completed. So data is collected for these calls during
+ * ioctl handle and export to user space with read() handle.
+ * This file provides common functions needed for such sequence based
+ * RTAS calls Ex: ibm,get-vpd and ibm,get-indices.
+ */
+
+bool papr_rtas_blob_has_data(const struct papr_rtas_blob *blob)
+{
+	return blob->data && blob->len;
+}
+
+void papr_rtas_blob_free(const struct papr_rtas_blob *blob)
+{
+	if (blob) {
+		kvfree(blob->data);
+		kfree(blob);
+	}
+}
+
+/**
+ * papr_rtas_blob_extend() - Append data to a &struct papr_rtas_blob.
+ * @blob: The blob to extend.
+ * @data: The new data to append to @blob.
+ * @len:  The length of @data.
+ *
+ * Context: May sleep.
+ * Return: -ENOMEM on allocation failure, 0 otherwise.
+ */
+static int papr_rtas_blob_extend(struct papr_rtas_blob *blob,
+				const char *data, size_t len)
+{
+	const size_t new_len = blob->len + len;
+	const size_t old_len = blob->len;
+	const char *old_ptr = blob->data;
+	char *new_ptr;
+
+	new_ptr = kvrealloc(old_ptr, new_len, GFP_KERNEL_ACCOUNT);
+	if (!new_ptr)
+		return -ENOMEM;
+
+	memcpy(&new_ptr[old_len], data, len);
+	blob->data = new_ptr;
+	blob->len = new_len;
+	return 0;
+}
+
+/**
+ * papr_rtas_blob_generate() - Construct a new &struct papr_rtas_blob.
+ * @seq: work function of the caller that is called to obtain
+ *       data with the caller RTAS call.
+ *
+ * The @work callback is invoked until it returns NULL. @seq is
+ * passed to @work in its first argument on each call. When
+ * @work returns data, it should store the data length in its
+ * second argument.
+ *
+ * Context: May sleep.
+ * Return: A completely populated &struct papr_rtas_blob, or NULL on error.
+ */
+static const struct papr_rtas_blob *
+papr_rtas_blob_generate(struct papr_rtas_sequence *seq)
+{
+	struct papr_rtas_blob *blob;
+	const char *buf;
+	size_t len;
+	int err = 0;
+
+	blob  = kzalloc(sizeof(*blob), GFP_KERNEL_ACCOUNT);
+	if (!blob)
+		return NULL;
+
+	if (!seq->work)
+		return ERR_PTR(-EINVAL);
+
+
+	while (err == 0 && (buf = seq->work(seq, &len)))
+		err = papr_rtas_blob_extend(blob, buf, len);
+
+	if (err != 0 || !papr_rtas_blob_has_data(blob))
+		goto free_blob;
+
+	return blob;
+free_blob:
+	papr_rtas_blob_free(blob);
+	return NULL;
+}
+
+int papr_rtas_sequence_set_err(struct papr_rtas_sequence *seq, int err)
+{
+	/* Preserve the first error recorded. */
+	if (seq->error == 0)
+		seq->error = err;
+
+	return seq->error;
+}
+
+/*
+ * Higher-level retrieval code below. These functions use the
+ * papr_rtas_blob_* and sequence_* APIs defined above to create fd-based
+ * handles for consumption by user space.
+ */
+
+/**
+ * papr_rtas_run_sequence() - Run a single retrieval sequence.
+ * @seq:	Functions of the caller to complete the sequence
+ *
+ * Context: May sleep. Holds a mutex and an RTAS work area for its
+ *          duration. Typically performs multiple sleepable slab
+ *          allocations.
+ *
+ * Return: A populated &struct papr_rtas_blob on success. Encoded error
+ * pointer otherwise.
+ */
+static const struct papr_rtas_blob *papr_rtas_run_sequence(struct papr_rtas_sequence *seq)
+{
+	const struct papr_rtas_blob *blob;
+
+	if (seq->begin)
+		seq->begin(seq);
+
+	blob = papr_rtas_blob_generate(seq);
+	if (!blob)
+		papr_rtas_sequence_set_err(seq, -ENOMEM);
+
+	if (seq->end)
+		seq->end(seq);
+
+
+	if (seq->error) {
+		papr_rtas_blob_free(blob);
+		return ERR_PTR(seq->error);
+	}
+
+	return blob;
+}
+
+/**
+ * papr_rtas_retrieve() - Return the data blob that is exposed to
+ * user space.
+ * @seq: RTAS call specific functions to be invoked until the
+ *       sequence is completed.
+ *
+ * Run sequences against @param until a blob is successfully
+ * instantiated, or a hard error is encountered, or a fatal signal is
+ * pending.
+ *
+ * Context: May sleep.
+ * Return: A fully populated data blob when successful. Encoded error
+ * pointer otherwise.
+ */
+const struct papr_rtas_blob *papr_rtas_retrieve(struct papr_rtas_sequence *seq)
+{
+	const struct papr_rtas_blob *blob;
+
+	/*
+	 * EAGAIN means the sequence returns error with a -4 (data
+	 * changed and need to start the sequence) status from RTAS calls
+	 * and we should attempt a new sequence. PAPR+ (v2.13 R1–7.3.20–5
+	 * - ibm,get-vpd, R1–7.3.17–6 - ibm,get-indices) indicates that
+	 * this should be a transient condition, not something that
+	 * happens continuously. But we'll stop trying on a fatal signal.
+	 */
+	do {
+		blob = papr_rtas_run_sequence(seq);
+		if (!IS_ERR(blob)) /* Success. */
+			break;
+		if (PTR_ERR(blob) != -EAGAIN) /* Hard error. */
+			break;
+		cond_resched();
+	} while (!fatal_signal_pending(current));
+
+	return blob;
+}
+
+/**
+ * papr_rtas_setup_file_interface - Complete the sequence and obtain
+ * the data and export to user space with fd-based handles. Then the
+ * user spave gets the data with read() handle.
+ * @seq: RTAS call specific functions to get the data.
+ * @fops: RTAS call specific file operations such as read().
+ * @name: RTAS call specific char device node.
+ *
+ * Return: FD handle for consumption by user space
+ */
+long papr_rtas_setup_file_interface(struct papr_rtas_sequence *seq,
+				const struct file_operations *fops,
+				char *name)
+{
+	const struct papr_rtas_blob *blob;
+	struct file *file;
+	long ret;
+	int fd;
+
+	blob = papr_rtas_retrieve(seq);
+	if (IS_ERR(blob))
+		return PTR_ERR(blob);
+
+	fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC);
+	if (fd < 0) {
+		ret = fd;
+		goto free_blob;
+	}
+
+	file = anon_inode_getfile_fmode(name, fops, (void *)blob,
+			O_RDONLY, FMODE_LSEEK | FMODE_PREAD);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto put_fd;
+	}
+
+	fd_install(fd, file);
+	return fd;
+
+put_fd:
+	put_unused_fd(fd);
+free_blob:
+	papr_rtas_blob_free(blob);
+	return ret;
+}
+
+/*
+ * papr_rtas_sequence_should_stop() - Determine whether RTAS retrieval
+ *                                    sequence should continue.
+ *
+ * Examines the sequence error state and outputs of the last call to
+ * the specific RTAS to determine whether the sequence in progress
+ * should continue or stop.
+ *
+ * Return: True if the sequence has encountered an error or if all data
+ *         for this sequence has been retrieved. False otherwise.
+ */
+bool papr_rtas_sequence_should_stop(const struct papr_rtas_sequence *seq,
+				s32 status, bool init_state)
+{
+	bool done;
+
+	if (seq->error)
+		return true;
+
+	switch (status) {
+	case RTAS_SEQ_COMPLETE:
+		if (init_state)
+			done = false; /* Initial state. */
+		else
+			done = true; /* All data consumed. */
+		break;
+	case RTAS_SEQ_MORE_DATA:
+		done = false; /* More data available. */
+		break;
+	default:
+		done = true; /* Error encountered. */
+		break;
+	}
+
+	return done;
+}
+
+/*
+ * User space read to retrieve data for the corresponding RTAS call.
+ * papr_rtas_blob is filled with the data using the corresponding RTAS
+ * call sequence API.
+ */
+ssize_t papr_rtas_common_handle_read(struct file *file,
+	       char __user *buf, size_t size, loff_t *off)
+{
+	const struct papr_rtas_blob *blob = file->private_data;
+
+	/* We should not instantiate a handle without any data attached. */
+	if (!papr_rtas_blob_has_data(blob)) {
+		pr_err_once("handle without data\n");
+		return -EIO;
+	}
+
+	return simple_read_from_buffer(buf, size, off, blob->data, blob->len);
+}
+
+int papr_rtas_common_handle_release(struct inode *inode,
+		struct file *file)
+{
+	const struct papr_rtas_blob *blob = file->private_data;
+
+	papr_rtas_blob_free(blob);
+
+	return 0;
+}
+
+loff_t papr_rtas_common_handle_seek(struct file *file, loff_t off,
+					int whence)
+{
+	const struct papr_rtas_blob *blob = file->private_data;
+
+	return fixed_size_llseek(file, off, whence, blob->len);
+}
diff --git a/arch/powerpc/platforms/pseries/papr-rtas-common.h b/arch/powerpc/platforms/pseries/papr-rtas-common.h
new file mode 100644
index 000000000000..4ceabcaf4905
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/papr-rtas-common.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_POWERPC_PAPR_RTAS_COMMON_H
+#define _ASM_POWERPC_PAPR_RTAS_COMMON_H
+
+#include <linux/types.h>
+
+/*
+ * Return codes for sequence based RTAS calls.
+ * Not listed under PAPR+ v2.13 7.2.8: "Return Codes".
+ * But defined in the specific section of each RTAS call.
+ */
+#define RTAS_SEQ_COMPLETE	0 /* All data has been retrieved. */
+#define RTAS_SEQ_MORE_DATA	1 /* More data is available */
+#define RTAS_SEQ_START_OVER	-4 /* Data changed, restart call sequence. */
+
+/*
+ * Internal "blob" APIs for accumulating RTAS call results into
+ * an immutable buffer to be attached to a file descriptor.
+ */
+struct papr_rtas_blob {
+	const char *data;
+	size_t len;
+};
+
+/**
+ * struct papr_sequence - State for managing a sequence of RTAS calls.
+ * @error:  Shall be zero as long as the sequence has not encountered an error,
+ *          -ve errno otherwise. Use papr_rtas_sequence_set_err() to update.
+ * @params: Parameter block to pass to rtas_*() calls.
+ * @begin: Work area allocation and initialize the needed parameter
+ *         values passed to RTAS call
+ * @end: Free the allocated work area
+ * @work: Obtain data with RTAS call and invoke it until the sequence is
+ *        completed.
+ *
+ */
+struct papr_rtas_sequence {
+	int error;
+	void *params;
+	void (*begin)(struct papr_rtas_sequence *seq);
+	void (*end)(struct papr_rtas_sequence *seq);
+	const char *(*work)(struct papr_rtas_sequence *seq, size_t *len);
+};
+
+extern bool papr_rtas_blob_has_data(const struct papr_rtas_blob *blob);
+extern void papr_rtas_blob_free(const struct papr_rtas_blob *blob);
+extern int papr_rtas_sequence_set_err(struct papr_rtas_sequence *seq,
+		int err);
+extern const struct papr_rtas_blob *papr_rtas_retrieve(struct papr_rtas_sequence *seq);
+extern long papr_rtas_setup_file_interface(struct papr_rtas_sequence *seq,
+			const struct file_operations *fops, char *name);
+extern bool papr_rtas_sequence_should_stop(const struct papr_rtas_sequence *seq,
+				s32 status, bool init_state);
+extern ssize_t papr_rtas_common_handle_read(struct file *file,
+			char __user *buf, size_t size, loff_t *off);
+extern int papr_rtas_common_handle_release(struct inode *inode,
+					struct file *file);
+extern loff_t papr_rtas_common_handle_seek(struct file *file, loff_t off,
+					int whence);
+#endif /* _ASM_POWERPC_PAPR_RTAS_COMMON_H */
+
diff --git a/arch/powerpc/platforms/pseries/papr-vpd.c b/arch/powerpc/platforms/pseries/papr-vpd.c
index c86950d7105a..f38c188fc4a1 100644
--- a/arch/powerpc/platforms/pseries/papr-vpd.c
+++ b/arch/powerpc/platforms/pseries/papr-vpd.c
@@ -2,7 +2,6 @@
 
 #define pr_fmt(fmt) "papr-vpd: " fmt
 
-#include <linux/anon_inodes.h>
 #include <linux/build_bug.h>
 #include <linux/file.h>
 #include <linux/fs.h>
@@ -20,14 +19,7 @@
 #include <asm/rtas-work-area.h>
 #include <asm/rtas.h>
 #include <uapi/asm/papr-vpd.h>
-
-/*
- * Function-specific return values for ibm,get-vpd, derived from PAPR+
- * v2.13 7.3.20 "ibm,get-vpd RTAS Call".
- */
-#define RTAS_IBM_GET_VPD_COMPLETE    0 /* All VPD has been retrieved. */
-#define RTAS_IBM_GET_VPD_MORE_DATA   1 /* More VPD is available. */
-#define RTAS_IBM_GET_VPD_START_OVER -4 /* VPD changed, restart call sequence. */
+#include "papr-rtas-common.h"
 
 /**
  * struct rtas_ibm_get_vpd_params - Parameters (in and out) for ibm,get-vpd.
@@ -91,13 +83,14 @@ static int rtas_ibm_get_vpd(struct rtas_ibm_get_vpd_params *params)
 	case RTAS_INVALID_PARAMETER:
 		ret = -EINVAL;
 		break;
-	case RTAS_IBM_GET_VPD_START_OVER:
+	case RTAS_SEQ_START_OVER:
 		ret = -EAGAIN;
+		pr_info_ratelimited("VPD changed during retrieval, retrying\n");
 		break;
-	case RTAS_IBM_GET_VPD_MORE_DATA:
+	case RTAS_SEQ_MORE_DATA:
 		params->sequence = rets[0];
 		fallthrough;
-	case RTAS_IBM_GET_VPD_COMPLETE:
+	case RTAS_SEQ_COMPLETE:
 		params->written = rets[1];
 		/*
 		 * Kernel or firmware bug, do not continue.
@@ -119,91 +112,6 @@ static int rtas_ibm_get_vpd(struct rtas_ibm_get_vpd_params *params)
 }
 
 /*
- * Internal VPD "blob" APIs for accumulating ibm,get-vpd results into
- * an immutable buffer to be attached to a file descriptor.
- */
-struct vpd_blob {
-	const char *data;
-	size_t len;
-};
-
-static bool vpd_blob_has_data(const struct vpd_blob *blob)
-{
-	return blob->data && blob->len;
-}
-
-static void vpd_blob_free(const struct vpd_blob *blob)
-{
-	if (blob) {
-		kvfree(blob->data);
-		kfree(blob);
-	}
-}
-
-/**
- * vpd_blob_extend() - Append data to a &struct vpd_blob.
- * @blob: The blob to extend.
- * @data: The new data to append to @blob.
- * @len:  The length of @data.
- *
- * Context: May sleep.
- * Return: -ENOMEM on allocation failure, 0 otherwise.
- */
-static int vpd_blob_extend(struct vpd_blob *blob, const char *data, size_t len)
-{
-	const size_t new_len = blob->len + len;
-	const size_t old_len = blob->len;
-	const char *old_ptr = blob->data;
-	char *new_ptr;
-
-	new_ptr = kvrealloc(old_ptr, new_len, GFP_KERNEL_ACCOUNT);
-	if (!new_ptr)
-		return -ENOMEM;
-
-	memcpy(&new_ptr[old_len], data, len);
-	blob->data = new_ptr;
-	blob->len = new_len;
-	return 0;
-}
-
-/**
- * vpd_blob_generate() - Construct a new &struct vpd_blob.
- * @generator: Function that supplies the blob data.
- * @arg:       Context pointer supplied by caller, passed to @generator.
- *
- * The @generator callback is invoked until it returns NULL. @arg is
- * passed to @generator in its first argument on each call. When
- * @generator returns data, it should store the data length in its
- * second argument.
- *
- * Context: May sleep.
- * Return: A completely populated &struct vpd_blob, or NULL on error.
- */
-static const struct vpd_blob *
-vpd_blob_generate(const char * (*generator)(void *, size_t *), void *arg)
-{
-	struct vpd_blob *blob;
-	const char *buf;
-	size_t len;
-	int err = 0;
-
-	blob  = kzalloc(sizeof(*blob), GFP_KERNEL_ACCOUNT);
-	if (!blob)
-		return NULL;
-
-	while (err == 0 && (buf = generator(arg, &len)))
-		err = vpd_blob_extend(blob, buf, len);
-
-	if (err != 0 || !vpd_blob_has_data(blob))
-		goto free_blob;
-
-	return blob;
-free_blob:
-	vpd_blob_free(blob);
-	return NULL;
-}
-
-/*
  * Internal VPD sequence APIs. A VPD sequence is a series of calls to
  * ibm,get-vpd for a given location code. The sequence ends when an
  * error is encountered or all VPD for the location code has been
@@ -211,30 +119,14 @@ free_blob:
  */
 
 /**
- * struct vpd_sequence - State for managing a VPD sequence.
- * @error:  Shall be zero as long as the sequence has not encountered an error,
- *          -ve errno otherwise. Use vpd_sequence_set_err() to update this.
- * @params: Parameter block to pass to rtas_ibm_get_vpd().
- */
-struct vpd_sequence {
-	int error;
-	struct rtas_ibm_get_vpd_params params;
-};
-
-/**
  * vpd_sequence_begin() - Begin a VPD retrieval sequence.
- * @seq:      Uninitialized sequence state.
- * @loc_code: Location code that defines the scope of the VPD to return.
- *
- * Initializes @seq with the resources necessary to carry out a VPD
- * sequence. Callers must pass @seq to vpd_sequence_end() regardless
- * of whether the sequence succeeds.
+ * @seq: vpd call parameters from sequence struct
  *
  * Context: May sleep.
  */
-static void vpd_sequence_begin(struct vpd_sequence *seq,
-			       const struct papr_location_code *loc_code)
+static void vpd_sequence_begin(struct papr_rtas_sequence *seq)
 {
+	struct rtas_ibm_get_vpd_params *vpd_params;
 	/*
 	 * Use a static data structure for the location code passed to
 	 * RTAS to ensure it's in the RMA and avoid a separate work
@@ -242,6 +134,7 @@ static void vpd_sequence_begin(struct vpd_sequence *seq,
 	 */
 	static struct papr_location_code static_loc_code;
 
+	vpd_params =  (struct rtas_ibm_get_vpd_params *)seq->params;
 	/*
 	 * We could allocate the work area before acquiring the
 	 * function lock, but that would allow concurrent requests to
@@ -249,14 +142,12 @@ static void vpd_sequence_begin(struct vpd_sequence *seq,
 	 * allocate the work area under the lock.
 	 */
 	mutex_lock(&rtas_ibm_get_vpd_lock);
-	static_loc_code = *loc_code;
-	*seq = (struct vpd_sequence) {
-		.params = {
-			.work_area = rtas_work_area_alloc(SZ_4K),
-			.loc_code = &static_loc_code,
-			.sequence = 1,
-		},
-	};
+	static_loc_code = *(struct papr_location_code *)vpd_params->loc_code;
+	vpd_params =  (struct rtas_ibm_get_vpd_params *)seq->params;
+	vpd_params->work_area = rtas_work_area_alloc(SZ_4K);
+	vpd_params->loc_code = &static_loc_code;
+	vpd_params->sequence = 1;
+	vpd_params->status = 0;
 }
 
 /**
@@ -265,180 +156,39 @@ static void vpd_sequence_begin(struct vpd_sequence *seq,
  *
  * Releases resources obtained by vpd_sequence_begin().
  */
-static void vpd_sequence_end(struct vpd_sequence *seq)
+static void vpd_sequence_end(struct papr_rtas_sequence *seq)
 {
-	rtas_work_area_free(seq->params.work_area);
-	mutex_unlock(&rtas_ibm_get_vpd_lock);
-}
-
-/**
- * vpd_sequence_should_stop() - Determine whether a VPD retrieval sequence
- *                              should continue.
- * @seq: VPD sequence state.
- *
- * Examines the sequence error state and outputs of the last call to
- * ibm,get-vpd to determine whether the sequence in progress should
- * continue or stop.
- *
- * Return: True if the sequence has encountered an error or if all VPD for
- *         this sequence has been retrieved. False otherwise.
- */
-static bool vpd_sequence_should_stop(const struct vpd_sequence *seq)
-{
-	bool done;
-
-	if (seq->error)
-		return true;
+	struct rtas_ibm_get_vpd_params *vpd_params;
 
-	switch (seq->params.status) {
-	case 0:
-		if (seq->params.written == 0)
-			done = false; /* Initial state. */
-		else
-			done = true; /* All data consumed. */
-		break;
-	case 1:
-		done = false; /* More data available. */
-		break;
-	default:
-		done = true; /* Error encountered. */
-		break;
-	}
-
-	return done;
-}
-
-static int vpd_sequence_set_err(struct vpd_sequence *seq, int err)
-{
-	/* Preserve the first error recorded. */
-	if (seq->error == 0)
-		seq->error = err;
-
-	return seq->error;
+	vpd_params =  (struct rtas_ibm_get_vpd_params *)seq->params;
+	rtas_work_area_free(vpd_params->work_area);
+	mutex_unlock(&rtas_ibm_get_vpd_lock);
 }
 
 /*
- * Generator function to be passed to vpd_blob_generate().
+ * Generator function to be passed to papr_rtas_blob_generate().
  */
-static const char *vpd_sequence_fill_work_area(void *arg, size_t *len)
+static const char *vpd_sequence_fill_work_area(struct papr_rtas_sequence *seq,
+						size_t *len)
 {
-	struct vpd_sequence *seq = arg;
-	struct rtas_ibm_get_vpd_params *p = &seq->params;
+	struct rtas_ibm_get_vpd_params *p;
+	bool init_state;
 
-	if (vpd_sequence_should_stop(seq))
+	p = (struct rtas_ibm_get_vpd_params *)seq->params;
+	init_state = (p->written == 0) ? true : false;
+
+	if (papr_rtas_sequence_should_stop(seq, p->status, init_state))
 		return NULL;
-	if (vpd_sequence_set_err(seq, rtas_ibm_get_vpd(p)))
+	if (papr_rtas_sequence_set_err(seq, rtas_ibm_get_vpd(p)))
 		return NULL;
 	*len = p->written;
 	return rtas_work_area_raw_buf(p->work_area);
 }
 
-/*
- * Higher-level VPD retrieval code below. These functions use the
- * vpd_blob_* and vpd_sequence_* APIs defined above to create fd-based
- * VPD handles for consumption by user space.
- */
-
-/**
- * papr_vpd_run_sequence() - Run a single VPD retrieval sequence.
- * @loc_code: Location code that defines the scope of VPD to return.
- *
- * Context: May sleep. Holds a mutex and an RTAS work area for its
- *          duration. Typically performs multiple sleepable slab
- *          allocations.
- *
- * Return: A populated &struct vpd_blob on success. Encoded error
- * pointer otherwise.
- */
-static const struct vpd_blob *papr_vpd_run_sequence(const struct papr_location_code *loc_code)
-{
-	const struct vpd_blob *blob;
-	struct vpd_sequence seq;
-
-	vpd_sequence_begin(&seq, loc_code);
-	blob = vpd_blob_generate(vpd_sequence_fill_work_area, &seq);
-	if (!blob)
-		vpd_sequence_set_err(&seq, -ENOMEM);
-	vpd_sequence_end(&seq);
-
-	if (seq.error) {
-		vpd_blob_free(blob);
-		return ERR_PTR(seq.error);
-	}
-
-	return blob;
-}
-
-/**
- * papr_vpd_retrieve() - Return the VPD for a location code.
- * @loc_code: Location code that defines the scope of VPD to return.
- *
- * Run VPD sequences against @loc_code until a blob is successfully
- * instantiated, or a hard error is encountered, or a fatal signal is
- * pending.
- *
- * Context: May sleep.
- * Return: A fully populated VPD blob when successful. Encoded error
- * pointer otherwise.
- */
-static const struct vpd_blob *papr_vpd_retrieve(const struct papr_location_code *loc_code)
-{
-	const struct vpd_blob *blob;
-
-	/*
-	 * EAGAIN means the sequence errored with a -4 (VPD changed)
-	 * status from ibm,get-vpd, and we should attempt a new
-	 * sequence. PAPR+ v2.13 R1–7.3.20–5 indicates that this
-	 * should be a transient condition, not something that happens
-	 * continuously. But we'll stop trying on a fatal signal.
-	 */
-	do {
-		blob = papr_vpd_run_sequence(loc_code);
-		if (!IS_ERR(blob)) /* Success. */
-			break;
-		if (PTR_ERR(blob) != -EAGAIN) /* Hard error. */
-			break;
-		pr_info_ratelimited("VPD changed during retrieval, retrying\n");
-		cond_resched();
-	} while (!fatal_signal_pending(current));
-
-	return blob;
-}
-
-static ssize_t papr_vpd_handle_read(struct file *file, char __user *buf, size_t size, loff_t *off)
-{
-	const struct vpd_blob *blob = file->private_data;
-
-	/* bug: we should not instantiate a handle without any data attached. */
-	if (!vpd_blob_has_data(blob)) {
-		pr_err_once("handle without data\n");
-		return -EIO;
-	}
-
-	return simple_read_from_buffer(buf, size, off, blob->data, blob->len);
-}
-
-static int papr_vpd_handle_release(struct inode *inode, struct file *file)
-{
-	const struct vpd_blob *blob = file->private_data;
-
-	vpd_blob_free(blob);
-
-	return 0;
-}
-
-static loff_t papr_vpd_handle_seek(struct file *file, loff_t off, int whence)
-{
-	const struct vpd_blob *blob = file->private_data;
-
-	return fixed_size_llseek(file, off, whence, blob->len);
-}
-
-
 static const struct file_operations papr_vpd_handle_ops = {
-	.read = papr_vpd_handle_read,
-	.llseek = papr_vpd_handle_seek,
-	.release = papr_vpd_handle_release,
+	.read = papr_rtas_common_handle_read,
+	.llseek = papr_rtas_common_handle_seek,
+	.release = papr_rtas_common_handle_release,
 };
 
 /**
@@ -460,10 +210,9 @@ static const struct file_operations papr_vpd_handle_ops = {
  */
 static long papr_vpd_create_handle(struct papr_location_code __user *ulc)
 {
+	struct rtas_ibm_get_vpd_params vpd_params = {};
+	struct papr_rtas_sequence seq = {};
 	struct papr_location_code klc;
-	const struct vpd_blob *blob;
-	struct file *file;
-	long err;
 	int fd;
 
 	if (copy_from_user(&klc, ulc, sizeof(klc)))
@@ -472,30 +221,19 @@ static long papr_vpd_create_handle(struct papr_location_code __user *ulc)
 	if (!string_is_terminated(klc.str, ARRAY_SIZE(klc.str)))
 		return -EINVAL;
 
-	blob = papr_vpd_retrieve(&klc);
-	if (IS_ERR(blob))
-		return PTR_ERR(blob);
+	seq = (struct papr_rtas_sequence) {
+		.begin = vpd_sequence_begin,
+		.end = vpd_sequence_end,
+		.work = vpd_sequence_fill_work_area,
+	};
 
-	fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC);
-	if (fd < 0) {
-		err = fd;
-		goto free_blob;
-	}
+	vpd_params.loc_code = &klc;
+	seq.params = (void *)&vpd_params;
+
+	fd = papr_rtas_setup_file_interface(&seq, &papr_vpd_handle_ops,
+			"[papr-vpd]");
 
-	file = anon_inode_getfile_fmode("[papr-vpd]", &papr_vpd_handle_ops,
-				  (void *)blob, O_RDONLY,
-				  FMODE_LSEEK | FMODE_PREAD);
-	if (IS_ERR(file)) {
-		err = PTR_ERR(file);
-		goto put_fd;
-	}
-	fd_install(fd, file);
 	return fd;
-put_fd:
-	put_unused_fd(fd);
-free_blob:
-	vpd_blob_free(blob);
-	return err;
 }
 
 /*
diff --git a/arch/powerpc/sysdev/cpm_common.c b/arch/powerpc/sysdev/cpm_common.c
index 47db732981a8..e22fc638dbc7 100644
--- a/arch/powerpc/sysdev/cpm_common.c
+++ b/arch/powerpc/sysdev/cpm_common.c
@@ -138,7 +138,7 @@ static void __cpm2_gpio32_set(struct of_mm_gpio_chip *mm_gc, u32 pin_mask,
 	out_be32(&iop->dat, cpm2_gc->cpdata);
 }
 
-static void cpm2_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value)
+static int cpm2_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value)
 {
 	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
 	struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(gc);
@@ -150,6 +150,8 @@ static void cpm2_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value)
 	__cpm2_gpio32_set(mm_gc, pin_mask, value);
 
 	spin_unlock_irqrestore(&cpm2_gc->lock, flags);
+
+	return 0;
 }
 
 static int cpm2_gpio32_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
@@ -208,7 +210,7 @@ int cpm2_gpiochip_add32(struct device *dev)
 	gc->direction_input = cpm2_gpio32_dir_in;
 	gc->direction_output = cpm2_gpio32_dir_out;
 	gc->get = cpm2_gpio32_get;
-	gc->set = cpm2_gpio32_set;
+	gc->set_rv = cpm2_gpio32_set;
 	gc->parent = dev;
 	gc->owner = THIS_MODULE;
 
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 4afbab83a2e2..c706a08e9955 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -27,6 +27,7 @@
 #include <linux/spinlock.h>
 #include <linux/pci.h>
 #include <linux/slab.h>
+#include <linux/string_choices.h>
 #include <linux/syscore_ops.h>
 #include <linux/ratelimit.h>
 #include <linux/pgtable.h>
@@ -474,9 +475,9 @@ static void __init mpic_scan_ht_msi(struct mpic *mpic, u8 __iomem *devbase,
 		addr = addr | ((u64)readl(base + HT_MSI_ADDR_HI) << 32);
 	}
 
-	printk(KERN_DEBUG "mpic:   - HT:%02x.%x %s MSI mapping found @ 0x%llx\n",
-		PCI_SLOT(devfn), PCI_FUNC(devfn),
-		flags & HT_MSI_FLAGS_ENABLE ? "enabled" : "disabled", addr);
+	pr_debug("mpic:   - HT:%02x.%x %s MSI mapping found @ 0x%llx\n",
+		 PCI_SLOT(devfn), PCI_FUNC(devfn),
+		 str_enabled_disabled(flags & HT_MSI_FLAGS_ENABLE), addr);
 
 	if (!(flags & HT_MSI_FLAGS_ENABLE))
 		writeb(flags | HT_MSI_FLAGS_ENABLE, base + HT_MSI_FLAGS);
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 88abffa8b54c..cb3a3244ae6f 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1770,7 +1770,7 @@ static void xmon_show_stack(unsigned long sp, unsigned long lr,
 				       sp + STACK_INT_FRAME_REGS);
 				break;
 			}
-			printf("--- Exception: %lx %s at ", regs.trap,
+			printf("---- Exception: %lx %s at ", regs.trap,
 			       getvecname(TRAP(&regs)));
 			pc = regs.nip;
 			lr = regs.link;
diff --git a/arch/riscv/boot/dts/sophgo/cv18xx.dtsi b/arch/riscv/boot/dts/sophgo/cv18xx.dtsi
index c18822ec849f..58cd546392e0 100644
--- a/arch/riscv/boot/dts/sophgo/cv18xx.dtsi
+++ b/arch/riscv/boot/dts/sophgo/cv18xx.dtsi
@@ -341,7 +341,7 @@
 					   1024 1024 1024 1024>;
 			snps,priority = <0 1 2 3 4 5 6 7>;
 			snps,dma-masters = <2>;
-			snps,data-width = <4>;
+			snps,data-width = <2>;
 			status = "disabled";
 		};
 
diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig
index c67095a3d669..cd9b776602f8 100644
--- a/arch/riscv/crypto/Kconfig
+++ b/arch/riscv/crypto/Kconfig
@@ -18,16 +18,6 @@ config CRYPTO_AES_RISCV64
 	  - Zvkb vector crypto extension (CTR)
 	  - Zvkg vector crypto extension (XTS)
 
-config CRYPTO_CHACHA_RISCV64
-	tristate "Ciphers: ChaCha"
-	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
-	select CRYPTO_SKCIPHER
-	help
-	  Length-preserving ciphers: ChaCha20 stream cipher algorithm
-
-	  Architecture: riscv64 using:
-	  - Zvkb vector crypto extension
-
 config CRYPTO_GHASH_RISCV64
 	tristate "Hash functions: GHASH"
 	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
@@ -38,17 +28,6 @@ config CRYPTO_GHASH_RISCV64
 	  Architecture: riscv64 using:
 	  - Zvkg vector crypto extension
 
-config CRYPTO_SHA256_RISCV64
-	tristate "Hash functions: SHA-224 and SHA-256"
-	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
-	select CRYPTO_SHA256
-	help
-	  SHA-224 and SHA-256 secure hash algorithm (FIPS 180)
-
-	  Architecture: riscv64 using:
-	  - Zvknha or Zvknhb vector crypto extensions
-	  - Zvkb vector crypto extension
-
 config CRYPTO_SHA512_RISCV64
 	tristate "Hash functions: SHA-384 and SHA-512"
 	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
@@ -64,7 +43,7 @@ config CRYPTO_SM3_RISCV64
 	tristate "Hash functions: SM3 (ShangMi 3)"
 	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
 	select CRYPTO_HASH
-	select CRYPTO_SM3
+	select CRYPTO_LIB_SM3
 	help
 	  SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012)
 
diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile
index 247c7bc7288c..e10e8257734e 100644
--- a/arch/riscv/crypto/Makefile
+++ b/arch/riscv/crypto/Makefile
@@ -4,15 +4,9 @@ obj-$(CONFIG_CRYPTO_AES_RISCV64) += aes-riscv64.o
 aes-riscv64-y := aes-riscv64-glue.o aes-riscv64-zvkned.o \
 		 aes-riscv64-zvkned-zvbb-zvkg.o aes-riscv64-zvkned-zvkb.o
 
-obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o
-chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o
-
 obj-$(CONFIG_CRYPTO_GHASH_RISCV64) += ghash-riscv64.o
 ghash-riscv64-y := ghash-riscv64-glue.o ghash-riscv64-zvkg.o
 
-obj-$(CONFIG_CRYPTO_SHA256_RISCV64) += sha256-riscv64.o
-sha256-riscv64-y := sha256-riscv64-glue.o sha256-riscv64-zvknha_or_zvknhb-zvkb.o
-
 obj-$(CONFIG_CRYPTO_SHA512_RISCV64) += sha512-riscv64.o
 sha512-riscv64-y := sha512-riscv64-glue.o sha512-riscv64-zvknhb-zvkb.o
 
diff --git a/arch/riscv/crypto/chacha-riscv64-glue.c b/arch/riscv/crypto/chacha-riscv64-glue.c
deleted file mode 100644
index 10b46f36375a..000000000000
--- a/arch/riscv/crypto/chacha-riscv64-glue.c
+++ /dev/null
@@ -1,101 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * ChaCha20 using the RISC-V vector crypto extensions
- *
- * Copyright (C) 2023 SiFive, Inc.
- * Author: Jerry Shih <jerry.shih@sifive.com>
- */
-
-#include <asm/simd.h>
-#include <asm/vector.h>
-#include <crypto/internal/chacha.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/linkage.h>
-#include <linux/module.h>
-
-asmlinkage void chacha20_zvkb(const u32 key[8], const u8 *in, u8 *out,
-			      size_t len, const u32 iv[4]);
-
-static int riscv64_chacha20_crypt(struct skcipher_request *req)
-{
-	u32 iv[CHACHA_IV_SIZE / sizeof(u32)];
-	u8 block_buffer[CHACHA_BLOCK_SIZE];
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	unsigned int nbytes;
-	unsigned int tail_bytes;
-	int err;
-
-	iv[0] = get_unaligned_le32(req->iv);
-	iv[1] = get_unaligned_le32(req->iv + 4);
-	iv[2] = get_unaligned_le32(req->iv + 8);
-	iv[3] = get_unaligned_le32(req->iv + 12);
-
-	err = skcipher_walk_virt(&walk, req, false);
-	while (walk.nbytes) {
-		nbytes = walk.nbytes & ~(CHACHA_BLOCK_SIZE - 1);
-		tail_bytes = walk.nbytes & (CHACHA_BLOCK_SIZE - 1);
-		kernel_vector_begin();
-		if (nbytes) {
-			chacha20_zvkb(ctx->key, walk.src.virt.addr,
-				      walk.dst.virt.addr, nbytes, iv);
-			iv[0] += nbytes / CHACHA_BLOCK_SIZE;
-		}
-		if (walk.nbytes == walk.total && tail_bytes > 0) {
-			memcpy(block_buffer, walk.src.virt.addr + nbytes,
-			       tail_bytes);
-			chacha20_zvkb(ctx->key, block_buffer, block_buffer,
-				      CHACHA_BLOCK_SIZE, iv);
-			memcpy(walk.dst.virt.addr + nbytes, block_buffer,
-			       tail_bytes);
-			tail_bytes = 0;
-		}
-		kernel_vector_end();
-
-		err = skcipher_walk_done(&walk, tail_bytes);
-	}
-
-	return err;
-}
-
-static struct skcipher_alg riscv64_chacha_alg = {
-	.setkey = chacha20_setkey,
-	.encrypt = riscv64_chacha20_crypt,
-	.decrypt = riscv64_chacha20_crypt,
-	.min_keysize = CHACHA_KEY_SIZE,
-	.max_keysize = CHACHA_KEY_SIZE,
-	.ivsize = CHACHA_IV_SIZE,
-	.chunksize = CHACHA_BLOCK_SIZE,
-	.walksize = 4 * CHACHA_BLOCK_SIZE,
-	.base = {
-		.cra_blocksize = 1,
-		.cra_ctxsize = sizeof(struct chacha_ctx),
-		.cra_priority = 300,
-		.cra_name = "chacha20",
-		.cra_driver_name = "chacha20-riscv64-zvkb",
-		.cra_module = THIS_MODULE,
-	},
-};
-
-static int __init riscv64_chacha_mod_init(void)
-{
-	if (riscv_isa_extension_available(NULL, ZVKB) &&
-	    riscv_vector_vlen() >= 128)
-		return crypto_register_skcipher(&riscv64_chacha_alg);
-
-	return -ENODEV;
-}
-
-static void __exit riscv64_chacha_mod_exit(void)
-{
-	crypto_unregister_skcipher(&riscv64_chacha_alg);
-}
-
-module_init(riscv64_chacha_mod_init);
-module_exit(riscv64_chacha_mod_exit);
-
-MODULE_DESCRIPTION("ChaCha20 (RISC-V accelerated)");
-MODULE_AUTHOR("Jerry Shih <jerry.shih@sifive.com>");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_CRYPTO("chacha20");
diff --git a/arch/riscv/crypto/ghash-riscv64-glue.c b/arch/riscv/crypto/ghash-riscv64-glue.c
index 312e7891fd0a..d86073d25387 100644
--- a/arch/riscv/crypto/ghash-riscv64-glue.c
+++ b/arch/riscv/crypto/ghash-riscv64-glue.c
@@ -11,11 +11,16 @@
 
 #include <asm/simd.h>
 #include <asm/vector.h>
+#include <crypto/b128ops.h>
+#include <crypto/gf128mul.h>
 #include <crypto/ghash.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
-#include <linux/linkage.h>
+#include <crypto/utils.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/string.h>
 
 asmlinkage void ghash_zvkg(be128 *accumulator, const be128 *key, const u8 *data,
 			   size_t len);
@@ -26,8 +31,6 @@ struct riscv64_ghash_tfm_ctx {
 
 struct riscv64_ghash_desc_ctx {
 	be128 accumulator;
-	u8 buffer[GHASH_BLOCK_SIZE];
-	u32 bytes;
 };
 
 static int riscv64_ghash_setkey(struct crypto_shash *tfm, const u8 *key,
@@ -78,50 +81,24 @@ static int riscv64_ghash_update(struct shash_desc *desc, const u8 *src,
 {
 	const struct riscv64_ghash_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
-	unsigned int len;
-
-	if (dctx->bytes) {
-		if (dctx->bytes + srclen < GHASH_BLOCK_SIZE) {
-			memcpy(dctx->buffer + dctx->bytes, src, srclen);
-			dctx->bytes += srclen;
-			return 0;
-		}
-		memcpy(dctx->buffer + dctx->bytes, src,
-		       GHASH_BLOCK_SIZE - dctx->bytes);
-		riscv64_ghash_blocks(tctx, dctx, dctx->buffer,
-				     GHASH_BLOCK_SIZE);
-		src += GHASH_BLOCK_SIZE - dctx->bytes;
-		srclen -= GHASH_BLOCK_SIZE - dctx->bytes;
-		dctx->bytes = 0;
-	}
-
-	len = round_down(srclen, GHASH_BLOCK_SIZE);
-	if (len) {
-		riscv64_ghash_blocks(tctx, dctx, src, len);
-		src += len;
-		srclen -= len;
-	}
 
-	if (srclen) {
-		memcpy(dctx->buffer, src, srclen);
-		dctx->bytes = srclen;
-	}
-
-	return 0;
+	riscv64_ghash_blocks(tctx, dctx, src,
+			     round_down(srclen, GHASH_BLOCK_SIZE));
+	return srclen - round_down(srclen, GHASH_BLOCK_SIZE);
 }
 
-static int riscv64_ghash_final(struct shash_desc *desc, u8 *out)
+static int riscv64_ghash_finup(struct shash_desc *desc, const u8 *src,
+			       unsigned int len, u8 *out)
 {
 	const struct riscv64_ghash_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
-	int i;
 
-	if (dctx->bytes) {
-		for (i = dctx->bytes; i < GHASH_BLOCK_SIZE; i++)
-			dctx->buffer[i] = 0;
+	if (len) {
+		u8 buf[GHASH_BLOCK_SIZE] = {};
 
-		riscv64_ghash_blocks(tctx, dctx, dctx->buffer,
-				     GHASH_BLOCK_SIZE);
+		memcpy(buf, src, len);
+		riscv64_ghash_blocks(tctx, dctx, buf, GHASH_BLOCK_SIZE);
+		memzero_explicit(buf, sizeof(buf));
 	}
 
 	memcpy(out, &dctx->accumulator, GHASH_DIGEST_SIZE);
@@ -131,7 +108,7 @@ static int riscv64_ghash_final(struct shash_desc *desc, u8 *out)
 static struct shash_alg riscv64_ghash_alg = {
 	.init = riscv64_ghash_init,
 	.update = riscv64_ghash_update,
-	.final = riscv64_ghash_final,
+	.finup = riscv64_ghash_finup,
 	.setkey = riscv64_ghash_setkey,
 	.descsize = sizeof(struct riscv64_ghash_desc_ctx),
 	.digestsize = GHASH_DIGEST_SIZE,
@@ -139,6 +116,7 @@ static struct shash_alg riscv64_ghash_alg = {
 		.cra_blocksize = GHASH_BLOCK_SIZE,
 		.cra_ctxsize = sizeof(struct riscv64_ghash_tfm_ctx),
 		.cra_priority = 300,
+		.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_name = "ghash",
 		.cra_driver_name = "ghash-riscv64-zvkg",
 		.cra_module = THIS_MODULE,
diff --git a/arch/riscv/crypto/sha256-riscv64-glue.c b/arch/riscv/crypto/sha256-riscv64-glue.c
deleted file mode 100644
index 71e051e40a64..000000000000
--- a/arch/riscv/crypto/sha256-riscv64-glue.c
+++ /dev/null
@@ -1,137 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 and SHA-224 using the RISC-V vector crypto extensions
- *
- * Copyright (C) 2022 VRULL GmbH
- * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
- *
- * Copyright (C) 2023 SiFive, Inc.
- * Author: Jerry Shih <jerry.shih@sifive.com>
- */
-
-#include <asm/simd.h>
-#include <asm/vector.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <crypto/sha256_base.h>
-#include <linux/linkage.h>
-#include <linux/module.h>
-
-/*
- * Note: the asm function only uses the 'state' field of struct sha256_state.
- * It is assumed to be the first field.
- */
-asmlinkage void sha256_transform_zvknha_or_zvknhb_zvkb(
-	struct sha256_state *state, const u8 *data, int num_blocks);
-
-static int riscv64_sha256_update(struct shash_desc *desc, const u8 *data,
-				 unsigned int len)
-{
-	/*
-	 * Ensure struct sha256_state begins directly with the SHA-256
-	 * 256-bit internal state, as this is what the asm function expects.
-	 */
-	BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
-
-	if (crypto_simd_usable()) {
-		kernel_vector_begin();
-		sha256_base_do_update(desc, data, len,
-				      sha256_transform_zvknha_or_zvknhb_zvkb);
-		kernel_vector_end();
-	} else {
-		crypto_sha256_update(desc, data, len);
-	}
-	return 0;
-}
-
-static int riscv64_sha256_finup(struct shash_desc *desc, const u8 *data,
-				unsigned int len, u8 *out)
-{
-	if (crypto_simd_usable()) {
-		kernel_vector_begin();
-		if (len)
-			sha256_base_do_update(
-				desc, data, len,
-				sha256_transform_zvknha_or_zvknhb_zvkb);
-		sha256_base_do_finalize(
-			desc, sha256_transform_zvknha_or_zvknhb_zvkb);
-		kernel_vector_end();
-
-		return sha256_base_finish(desc, out);
-	}
-
-	return crypto_sha256_finup(desc, data, len, out);
-}
-
-static int riscv64_sha256_final(struct shash_desc *desc, u8 *out)
-{
-	return riscv64_sha256_finup(desc, NULL, 0, out);
-}
-
-static int riscv64_sha256_digest(struct shash_desc *desc, const u8 *data,
-				 unsigned int len, u8 *out)
-{
-	return sha256_base_init(desc) ?:
-	       riscv64_sha256_finup(desc, data, len, out);
-}
-
-static struct shash_alg riscv64_sha256_algs[] = {
-	{
-		.init = sha256_base_init,
-		.update = riscv64_sha256_update,
-		.final = riscv64_sha256_final,
-		.finup = riscv64_sha256_finup,
-		.digest = riscv64_sha256_digest,
-		.descsize = sizeof(struct sha256_state),
-		.digestsize = SHA256_DIGEST_SIZE,
-		.base = {
-			.cra_blocksize = SHA256_BLOCK_SIZE,
-			.cra_priority = 300,
-			.cra_name = "sha256",
-			.cra_driver_name = "sha256-riscv64-zvknha_or_zvknhb-zvkb",
-			.cra_module = THIS_MODULE,
-		},
-	}, {
-		.init = sha224_base_init,
-		.update = riscv64_sha256_update,
-		.final = riscv64_sha256_final,
-		.finup = riscv64_sha256_finup,
-		.descsize = sizeof(struct sha256_state),
-		.digestsize = SHA224_DIGEST_SIZE,
-		.base = {
-			.cra_blocksize = SHA224_BLOCK_SIZE,
-			.cra_priority = 300,
-			.cra_name = "sha224",
-			.cra_driver_name = "sha224-riscv64-zvknha_or_zvknhb-zvkb",
-			.cra_module = THIS_MODULE,
-		},
-	},
-};
-
-static int __init riscv64_sha256_mod_init(void)
-{
-	/* Both zvknha and zvknhb provide the SHA-256 instructions. */
-	if ((riscv_isa_extension_available(NULL, ZVKNHA) ||
-	     riscv_isa_extension_available(NULL, ZVKNHB)) &&
-	    riscv_isa_extension_available(NULL, ZVKB) &&
-	    riscv_vector_vlen() >= 128)
-		return crypto_register_shashes(riscv64_sha256_algs,
-					       ARRAY_SIZE(riscv64_sha256_algs));
-
-	return -ENODEV;
-}
-
-static void __exit riscv64_sha256_mod_exit(void)
-{
-	crypto_unregister_shashes(riscv64_sha256_algs,
-				  ARRAY_SIZE(riscv64_sha256_algs));
-}
-
-module_init(riscv64_sha256_mod_init);
-module_exit(riscv64_sha256_mod_exit);
-
-MODULE_DESCRIPTION("SHA-256 (RISC-V accelerated)");
-MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_CRYPTO("sha256");
-MODULE_ALIAS_CRYPTO("sha224");
diff --git a/arch/riscv/crypto/sha512-riscv64-glue.c b/arch/riscv/crypto/sha512-riscv64-glue.c
index 43b56a08aeb5..4634fca78ae2 100644
--- a/arch/riscv/crypto/sha512-riscv64-glue.c
+++ b/arch/riscv/crypto/sha512-riscv64-glue.c
@@ -14,7 +14,7 @@
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
 #include <crypto/sha512_base.h>
-#include <linux/linkage.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 
 /*
@@ -24,8 +24,8 @@
 asmlinkage void sha512_transform_zvknhb_zvkb(
 	struct sha512_state *state, const u8 *data, int num_blocks);
 
-static int riscv64_sha512_update(struct shash_desc *desc, const u8 *data,
-				 unsigned int len)
+static void sha512_block(struct sha512_state *state, const u8 *data,
+			 int num_blocks)
 {
 	/*
 	 * Ensure struct sha512_state begins directly with the SHA-512
@@ -35,35 +35,24 @@ static int riscv64_sha512_update(struct shash_desc *desc, const u8 *data,
 
 	if (crypto_simd_usable()) {
 		kernel_vector_begin();
-		sha512_base_do_update(desc, data, len,
-				      sha512_transform_zvknhb_zvkb);
+		sha512_transform_zvknhb_zvkb(state, data, num_blocks);
 		kernel_vector_end();
 	} else {
-		crypto_sha512_update(desc, data, len);
+		sha512_generic_block_fn(state, data, num_blocks);
 	}
-	return 0;
 }
 
-static int riscv64_sha512_finup(struct shash_desc *desc, const u8 *data,
-				unsigned int len, u8 *out)
+static int riscv64_sha512_update(struct shash_desc *desc, const u8 *data,
+				 unsigned int len)
 {
-	if (crypto_simd_usable()) {
-		kernel_vector_begin();
-		if (len)
-			sha512_base_do_update(desc, data, len,
-					      sha512_transform_zvknhb_zvkb);
-		sha512_base_do_finalize(desc, sha512_transform_zvknhb_zvkb);
-		kernel_vector_end();
-
-		return sha512_base_finish(desc, out);
-	}
-
-	return crypto_sha512_finup(desc, data, len, out);
+	return sha512_base_do_update_blocks(desc, data, len, sha512_block);
 }
 
-static int riscv64_sha512_final(struct shash_desc *desc, u8 *out)
+static int riscv64_sha512_finup(struct shash_desc *desc, const u8 *data,
+				unsigned int len, u8 *out)
 {
-	return riscv64_sha512_finup(desc, NULL, 0, out);
+	sha512_base_do_finup(desc, data, len, sha512_block);
+	return sha512_base_finish(desc, out);
 }
 
 static int riscv64_sha512_digest(struct shash_desc *desc, const u8 *data,
@@ -77,14 +66,15 @@ static struct shash_alg riscv64_sha512_algs[] = {
 	{
 		.init = sha512_base_init,
 		.update = riscv64_sha512_update,
-		.final = riscv64_sha512_final,
 		.finup = riscv64_sha512_finup,
 		.digest = riscv64_sha512_digest,
-		.descsize = sizeof(struct sha512_state),
+		.descsize = SHA512_STATE_SIZE,
 		.digestsize = SHA512_DIGEST_SIZE,
 		.base = {
 			.cra_blocksize = SHA512_BLOCK_SIZE,
 			.cra_priority = 300,
+			.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				     CRYPTO_AHASH_ALG_FINUP_MAX,
 			.cra_name = "sha512",
 			.cra_driver_name = "sha512-riscv64-zvknhb-zvkb",
 			.cra_module = THIS_MODULE,
@@ -92,13 +82,14 @@ static struct shash_alg riscv64_sha512_algs[] = {
 	}, {
 		.init = sha384_base_init,
 		.update = riscv64_sha512_update,
-		.final = riscv64_sha512_final,
 		.finup = riscv64_sha512_finup,
-		.descsize = sizeof(struct sha512_state),
+		.descsize = SHA512_STATE_SIZE,
 		.digestsize = SHA384_DIGEST_SIZE,
 		.base = {
 			.cra_blocksize = SHA384_BLOCK_SIZE,
 			.cra_priority = 300,
+			.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				     CRYPTO_AHASH_ALG_FINUP_MAX,
 			.cra_name = "sha384",
 			.cra_driver_name = "sha384-riscv64-zvknhb-zvkb",
 			.cra_module = THIS_MODULE,
diff --git a/arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S b/arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S
index 3a9ae210f915..89f4a10d12dd 100644
--- a/arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S
+++ b/arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S
@@ -43,7 +43,7 @@
 // - RISC-V Vector SHA-2 Secure Hash extension ('Zvknhb')
 // - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
 
-#include <linux/cfi_types.h>
+#include <linux/linkage.h>
 
 .text
 .option arch, +zvknhb, +zvkb
@@ -95,7 +95,7 @@
 
 // void sha512_transform_zvknhb_zvkb(u64 state[8], const u8 *data,
 //				     int num_blocks);
-SYM_TYPED_FUNC_START(sha512_transform_zvknhb_zvkb)
+SYM_FUNC_START(sha512_transform_zvknhb_zvkb)
 
 	// Setup mask for the vmerge to replace the first word (idx==0) in
 	// message scheduling.  There are 4 words, so an 8-bit mask suffices.
diff --git a/arch/riscv/crypto/sm3-riscv64-glue.c b/arch/riscv/crypto/sm3-riscv64-glue.c
index e1737a970c7c..abdfe4a63a27 100644
--- a/arch/riscv/crypto/sm3-riscv64-glue.c
+++ b/arch/riscv/crypto/sm3-riscv64-glue.c
@@ -13,8 +13,9 @@
 #include <asm/vector.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
+#include <crypto/sm3.h>
 #include <crypto/sm3_base.h>
-#include <linux/linkage.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 
 /*
@@ -24,8 +25,8 @@
 asmlinkage void sm3_transform_zvksh_zvkb(
 	struct sm3_state *state, const u8 *data, int num_blocks);
 
-static int riscv64_sm3_update(struct shash_desc *desc, const u8 *data,
-			      unsigned int len)
+static void sm3_block(struct sm3_state *state, const u8 *data,
+		      int num_blocks)
 {
 	/*
 	 * Ensure struct sm3_state begins directly with the SM3
@@ -35,52 +36,36 @@ static int riscv64_sm3_update(struct shash_desc *desc, const u8 *data,
 
 	if (crypto_simd_usable()) {
 		kernel_vector_begin();
-		sm3_base_do_update(desc, data, len, sm3_transform_zvksh_zvkb);
+		sm3_transform_zvksh_zvkb(state, data, num_blocks);
 		kernel_vector_end();
 	} else {
-		sm3_update(shash_desc_ctx(desc), data, len);
+		sm3_block_generic(state, data, num_blocks);
 	}
-	return 0;
 }
 
-static int riscv64_sm3_finup(struct shash_desc *desc, const u8 *data,
-			     unsigned int len, u8 *out)
+static int riscv64_sm3_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len)
 {
-	struct sm3_state *ctx;
-
-	if (crypto_simd_usable()) {
-		kernel_vector_begin();
-		if (len)
-			sm3_base_do_update(desc, data, len,
-					   sm3_transform_zvksh_zvkb);
-		sm3_base_do_finalize(desc, sm3_transform_zvksh_zvkb);
-		kernel_vector_end();
-
-		return sm3_base_finish(desc, out);
-	}
-
-	ctx = shash_desc_ctx(desc);
-	if (len)
-		sm3_update(ctx, data, len);
-	sm3_final(ctx, out);
-
-	return 0;
+	return sm3_base_do_update_blocks(desc, data, len, sm3_block);
 }
 
-static int riscv64_sm3_final(struct shash_desc *desc, u8 *out)
+static int riscv64_sm3_finup(struct shash_desc *desc, const u8 *data,
+			     unsigned int len, u8 *out)
 {
-	return riscv64_sm3_finup(desc, NULL, 0, out);
+	sm3_base_do_finup(desc, data, len, sm3_block);
+	return sm3_base_finish(desc, out);
 }
 
 static struct shash_alg riscv64_sm3_alg = {
 	.init = sm3_base_init,
 	.update = riscv64_sm3_update,
-	.final = riscv64_sm3_final,
 	.finup = riscv64_sm3_finup,
-	.descsize = sizeof(struct sm3_state),
+	.descsize = SM3_STATE_SIZE,
 	.digestsize = SM3_DIGEST_SIZE,
 	.base = {
 		.cra_blocksize = SM3_BLOCK_SIZE,
+		.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
+			     CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_priority = 300,
 		.cra_name = "sm3",
 		.cra_driver_name = "sm3-riscv64-zvksh-zvkb",
diff --git a/arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S b/arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S
index a2b65d961c04..4fe754846f65 100644
--- a/arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S
+++ b/arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S
@@ -43,7 +43,7 @@
 // - RISC-V Vector SM3 Secure Hash extension ('Zvksh')
 // - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
 
-#include <linux/cfi_types.h>
+#include <linux/linkage.h>
 
 .text
 .option arch, +zvksh, +zvkb
@@ -81,7 +81,7 @@
 .endm
 
 // void sm3_transform_zvksh_zvkb(u32 state[8], const u8 *data, int num_blocks);
-SYM_TYPED_FUNC_START(sm3_transform_zvksh_zvkb)
+SYM_FUNC_START(sm3_transform_zvksh_zvkb)
 
 	// Load the state and endian-swap each 32-bit word.
 	vsetivli	zero, 8, e32, m2, ta, ma
diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h
index 8de73f91bfa3..b59ffeb668d6 100644
--- a/arch/riscv/include/asm/cacheflush.h
+++ b/arch/riscv/include/asm/cacheflush.h
@@ -34,11 +34,6 @@ static inline void flush_dcache_page(struct page *page)
 	flush_dcache_folio(page_folio(page));
 }
 
-/*
- * RISC-V doesn't have an instruction to flush parts of the instruction cache,
- * so instead we just flush the whole thing.
- */
-#define flush_icache_range(start, end) flush_icache_all()
 #define flush_icache_user_page(vma, pg, addr, len)	\
 do {							\
 	if (vma->vm_flags & VM_EXEC)			\
@@ -78,6 +73,16 @@ void flush_icache_mm(struct mm_struct *mm, bool local);
 
 #endif /* CONFIG_SMP */
 
+/*
+ * RISC-V doesn't have an instruction to flush parts of the instruction cache,
+ * so instead we just flush the whole thing.
+ */
+#define flush_icache_range flush_icache_range
+static inline void flush_icache_range(unsigned long start, unsigned long end)
+{
+	flush_icache_all();
+}
+
 extern unsigned int riscv_cbom_block_size;
 extern unsigned int riscv_cboz_block_size;
 void riscv_init_cbo_blocksizes(void);
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 8d186bfced45..f7480c9c6f8d 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -9,8 +9,8 @@ CFLAGS_REMOVE_patch.o	= $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_sbi.o	= $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_return_address.o	= $(CC_FLAGS_FTRACE)
 endif
-CFLAGS_syscall_table.o	+= $(call cc-option,-Wno-override-init,)
-CFLAGS_compat_syscall_table.o += $(call cc-option,-Wno-override-init,)
+CFLAGS_syscall_table.o	+= $(call cc-disable-warning, override-init)
+CFLAGS_compat_syscall_table.o += $(call cc-disable-warning, override-init)
 
 ifdef CONFIG_KEXEC_CORE
 AFLAGS_kexec_relocate.o := -mcmodel=medany $(call cc-option,-mno-relax)
diff --git a/arch/riscv/kernel/probes/uprobes.c b/arch/riscv/kernel/probes/uprobes.c
index 4b3dc8beaf77..cc15f7ca6cc1 100644
--- a/arch/riscv/kernel/probes/uprobes.c
+++ b/arch/riscv/kernel/probes/uprobes.c
@@ -167,6 +167,7 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
 	/* Initialize the slot */
 	void *kaddr = kmap_atomic(page);
 	void *dst = kaddr + (vaddr & ~PAGE_MASK);
+	unsigned long start = (unsigned long)dst;
 
 	memcpy(dst, src, len);
 
@@ -176,13 +177,6 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
 		*(uprobe_opcode_t *)dst = __BUG_INSN_32;
 	}
 
+	flush_icache_range(start, start + len);
 	kunmap_atomic(kaddr);
-
-	/*
-	 * We probably need flush_icache_user_page() but it needs vma.
-	 * This should work on most of architectures by default. If
-	 * architecture needs to do something different it can define
-	 * its own version of the function.
-	 */
-	flush_dcache_page(page);
 }
diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index 7c244de77180..15d8f75902f8 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -275,6 +275,9 @@ long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg)
 	unsigned long pmm;
 	u8 pmlen;
 
+	if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM))
+		return -EINVAL;
+
 	if (is_compat_thread(ti))
 		return -EINVAL;
 
@@ -330,6 +333,9 @@ long get_tagged_addr_ctrl(struct task_struct *task)
 	struct thread_info *ti = task_thread_info(task);
 	long ret = 0;
 
+	if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM))
+		return -EINVAL;
+
 	if (is_compat_thread(ti))
 		return -EINVAL;
 
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 8ff8e8b36524..9c83848797a7 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -198,47 +198,57 @@ asmlinkage __visible __trap_section void do_trap_insn_illegal(struct pt_regs *re
 DO_ERROR_INFO(do_trap_load_fault,
 	SIGSEGV, SEGV_ACCERR, "load access fault");
 
-asmlinkage __visible __trap_section void do_trap_load_misaligned(struct pt_regs *regs)
+enum misaligned_access_type {
+	MISALIGNED_STORE,
+	MISALIGNED_LOAD,
+};
+static const struct {
+	const char *type_str;
+	int (*handler)(struct pt_regs *regs);
+} misaligned_handler[] = {
+	[MISALIGNED_STORE] = {
+		.type_str = "Oops - store (or AMO) address misaligned",
+		.handler = handle_misaligned_store,
+	},
+	[MISALIGNED_LOAD] = {
+		.type_str = "Oops - load address misaligned",
+		.handler = handle_misaligned_load,
+	},
+};
+
+static void do_trap_misaligned(struct pt_regs *regs, enum misaligned_access_type type)
 {
+	irqentry_state_t state;
+
 	if (user_mode(regs)) {
 		irqentry_enter_from_user_mode(regs);
+		local_irq_enable();
+	} else {
+		state = irqentry_nmi_enter(regs);
+	}
 
-		if (handle_misaligned_load(regs))
-			do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc,
-			      "Oops - load address misaligned");
+	if (misaligned_handler[type].handler(regs))
+		do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc,
+			      misaligned_handler[type].type_str);
 
+	if (user_mode(regs)) {
+		local_irq_disable();
 		irqentry_exit_to_user_mode(regs);
 	} else {
-		irqentry_state_t state = irqentry_nmi_enter(regs);
-
-		if (handle_misaligned_load(regs))
-			do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc,
-			      "Oops - load address misaligned");
-
 		irqentry_nmi_exit(regs, state);
 	}
 }
 
-asmlinkage __visible __trap_section void do_trap_store_misaligned(struct pt_regs *regs)
+asmlinkage __visible __trap_section void do_trap_load_misaligned(struct pt_regs *regs)
 {
-	if (user_mode(regs)) {
-		irqentry_enter_from_user_mode(regs);
-
-		if (handle_misaligned_store(regs))
-			do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc,
-				"Oops - store (or AMO) address misaligned");
-
-		irqentry_exit_to_user_mode(regs);
-	} else {
-		irqentry_state_t state = irqentry_nmi_enter(regs);
-
-		if (handle_misaligned_store(regs))
-			do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc,
-				"Oops - store (or AMO) address misaligned");
+	do_trap_misaligned(regs, MISALIGNED_LOAD);
+}
 
-		irqentry_nmi_exit(regs, state);
-	}
+asmlinkage __visible __trap_section void do_trap_store_misaligned(struct pt_regs *regs)
+{
+	do_trap_misaligned(regs, MISALIGNED_STORE);
 }
+
 DO_ERROR_INFO(do_trap_store_fault,
 	SIGSEGV, SEGV_ACCERR, "store (or AMO) access fault");
 DO_ERROR_INFO(do_trap_ecall_s,
diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
index 4354c87c0376..77c788660223 100644
--- a/arch/riscv/kernel/traps_misaligned.c
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -88,6 +88,13 @@
 #define INSN_MATCH_C_FSWSP		0xe002
 #define INSN_MASK_C_FSWSP		0xe003
 
+#define INSN_MATCH_C_LHU		0x8400
+#define INSN_MASK_C_LHU			0xfc43
+#define INSN_MATCH_C_LH			0x8440
+#define INSN_MASK_C_LH			0xfc43
+#define INSN_MATCH_C_SH			0x8c00
+#define INSN_MASK_C_SH			0xfc43
+
 #define INSN_LEN(insn)			((((insn) & 0x3) < 0x3) ? 2 : 4)
 
 #if defined(CONFIG_64BIT)
@@ -268,7 +275,7 @@ static unsigned long get_f32_rs(unsigned long insn, u8 fp_reg_offset,
 	int __ret;					\
 							\
 	if (user_mode(regs)) {				\
-		__ret = __get_user(insn, (type __user *) insn_addr); \
+		__ret = get_user(insn, (type __user *) insn_addr); \
 	} else {					\
 		insn = *(type *)insn_addr;		\
 		__ret = 0;				\
@@ -431,6 +438,13 @@ static int handle_scalar_misaligned_load(struct pt_regs *regs)
 		fp = 1;
 		len = 4;
 #endif
+	} else if ((insn & INSN_MASK_C_LHU) == INSN_MATCH_C_LHU) {
+		len = 2;
+		insn = RVC_RS2S(insn) << SH_RD;
+	} else if ((insn & INSN_MASK_C_LH) == INSN_MATCH_C_LH) {
+		len = 2;
+		shift = 8 * (sizeof(ulong) - len);
+		insn = RVC_RS2S(insn) << SH_RD;
 	} else {
 		regs->epc = epc;
 		return -1;
@@ -530,6 +544,9 @@ static int handle_scalar_misaligned_store(struct pt_regs *regs)
 		len = 4;
 		val.data_ulong = GET_F32_RS2C(insn, regs);
 #endif
+	} else if ((insn & INSN_MASK_C_SH) == INSN_MATCH_C_SH) {
+		len = 2;
+		val.data_ulong = GET_RS2S(insn, regs);
 	} else {
 		regs->epc = epc;
 		return -1;
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index 60d684c76c58..02635bac91f1 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -77,6 +77,8 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu)
 	memcpy(cntx, reset_cntx, sizeof(*cntx));
 	spin_unlock(&vcpu->arch.reset_cntx_lock);
 
+	memset(&vcpu->arch.smstateen_csr, 0, sizeof(vcpu->arch.smstateen_csr));
+
 	kvm_riscv_vcpu_fp_reset(vcpu);
 
 	kvm_riscv_vcpu_vector_reset(vcpu);
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index b1c46153606a..0baec92d2f55 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
+obj-y			+= crypto/
 lib-y			+= delay.o
 lib-y			+= memcpy.o
 lib-y			+= memset.o
diff --git a/arch/riscv/lib/crypto/Kconfig b/arch/riscv/lib/crypto/Kconfig
new file mode 100644
index 000000000000..47c99ea97ce2
--- /dev/null
+++ b/arch/riscv/lib/crypto/Kconfig
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_CHACHA_RISCV64
+	tristate
+	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	default CRYPTO_LIB_CHACHA
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+	select CRYPTO_LIB_CHACHA_GENERIC
+
+config CRYPTO_SHA256_RISCV64
+	tristate
+	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	default CRYPTO_LIB_SHA256
+	select CRYPTO_ARCH_HAVE_LIB_SHA256
+	select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
+	select CRYPTO_LIB_SHA256_GENERIC
diff --git a/arch/riscv/lib/crypto/Makefile b/arch/riscv/lib/crypto/Makefile
new file mode 100644
index 000000000000..b7cb877a2c07
--- /dev/null
+++ b/arch/riscv/lib/crypto/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o
+chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o
+
+obj-$(CONFIG_CRYPTO_SHA256_RISCV64) += sha256-riscv64.o
+sha256-riscv64-y := sha256.o sha256-riscv64-zvknha_or_zvknhb-zvkb.o
diff --git a/arch/riscv/lib/crypto/chacha-riscv64-glue.c b/arch/riscv/lib/crypto/chacha-riscv64-glue.c
new file mode 100644
index 000000000000..8c3f11d79be3
--- /dev/null
+++ b/arch/riscv/lib/crypto/chacha-riscv64-glue.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ChaCha stream cipher (RISC-V optimized)
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
+#include <linux/linkage.h>
+#include <linux/module.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_zvkb);
+
+asmlinkage void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out,
+			    size_t nblocks, int nrounds);
+
+void hchacha_block_arch(const struct chacha_state *state,
+			u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	hchacha_block_generic(state, out, nrounds);
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+		       unsigned int bytes, int nrounds)
+{
+	u8 block_buffer[CHACHA_BLOCK_SIZE];
+	unsigned int full_blocks = bytes / CHACHA_BLOCK_SIZE;
+	unsigned int tail_bytes = bytes % CHACHA_BLOCK_SIZE;
+
+	if (!static_branch_likely(&use_zvkb) || !crypto_simd_usable())
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	kernel_vector_begin();
+	if (full_blocks) {
+		chacha_zvkb(state, src, dst, full_blocks, nrounds);
+		src += full_blocks * CHACHA_BLOCK_SIZE;
+		dst += full_blocks * CHACHA_BLOCK_SIZE;
+	}
+	if (tail_bytes) {
+		memcpy(block_buffer, src, tail_bytes);
+		chacha_zvkb(state, block_buffer, block_buffer, 1, nrounds);
+		memcpy(dst, block_buffer, tail_bytes);
+	}
+	kernel_vector_end();
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	return static_key_enabled(&use_zvkb);
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+static int __init riscv64_chacha_mod_init(void)
+{
+	if (riscv_isa_extension_available(NULL, ZVKB) &&
+	    riscv_vector_vlen() >= 128)
+		static_branch_enable(&use_zvkb);
+	return 0;
+}
+subsys_initcall(riscv64_chacha_mod_init);
+
+static void __exit riscv64_chacha_mod_exit(void)
+{
+}
+module_exit(riscv64_chacha_mod_exit);
+
+MODULE_DESCRIPTION("ChaCha stream cipher (RISC-V optimized)");
+MODULE_AUTHOR("Jerry Shih <jerry.shih@sifive.com>");
+MODULE_LICENSE("GPL");
diff --git a/arch/riscv/crypto/chacha-riscv64-zvkb.S b/arch/riscv/lib/crypto/chacha-riscv64-zvkb.S
index bf057737ac69..b777d0b4e379 100644
--- a/arch/riscv/crypto/chacha-riscv64-zvkb.S
+++ b/arch/riscv/lib/crypto/chacha-riscv64-zvkb.S
@@ -46,11 +46,11 @@
 .text
 .option arch, +zvkb
 
-#define KEYP		a0
+#define STATEP		a0
 #define INP		a1
 #define OUTP		a2
-#define LEN		a3
-#define IVP		a4
+#define NBLOCKS		a3
+#define NROUNDS		a4
 
 #define CONSTS0		a5
 #define CONSTS1		a6
@@ -59,7 +59,7 @@
 #define TMP		t1
 #define VL		t2
 #define STRIDE		t3
-#define NROUNDS		t4
+#define ROUND_CTR	t4
 #define KEY0		s0
 #define KEY1		s1
 #define KEY2		s2
@@ -132,14 +132,16 @@
 	vror.vi		\b3, \b3, 32 - 7
 .endm
 
-// void chacha20_zvkb(const u32 key[8], const u8 *in, u8 *out, size_t len,
-//		      const u32 iv[4]);
+// void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out,
+//		    size_t nblocks, int nrounds);
 //
-// |len| must be nonzero and a multiple of 64 (CHACHA_BLOCK_SIZE).
-// The counter is treated as 32-bit, following the RFC7539 convention.
-SYM_FUNC_START(chacha20_zvkb)
-	srli		LEN, LEN, 6	// Bytes to blocks
-
+// |nblocks| is the number of 64-byte blocks to process, and must be nonzero.
+//
+// |state| gives the ChaCha state matrix, including the 32-bit counter in
+// state->x[12] following the RFC7539 convention; note that this differs from
+// the original Salsa20 paper which uses a 64-bit counter in state->x[12..13].
+// The updated 32-bit counter is written back to state->x[12] before returning.
+SYM_FUNC_START(chacha_zvkb)
 	addi		sp, sp, -96
 	sd		s0, 0(sp)
 	sd		s1, 8(sp)
@@ -157,26 +159,26 @@ SYM_FUNC_START(chacha20_zvkb)
 	li		STRIDE, 64
 
 	// Set up the initial state matrix in scalar registers.
-	li		CONSTS0, 0x61707865	// "expa" little endian
-	li		CONSTS1, 0x3320646e	// "nd 3" little endian
-	li		CONSTS2, 0x79622d32	// "2-by" little endian
-	li		CONSTS3, 0x6b206574	// "te k" little endian
-	lw		KEY0, 0(KEYP)
-	lw		KEY1, 4(KEYP)
-	lw		KEY2, 8(KEYP)
-	lw		KEY3, 12(KEYP)
-	lw		KEY4, 16(KEYP)
-	lw		KEY5, 20(KEYP)
-	lw		KEY6, 24(KEYP)
-	lw		KEY7, 28(KEYP)
-	lw		COUNTER, 0(IVP)
-	lw		NONCE0, 4(IVP)
-	lw		NONCE1, 8(IVP)
-	lw		NONCE2, 12(IVP)
+	lw		CONSTS0, 0(STATEP)
+	lw		CONSTS1, 4(STATEP)
+	lw		CONSTS2, 8(STATEP)
+	lw		CONSTS3, 12(STATEP)
+	lw		KEY0, 16(STATEP)
+	lw		KEY1, 20(STATEP)
+	lw		KEY2, 24(STATEP)
+	lw		KEY3, 28(STATEP)
+	lw		KEY4, 32(STATEP)
+	lw		KEY5, 36(STATEP)
+	lw		KEY6, 40(STATEP)
+	lw		KEY7, 44(STATEP)
+	lw		COUNTER, 48(STATEP)
+	lw		NONCE0, 52(STATEP)
+	lw		NONCE1, 56(STATEP)
+	lw		NONCE2, 60(STATEP)
 
 .Lblock_loop:
 	// Set vl to the number of blocks to process in this iteration.
-	vsetvli		VL, LEN, e32, m1, ta, ma
+	vsetvli		VL, NBLOCKS, e32, m1, ta, ma
 
 	// Set up the initial state matrix for the next VL blocks in v0-v15.
 	// v{i} holds the i'th 32-bit word of the state matrix for all blocks.
@@ -203,16 +205,16 @@ SYM_FUNC_START(chacha20_zvkb)
 	// v{16+i} holds the i'th 32-bit word for all blocks.
 	vlsseg8e32.v	v16, (INP), STRIDE
 
-	li		NROUNDS, 20
+	mv		ROUND_CTR, NROUNDS
 .Lnext_doubleround:
-	addi		NROUNDS, NROUNDS, -2
+	addi		ROUND_CTR, ROUND_CTR, -2
 	// column round
 	chacha_round	v0, v4, v8, v12, v1, v5, v9, v13, \
 			v2, v6, v10, v14, v3, v7, v11, v15
 	// diagonal round
 	chacha_round	v0, v5, v10, v15, v1, v6, v11, v12, \
 			v2, v7, v8, v13, v3, v4, v9, v14
-	bnez		NROUNDS, .Lnext_doubleround
+	bnez		ROUND_CTR, .Lnext_doubleround
 
 	// Load the second half of the input data for each block into v24-v31.
 	// v{24+i} holds the {8+i}'th 32-bit word for all blocks.
@@ -271,12 +273,13 @@ SYM_FUNC_START(chacha20_zvkb)
 	// Update the counter, the remaining number of blocks, and the input and
 	// output pointers according to the number of blocks processed (VL).
 	add		COUNTER, COUNTER, VL
-	sub		LEN, LEN, VL
+	sub		NBLOCKS, NBLOCKS, VL
 	slli		TMP, VL, 6
 	add		OUTP, OUTP, TMP
 	add		INP, INP, TMP
-	bnez		LEN, .Lblock_loop
+	bnez		NBLOCKS, .Lblock_loop
 
+	sw		COUNTER, 48(STATEP)
 	ld		s0, 0(sp)
 	ld		s1, 8(sp)
 	ld		s2, 16(sp)
@@ -291,4 +294,4 @@ SYM_FUNC_START(chacha20_zvkb)
 	ld		s11, 88(sp)
 	addi		sp, sp, 96
 	ret
-SYM_FUNC_END(chacha20_zvkb)
+SYM_FUNC_END(chacha_zvkb)
diff --git a/arch/riscv/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S b/arch/riscv/lib/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
index 8ebcc17de4dc..fad501ad0617 100644
--- a/arch/riscv/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
+++ b/arch/riscv/lib/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
@@ -43,7 +43,7 @@
 // - RISC-V Vector SHA-2 Secure Hash extension ('Zvknha' or 'Zvknhb')
 // - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
 
-#include <linux/cfi_types.h>
+#include <linux/linkage.h>
 
 .text
 .option arch, +zvknha, +zvkb
@@ -106,9 +106,9 @@
 	sha256_4rounds	\last, \k3, W3, W0, W1, W2
 .endm
 
-// void sha256_transform_zvknha_or_zvknhb_zvkb(u32 state[8], const u8 *data,
-//					       int num_blocks);
-SYM_TYPED_FUNC_START(sha256_transform_zvknha_or_zvknhb_zvkb)
+// void sha256_transform_zvknha_or_zvknhb_zvkb(u32 state[SHA256_STATE_WORDS],
+//					       const u8 *data, size_t nblocks);
+SYM_FUNC_START(sha256_transform_zvknha_or_zvknhb_zvkb)
 
 	// Load the round constants into K0-K15.
 	vsetivli	zero, 4, e32, m1, ta, ma
diff --git a/arch/riscv/lib/crypto/sha256.c b/arch/riscv/lib/crypto/sha256.c
new file mode 100644
index 000000000000..71808397dff4
--- /dev/null
+++ b/arch/riscv/lib/crypto/sha256.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-256 (RISC-V accelerated)
+ *
+ * Copyright (C) 2022 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/vector.h>
+#include <crypto/internal/sha2.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+asmlinkage void sha256_transform_zvknha_or_zvknhb_zvkb(
+	u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
+
+void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
+			const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_extensions)) {
+		kernel_vector_begin();
+		sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks);
+		kernel_vector_end();
+	} else {
+		sha256_blocks_generic(state, data, nblocks);
+	}
+}
+EXPORT_SYMBOL_GPL(sha256_blocks_simd);
+
+void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+			const u8 *data, size_t nblocks)
+{
+	sha256_blocks_generic(state, data, nblocks);
+}
+EXPORT_SYMBOL_GPL(sha256_blocks_arch);
+
+bool sha256_is_arch_optimized(void)
+{
+	return static_key_enabled(&have_extensions);
+}
+EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
+
+static int __init riscv64_sha256_mod_init(void)
+{
+	/* Both zvknha and zvknhb provide the SHA-256 instructions. */
+	if ((riscv_isa_extension_available(NULL, ZVKNHA) ||
+	     riscv_isa_extension_available(NULL, ZVKNHB)) &&
+	    riscv_isa_extension_available(NULL, ZVKB) &&
+	    riscv_vector_vlen() >= 128)
+		static_branch_enable(&have_extensions);
+	return 0;
+}
+subsys_initcall(riscv64_sha256_mod_init);
+
+static void __exit riscv64_sha256_mod_exit(void)
+{
+}
+module_exit(riscv64_sha256_mod_exit);
+
+MODULE_DESCRIPTION("SHA-256 (RISC-V accelerated)");
+MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
+MODULE_LICENSE("GPL");
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 99fb986fca6e..0c16dc443e2f 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -146,6 +146,7 @@ config S390
 	select ARCH_WANTS_NO_INSTR
 	select ARCH_WANT_DEFAULT_BPF_JIT
 	select ARCH_WANT_IPC_PARSE_VERSION
+	select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	select ARCH_WANT_KERNEL_PMD_MKWRITE
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c
index d04e9b89d14a..f584d7da29cb 100644
--- a/arch/s390/boot/ipl_parm.c
+++ b/arch/s390/boot/ipl_parm.c
@@ -179,7 +179,7 @@ void setup_boot_command_line(void)
 	if (has_ebcdic_char(parmarea.command_line))
 		EBCASC(parmarea.command_line, COMMAND_LINE_SIZE);
 	/* copy arch command line */
-	strcpy(early_command_line, strim(parmarea.command_line));
+	strscpy(early_command_line, strim(parmarea.command_line));
 
 	/* append IPL PARM data to the boot command line */
 	if (!is_prot_virt_guest() && ipl_block_valid)
@@ -253,7 +253,8 @@ void parse_boot_command_line(void)
 	int rc;
 
 	__kaslr_enabled = IS_ENABLED(CONFIG_RANDOMIZE_BASE);
-	args = strcpy(command_line_buf, early_command_line);
+	strscpy(command_line_buf, early_command_line);
+	args = command_line_buf;
 	while (*args) {
 		args = next_arg(args, &param, &val);
 
@@ -309,7 +310,7 @@ void parse_boot_command_line(void)
 		if (!strcmp(param, "bootdebug")) {
 			bootdebug = true;
 			if (val)
-				strncpy(bootdebug_filter, val, sizeof(bootdebug_filter) - 1);
+				strscpy(bootdebug_filter, val);
 		}
 		if (!strcmp(param, "quiet"))
 			boot_console_loglevel = CONSOLE_LOGLEVEL_QUIET;
diff --git a/arch/s390/boot/printk.c b/arch/s390/boot/printk.c
index 8cf6331bc060..4bb6bc95704e 100644
--- a/arch/s390/boot/printk.c
+++ b/arch/s390/boot/printk.c
@@ -29,7 +29,8 @@ static void boot_rb_add(const char *str, size_t len)
 	/* store strings separated by '\0' */
 	if (len + 1 > avail)
 		boot_rb_off = 0;
-	strcpy(boot_rb + boot_rb_off, str);
+	avail = sizeof(boot_rb) - boot_rb_off - 1;
+	strscpy(boot_rb + boot_rb_off, str, avail);
 	boot_rb_off += len + 1;
 }
 
@@ -158,10 +159,10 @@ static noinline char *strsym(char *buf, void *ip)
 
 	p = findsym((unsigned long)ip, &off, &len);
 	if (p) {
-		strncpy(buf, p, MAX_SYMLEN);
+		strscpy(buf, p, MAX_SYMLEN);
 		/* reserve 15 bytes for offset/len in symbol+0x1234/0x1234 */
 		p = buf + strnlen(buf, MAX_SYMLEN - 15);
-		strcpy(p, "+0x");
+		strscpy(p, "+0x", MAX_SYMLEN - (p - buf));
 		as_hex(p + 3, off, 0);
 		strcat(p, "/0x");
 		as_hex(p + strlen(p), len, 0);
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index 06316fb8e0fa..da8337e63a3e 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -6,6 +6,7 @@
 #include <asm/boot_data.h>
 #include <asm/extmem.h>
 #include <asm/sections.h>
+#include <asm/diag288.h>
 #include <asm/maccess.h>
 #include <asm/machine.h>
 #include <asm/sysinfo.h>
@@ -71,6 +72,20 @@ static void detect_machine_type(void)
 		set_machine_feature(MFEATURE_VM);
 }
 
+static void detect_diag288(void)
+{
+	/* "BEGIN" in EBCDIC character set */
+	static const char cmd[] = "\xc2\xc5\xc7\xc9\xd5";
+	unsigned long action, len;
+
+	action = machine_is_vm() ? (unsigned long)cmd : LPARWDT_RESTART;
+	len = machine_is_vm() ? sizeof(cmd) : 0;
+	if (__diag288(WDT_FUNC_INIT, MIN_INTERVAL, action, len))
+		return;
+	__diag288(WDT_FUNC_CANCEL, 0, 0, 0);
+	set_machine_feature(MFEATURE_DIAG288);
+}
+
 static void detect_diag9c(void)
 {
 	unsigned int cpu;
@@ -519,6 +534,8 @@ void startup_kernel(void)
 	detect_facilities();
 	detect_diag9c();
 	detect_machine_type();
+	/* detect_diag288() needs machine type */
+	detect_diag288();
 	cmma_init();
 	sanitize_prot_virt_host();
 	max_physmem_end = detect_max_physmem_end();
diff --git a/arch/s390/boot/string.c b/arch/s390/boot/string.c
index f6b9b1df48a8..bd68161434a6 100644
--- a/arch/s390/boot/string.c
+++ b/arch/s390/boot/string.c
@@ -29,6 +29,18 @@ int strncmp(const char *cs, const char *ct, size_t count)
 	return 0;
 }
 
+ssize_t sized_strscpy(char *dst, const char *src, size_t count)
+{
+	size_t len;
+
+	if (count == 0)
+		return -E2BIG;
+	len = strnlen(src, count - 1);
+	memcpy(dst, src, len);
+	dst[len] = '\0';
+	return src[len] ? -E2BIG : len;
+}
+
 void *memset64(uint64_t *s, uint64_t v, size_t count)
 {
 	uint64_t *xs = s;
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 6f2c9ce1b154..8ecad727497e 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -38,7 +38,6 @@ CONFIG_USER_NS=y
 CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_SCHED_AUTOGROUP=y
 CONFIG_EXPERT=y
-# CONFIG_SYSFS_SYSCALL is not set
 CONFIG_PROFILING=y
 CONFIG_KEXEC=y
 CONFIG_KEXEC_FILE=y
@@ -92,7 +91,6 @@ CONFIG_UNIXWARE_DISKLABEL=y
 CONFIG_IOSCHED_BFQ=y
 CONFIG_BINFMT_MISC=m
 CONFIG_ZSWAP=y
-CONFIG_ZSMALLOC=y
 CONFIG_ZSMALLOC_STAT=y
 CONFIG_SLAB_BUCKETS=y
 CONFIG_SLUB_STATS=y
@@ -395,6 +393,9 @@ CONFIG_CLS_U32_MARK=y
 CONFIG_NET_CLS_FLOW=m
 CONFIG_NET_CLS_CGROUP=y
 CONFIG_NET_CLS_BPF=m
+CONFIG_NET_CLS_FLOWER=m
+CONFIG_NET_CLS_MATCHALL=m
+CONFIG_NET_EMATCH=y
 CONFIG_NET_CLS_ACT=y
 CONFIG_NET_ACT_POLICE=m
 CONFIG_NET_ACT_GACT=m
@@ -405,6 +406,9 @@ CONFIG_NET_ACT_PEDIT=m
 CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
 CONFIG_NET_ACT_CSUM=m
+CONFIG_NET_ACT_VLAN=m
+CONFIG_NET_ACT_TUNNEL_KEY=m
+CONFIG_NET_ACT_CT=m
 CONFIG_NET_ACT_GATE=m
 CONFIG_NET_TC_SKB_EXT=y
 CONFIG_DNS_RESOLVER=y
@@ -628,8 +632,16 @@ CONFIG_VIRTIO_PCI=m
 CONFIG_VIRTIO_BALLOON=m
 CONFIG_VIRTIO_MEM=m
 CONFIG_VIRTIO_INPUT=y
+CONFIG_VDPA=m
+CONFIG_VDPA_SIM=m
+CONFIG_VDPA_SIM_NET=m
+CONFIG_VDPA_SIM_BLOCK=m
+CONFIG_VDPA_USER=m
+CONFIG_MLX5_VDPA_NET=m
+CONFIG_VP_VDPA=m
 CONFIG_VHOST_NET=m
 CONFIG_VHOST_VSOCK=m
+CONFIG_VHOST_VDPA=m
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_EXT4_FS_SECURITY=y
@@ -654,7 +666,6 @@ CONFIG_NILFS2_FS=m
 CONFIG_BCACHEFS_FS=y
 CONFIG_BCACHEFS_QUOTA=y
 CONFIG_BCACHEFS_POSIX_ACL=y
-CONFIG_FS_DAX=y
 CONFIG_EXPORTFS_BLOCK_OPS=y
 CONFIG_FS_ENCRYPTION=y
 CONFIG_FS_VERITY=y
@@ -724,11 +735,10 @@ CONFIG_NLS_UTF8=m
 CONFIG_DLM=m
 CONFIG_UNICODE=y
 CONFIG_PERSISTENT_KEYRINGS=y
+CONFIG_BIG_KEYS=y
 CONFIG_ENCRYPTED_KEYS=m
 CONFIG_KEY_NOTIFICATIONS=y
 CONFIG_SECURITY=y
-CONFIG_HARDENED_USERCOPY=y
-CONFIG_FORTIFY_SOURCE=y
 CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_BOOTPARAM=y
 CONFIG_SECURITY_LOCKDOWN_LSM=y
@@ -741,12 +751,14 @@ CONFIG_IMA=y
 CONFIG_IMA_DEFAULT_HASH_SHA256=y
 CONFIG_IMA_WRITE_POLICY=y
 CONFIG_IMA_APPRAISE=y
+CONFIG_FORTIFY_SOURCE=y
+CONFIG_HARDENED_USERCOPY=y
 CONFIG_BUG_ON_DATA_CORRUPTION=y
 CONFIG_CRYPTO_USER=m
-# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
+CONFIG_CRYPTO_SELFTESTS=y
 CONFIG_CRYPTO_PCRYPT=m
 CONFIG_CRYPTO_CRYPTD=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_ECDSA=m
@@ -756,7 +768,6 @@ CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_ARIA=m
 CONFIG_CRYPTO_BLOWFISH=m
-CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_DES=m
@@ -795,13 +806,11 @@ CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_CRYPTO_SHA512_S390=m
 CONFIG_CRYPTO_SHA1_S390=m
-CONFIG_CRYPTO_SHA256_S390=m
 CONFIG_CRYPTO_SHA3_256_S390=m
 CONFIG_CRYPTO_SHA3_512_S390=m
 CONFIG_CRYPTO_GHASH_S390=m
 CONFIG_CRYPTO_AES_S390=m
 CONFIG_CRYPTO_DES_S390=m
-CONFIG_CRYPTO_CHACHA_S390=m
 CONFIG_CRYPTO_HMAC_S390=m
 CONFIG_ZCRYPT=m
 CONFIG_PKEY=m
@@ -812,9 +821,9 @@ CONFIG_PKEY_UV=m
 CONFIG_CRYPTO_PAES_S390=m
 CONFIG_CRYPTO_DEV_VIRTIO=m
 CONFIG_SYSTEM_BLACKLIST_KEYRING=y
+CONFIG_CRYPTO_KRB5=m
+CONFIG_CRYPTO_KRB5_SELFTESTS=y
 CONFIG_CORDIC=m
-CONFIG_CRYPTO_LIB_CURVE25519=m
-CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
 CONFIG_RANDOM32_SELFTEST=y
 CONFIG_XZ_DEC_MICROLZMA=y
 CONFIG_DMA_CMA=y
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index f18a7d97ac21..c13a77765162 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -36,7 +36,6 @@ CONFIG_USER_NS=y
 CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_SCHED_AUTOGROUP=y
 CONFIG_EXPERT=y
-# CONFIG_SYSFS_SYSCALL is not set
 CONFIG_PROFILING=y
 CONFIG_KEXEC=y
 CONFIG_KEXEC_FILE=y
@@ -86,7 +85,6 @@ CONFIG_UNIXWARE_DISKLABEL=y
 CONFIG_IOSCHED_BFQ=y
 CONFIG_BINFMT_MISC=m
 CONFIG_ZSWAP=y
-CONFIG_ZSMALLOC=y
 CONFIG_ZSMALLOC_STAT=y
 CONFIG_SLAB_BUCKETS=y
 # CONFIG_COMPAT_BRK is not set
@@ -385,6 +383,9 @@ CONFIG_CLS_U32_MARK=y
 CONFIG_NET_CLS_FLOW=m
 CONFIG_NET_CLS_CGROUP=y
 CONFIG_NET_CLS_BPF=m
+CONFIG_NET_CLS_FLOWER=m
+CONFIG_NET_CLS_MATCHALL=m
+CONFIG_NET_EMATCH=y
 CONFIG_NET_CLS_ACT=y
 CONFIG_NET_ACT_POLICE=m
 CONFIG_NET_ACT_GACT=m
@@ -395,6 +396,9 @@ CONFIG_NET_ACT_PEDIT=m
 CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
 CONFIG_NET_ACT_CSUM=m
+CONFIG_NET_ACT_VLAN=m
+CONFIG_NET_ACT_TUNNEL_KEY=m
+CONFIG_NET_ACT_CT=m
 CONFIG_NET_ACT_GATE=m
 CONFIG_NET_TC_SKB_EXT=y
 CONFIG_DNS_RESOLVER=y
@@ -618,8 +622,16 @@ CONFIG_VIRTIO_PCI=m
 CONFIG_VIRTIO_BALLOON=m
 CONFIG_VIRTIO_MEM=m
 CONFIG_VIRTIO_INPUT=y
+CONFIG_VDPA=m
+CONFIG_VDPA_SIM=m
+CONFIG_VDPA_SIM_NET=m
+CONFIG_VDPA_SIM_BLOCK=m
+CONFIG_VDPA_USER=m
+CONFIG_MLX5_VDPA_NET=m
+CONFIG_VP_VDPA=m
 CONFIG_VHOST_NET=m
 CONFIG_VHOST_VSOCK=m
+CONFIG_VHOST_VDPA=m
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_EXT4_FS_SECURITY=y
@@ -641,7 +653,6 @@ CONFIG_NILFS2_FS=m
 CONFIG_BCACHEFS_FS=m
 CONFIG_BCACHEFS_QUOTA=y
 CONFIG_BCACHEFS_POSIX_ACL=y
-CONFIG_FS_DAX=y
 CONFIG_EXPORTFS_BLOCK_OPS=y
 CONFIG_FS_ENCRYPTION=y
 CONFIG_FS_VERITY=y
@@ -711,6 +722,7 @@ CONFIG_NLS_UTF8=m
 CONFIG_DLM=m
 CONFIG_UNICODE=y
 CONFIG_PERSISTENT_KEYRINGS=y
+CONFIG_BIG_KEYS=y
 CONFIG_ENCRYPTED_KEYS=m
 CONFIG_KEY_NOTIFICATIONS=y
 CONFIG_SECURITY=y
@@ -729,10 +741,10 @@ CONFIG_IMA_APPRAISE=y
 CONFIG_BUG_ON_DATA_CORRUPTION=y
 CONFIG_CRYPTO_FIPS=y
 CONFIG_CRYPTO_USER=m
-# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
+CONFIG_CRYPTO_SELFTESTS=y
 CONFIG_CRYPTO_PCRYPT=m
 CONFIG_CRYPTO_CRYPTD=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_ECDSA=m
@@ -742,7 +754,6 @@ CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_ARIA=m
 CONFIG_CRYPTO_BLOWFISH=m
-CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_DES=m
@@ -782,13 +793,11 @@ CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_CRYPTO_SHA512_S390=m
 CONFIG_CRYPTO_SHA1_S390=m
-CONFIG_CRYPTO_SHA256_S390=m
 CONFIG_CRYPTO_SHA3_256_S390=m
 CONFIG_CRYPTO_SHA3_512_S390=m
 CONFIG_CRYPTO_GHASH_S390=m
 CONFIG_CRYPTO_AES_S390=m
 CONFIG_CRYPTO_DES_S390=m
-CONFIG_CRYPTO_CHACHA_S390=m
 CONFIG_CRYPTO_HMAC_S390=m
 CONFIG_ZCRYPT=m
 CONFIG_PKEY=m
@@ -799,10 +808,10 @@ CONFIG_PKEY_UV=m
 CONFIG_CRYPTO_PAES_S390=m
 CONFIG_CRYPTO_DEV_VIRTIO=m
 CONFIG_SYSTEM_BLACKLIST_KEYRING=y
+CONFIG_CRYPTO_KRB5=m
+CONFIG_CRYPTO_KRB5_SELFTESTS=y
 CONFIG_CORDIC=m
 CONFIG_PRIME_NUMBERS=m
-CONFIG_CRYPTO_LIB_CURVE25519=m
-CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
 CONFIG_XZ_DEC_MICROLZMA=y
 CONFIG_DMA_CMA=y
 CONFIG_CMA_SIZE_MBYTES=0
diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig
index 853b2326a171..8163c1702720 100644
--- a/arch/s390/configs/zfcpdump_defconfig
+++ b/arch/s390/configs/zfcpdump_defconfig
@@ -70,7 +70,6 @@ CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_INFO_DWARF4=y
 CONFIG_DEBUG_FS=y
 CONFIG_PANIC_ON_OOPS=y
-# CONFIG_SCHED_DEBUG is not set
 CONFIG_RCU_CPU_STALL_TIMEOUT=60
 # CONFIG_RCU_TRACE is not set
 # CONFIG_FTRACE is not set
diff --git a/arch/s390/crypto/Kconfig b/arch/s390/crypto/Kconfig
index 8c4db8b64fa2..e2c27588b21a 100644
--- a/arch/s390/crypto/Kconfig
+++ b/arch/s390/crypto/Kconfig
@@ -4,7 +4,6 @@ menu "Accelerated Cryptographic Algorithms for CPU (s390)"
 
 config CRYPTO_SHA512_S390
 	tristate "Hash functions: SHA-384 and SHA-512"
-	depends on S390
 	select CRYPTO_HASH
 	help
 	  SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
@@ -15,7 +14,6 @@ config CRYPTO_SHA512_S390
 
 config CRYPTO_SHA1_S390
 	tristate "Hash functions: SHA-1"
-	depends on S390
 	select CRYPTO_HASH
 	help
 	  SHA-1 secure hash algorithm (FIPS 180)
@@ -24,20 +22,8 @@ config CRYPTO_SHA1_S390
 
 	  It is available as of z990.
 
-config CRYPTO_SHA256_S390
-	tristate "Hash functions: SHA-224 and SHA-256"
-	depends on S390
-	select CRYPTO_HASH
-	help
-	  SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
-	  Architecture: s390
-
-	  It is available as of z9.
-
 config CRYPTO_SHA3_256_S390
 	tristate "Hash functions: SHA3-224 and SHA3-256"
-	depends on S390
 	select CRYPTO_HASH
 	help
 	  SHA3-224 and SHA3-256 secure hash algorithms (FIPS 202)
@@ -48,7 +34,6 @@ config CRYPTO_SHA3_256_S390
 
 config CRYPTO_SHA3_512_S390
 	tristate "Hash functions: SHA3-384 and SHA3-512"
-	depends on S390
 	select CRYPTO_HASH
 	help
 	  SHA3-384 and SHA3-512 secure hash algorithms (FIPS 202)
@@ -59,7 +44,6 @@ config CRYPTO_SHA3_512_S390
 
 config CRYPTO_GHASH_S390
 	tristate "Hash functions: GHASH"
-	depends on S390
 	select CRYPTO_HASH
 	help
 	  GCM GHASH hash function (NIST SP800-38D)
@@ -70,7 +54,6 @@ config CRYPTO_GHASH_S390
 
 config CRYPTO_AES_S390
 	tristate "Ciphers: AES, modes: ECB, CBC, CTR, XTS, GCM"
-	depends on S390
 	select CRYPTO_ALGAPI
 	select CRYPTO_SKCIPHER
 	help
@@ -92,7 +75,6 @@ config CRYPTO_AES_S390
 
 config CRYPTO_DES_S390
 	tristate "Ciphers: DES and Triple DES EDE, modes: ECB, CBC, CTR"
-	depends on S390
 	select CRYPTO_ALGAPI
 	select CRYPTO_SKCIPHER
 	select CRYPTO_LIB_DES
@@ -107,23 +89,8 @@ config CRYPTO_DES_S390
 	  As of z990 the ECB and CBC mode are hardware accelerated.
 	  As of z196 the CTR mode is hardware accelerated.
 
-config CRYPTO_CHACHA_S390
-	tristate
-	depends on S390
-	select CRYPTO_SKCIPHER
-	select CRYPTO_LIB_CHACHA_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-	default CRYPTO_LIB_CHACHA_INTERNAL
-	help
-	  Length-preserving cipher: ChaCha20 stream cipher (RFC 7539)
-
-	  Architecture: s390
-
-	  It is available as of z13.
-
 config CRYPTO_HMAC_S390
 	tristate "Keyed-hash message authentication code: HMAC"
-	depends on S390
 	select CRYPTO_HASH
 	help
 	  s390 specific HMAC hardware support for SHA224, SHA256, SHA384 and
diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index 14dafadbcbed..21757d86cd49 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -4,17 +4,13 @@
 #
 
 obj-$(CONFIG_CRYPTO_SHA1_S390) += sha1_s390.o sha_common.o
-obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256_s390.o sha_common.o
 obj-$(CONFIG_CRYPTO_SHA512_S390) += sha512_s390.o sha_common.o
 obj-$(CONFIG_CRYPTO_SHA3_256_S390) += sha3_256_s390.o sha_common.o
 obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o
 obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o
 obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o
 obj-$(CONFIG_CRYPTO_PAES_S390) += paes_s390.o
-obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o
 obj-$(CONFIG_S390_PRNG) += prng.o
 obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o
 obj-$(CONFIG_CRYPTO_HMAC_S390) += hmac_s390.o
 obj-y += arch_random.o
-
-chacha_s390-y := chacha-glue.o chacha-s390.o
diff --git a/arch/s390/crypto/chacha-glue.c b/arch/s390/crypto/chacha-glue.c
deleted file mode 100644
index 920e9f0941e7..000000000000
--- a/arch/s390/crypto/chacha-glue.c
+++ /dev/null
@@ -1,124 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * s390 ChaCha stream cipher.
- *
- * Copyright IBM Corp. 2021
- */
-
-#define KMSG_COMPONENT "chacha_s390"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <crypto/internal/chacha.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/algapi.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-#include <asm/fpu.h>
-#include "chacha-s390.h"
-
-static void chacha20_crypt_s390(u32 *state, u8 *dst, const u8 *src,
-				unsigned int nbytes, const u32 *key,
-				u32 *counter)
-{
-	DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
-
-	kernel_fpu_begin(&vxstate, KERNEL_VXR);
-	chacha20_vx(dst, src, nbytes, key, counter);
-	kernel_fpu_end(&vxstate, KERNEL_VXR);
-
-	*counter += round_up(nbytes, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
-}
-
-static int chacha20_s390(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-	u32 state[CHACHA_STATE_WORDS] __aligned(16);
-	struct skcipher_walk walk;
-	unsigned int nbytes;
-	int rc;
-
-	rc = skcipher_walk_virt(&walk, req, false);
-	chacha_init(state, ctx->key, req->iv);
-
-	while (walk.nbytes > 0) {
-		nbytes = walk.nbytes;
-		if (nbytes < walk.total)
-			nbytes = round_down(nbytes, walk.stride);
-
-		if (nbytes <= CHACHA_BLOCK_SIZE) {
-			chacha_crypt_generic(state, walk.dst.virt.addr,
-					     walk.src.virt.addr, nbytes,
-					     ctx->nrounds);
-		} else {
-			chacha20_crypt_s390(state, walk.dst.virt.addr,
-					    walk.src.virt.addr, nbytes,
-					    &state[4], &state[12]);
-		}
-		rc = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
-	return rc;
-}
-
-void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
-{
-	/* TODO: implement hchacha_block_arch() in assembly */
-	hchacha_block_generic(state, stream, nrounds);
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
-		       unsigned int bytes, int nrounds)
-{
-	/* s390 chacha20 implementation has 20 rounds hard-coded,
-	 * it cannot handle a block of data or less, but otherwise
-	 * it can handle data of arbitrary size
-	 */
-	if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx())
-		chacha_crypt_generic(state, dst, src, bytes, nrounds);
-	else
-		chacha20_crypt_s390(state, dst, src, bytes,
-				    &state[4], &state[12]);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-static struct skcipher_alg chacha_algs[] = {
-	{
-		.base.cra_name		= "chacha20",
-		.base.cra_driver_name	= "chacha20-s390",
-		.base.cra_priority	= 900,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= CHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= chacha20_s390,
-		.decrypt		= chacha20_s390,
-	}
-};
-
-static int __init chacha_mod_init(void)
-{
-	return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ?
-		crypto_register_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs)) : 0;
-}
-
-static void __exit chacha_mod_fini(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER))
-		crypto_unregister_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs));
-}
-
-module_cpu_feature_match(S390_CPU_FEATURE_VXRS, chacha_mod_init);
-module_exit(chacha_mod_fini);
-
-MODULE_DESCRIPTION("ChaCha20 stream cipher");
-MODULE_LICENSE("GPL v2");
-
-MODULE_ALIAS_CRYPTO("chacha20");
diff --git a/arch/s390/crypto/ghash_s390.c b/arch/s390/crypto/ghash_s390.c
index 0800a2a5799f..dcbcee37cb63 100644
--- a/arch/s390/crypto/ghash_s390.c
+++ b/arch/s390/crypto/ghash_s390.c
@@ -8,29 +8,28 @@
  * Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com>
  */
 
+#include <asm/cpacf.h>
+#include <crypto/ghash.h>
 #include <crypto/internal/hash.h>
-#include <linux/module.h>
 #include <linux/cpufeature.h>
-#include <asm/cpacf.h>
-
-#define GHASH_BLOCK_SIZE	16
-#define GHASH_DIGEST_SIZE	16
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
 
-struct ghash_ctx {
+struct s390_ghash_ctx {
 	u8 key[GHASH_BLOCK_SIZE];
 };
 
-struct ghash_desc_ctx {
+struct s390_ghash_desc_ctx {
 	u8 icv[GHASH_BLOCK_SIZE];
 	u8 key[GHASH_BLOCK_SIZE];
-	u8 buffer[GHASH_BLOCK_SIZE];
-	u32 bytes;
 };
 
 static int ghash_init(struct shash_desc *desc)
 {
-	struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
-	struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+	struct s390_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+	struct s390_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
 
 	memset(dctx, 0, sizeof(*dctx));
 	memcpy(dctx->key, ctx->key, GHASH_BLOCK_SIZE);
@@ -41,7 +40,7 @@ static int ghash_init(struct shash_desc *desc)
 static int ghash_setkey(struct crypto_shash *tfm,
 			const u8 *key, unsigned int keylen)
 {
-	struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
+	struct s390_ghash_ctx *ctx = crypto_shash_ctx(tfm);
 
 	if (keylen != GHASH_BLOCK_SIZE)
 		return -EINVAL;
@@ -54,80 +53,71 @@ static int ghash_setkey(struct crypto_shash *tfm,
 static int ghash_update(struct shash_desc *desc,
 			 const u8 *src, unsigned int srclen)
 {
-	struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+	struct s390_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
 	unsigned int n;
-	u8 *buf = dctx->buffer;
-
-	if (dctx->bytes) {
-		u8 *pos = buf + (GHASH_BLOCK_SIZE - dctx->bytes);
 
-		n = min(srclen, dctx->bytes);
-		dctx->bytes -= n;
-		srclen -= n;
-
-		memcpy(pos, src, n);
-		src += n;
+	n = srclen & ~(GHASH_BLOCK_SIZE - 1);
+	cpacf_kimd(CPACF_KIMD_GHASH, dctx, src, n);
+	return srclen - n;
+}
 
-		if (!dctx->bytes) {
-			cpacf_kimd(CPACF_KIMD_GHASH, dctx, buf,
-				   GHASH_BLOCK_SIZE);
-		}
-	}
+static void ghash_flush(struct s390_ghash_desc_ctx *dctx, const u8 *src,
+			unsigned int len)
+{
+	if (len) {
+		u8 buf[GHASH_BLOCK_SIZE] = {};
 
-	n = srclen & ~(GHASH_BLOCK_SIZE - 1);
-	if (n) {
-		cpacf_kimd(CPACF_KIMD_GHASH, dctx, src, n);
-		src += n;
-		srclen -= n;
+		memcpy(buf, src, len);
+		cpacf_kimd(CPACF_KIMD_GHASH, dctx, buf, GHASH_BLOCK_SIZE);
+		memzero_explicit(buf, sizeof(buf));
 	}
+}
 
-	if (srclen) {
-		dctx->bytes = GHASH_BLOCK_SIZE - srclen;
-		memcpy(buf, src, srclen);
-	}
+static int ghash_finup(struct shash_desc *desc, const u8 *src,
+		       unsigned int len, u8 *dst)
+{
+	struct s390_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
 
+	ghash_flush(dctx, src, len);
+	memcpy(dst, dctx->icv, GHASH_BLOCK_SIZE);
 	return 0;
 }
 
-static int ghash_flush(struct ghash_desc_ctx *dctx)
+static int ghash_export(struct shash_desc *desc, void *out)
 {
-	u8 *buf = dctx->buffer;
-
-	if (dctx->bytes) {
-		u8 *pos = buf + (GHASH_BLOCK_SIZE - dctx->bytes);
-
-		memset(pos, 0, dctx->bytes);
-		cpacf_kimd(CPACF_KIMD_GHASH, dctx, buf, GHASH_BLOCK_SIZE);
-		dctx->bytes = 0;
-	}
+	struct s390_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
 
+	memcpy(out, dctx->icv, GHASH_DIGEST_SIZE);
 	return 0;
 }
 
-static int ghash_final(struct shash_desc *desc, u8 *dst)
+static int ghash_import(struct shash_desc *desc, const void *in)
 {
-	struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
-	int ret;
+	struct s390_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+	struct s390_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
 
-	ret = ghash_flush(dctx);
-	if (!ret)
-		memcpy(dst, dctx->icv, GHASH_BLOCK_SIZE);
-	return ret;
+	memcpy(dctx->icv, in, GHASH_DIGEST_SIZE);
+	memcpy(dctx->key, ctx->key, GHASH_BLOCK_SIZE);
+	return 0;
 }
 
 static struct shash_alg ghash_alg = {
 	.digestsize	= GHASH_DIGEST_SIZE,
 	.init		= ghash_init,
 	.update		= ghash_update,
-	.final		= ghash_final,
+	.finup		= ghash_finup,
 	.setkey		= ghash_setkey,
-	.descsize	= sizeof(struct ghash_desc_ctx),
+	.export		= ghash_export,
+	.import		= ghash_import,
+	.statesize	= sizeof(struct ghash_desc_ctx),
+	.descsize	= sizeof(struct s390_ghash_desc_ctx),
 	.base		= {
 		.cra_name		= "ghash",
 		.cra_driver_name	= "ghash-s390",
 		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize		= GHASH_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct ghash_ctx),
+		.cra_ctxsize		= sizeof(struct s390_ghash_ctx),
 		.cra_module		= THIS_MODULE,
 	},
 };
diff --git a/arch/s390/crypto/hmac_s390.c b/arch/s390/crypto/hmac_s390.c
index bba9a818dfdc..93a1098d9f8d 100644
--- a/arch/s390/crypto/hmac_s390.c
+++ b/arch/s390/crypto/hmac_s390.c
@@ -9,10 +9,14 @@
 #define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
 
 #include <asm/cpacf.h>
-#include <crypto/sha2.h>
 #include <crypto/internal/hash.h>
+#include <crypto/hmac.h>
+#include <crypto/sha2.h>
 #include <linux/cpufeature.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/string.h>
 
 /*
  * KMAC param block layout for sha2 function codes:
@@ -71,32 +75,31 @@ union s390_kmac_gr0 {
 struct s390_kmac_sha2_ctx {
 	u8 param[MAX_DIGEST_SIZE + MAX_IMBL_SIZE + MAX_BLOCK_SIZE];
 	union s390_kmac_gr0 gr0;
-	u8 buf[MAX_BLOCK_SIZE];
-	unsigned int buflen;
+	u64 buflen[2];
 };
 
 /*
  * kmac_sha2_set_imbl - sets the input message bit-length based on the blocksize
  */
-static inline void kmac_sha2_set_imbl(u8 *param, unsigned int buflen,
-				      unsigned int blocksize)
+static inline void kmac_sha2_set_imbl(u8 *param, u64 buflen_lo,
+				      u64 buflen_hi, unsigned int blocksize)
 {
 	u8 *imbl = param + SHA2_IMBL_OFFSET(blocksize);
 
 	switch (blocksize) {
 	case SHA256_BLOCK_SIZE:
-		*(u64 *)imbl = (u64)buflen * BITS_PER_BYTE;
+		*(u64 *)imbl = buflen_lo * BITS_PER_BYTE;
 		break;
 	case SHA512_BLOCK_SIZE:
-		*(u128 *)imbl = (u128)buflen * BITS_PER_BYTE;
+		*(u128 *)imbl = (((u128)buflen_hi << 64) + buflen_lo) << 3;
 		break;
 	default:
 		break;
 	}
 }
 
-static int hash_key(const u8 *in, unsigned int inlen,
-		    u8 *digest, unsigned int digestsize)
+static int hash_data(const u8 *in, unsigned int inlen,
+		     u8 *digest, unsigned int digestsize, bool final)
 {
 	unsigned long func;
 	union {
@@ -123,19 +126,23 @@ static int hash_key(const u8 *in, unsigned int inlen,
 
 	switch (digestsize) {
 	case SHA224_DIGEST_SIZE:
-		func = CPACF_KLMD_SHA_256;
+		func = final ? CPACF_KLMD_SHA_256 : CPACF_KIMD_SHA_256;
 		PARAM_INIT(256, 224, inlen * 8);
+		if (!final)
+			digestsize = SHA256_DIGEST_SIZE;
 		break;
 	case SHA256_DIGEST_SIZE:
-		func = CPACF_KLMD_SHA_256;
+		func = final ? CPACF_KLMD_SHA_256 : CPACF_KIMD_SHA_256;
 		PARAM_INIT(256, 256, inlen * 8);
 		break;
 	case SHA384_DIGEST_SIZE:
-		func = CPACF_KLMD_SHA_512;
+		func = final ? CPACF_KLMD_SHA_512 : CPACF_KIMD_SHA_512;
 		PARAM_INIT(512, 384, inlen * 8);
+		if (!final)
+			digestsize = SHA512_DIGEST_SIZE;
 		break;
 	case SHA512_DIGEST_SIZE:
-		func = CPACF_KLMD_SHA_512;
+		func = final ? CPACF_KLMD_SHA_512 : CPACF_KIMD_SHA_512;
 		PARAM_INIT(512, 512, inlen * 8);
 		break;
 	default:
@@ -151,6 +158,12 @@ static int hash_key(const u8 *in, unsigned int inlen,
 	return 0;
 }
 
+static int hash_key(const u8 *in, unsigned int inlen,
+		    u8 *digest, unsigned int digestsize)
+{
+	return hash_data(in, inlen, digest, digestsize, true);
+}
+
 static int s390_hmac_sha2_setkey(struct crypto_shash *tfm,
 				 const u8 *key, unsigned int keylen)
 {
@@ -176,7 +189,8 @@ static int s390_hmac_sha2_init(struct shash_desc *desc)
 	memcpy(ctx->param + SHA2_KEY_OFFSET(bs),
 	       tfm_ctx->key, bs);
 
-	ctx->buflen = 0;
+	ctx->buflen[0] = 0;
+	ctx->buflen[1] = 0;
 	ctx->gr0.reg = 0;
 	switch (crypto_shash_digestsize(desc->tfm)) {
 	case SHA224_DIGEST_SIZE:
@@ -203,48 +217,31 @@ static int s390_hmac_sha2_update(struct shash_desc *desc,
 {
 	struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc);
 	unsigned int bs = crypto_shash_blocksize(desc->tfm);
-	unsigned int offset, n;
-
-	/* check current buffer */
-	offset = ctx->buflen % bs;
-	ctx->buflen += len;
-	if (offset + len < bs)
-		goto store;
-
-	/* process one stored block */
-	if (offset) {
-		n = bs - offset;
-		memcpy(ctx->buf + offset, data, n);
-		ctx->gr0.iimp = 1;
-		_cpacf_kmac(&ctx->gr0.reg, ctx->param, ctx->buf, bs);
-		data += n;
-		len -= n;
-		offset = 0;
-	}
-	/* process as many blocks as possible */
-	if (len >= bs) {
-		n = (len / bs) * bs;
-		ctx->gr0.iimp = 1;
-		_cpacf_kmac(&ctx->gr0.reg, ctx->param, data, n);
-		data += n;
-		len -= n;
-	}
-store:
-	/* store incomplete block in buffer */
-	if (len)
-		memcpy(ctx->buf + offset, data, len);
+	unsigned int n = round_down(len, bs);
 
-	return 0;
+	ctx->buflen[0] += n;
+	if (ctx->buflen[0] < n)
+		ctx->buflen[1]++;
+
+	/* process as many blocks as possible */
+	ctx->gr0.iimp = 1;
+	_cpacf_kmac(&ctx->gr0.reg, ctx->param, data, n);
+	return len - n;
 }
 
-static int s390_hmac_sha2_final(struct shash_desc *desc, u8 *out)
+static int s390_hmac_sha2_finup(struct shash_desc *desc, const u8 *src,
+				unsigned int len, u8 *out)
 {
 	struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc);
 	unsigned int bs = crypto_shash_blocksize(desc->tfm);
 
+	ctx->buflen[0] += len;
+	if (ctx->buflen[0] < len)
+		ctx->buflen[1]++;
+
 	ctx->gr0.iimp = 0;
-	kmac_sha2_set_imbl(ctx->param, ctx->buflen, bs);
-	_cpacf_kmac(&ctx->gr0.reg, ctx->param, ctx->buf, ctx->buflen % bs);
+	kmac_sha2_set_imbl(ctx->param, ctx->buflen[0], ctx->buflen[1], bs);
+	_cpacf_kmac(&ctx->gr0.reg, ctx->param, src, len);
 	memcpy(out, ctx->param, crypto_shash_digestsize(desc->tfm));
 
 	return 0;
@@ -262,7 +259,7 @@ static int s390_hmac_sha2_digest(struct shash_desc *desc,
 		return rc;
 
 	ctx->gr0.iimp = 0;
-	kmac_sha2_set_imbl(ctx->param, len,
+	kmac_sha2_set_imbl(ctx->param, len, 0,
 			   crypto_shash_blocksize(desc->tfm));
 	_cpacf_kmac(&ctx->gr0.reg, ctx->param, data, len);
 	memcpy(out, ctx->param, ds);
@@ -270,22 +267,89 @@ static int s390_hmac_sha2_digest(struct shash_desc *desc,
 	return 0;
 }
 
-#define S390_HMAC_SHA2_ALG(x) {						\
+static int s390_hmac_export_zero(struct shash_desc *desc, void *out)
+{
+	struct crypto_shash *tfm = desc->tfm;
+	u8 ipad[SHA512_BLOCK_SIZE];
+	struct s390_hmac_ctx *ctx;
+	unsigned int bs;
+	int err, i;
+
+	ctx = crypto_shash_ctx(tfm);
+	bs = crypto_shash_blocksize(tfm);
+	for (i = 0; i < bs; i++)
+		ipad[i] = ctx->key[i] ^ HMAC_IPAD_VALUE;
+
+	err = hash_data(ipad, bs, out, crypto_shash_digestsize(tfm), false);
+	memzero_explicit(ipad, sizeof(ipad));
+	return err;
+}
+
+static int s390_hmac_export(struct shash_desc *desc, void *out)
+{
+	struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc);
+	unsigned int bs = crypto_shash_blocksize(desc->tfm);
+	unsigned int ds = bs / 2;
+	union {
+		u8 *u8;
+		u64 *u64;
+	} p = { .u8 = out };
+	int err = 0;
+
+	if (!ctx->gr0.ikp)
+		err = s390_hmac_export_zero(desc, out);
+	else
+		memcpy(p.u8, ctx->param, ds);
+	p.u8 += ds;
+	put_unaligned(ctx->buflen[0], p.u64++);
+	if (ds == SHA512_DIGEST_SIZE)
+		put_unaligned(ctx->buflen[1], p.u64);
+	return err;
+}
+
+static int s390_hmac_import(struct shash_desc *desc, const void *in)
+{
+	struct s390_kmac_sha2_ctx *ctx = shash_desc_ctx(desc);
+	unsigned int bs = crypto_shash_blocksize(desc->tfm);
+	unsigned int ds = bs / 2;
+	union {
+		const u8 *u8;
+		const u64 *u64;
+	} p = { .u8 = in };
+	int err;
+
+	err = s390_hmac_sha2_init(desc);
+	memcpy(ctx->param, p.u8, ds);
+	p.u8 += ds;
+	ctx->buflen[0] = get_unaligned(p.u64++);
+	if (ds == SHA512_DIGEST_SIZE)
+		ctx->buflen[1] = get_unaligned(p.u64);
+	if (ctx->buflen[0] | ctx->buflen[1])
+		ctx->gr0.ikp = 1;
+	return err;
+}
+
+#define S390_HMAC_SHA2_ALG(x, ss) {					\
 	.fc = CPACF_KMAC_HMAC_SHA_##x,					\
 	.alg = {							\
 		.init = s390_hmac_sha2_init,				\
 		.update = s390_hmac_sha2_update,			\
-		.final = s390_hmac_sha2_final,				\
+		.finup = s390_hmac_sha2_finup,				\
 		.digest = s390_hmac_sha2_digest,			\
 		.setkey = s390_hmac_sha2_setkey,			\
+		.export = s390_hmac_export,				\
+		.import = s390_hmac_import,				\
 		.descsize = sizeof(struct s390_kmac_sha2_ctx),		\
 		.halg = {						\
+			.statesize = ss,				\
 			.digestsize = SHA##x##_DIGEST_SIZE,		\
 			.base = {					\
 				.cra_name = "hmac(sha" #x ")",		\
 				.cra_driver_name = "hmac_s390_sha" #x,	\
 				.cra_blocksize = SHA##x##_BLOCK_SIZE,	\
 				.cra_priority = 400,			\
+				.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | \
+					     CRYPTO_AHASH_ALG_FINUP_MAX, \
 				.cra_ctxsize = sizeof(struct s390_hmac_ctx), \
 				.cra_module = THIS_MODULE,		\
 			},						\
@@ -298,10 +362,10 @@ static struct s390_hmac_alg {
 	unsigned int fc;
 	struct shash_alg alg;
 } s390_hmac_algs[] = {
-	S390_HMAC_SHA2_ALG(224),
-	S390_HMAC_SHA2_ALG(256),
-	S390_HMAC_SHA2_ALG(384),
-	S390_HMAC_SHA2_ALG(512),
+	S390_HMAC_SHA2_ALG(224, sizeof(struct crypto_sha256_state)),
+	S390_HMAC_SHA2_ALG(256, sizeof(struct crypto_sha256_state)),
+	S390_HMAC_SHA2_ALG(384, SHA512_STATE_SIZE),
+	S390_HMAC_SHA2_ALG(512, SHA512_STATE_SIZE),
 };
 
 static __always_inline void _s390_hmac_algs_unregister(void)
diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c
index 511093713a6f..8a340c16acb4 100644
--- a/arch/s390/crypto/paes_s390.c
+++ b/arch/s390/crypto/paes_s390.c
@@ -5,7 +5,7 @@
  * s390 implementation of the AES Cipher Algorithm with protected keys.
  *
  * s390 Version:
- *   Copyright IBM Corp. 2017, 2023
+ *   Copyright IBM Corp. 2017, 2025
  *   Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  *		Harald Freudenberger <freude@de.ibm.com>
  */
@@ -13,16 +13,18 @@
 #define KMSG_COMPONENT "paes_s390"
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 
-#include <crypto/aes.h>
-#include <crypto/algapi.h>
-#include <linux/bug.h>
-#include <linux/err.h>
-#include <linux/module.h>
+#include <linux/atomic.h>
 #include <linux/cpufeature.h>
+#include <linux/delay.h>
+#include <linux/err.h>
 #include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
-#include <linux/delay.h>
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <crypto/engine.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/xts.h>
 #include <asm/cpacf.h>
@@ -44,23 +46,61 @@ static DEFINE_MUTEX(ctrblk_lock);
 
 static cpacf_mask_t km_functions, kmc_functions, kmctr_functions;
 
+static struct crypto_engine *paes_crypto_engine;
+#define MAX_QLEN 10
+
+/*
+ * protected key specific stuff
+ */
+
 struct paes_protkey {
 	u32 type;
 	u32 len;
 	u8 protkey[PXTS_256_PROTKEY_SIZE];
 };
 
-struct key_blob {
-	/*
-	 * Small keys will be stored in the keybuf. Larger keys are
-	 * stored in extra allocated memory. In both cases does
-	 * key point to the memory where the key is stored.
-	 * The code distinguishes by checking keylen against
-	 * sizeof(keybuf). See the two following helper functions.
-	 */
-	u8 *key;
-	u8 keybuf[128];
+#define PK_STATE_NO_KEY		     0
+#define PK_STATE_CONVERT_IN_PROGRESS 1
+#define PK_STATE_VALID		     2
+
+struct s390_paes_ctx {
+	/* source key material used to derive a protected key from */
+	u8 keybuf[PAES_MAX_KEYSIZE];
+	unsigned int keylen;
+
+	/* cpacf function code to use with this protected key type */
+	long fc;
+
+	/* nr of requests enqueued via crypto engine which use this tfm ctx */
+	atomic_t via_engine_ctr;
+
+	/* spinlock to atomic read/update all the following fields */
+	spinlock_t pk_lock;
+
+	/* see PK_STATE* defines above, < 0 holds convert failure rc  */
+	int pk_state;
+	/* if state is valid, pk holds the protected key */
+	struct paes_protkey pk;
+};
+
+struct s390_pxts_ctx {
+	/* source key material used to derive a protected key from */
+	u8 keybuf[2 * PAES_MAX_KEYSIZE];
 	unsigned int keylen;
+
+	/* cpacf function code to use with this protected key type */
+	long fc;
+
+	/* nr of requests enqueued via crypto engine which use this tfm ctx */
+	atomic_t via_engine_ctr;
+
+	/* spinlock to atomic read/update all the following fields */
+	spinlock_t pk_lock;
+
+	/* see PK_STATE* defines above, < 0 holds convert failure rc  */
+	int pk_state;
+	/* if state is valid, pk[] hold(s) the protected key(s) */
+	struct paes_protkey pk[2];
 };
 
 /*
@@ -89,214 +129,370 @@ static inline u32 make_clrkey_token(const u8 *ck, size_t cklen, u8 *dest)
 	return sizeof(*token) + cklen;
 }
 
-static inline int _key_to_kb(struct key_blob *kb,
-			     const u8 *key,
-			     unsigned int keylen)
+/*
+ * paes_ctx_setkey() - Set key value into context, maybe construct
+ * a clear key token digestible by pkey from a clear key value.
+ */
+static inline int paes_ctx_setkey(struct s390_paes_ctx *ctx,
+				  const u8 *key, unsigned int keylen)
 {
+	if (keylen > sizeof(ctx->keybuf))
+		return -EINVAL;
+
 	switch (keylen) {
 	case 16:
 	case 24:
 	case 32:
 		/* clear key value, prepare pkey clear key token in keybuf */
-		memset(kb->keybuf, 0, sizeof(kb->keybuf));
-		kb->keylen = make_clrkey_token(key, keylen, kb->keybuf);
-		kb->key = kb->keybuf;
+		memset(ctx->keybuf, 0, sizeof(ctx->keybuf));
+		ctx->keylen = make_clrkey_token(key, keylen, ctx->keybuf);
 		break;
 	default:
 		/* other key material, let pkey handle this */
-		if (keylen <= sizeof(kb->keybuf))
-			kb->key = kb->keybuf;
-		else {
-			kb->key = kmalloc(keylen, GFP_KERNEL);
-			if (!kb->key)
-				return -ENOMEM;
-		}
-		memcpy(kb->key, key, keylen);
-		kb->keylen = keylen;
+		memcpy(ctx->keybuf, key, keylen);
+		ctx->keylen = keylen;
 		break;
 	}
 
 	return 0;
 }
 
-static inline int _xts_key_to_kb(struct key_blob *kb,
-				 const u8 *key,
-				 unsigned int keylen)
+/*
+ * pxts_ctx_setkey() - Set key value into context, maybe construct
+ * a clear key token digestible by pkey from a clear key value.
+ */
+static inline int pxts_ctx_setkey(struct s390_pxts_ctx *ctx,
+				  const u8 *key, unsigned int keylen)
 {
 	size_t cklen = keylen / 2;
 
-	memset(kb->keybuf, 0, sizeof(kb->keybuf));
+	if (keylen > sizeof(ctx->keybuf))
+		return -EINVAL;
 
 	switch (keylen) {
 	case 32:
 	case 64:
 		/* clear key value, prepare pkey clear key tokens in keybuf */
-		kb->key = kb->keybuf;
-		kb->keylen  = make_clrkey_token(key, cklen, kb->key);
-		kb->keylen += make_clrkey_token(key + cklen, cklen,
-						kb->key + kb->keylen);
+		memset(ctx->keybuf, 0, sizeof(ctx->keybuf));
+		ctx->keylen = make_clrkey_token(key, cklen, ctx->keybuf);
+		ctx->keylen += make_clrkey_token(key + cklen, cklen,
+						 ctx->keybuf + ctx->keylen);
 		break;
 	default:
 		/* other key material, let pkey handle this */
-		if (keylen <= sizeof(kb->keybuf)) {
-			kb->key = kb->keybuf;
-		} else {
-			kb->key = kmalloc(keylen, GFP_KERNEL);
-			if (!kb->key)
-				return -ENOMEM;
-		}
-		memcpy(kb->key, key, keylen);
-		kb->keylen = keylen;
+		memcpy(ctx->keybuf, key, keylen);
+		ctx->keylen = keylen;
 		break;
 	}
 
 	return 0;
 }
 
-static inline void _free_kb_keybuf(struct key_blob *kb)
+/*
+ * Convert the raw key material into a protected key via PKEY api.
+ * This function may sleep - don't call in non-sleeping context.
+ */
+static inline int convert_key(const u8 *key, unsigned int keylen,
+			      struct paes_protkey *pk)
 {
-	if (kb->key && kb->key != kb->keybuf
-	    && kb->keylen > sizeof(kb->keybuf)) {
-		kfree_sensitive(kb->key);
-		kb->key = NULL;
+	int rc, i;
+
+	pk->len = sizeof(pk->protkey);
+
+	/*
+	 * In case of a busy card retry with increasing delay
+	 * of 200, 400, 800 and 1600 ms - in total 3 s.
+	 */
+	for (rc = -EIO, i = 0; rc && i < 5; i++) {
+		if (rc == -EBUSY && msleep_interruptible((1 << i) * 100)) {
+			rc = -EINTR;
+			goto out;
+		}
+		rc = pkey_key2protkey(key, keylen,
+				      pk->protkey, &pk->len, &pk->type,
+				      PKEY_XFLAG_NOMEMALLOC);
 	}
-	memzero_explicit(kb->keybuf, sizeof(kb->keybuf));
+
+out:
+	pr_debug("rc=%d\n", rc);
+	return rc;
 }
 
-struct s390_paes_ctx {
-	struct key_blob kb;
+/*
+ * (Re-)Convert the raw key material from the ctx into a protected key
+ * via convert_key() function. Update the pk_state, pk_type, pk_len
+ * and the protected key in the tfm context.
+ * Please note this function may be invoked concurrently with the very
+ * same tfm context. The pk_lock spinlock in the context ensures an
+ * atomic update of the pk and the pk state but does not guarantee any
+ * order of update. So a fresh converted valid protected key may get
+ * updated with an 'old' expired key value. As the cpacf instructions
+ * detect this, refuse to operate with an invalid key and the calling
+ * code triggers a (re-)conversion this does no harm. This may lead to
+ * unnecessary additional conversion but never to invalid data on en-
+ * or decrypt operations.
+ */
+static int paes_convert_key(struct s390_paes_ctx *ctx)
+{
 	struct paes_protkey pk;
-	spinlock_t pk_lock;
-	unsigned long fc;
-};
+	int rc;
 
-struct s390_pxts_ctx {
-	struct key_blob kb;
-	struct paes_protkey pk[2];
-	spinlock_t pk_lock;
-	unsigned long fc;
-};
+	spin_lock_bh(&ctx->pk_lock);
+	ctx->pk_state = PK_STATE_CONVERT_IN_PROGRESS;
+	spin_unlock_bh(&ctx->pk_lock);
 
-static inline int __paes_keyblob2pkey(const u8 *key, unsigned int keylen,
-				      struct paes_protkey *pk)
-{
-	int i, rc = -EIO;
+	rc = convert_key(ctx->keybuf, ctx->keylen, &pk);
 
-	/* try three times in case of busy card */
-	for (i = 0; rc && i < 3; i++) {
-		if (rc == -EBUSY && in_task()) {
-			if (msleep_interruptible(1000))
-				return -EINTR;
-		}
-		rc = pkey_key2protkey(key, keylen, pk->protkey, &pk->len,
-				      &pk->type);
+	/* update context */
+	spin_lock_bh(&ctx->pk_lock);
+	if (rc) {
+		ctx->pk_state = rc;
+	} else {
+		ctx->pk_state = PK_STATE_VALID;
+		ctx->pk = pk;
 	}
+	spin_unlock_bh(&ctx->pk_lock);
 
+	memzero_explicit(&pk, sizeof(pk));
+	pr_debug("rc=%d\n", rc);
 	return rc;
 }
 
-static inline int __paes_convert_key(struct s390_paes_ctx *ctx)
+/*
+ * (Re-)Convert the raw xts key material from the ctx into a
+ * protected key via convert_key() function. Update the pk_state,
+ * pk_type, pk_len and the protected key in the tfm context.
+ * See also comments on function paes_convert_key.
+ */
+static int pxts_convert_key(struct s390_pxts_ctx *ctx)
 {
-	struct paes_protkey pk;
+	struct paes_protkey pk0, pk1;
+	size_t split_keylen;
 	int rc;
 
-	pk.len = sizeof(pk.protkey);
-	rc = __paes_keyblob2pkey(ctx->kb.key, ctx->kb.keylen, &pk);
+	spin_lock_bh(&ctx->pk_lock);
+	ctx->pk_state = PK_STATE_CONVERT_IN_PROGRESS;
+	spin_unlock_bh(&ctx->pk_lock);
+
+	rc = convert_key(ctx->keybuf, ctx->keylen, &pk0);
 	if (rc)
-		return rc;
+		goto out;
+
+	switch (pk0.type) {
+	case PKEY_KEYTYPE_AES_128:
+	case PKEY_KEYTYPE_AES_256:
+		/* second keytoken required */
+		if (ctx->keylen % 2) {
+			rc = -EINVAL;
+			goto out;
+		}
+		split_keylen = ctx->keylen / 2;
+		rc = convert_key(ctx->keybuf + split_keylen,
+				 split_keylen, &pk1);
+		if (rc)
+			goto out;
+		if (pk0.type != pk1.type) {
+			rc = -EINVAL;
+			goto out;
+		}
+		break;
+	case PKEY_KEYTYPE_AES_XTS_128:
+	case PKEY_KEYTYPE_AES_XTS_256:
+		/* single key */
+		pk1.type = 0;
+		break;
+	default:
+		/* unsupported protected keytype */
+		rc = -EINVAL;
+		goto out;
+	}
 
+out:
+	/* update context */
 	spin_lock_bh(&ctx->pk_lock);
-	memcpy(&ctx->pk, &pk, sizeof(pk));
+	if (rc) {
+		ctx->pk_state = rc;
+	} else {
+		ctx->pk_state = PK_STATE_VALID;
+		ctx->pk[0] = pk0;
+		ctx->pk[1] = pk1;
+	}
 	spin_unlock_bh(&ctx->pk_lock);
 
-	return 0;
+	memzero_explicit(&pk0, sizeof(pk0));
+	memzero_explicit(&pk1, sizeof(pk1));
+	pr_debug("rc=%d\n", rc);
+	return rc;
 }
 
-static int ecb_paes_init(struct crypto_skcipher *tfm)
-{
-	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+/*
+ * PAES ECB implementation
+ */
 
-	ctx->kb.key = NULL;
-	spin_lock_init(&ctx->pk_lock);
+struct ecb_param {
+	u8 key[PAES_256_PROTKEY_SIZE];
+} __packed;
 
-	return 0;
-}
+struct s390_pecb_req_ctx {
+	unsigned long modifier;
+	struct skcipher_walk walk;
+	bool param_init_done;
+	struct ecb_param param;
+};
 
-static void ecb_paes_exit(struct crypto_skcipher *tfm)
+static int ecb_paes_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			   unsigned int key_len)
 {
 	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	_free_kb_keybuf(&ctx->kb);
-}
-
-static inline int __ecb_paes_set_key(struct s390_paes_ctx *ctx)
-{
-	unsigned long fc;
+	long fc;
 	int rc;
 
-	rc = __paes_convert_key(ctx);
+	/* set raw key into context */
+	rc = paes_ctx_setkey(ctx, in_key, key_len);
 	if (rc)
-		return rc;
+		goto out;
 
-	/* Pick the correct function code based on the protected key type */
-	fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PAES_128 :
-		(ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KM_PAES_192 :
-		(ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KM_PAES_256 : 0;
+	/* convert key into protected key */
+	rc = paes_convert_key(ctx);
+	if (rc)
+		goto out;
 
-	/* Check if the function code is available */
+	/* Pick the correct function code based on the protected key type */
+	switch (ctx->pk.type) {
+	case PKEY_KEYTYPE_AES_128:
+		fc = CPACF_KM_PAES_128;
+		break;
+	case PKEY_KEYTYPE_AES_192:
+		fc = CPACF_KM_PAES_192;
+		break;
+	case PKEY_KEYTYPE_AES_256:
+		fc = CPACF_KM_PAES_256;
+		break;
+	default:
+		fc = 0;
+		break;
+	}
 	ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0;
 
-	return ctx->fc ? 0 : -EINVAL;
+	rc = fc ? 0 : -EINVAL;
+
+out:
+	pr_debug("rc=%d\n", rc);
+	return rc;
 }
 
-static int ecb_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
-			    unsigned int key_len)
+static int ecb_paes_do_crypt(struct s390_paes_ctx *ctx,
+			     struct s390_pecb_req_ctx *req_ctx,
+			     bool maysleep)
 {
-	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int rc;
-
-	_free_kb_keybuf(&ctx->kb);
-	rc = _key_to_kb(&ctx->kb, in_key, key_len);
+	struct ecb_param *param = &req_ctx->param;
+	struct skcipher_walk *walk = &req_ctx->walk;
+	unsigned int nbytes, n, k;
+	int pk_state, rc = 0;
+
+	if (!req_ctx->param_init_done) {
+		/* fetch and check protected key state */
+		spin_lock_bh(&ctx->pk_lock);
+		pk_state = ctx->pk_state;
+		switch (pk_state) {
+		case PK_STATE_NO_KEY:
+			rc = -ENOKEY;
+			break;
+		case PK_STATE_CONVERT_IN_PROGRESS:
+			rc = -EKEYEXPIRED;
+			break;
+		case PK_STATE_VALID:
+			memcpy(param->key, ctx->pk.protkey, sizeof(param->key));
+			req_ctx->param_init_done = true;
+			break;
+		default:
+			rc = pk_state < 0 ? pk_state : -EIO;
+			break;
+		}
+		spin_unlock_bh(&ctx->pk_lock);
+	}
 	if (rc)
-		return rc;
+		goto out;
 
-	return __ecb_paes_set_key(ctx);
+	/*
+	 * Note that in case of partial processing or failure the walk
+	 * is NOT unmapped here. So a follow up task may reuse the walk
+	 * or in case of unrecoverable failure needs to unmap it.
+	 */
+	while ((nbytes = walk->nbytes) != 0) {
+		/* only use complete blocks */
+		n = nbytes & ~(AES_BLOCK_SIZE - 1);
+		k = cpacf_km(ctx->fc | req_ctx->modifier, param,
+			     walk->dst.virt.addr, walk->src.virt.addr, n);
+		if (k)
+			rc = skcipher_walk_done(walk, nbytes - k);
+		if (k < n) {
+			if (!maysleep) {
+				rc = -EKEYEXPIRED;
+				goto out;
+			}
+			rc = paes_convert_key(ctx);
+			if (rc)
+				goto out;
+			spin_lock_bh(&ctx->pk_lock);
+			memcpy(param->key, ctx->pk.protkey, sizeof(param->key));
+			spin_unlock_bh(&ctx->pk_lock);
+		}
+	}
+
+out:
+	pr_debug("rc=%d\n", rc);
+	return rc;
 }
 
 static int ecb_paes_crypt(struct skcipher_request *req, unsigned long modifier)
 {
+	struct s390_pecb_req_ctx *req_ctx = skcipher_request_ctx(req);
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct {
-		u8 key[PAES_256_PROTKEY_SIZE];
-	} param;
-	struct skcipher_walk walk;
-	unsigned int nbytes, n, k;
+	struct skcipher_walk *walk = &req_ctx->walk;
 	int rc;
 
-	rc = skcipher_walk_virt(&walk, req, false);
+	/*
+	 * Attempt synchronous encryption first. If it fails, schedule the request
+	 * asynchronously via the crypto engine. To preserve execution order,
+	 * once a request is queued to the engine, further requests using the same
+	 * tfm will also be routed through the engine.
+	 */
+
+	rc = skcipher_walk_virt(walk, req, false);
 	if (rc)
-		return rc;
+		goto out;
 
-	spin_lock_bh(&ctx->pk_lock);
-	memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
-	spin_unlock_bh(&ctx->pk_lock);
+	req_ctx->modifier = modifier;
+	req_ctx->param_init_done = false;
 
-	while ((nbytes = walk.nbytes) != 0) {
-		/* only use complete blocks */
-		n = nbytes & ~(AES_BLOCK_SIZE - 1);
-		k = cpacf_km(ctx->fc | modifier, &param,
-			     walk.dst.virt.addr, walk.src.virt.addr, n);
-		if (k)
-			rc = skcipher_walk_done(&walk, nbytes - k);
-		if (k < n) {
-			if (__paes_convert_key(ctx))
-				return skcipher_walk_done(&walk, -EIO);
-			spin_lock_bh(&ctx->pk_lock);
-			memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
-			spin_unlock_bh(&ctx->pk_lock);
-		}
+	/* Try synchronous operation if no active engine usage */
+	if (!atomic_read(&ctx->via_engine_ctr)) {
+		rc = ecb_paes_do_crypt(ctx, req_ctx, false);
+		if (rc == 0)
+			goto out;
+	}
+
+	/*
+	 * If sync operation failed or key expired or there are already
+	 * requests enqueued via engine, fallback to async. Mark tfm as
+	 * using engine to serialize requests.
+	 */
+	if (rc == 0 || rc == -EKEYEXPIRED) {
+		atomic_inc(&ctx->via_engine_ctr);
+		rc = crypto_transfer_skcipher_request_to_engine(paes_crypto_engine, req);
+		if (rc != -EINPROGRESS)
+			atomic_dec(&ctx->via_engine_ctr);
 	}
+
+	if (rc != -EINPROGRESS)
+		skcipher_walk_done(walk, rc);
+
+out:
+	if (rc != -EINPROGRESS)
+		memzero_explicit(&req_ctx->param, sizeof(req_ctx->param));
+	pr_debug("rc=%d\n", rc);
 	return rc;
 }
 
@@ -310,112 +506,256 @@ static int ecb_paes_decrypt(struct skcipher_request *req)
 	return ecb_paes_crypt(req, CPACF_DECRYPT);
 }
 
-static struct skcipher_alg ecb_paes_alg = {
-	.base.cra_name		=	"ecb(paes)",
-	.base.cra_driver_name	=	"ecb-paes-s390",
-	.base.cra_priority	=	401,	/* combo: aes + ecb + 1 */
-	.base.cra_blocksize	=	AES_BLOCK_SIZE,
-	.base.cra_ctxsize	=	sizeof(struct s390_paes_ctx),
-	.base.cra_module	=	THIS_MODULE,
-	.base.cra_list		=	LIST_HEAD_INIT(ecb_paes_alg.base.cra_list),
-	.init			=	ecb_paes_init,
-	.exit			=	ecb_paes_exit,
-	.min_keysize		=	PAES_MIN_KEYSIZE,
-	.max_keysize		=	PAES_MAX_KEYSIZE,
-	.setkey			=	ecb_paes_set_key,
-	.encrypt		=	ecb_paes_encrypt,
-	.decrypt		=	ecb_paes_decrypt,
-};
-
-static int cbc_paes_init(struct crypto_skcipher *tfm)
+static int ecb_paes_init(struct crypto_skcipher *tfm)
 {
 	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-	ctx->kb.key = NULL;
+	memset(ctx, 0, sizeof(*ctx));
 	spin_lock_init(&ctx->pk_lock);
 
+	crypto_skcipher_set_reqsize(tfm, sizeof(struct s390_pecb_req_ctx));
+
 	return 0;
 }
 
-static void cbc_paes_exit(struct crypto_skcipher *tfm)
+static void ecb_paes_exit(struct crypto_skcipher *tfm)
 {
 	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-	_free_kb_keybuf(&ctx->kb);
+	memzero_explicit(ctx, sizeof(*ctx));
 }
 
-static inline int __cbc_paes_set_key(struct s390_paes_ctx *ctx)
+static int ecb_paes_do_one_request(struct crypto_engine *engine, void *areq)
 {
-	unsigned long fc;
+	struct skcipher_request *req = skcipher_request_cast(areq);
+	struct s390_pecb_req_ctx *req_ctx = skcipher_request_ctx(req);
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk *walk = &req_ctx->walk;
 	int rc;
 
-	rc = __paes_convert_key(ctx);
-	if (rc)
-		return rc;
+	/* walk has already been prepared */
+
+	rc = ecb_paes_do_crypt(ctx, req_ctx, true);
+	if (rc == -EKEYEXPIRED) {
+		/*
+		 * Protected key expired, conversion is in process.
+		 * Trigger a re-schedule of this request by returning
+		 * -ENOSPC ("hardware queue is full") to the crypto engine.
+		 * To avoid immediately re-invocation of this callback,
+		 * tell the scheduler to voluntarily give up the CPU here.
+		 */
+		cond_resched();
+		pr_debug("rescheduling request\n");
+		return -ENOSPC;
+	} else if (rc) {
+		skcipher_walk_done(walk, rc);
+	}
 
-	/* Pick the correct function code based on the protected key type */
-	fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMC_PAES_128 :
-		(ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KMC_PAES_192 :
-		(ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KMC_PAES_256 : 0;
+	memzero_explicit(&req_ctx->param, sizeof(req_ctx->param));
+	pr_debug("request complete with rc=%d\n", rc);
+	local_bh_disable();
+	atomic_dec(&ctx->via_engine_ctr);
+	crypto_finalize_skcipher_request(engine, req, rc);
+	local_bh_enable();
+	return rc;
+}
 
-	/* Check if the function code is available */
-	ctx->fc = (fc && cpacf_test_func(&kmc_functions, fc)) ? fc : 0;
+static struct skcipher_engine_alg ecb_paes_alg = {
+	.base = {
+		.base.cra_name	      = "ecb(paes)",
+		.base.cra_driver_name = "ecb-paes-s390",
+		.base.cra_priority    = 401,	/* combo: aes + ecb + 1 */
+		.base.cra_blocksize   = AES_BLOCK_SIZE,
+		.base.cra_ctxsize     = sizeof(struct s390_paes_ctx),
+		.base.cra_module      = THIS_MODULE,
+		.base.cra_list	      = LIST_HEAD_INIT(ecb_paes_alg.base.base.cra_list),
+		.init		      = ecb_paes_init,
+		.exit		      = ecb_paes_exit,
+		.min_keysize	      = PAES_MIN_KEYSIZE,
+		.max_keysize	      = PAES_MAX_KEYSIZE,
+		.setkey		      = ecb_paes_setkey,
+		.encrypt	      = ecb_paes_encrypt,
+		.decrypt	      = ecb_paes_decrypt,
+	},
+	.op = {
+		.do_one_request	      = ecb_paes_do_one_request,
+	},
+};
 
-	return ctx->fc ? 0 : -EINVAL;
-}
+/*
+ * PAES CBC implementation
+ */
+
+struct cbc_param {
+	u8 iv[AES_BLOCK_SIZE];
+	u8 key[PAES_256_PROTKEY_SIZE];
+} __packed;
+
+struct s390_pcbc_req_ctx {
+	unsigned long modifier;
+	struct skcipher_walk walk;
+	bool param_init_done;
+	struct cbc_param param;
+};
 
-static int cbc_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
-			    unsigned int key_len)
+static int cbc_paes_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			   unsigned int key_len)
 {
 	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+	long fc;
 	int rc;
 
-	_free_kb_keybuf(&ctx->kb);
-	rc = _key_to_kb(&ctx->kb, in_key, key_len);
+	/* set raw key into context */
+	rc = paes_ctx_setkey(ctx, in_key, key_len);
 	if (rc)
-		return rc;
+		goto out;
 
-	return __cbc_paes_set_key(ctx);
+	/* convert raw key into protected key */
+	rc = paes_convert_key(ctx);
+	if (rc)
+		goto out;
+
+	/* Pick the correct function code based on the protected key type */
+	switch (ctx->pk.type) {
+	case PKEY_KEYTYPE_AES_128:
+		fc = CPACF_KMC_PAES_128;
+		break;
+	case PKEY_KEYTYPE_AES_192:
+		fc = CPACF_KMC_PAES_192;
+		break;
+	case PKEY_KEYTYPE_AES_256:
+		fc = CPACF_KMC_PAES_256;
+		break;
+	default:
+		fc = 0;
+		break;
+	}
+	ctx->fc = (fc && cpacf_test_func(&kmc_functions, fc)) ? fc : 0;
+
+	rc = fc ? 0 : -EINVAL;
+
+out:
+	pr_debug("rc=%d\n", rc);
+	return rc;
 }
 
-static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier)
+static int cbc_paes_do_crypt(struct s390_paes_ctx *ctx,
+			     struct s390_pcbc_req_ctx *req_ctx,
+			     bool maysleep)
 {
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct {
-		u8 iv[AES_BLOCK_SIZE];
-		u8 key[PAES_256_PROTKEY_SIZE];
-	} param;
-	struct skcipher_walk walk;
+	struct cbc_param *param = &req_ctx->param;
+	struct skcipher_walk *walk = &req_ctx->walk;
 	unsigned int nbytes, n, k;
-	int rc;
-
-	rc = skcipher_walk_virt(&walk, req, false);
+	int pk_state, rc = 0;
+
+	if (!req_ctx->param_init_done) {
+		/* fetch and check protected key state */
+		spin_lock_bh(&ctx->pk_lock);
+		pk_state = ctx->pk_state;
+		switch (pk_state) {
+		case PK_STATE_NO_KEY:
+			rc = -ENOKEY;
+			break;
+		case PK_STATE_CONVERT_IN_PROGRESS:
+			rc = -EKEYEXPIRED;
+			break;
+		case PK_STATE_VALID:
+			memcpy(param->key, ctx->pk.protkey, sizeof(param->key));
+			req_ctx->param_init_done = true;
+			break;
+		default:
+			rc = pk_state < 0 ? pk_state : -EIO;
+			break;
+		}
+		spin_unlock_bh(&ctx->pk_lock);
+	}
 	if (rc)
-		return rc;
+		goto out;
 
-	memcpy(param.iv, walk.iv, AES_BLOCK_SIZE);
-	spin_lock_bh(&ctx->pk_lock);
-	memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
-	spin_unlock_bh(&ctx->pk_lock);
+	memcpy(param->iv, walk->iv, AES_BLOCK_SIZE);
 
-	while ((nbytes = walk.nbytes) != 0) {
+	/*
+	 * Note that in case of partial processing or failure the walk
+	 * is NOT unmapped here. So a follow up task may reuse the walk
+	 * or in case of unrecoverable failure needs to unmap it.
+	 */
+	while ((nbytes = walk->nbytes) != 0) {
 		/* only use complete blocks */
 		n = nbytes & ~(AES_BLOCK_SIZE - 1);
-		k = cpacf_kmc(ctx->fc | modifier, &param,
-			      walk.dst.virt.addr, walk.src.virt.addr, n);
+		k = cpacf_kmc(ctx->fc | req_ctx->modifier, param,
+			      walk->dst.virt.addr, walk->src.virt.addr, n);
 		if (k) {
-			memcpy(walk.iv, param.iv, AES_BLOCK_SIZE);
-			rc = skcipher_walk_done(&walk, nbytes - k);
+			memcpy(walk->iv, param->iv, AES_BLOCK_SIZE);
+			rc = skcipher_walk_done(walk, nbytes - k);
 		}
 		if (k < n) {
-			if (__paes_convert_key(ctx))
-				return skcipher_walk_done(&walk, -EIO);
+			if (!maysleep) {
+				rc = -EKEYEXPIRED;
+				goto out;
+			}
+			rc = paes_convert_key(ctx);
+			if (rc)
+				goto out;
 			spin_lock_bh(&ctx->pk_lock);
-			memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
+			memcpy(param->key, ctx->pk.protkey, sizeof(param->key));
 			spin_unlock_bh(&ctx->pk_lock);
 		}
 	}
+
+out:
+	pr_debug("rc=%d\n", rc);
+	return rc;
+}
+
+static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier)
+{
+	struct s390_pcbc_req_ctx *req_ctx = skcipher_request_ctx(req);
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk *walk = &req_ctx->walk;
+	int rc;
+
+	/*
+	 * Attempt synchronous encryption first. If it fails, schedule the request
+	 * asynchronously via the crypto engine. To preserve execution order,
+	 * once a request is queued to the engine, further requests using the same
+	 * tfm will also be routed through the engine.
+	 */
+
+	rc = skcipher_walk_virt(walk, req, false);
+	if (rc)
+		goto out;
+
+	req_ctx->modifier = modifier;
+	req_ctx->param_init_done = false;
+
+	/* Try synchronous operation if no active engine usage */
+	if (!atomic_read(&ctx->via_engine_ctr)) {
+		rc = cbc_paes_do_crypt(ctx, req_ctx, false);
+		if (rc == 0)
+			goto out;
+	}
+
+	/*
+	 * If sync operation failed or key expired or there are already
+	 * requests enqueued via engine, fallback to async. Mark tfm as
+	 * using engine to serialize requests.
+	 */
+	if (rc == 0 || rc == -EKEYEXPIRED) {
+		atomic_inc(&ctx->via_engine_ctr);
+		rc = crypto_transfer_skcipher_request_to_engine(paes_crypto_engine, req);
+		if (rc != -EINPROGRESS)
+			atomic_dec(&ctx->via_engine_ctr);
+	}
+
+	if (rc != -EINPROGRESS)
+		skcipher_walk_done(walk, rc);
+
+out:
+	if (rc != -EINPROGRESS)
+		memzero_explicit(&req_ctx->param, sizeof(req_ctx->param));
+	pr_debug("rc=%d\n", rc);
 	return rc;
 }
 
@@ -429,496 +769,882 @@ static int cbc_paes_decrypt(struct skcipher_request *req)
 	return cbc_paes_crypt(req, CPACF_DECRYPT);
 }
 
-static struct skcipher_alg cbc_paes_alg = {
-	.base.cra_name		=	"cbc(paes)",
-	.base.cra_driver_name	=	"cbc-paes-s390",
-	.base.cra_priority	=	402,	/* ecb-paes-s390 + 1 */
-	.base.cra_blocksize	=	AES_BLOCK_SIZE,
-	.base.cra_ctxsize	=	sizeof(struct s390_paes_ctx),
-	.base.cra_module	=	THIS_MODULE,
-	.base.cra_list		=	LIST_HEAD_INIT(cbc_paes_alg.base.cra_list),
-	.init			=	cbc_paes_init,
-	.exit			=	cbc_paes_exit,
-	.min_keysize		=	PAES_MIN_KEYSIZE,
-	.max_keysize		=	PAES_MAX_KEYSIZE,
-	.ivsize			=	AES_BLOCK_SIZE,
-	.setkey			=	cbc_paes_set_key,
-	.encrypt		=	cbc_paes_encrypt,
-	.decrypt		=	cbc_paes_decrypt,
-};
-
-static int xts_paes_init(struct crypto_skcipher *tfm)
+static int cbc_paes_init(struct crypto_skcipher *tfm)
 {
-	struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-	ctx->kb.key = NULL;
+	memset(ctx, 0, sizeof(*ctx));
 	spin_lock_init(&ctx->pk_lock);
 
+	crypto_skcipher_set_reqsize(tfm, sizeof(struct s390_pcbc_req_ctx));
+
 	return 0;
 }
 
-static void xts_paes_exit(struct crypto_skcipher *tfm)
+static void cbc_paes_exit(struct crypto_skcipher *tfm)
 {
-	struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-	_free_kb_keybuf(&ctx->kb);
+	memzero_explicit(ctx, sizeof(*ctx));
 }
 
-static inline int __xts_paes_convert_key(struct s390_pxts_ctx *ctx)
+static int cbc_paes_do_one_request(struct crypto_engine *engine, void *areq)
 {
-	struct paes_protkey pk0, pk1;
-	size_t split_keylen;
+	struct skcipher_request *req = skcipher_request_cast(areq);
+	struct s390_pcbc_req_ctx *req_ctx = skcipher_request_ctx(req);
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk *walk = &req_ctx->walk;
 	int rc;
 
-	pk0.len = sizeof(pk0.protkey);
-	pk1.len = sizeof(pk1.protkey);
-
-	rc = __paes_keyblob2pkey(ctx->kb.key, ctx->kb.keylen, &pk0);
-	if (rc)
-		return rc;
+	/* walk has already been prepared */
+
+	rc = cbc_paes_do_crypt(ctx, req_ctx, true);
+	if (rc == -EKEYEXPIRED) {
+		/*
+		 * Protected key expired, conversion is in process.
+		 * Trigger a re-schedule of this request by returning
+		 * -ENOSPC ("hardware queue is full") to the crypto engine.
+		 * To avoid immediately re-invocation of this callback,
+		 * tell the scheduler to voluntarily give up the CPU here.
+		 */
+		cond_resched();
+		pr_debug("rescheduling request\n");
+		return -ENOSPC;
+	} else if (rc) {
+		skcipher_walk_done(walk, rc);
+	}
 
-	switch (pk0.type) {
-	case PKEY_KEYTYPE_AES_128:
-	case PKEY_KEYTYPE_AES_256:
-		/* second keytoken required */
-		if (ctx->kb.keylen % 2)
-			return -EINVAL;
-		split_keylen = ctx->kb.keylen / 2;
+	memzero_explicit(&req_ctx->param, sizeof(req_ctx->param));
+	pr_debug("request complete with rc=%d\n", rc);
+	local_bh_disable();
+	atomic_dec(&ctx->via_engine_ctr);
+	crypto_finalize_skcipher_request(engine, req, rc);
+	local_bh_enable();
+	return rc;
+}
 
-		rc = __paes_keyblob2pkey(ctx->kb.key + split_keylen,
-					 split_keylen, &pk1);
-		if (rc)
-			return rc;
+static struct skcipher_engine_alg cbc_paes_alg = {
+	.base = {
+		.base.cra_name	      = "cbc(paes)",
+		.base.cra_driver_name = "cbc-paes-s390",
+		.base.cra_priority    = 402,	/* cbc-paes-s390 + 1 */
+		.base.cra_blocksize   = AES_BLOCK_SIZE,
+		.base.cra_ctxsize     = sizeof(struct s390_paes_ctx),
+		.base.cra_module      = THIS_MODULE,
+		.base.cra_list	      = LIST_HEAD_INIT(cbc_paes_alg.base.base.cra_list),
+		.init		      = cbc_paes_init,
+		.exit		      = cbc_paes_exit,
+		.min_keysize	      = PAES_MIN_KEYSIZE,
+		.max_keysize	      = PAES_MAX_KEYSIZE,
+		.ivsize		      = AES_BLOCK_SIZE,
+		.setkey		      = cbc_paes_setkey,
+		.encrypt	      = cbc_paes_encrypt,
+		.decrypt	      = cbc_paes_decrypt,
+	},
+	.op = {
+		.do_one_request	      = cbc_paes_do_one_request,
+	},
+};
 
-		if (pk0.type != pk1.type)
-			return -EINVAL;
-		break;
-	case PKEY_KEYTYPE_AES_XTS_128:
-	case PKEY_KEYTYPE_AES_XTS_256:
-		/* single key */
-		pk1.type = 0;
-		break;
-	default:
-		/* unsupported protected keytype */
-		return -EINVAL;
-	}
+/*
+ * PAES CTR implementation
+ */
 
-	spin_lock_bh(&ctx->pk_lock);
-	ctx->pk[0] = pk0;
-	ctx->pk[1] = pk1;
-	spin_unlock_bh(&ctx->pk_lock);
+struct ctr_param {
+	u8 key[PAES_256_PROTKEY_SIZE];
+} __packed;
 
-	return 0;
-}
+struct s390_pctr_req_ctx {
+	unsigned long modifier;
+	struct skcipher_walk walk;
+	bool param_init_done;
+	struct ctr_param param;
+};
 
-static inline int __xts_paes_set_key(struct s390_pxts_ctx *ctx)
+static int ctr_paes_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			   unsigned int key_len)
 {
-	unsigned long fc;
+	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+	long fc;
 	int rc;
 
-	rc = __xts_paes_convert_key(ctx);
+	/* set raw key into context */
+	rc = paes_ctx_setkey(ctx, in_key, key_len);
 	if (rc)
-		return rc;
+		goto out;
+
+	/* convert raw key into protected key */
+	rc = paes_convert_key(ctx);
+	if (rc)
+		goto out;
 
 	/* Pick the correct function code based on the protected key type */
-	switch (ctx->pk[0].type) {
+	switch (ctx->pk.type) {
 	case PKEY_KEYTYPE_AES_128:
-		fc = CPACF_KM_PXTS_128;
-		break;
-	case PKEY_KEYTYPE_AES_256:
-		fc = CPACF_KM_PXTS_256;
+		fc = CPACF_KMCTR_PAES_128;
 		break;
-	case PKEY_KEYTYPE_AES_XTS_128:
-		fc = CPACF_KM_PXTS_128_FULL;
+	case PKEY_KEYTYPE_AES_192:
+		fc = CPACF_KMCTR_PAES_192;
 		break;
-	case PKEY_KEYTYPE_AES_XTS_256:
-		fc = CPACF_KM_PXTS_256_FULL;
+	case PKEY_KEYTYPE_AES_256:
+		fc = CPACF_KMCTR_PAES_256;
 		break;
 	default:
 		fc = 0;
 		break;
 	}
+	ctx->fc = (fc && cpacf_test_func(&kmctr_functions, fc)) ? fc : 0;
 
-	/* Check if the function code is available */
-	ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0;
+	rc = fc ? 0 : -EINVAL;
+
+out:
+	pr_debug("rc=%d\n", rc);
+	return rc;
+}
+
+static inline unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes)
+{
+	unsigned int i, n;
+
+	/* only use complete blocks, max. PAGE_SIZE */
+	memcpy(ctrptr, iv, AES_BLOCK_SIZE);
+	n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(AES_BLOCK_SIZE - 1);
+	for (i = (n / AES_BLOCK_SIZE) - 1; i > 0; i--) {
+		memcpy(ctrptr + AES_BLOCK_SIZE, ctrptr, AES_BLOCK_SIZE);
+		crypto_inc(ctrptr + AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+		ctrptr += AES_BLOCK_SIZE;
+	}
+	return n;
+}
+
+static int ctr_paes_do_crypt(struct s390_paes_ctx *ctx,
+			     struct s390_pctr_req_ctx *req_ctx,
+			     bool maysleep)
+{
+	struct ctr_param *param = &req_ctx->param;
+	struct skcipher_walk *walk = &req_ctx->walk;
+	u8 buf[AES_BLOCK_SIZE], *ctrptr;
+	unsigned int nbytes, n, k;
+	int pk_state, locked, rc = 0;
+
+	if (!req_ctx->param_init_done) {
+		/* fetch and check protected key state */
+		spin_lock_bh(&ctx->pk_lock);
+		pk_state = ctx->pk_state;
+		switch (pk_state) {
+		case PK_STATE_NO_KEY:
+			rc = -ENOKEY;
+			break;
+		case PK_STATE_CONVERT_IN_PROGRESS:
+			rc = -EKEYEXPIRED;
+			break;
+		case PK_STATE_VALID:
+			memcpy(param->key, ctx->pk.protkey, sizeof(param->key));
+			req_ctx->param_init_done = true;
+			break;
+		default:
+			rc = pk_state < 0 ? pk_state : -EIO;
+			break;
+		}
+		spin_unlock_bh(&ctx->pk_lock);
+	}
+	if (rc)
+		goto out;
+
+	locked = mutex_trylock(&ctrblk_lock);
+
+	/*
+	 * Note that in case of partial processing or failure the walk
+	 * is NOT unmapped here. So a follow up task may reuse the walk
+	 * or in case of unrecoverable failure needs to unmap it.
+	 */
+	while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) {
+		n = AES_BLOCK_SIZE;
+		if (nbytes >= 2 * AES_BLOCK_SIZE && locked)
+			n = __ctrblk_init(ctrblk, walk->iv, nbytes);
+		ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk->iv;
+		k = cpacf_kmctr(ctx->fc, param, walk->dst.virt.addr,
+				walk->src.virt.addr, n, ctrptr);
+		if (k) {
+			if (ctrptr == ctrblk)
+				memcpy(walk->iv, ctrptr + k - AES_BLOCK_SIZE,
+				       AES_BLOCK_SIZE);
+			crypto_inc(walk->iv, AES_BLOCK_SIZE);
+			rc = skcipher_walk_done(walk, nbytes - k);
+		}
+		if (k < n) {
+			if (!maysleep) {
+				if (locked)
+					mutex_unlock(&ctrblk_lock);
+				rc = -EKEYEXPIRED;
+				goto out;
+			}
+			rc = paes_convert_key(ctx);
+			if (rc) {
+				if (locked)
+					mutex_unlock(&ctrblk_lock);
+				goto out;
+			}
+			spin_lock_bh(&ctx->pk_lock);
+			memcpy(param->key, ctx->pk.protkey, sizeof(param->key));
+			spin_unlock_bh(&ctx->pk_lock);
+		}
+	}
+	if (locked)
+		mutex_unlock(&ctrblk_lock);
+
+	/* final block may be < AES_BLOCK_SIZE, copy only nbytes */
+	if (nbytes) {
+		memset(buf, 0, AES_BLOCK_SIZE);
+		memcpy(buf, walk->src.virt.addr, nbytes);
+		while (1) {
+			if (cpacf_kmctr(ctx->fc, param, buf,
+					buf, AES_BLOCK_SIZE,
+					walk->iv) == AES_BLOCK_SIZE)
+				break;
+			if (!maysleep) {
+				rc = -EKEYEXPIRED;
+				goto out;
+			}
+			rc = paes_convert_key(ctx);
+			if (rc)
+				goto out;
+			spin_lock_bh(&ctx->pk_lock);
+			memcpy(param->key, ctx->pk.protkey, sizeof(param->key));
+			spin_unlock_bh(&ctx->pk_lock);
+		}
+		memcpy(walk->dst.virt.addr, buf, nbytes);
+		crypto_inc(walk->iv, AES_BLOCK_SIZE);
+		rc = skcipher_walk_done(walk, 0);
+	}
+
+out:
+	pr_debug("rc=%d\n", rc);
+	return rc;
+}
+
+static int ctr_paes_crypt(struct skcipher_request *req)
+{
+	struct s390_pctr_req_ctx *req_ctx = skcipher_request_ctx(req);
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk *walk = &req_ctx->walk;
+	int rc;
+
+	/*
+	 * Attempt synchronous encryption first. If it fails, schedule the request
+	 * asynchronously via the crypto engine. To preserve execution order,
+	 * once a request is queued to the engine, further requests using the same
+	 * tfm will also be routed through the engine.
+	 */
+
+	rc = skcipher_walk_virt(walk, req, false);
+	if (rc)
+		goto out;
+
+	req_ctx->param_init_done = false;
+
+	/* Try synchronous operation if no active engine usage */
+	if (!atomic_read(&ctx->via_engine_ctr)) {
+		rc = ctr_paes_do_crypt(ctx, req_ctx, false);
+		if (rc == 0)
+			goto out;
+	}
+
+	/*
+	 * If sync operation failed or key expired or there are already
+	 * requests enqueued via engine, fallback to async. Mark tfm as
+	 * using engine to serialize requests.
+	 */
+	if (rc == 0 || rc == -EKEYEXPIRED) {
+		atomic_inc(&ctx->via_engine_ctr);
+		rc = crypto_transfer_skcipher_request_to_engine(paes_crypto_engine, req);
+		if (rc != -EINPROGRESS)
+			atomic_dec(&ctx->via_engine_ctr);
+	}
+
+	if (rc != -EINPROGRESS)
+		skcipher_walk_done(walk, rc);
+
+out:
+	if (rc != -EINPROGRESS)
+		memzero_explicit(&req_ctx->param, sizeof(req_ctx->param));
+	pr_debug("rc=%d\n", rc);
+	return rc;
+}
+
+static int ctr_paes_init(struct crypto_skcipher *tfm)
+{
+	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	memset(ctx, 0, sizeof(*ctx));
+	spin_lock_init(&ctx->pk_lock);
+
+	crypto_skcipher_set_reqsize(tfm, sizeof(struct s390_pctr_req_ctx));
+
+	return 0;
+}
+
+static void ctr_paes_exit(struct crypto_skcipher *tfm)
+{
+	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+
+static int ctr_paes_do_one_request(struct crypto_engine *engine, void *areq)
+{
+	struct skcipher_request *req = skcipher_request_cast(areq);
+	struct s390_pctr_req_ctx *req_ctx = skcipher_request_ctx(req);
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk *walk = &req_ctx->walk;
+	int rc;
 
-	return ctx->fc ? 0 : -EINVAL;
+	/* walk has already been prepared */
+
+	rc = ctr_paes_do_crypt(ctx, req_ctx, true);
+	if (rc == -EKEYEXPIRED) {
+		/*
+		 * Protected key expired, conversion is in process.
+		 * Trigger a re-schedule of this request by returning
+		 * -ENOSPC ("hardware queue is full") to the crypto engine.
+		 * To avoid immediately re-invocation of this callback,
+		 * tell the scheduler to voluntarily give up the CPU here.
+		 */
+		cond_resched();
+		pr_debug("rescheduling request\n");
+		return -ENOSPC;
+	} else if (rc) {
+		skcipher_walk_done(walk, rc);
+	}
+
+	memzero_explicit(&req_ctx->param, sizeof(req_ctx->param));
+	pr_debug("request complete with rc=%d\n", rc);
+	local_bh_disable();
+	atomic_dec(&ctx->via_engine_ctr);
+	crypto_finalize_skcipher_request(engine, req, rc);
+	local_bh_enable();
+	return rc;
 }
 
-static int xts_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
-			    unsigned int in_keylen)
+static struct skcipher_engine_alg ctr_paes_alg = {
+	.base = {
+		.base.cra_name	      =	"ctr(paes)",
+		.base.cra_driver_name =	"ctr-paes-s390",
+		.base.cra_priority    =	402,	/* ecb-paes-s390 + 1 */
+		.base.cra_blocksize   =	1,
+		.base.cra_ctxsize     =	sizeof(struct s390_paes_ctx),
+		.base.cra_module      =	THIS_MODULE,
+		.base.cra_list	      =	LIST_HEAD_INIT(ctr_paes_alg.base.base.cra_list),
+		.init		      =	ctr_paes_init,
+		.exit		      =	ctr_paes_exit,
+		.min_keysize	      =	PAES_MIN_KEYSIZE,
+		.max_keysize	      =	PAES_MAX_KEYSIZE,
+		.ivsize		      =	AES_BLOCK_SIZE,
+		.setkey		      =	ctr_paes_setkey,
+		.encrypt	      =	ctr_paes_crypt,
+		.decrypt	      =	ctr_paes_crypt,
+		.chunksize	      =	AES_BLOCK_SIZE,
+	},
+	.op = {
+		.do_one_request	      = ctr_paes_do_one_request,
+	},
+};
+
+/*
+ * PAES XTS implementation
+ */
+
+struct xts_full_km_param {
+	u8 key[64];
+	u8 tweak[16];
+	u8 nap[16];
+	u8 wkvp[32];
+} __packed;
+
+struct xts_km_param {
+	u8 key[PAES_256_PROTKEY_SIZE];
+	u8 init[16];
+} __packed;
+
+struct xts_pcc_param {
+	u8 key[PAES_256_PROTKEY_SIZE];
+	u8 tweak[16];
+	u8 block[16];
+	u8 bit[16];
+	u8 xts[16];
+} __packed;
+
+struct s390_pxts_req_ctx {
+	unsigned long modifier;
+	struct skcipher_walk walk;
+	bool param_init_done;
+	union {
+		struct xts_full_km_param full_km_param;
+		struct xts_km_param km_param;
+	} param;
+};
+
+static int xts_paes_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			   unsigned int in_keylen)
 {
 	struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
 	u8 ckey[2 * AES_MAX_KEY_SIZE];
 	unsigned int ckey_len;
+	long fc;
 	int rc;
 
 	if ((in_keylen == 32 || in_keylen == 64) &&
 	    xts_verify_key(tfm, in_key, in_keylen))
 		return -EINVAL;
 
-	_free_kb_keybuf(&ctx->kb);
-	rc = _xts_key_to_kb(&ctx->kb, in_key, in_keylen);
+	/* set raw key into context */
+	rc = pxts_ctx_setkey(ctx, in_key, in_keylen);
 	if (rc)
-		return rc;
+		goto out;
 
-	rc = __xts_paes_set_key(ctx);
+	/* convert raw key(s) into protected key(s) */
+	rc = pxts_convert_key(ctx);
 	if (rc)
-		return rc;
+		goto out;
 
 	/*
-	 * It is not possible on a single protected key (e.g. full AES-XTS) to
-	 * check, if k1 and k2 are the same.
-	 */
-	if (ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_128 ||
-	    ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_256)
-		return 0;
-	/*
 	 * xts_verify_key verifies the key length is not odd and makes
 	 * sure that the two keys are not the same. This can be done
-	 * on the two protected keys as well
+	 * on the two protected keys as well - but not for full xts keys.
 	 */
-	ckey_len = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ?
-		AES_KEYSIZE_128 : AES_KEYSIZE_256;
-	memcpy(ckey, ctx->pk[0].protkey, ckey_len);
-	memcpy(ckey + ckey_len, ctx->pk[1].protkey, ckey_len);
-	return xts_verify_key(tfm, ckey, 2*ckey_len);
+	if (ctx->pk[0].type == PKEY_KEYTYPE_AES_128 ||
+	    ctx->pk[0].type == PKEY_KEYTYPE_AES_256) {
+		ckey_len = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ?
+			AES_KEYSIZE_128 : AES_KEYSIZE_256;
+		memcpy(ckey, ctx->pk[0].protkey, ckey_len);
+		memcpy(ckey + ckey_len, ctx->pk[1].protkey, ckey_len);
+		rc = xts_verify_key(tfm, ckey, 2 * ckey_len);
+		memzero_explicit(ckey, sizeof(ckey));
+		if (rc)
+			goto out;
+	}
+
+	/* Pick the correct function code based on the protected key type */
+	switch (ctx->pk[0].type) {
+	case PKEY_KEYTYPE_AES_128:
+		fc = CPACF_KM_PXTS_128;
+		break;
+	case PKEY_KEYTYPE_AES_256:
+		fc = CPACF_KM_PXTS_256;
+		break;
+	case PKEY_KEYTYPE_AES_XTS_128:
+		fc = CPACF_KM_PXTS_128_FULL;
+		break;
+	case PKEY_KEYTYPE_AES_XTS_256:
+		fc = CPACF_KM_PXTS_256_FULL;
+		break;
+	default:
+		fc = 0;
+		break;
+	}
+	ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0;
+
+	rc = fc ? 0 : -EINVAL;
+
+out:
+	pr_debug("rc=%d\n", rc);
+	return rc;
 }
 
-static int paes_xts_crypt_full(struct skcipher_request *req,
-			       unsigned long modifier)
+static int xts_paes_do_crypt_fullkey(struct s390_pxts_ctx *ctx,
+				     struct s390_pxts_req_ctx *req_ctx,
+				     bool maysleep)
 {
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct xts_full_km_param *param = &req_ctx->param.full_km_param;
+	struct skcipher_walk *walk = &req_ctx->walk;
 	unsigned int keylen, offset, nbytes, n, k;
-	struct {
-		u8 key[64];
-		u8 tweak[16];
-		u8 nap[16];
-		u8 wkvp[32];
-	} fxts_param = {
-		.nap = {0},
-	};
-	struct skcipher_walk walk;
-	int rc;
+	int rc = 0;
 
-	rc = skcipher_walk_virt(&walk, req, false);
-	if (rc)
-		return rc;
+	/*
+	 * The calling function xts_paes_do_crypt() ensures the
+	 * protected key state is always PK_STATE_VALID when this
+	 * function is invoked.
+	 */
 
 	keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_128) ? 32 : 64;
 	offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_128) ? 32 : 0;
 
-	spin_lock_bh(&ctx->pk_lock);
-	memcpy(fxts_param.key + offset, ctx->pk[0].protkey, keylen);
-	memcpy(fxts_param.wkvp, ctx->pk[0].protkey + keylen,
-	       sizeof(fxts_param.wkvp));
-	spin_unlock_bh(&ctx->pk_lock);
-	memcpy(fxts_param.tweak, walk.iv, sizeof(fxts_param.tweak));
-	fxts_param.nap[0] = 0x01; /* initial alpha power (1, little-endian) */
+	if (!req_ctx->param_init_done) {
+		memset(param, 0, sizeof(*param));
+		spin_lock_bh(&ctx->pk_lock);
+		memcpy(param->key + offset, ctx->pk[0].protkey, keylen);
+		memcpy(param->wkvp, ctx->pk[0].protkey + keylen, sizeof(param->wkvp));
+		spin_unlock_bh(&ctx->pk_lock);
+		memcpy(param->tweak, walk->iv, sizeof(param->tweak));
+		param->nap[0] = 0x01; /* initial alpha power (1, little-endian) */
+		req_ctx->param_init_done = true;
+	}
 
-	while ((nbytes = walk.nbytes) != 0) {
+	/*
+	 * Note that in case of partial processing or failure the walk
+	 * is NOT unmapped here. So a follow up task may reuse the walk
+	 * or in case of unrecoverable failure needs to unmap it.
+	 */
+	while ((nbytes = walk->nbytes) != 0) {
 		/* only use complete blocks */
 		n = nbytes & ~(AES_BLOCK_SIZE - 1);
-		k = cpacf_km(ctx->fc | modifier, fxts_param.key + offset,
-			     walk.dst.virt.addr, walk.src.virt.addr, n);
+		k = cpacf_km(ctx->fc | req_ctx->modifier, param->key + offset,
+			     walk->dst.virt.addr, walk->src.virt.addr, n);
 		if (k)
-			rc = skcipher_walk_done(&walk, nbytes - k);
+			rc = skcipher_walk_done(walk, nbytes - k);
 		if (k < n) {
-			if (__xts_paes_convert_key(ctx))
-				return skcipher_walk_done(&walk, -EIO);
+			if (!maysleep) {
+				rc = -EKEYEXPIRED;
+				goto out;
+			}
+			rc = pxts_convert_key(ctx);
+			if (rc)
+				goto out;
 			spin_lock_bh(&ctx->pk_lock);
-			memcpy(fxts_param.key + offset, ctx->pk[0].protkey,
-			       keylen);
-			memcpy(fxts_param.wkvp, ctx->pk[0].protkey + keylen,
-			       sizeof(fxts_param.wkvp));
+			memcpy(param->key + offset, ctx->pk[0].protkey, keylen);
+			memcpy(param->wkvp, ctx->pk[0].protkey + keylen, sizeof(param->wkvp));
 			spin_unlock_bh(&ctx->pk_lock);
 		}
 	}
 
+out:
+	pr_debug("rc=%d\n", rc);
 	return rc;
 }
 
-static int paes_xts_crypt(struct skcipher_request *req, unsigned long modifier)
+static inline int __xts_2keys_prep_param(struct s390_pxts_ctx *ctx,
+					 struct xts_km_param *param,
+					 struct skcipher_walk *walk,
+					 unsigned int keylen,
+					 unsigned int offset, bool maysleep)
 {
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct xts_pcc_param pcc_param;
+	unsigned long cc = 1;
+	int rc = 0;
+
+	while (cc) {
+		memset(&pcc_param, 0, sizeof(pcc_param));
+		memcpy(pcc_param.tweak, walk->iv, sizeof(pcc_param.tweak));
+		spin_lock_bh(&ctx->pk_lock);
+		memcpy(pcc_param.key + offset, ctx->pk[1].protkey, keylen);
+		memcpy(param->key + offset, ctx->pk[0].protkey, keylen);
+		spin_unlock_bh(&ctx->pk_lock);
+		cc = cpacf_pcc(ctx->fc, pcc_param.key + offset);
+		if (cc) {
+			if (!maysleep) {
+				rc = -EKEYEXPIRED;
+				break;
+			}
+			rc = pxts_convert_key(ctx);
+			if (rc)
+				break;
+			continue;
+		}
+		memcpy(param->init, pcc_param.xts, 16);
+	}
+
+	memzero_explicit(pcc_param.key, sizeof(pcc_param.key));
+	return rc;
+}
+
+static int xts_paes_do_crypt_2keys(struct s390_pxts_ctx *ctx,
+				   struct s390_pxts_req_ctx *req_ctx,
+				   bool maysleep)
+{
+	struct xts_km_param *param = &req_ctx->param.km_param;
+	struct skcipher_walk *walk = &req_ctx->walk;
 	unsigned int keylen, offset, nbytes, n, k;
-	struct {
-		u8 key[PAES_256_PROTKEY_SIZE];
-		u8 tweak[16];
-		u8 block[16];
-		u8 bit[16];
-		u8 xts[16];
-	} pcc_param;
-	struct {
-		u8 key[PAES_256_PROTKEY_SIZE];
-		u8 init[16];
-	} xts_param;
-	struct skcipher_walk walk;
-	int rc;
+	int rc = 0;
 
-	rc = skcipher_walk_virt(&walk, req, false);
-	if (rc)
-		return rc;
+	/*
+	 * The calling function xts_paes_do_crypt() ensures the
+	 * protected key state is always PK_STATE_VALID when this
+	 * function is invoked.
+	 */
 
 	keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 48 : 64;
 	offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 16 : 0;
 
-	memset(&pcc_param, 0, sizeof(pcc_param));
-	memcpy(pcc_param.tweak, walk.iv, sizeof(pcc_param.tweak));
-	spin_lock_bh(&ctx->pk_lock);
-	memcpy(pcc_param.key + offset, ctx->pk[1].protkey, keylen);
-	memcpy(xts_param.key + offset, ctx->pk[0].protkey, keylen);
-	spin_unlock_bh(&ctx->pk_lock);
-	cpacf_pcc(ctx->fc, pcc_param.key + offset);
-	memcpy(xts_param.init, pcc_param.xts, 16);
+	if (!req_ctx->param_init_done) {
+		rc = __xts_2keys_prep_param(ctx, param, walk,
+					    keylen, offset, maysleep);
+		if (rc)
+			goto out;
+		req_ctx->param_init_done = true;
+	}
 
-	while ((nbytes = walk.nbytes) != 0) {
+	/*
+	 * Note that in case of partial processing or failure the walk
+	 * is NOT unmapped here. So a follow up task may reuse the walk
+	 * or in case of unrecoverable failure needs to unmap it.
+	 */
+	while ((nbytes = walk->nbytes) != 0) {
 		/* only use complete blocks */
 		n = nbytes & ~(AES_BLOCK_SIZE - 1);
-		k = cpacf_km(ctx->fc | modifier, xts_param.key + offset,
-			     walk.dst.virt.addr, walk.src.virt.addr, n);
+		k = cpacf_km(ctx->fc | req_ctx->modifier, param->key + offset,
+			     walk->dst.virt.addr, walk->src.virt.addr, n);
 		if (k)
-			rc = skcipher_walk_done(&walk, nbytes - k);
+			rc = skcipher_walk_done(walk, nbytes - k);
 		if (k < n) {
-			if (__xts_paes_convert_key(ctx))
-				return skcipher_walk_done(&walk, -EIO);
+			if (!maysleep) {
+				rc = -EKEYEXPIRED;
+				goto out;
+			}
+			rc = pxts_convert_key(ctx);
+			if (rc)
+				goto out;
 			spin_lock_bh(&ctx->pk_lock);
-			memcpy(xts_param.key + offset,
-			       ctx->pk[0].protkey, keylen);
+			memcpy(param->key + offset, ctx->pk[0].protkey, keylen);
 			spin_unlock_bh(&ctx->pk_lock);
 		}
 	}
 
+out:
+	pr_debug("rc=%d\n", rc);
 	return rc;
 }
 
-static inline int xts_paes_crypt(struct skcipher_request *req, unsigned long modifier)
+static int xts_paes_do_crypt(struct s390_pxts_ctx *ctx,
+			     struct s390_pxts_req_ctx *req_ctx,
+			     bool maysleep)
 {
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	int pk_state, rc = 0;
+
+	/* fetch and check protected key state */
+	spin_lock_bh(&ctx->pk_lock);
+	pk_state = ctx->pk_state;
+	switch (pk_state) {
+	case PK_STATE_NO_KEY:
+		rc = -ENOKEY;
+		break;
+	case PK_STATE_CONVERT_IN_PROGRESS:
+		rc = -EKEYEXPIRED;
+		break;
+	case PK_STATE_VALID:
+		break;
+	default:
+		rc = pk_state < 0 ? pk_state : -EIO;
+		break;
+	}
+	spin_unlock_bh(&ctx->pk_lock);
+	if (rc)
+		goto out;
 
+	/* Call the 'real' crypt function based on the xts prot key type. */
 	switch (ctx->fc) {
 	case CPACF_KM_PXTS_128:
 	case CPACF_KM_PXTS_256:
-		return paes_xts_crypt(req, modifier);
+		rc = xts_paes_do_crypt_2keys(ctx, req_ctx, maysleep);
+		break;
 	case CPACF_KM_PXTS_128_FULL:
 	case CPACF_KM_PXTS_256_FULL:
-		return paes_xts_crypt_full(req, modifier);
+		rc = xts_paes_do_crypt_fullkey(ctx, req_ctx, maysleep);
+		break;
 	default:
-		return -EINVAL;
+		rc = -EINVAL;
 	}
-}
 
-static int xts_paes_encrypt(struct skcipher_request *req)
-{
-	return xts_paes_crypt(req, 0);
+out:
+	pr_debug("rc=%d\n", rc);
+	return rc;
 }
 
-static int xts_paes_decrypt(struct skcipher_request *req)
+static inline int xts_paes_crypt(struct skcipher_request *req, unsigned long modifier)
 {
-	return xts_paes_crypt(req, CPACF_DECRYPT);
-}
+	struct s390_pxts_req_ctx *req_ctx = skcipher_request_ctx(req);
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk *walk = &req_ctx->walk;
+	int rc;
 
-static struct skcipher_alg xts_paes_alg = {
-	.base.cra_name		=	"xts(paes)",
-	.base.cra_driver_name	=	"xts-paes-s390",
-	.base.cra_priority	=	402,	/* ecb-paes-s390 + 1 */
-	.base.cra_blocksize	=	AES_BLOCK_SIZE,
-	.base.cra_ctxsize	=	sizeof(struct s390_pxts_ctx),
-	.base.cra_module	=	THIS_MODULE,
-	.base.cra_list		=	LIST_HEAD_INIT(xts_paes_alg.base.cra_list),
-	.init			=	xts_paes_init,
-	.exit			=	xts_paes_exit,
-	.min_keysize		=	2 * PAES_MIN_KEYSIZE,
-	.max_keysize		=	2 * PAES_MAX_KEYSIZE,
-	.ivsize			=	AES_BLOCK_SIZE,
-	.setkey			=	xts_paes_set_key,
-	.encrypt		=	xts_paes_encrypt,
-	.decrypt		=	xts_paes_decrypt,
-};
+	/*
+	 * Attempt synchronous encryption first. If it fails, schedule the request
+	 * asynchronously via the crypto engine. To preserve execution order,
+	 * once a request is queued to the engine, further requests using the same
+	 * tfm will also be routed through the engine.
+	 */
 
-static int ctr_paes_init(struct crypto_skcipher *tfm)
-{
-	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+	rc = skcipher_walk_virt(walk, req, false);
+	if (rc)
+		goto out;
 
-	ctx->kb.key = NULL;
-	spin_lock_init(&ctx->pk_lock);
+	req_ctx->modifier = modifier;
+	req_ctx->param_init_done = false;
 
-	return 0;
-}
+	/* Try synchronous operation if no active engine usage */
+	if (!atomic_read(&ctx->via_engine_ctr)) {
+		rc = xts_paes_do_crypt(ctx, req_ctx, false);
+		if (rc == 0)
+			goto out;
+	}
 
-static void ctr_paes_exit(struct crypto_skcipher *tfm)
-{
-	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+	/*
+	 * If sync operation failed or key expired or there are already
+	 * requests enqueued via engine, fallback to async. Mark tfm as
+	 * using engine to serialize requests.
+	 */
+	if (rc == 0 || rc == -EKEYEXPIRED) {
+		atomic_inc(&ctx->via_engine_ctr);
+		rc = crypto_transfer_skcipher_request_to_engine(paes_crypto_engine, req);
+		if (rc != -EINPROGRESS)
+			atomic_dec(&ctx->via_engine_ctr);
+	}
+
+	if (rc != -EINPROGRESS)
+		skcipher_walk_done(walk, rc);
 
-	_free_kb_keybuf(&ctx->kb);
+out:
+	if (rc != -EINPROGRESS)
+		memzero_explicit(&req_ctx->param, sizeof(req_ctx->param));
+	pr_debug("rc=%d\n", rc);
+	return rc;
 }
 
-static inline int __ctr_paes_set_key(struct s390_paes_ctx *ctx)
+static int xts_paes_encrypt(struct skcipher_request *req)
 {
-	unsigned long fc;
-	int rc;
-
-	rc = __paes_convert_key(ctx);
-	if (rc)
-		return rc;
-
-	/* Pick the correct function code based on the protected key type */
-	fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMCTR_PAES_128 :
-		(ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KMCTR_PAES_192 :
-		(ctx->pk.type == PKEY_KEYTYPE_AES_256) ?
-		CPACF_KMCTR_PAES_256 : 0;
-
-	/* Check if the function code is available */
-	ctx->fc = (fc && cpacf_test_func(&kmctr_functions, fc)) ? fc : 0;
+	return xts_paes_crypt(req, 0);
+}
 
-	return ctx->fc ? 0 : -EINVAL;
+static int xts_paes_decrypt(struct skcipher_request *req)
+{
+	return xts_paes_crypt(req, CPACF_DECRYPT);
 }
 
-static int ctr_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
-			    unsigned int key_len)
+static int xts_paes_init(struct crypto_skcipher *tfm)
 {
-	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int rc;
+	struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-	_free_kb_keybuf(&ctx->kb);
-	rc = _key_to_kb(&ctx->kb, in_key, key_len);
-	if (rc)
-		return rc;
+	memset(ctx, 0, sizeof(*ctx));
+	spin_lock_init(&ctx->pk_lock);
 
-	return __ctr_paes_set_key(ctx);
+	crypto_skcipher_set_reqsize(tfm, sizeof(struct s390_pxts_req_ctx));
+
+	return 0;
 }
 
-static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes)
+static void xts_paes_exit(struct crypto_skcipher *tfm)
 {
-	unsigned int i, n;
+	struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-	/* only use complete blocks, max. PAGE_SIZE */
-	memcpy(ctrptr, iv, AES_BLOCK_SIZE);
-	n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(AES_BLOCK_SIZE - 1);
-	for (i = (n / AES_BLOCK_SIZE) - 1; i > 0; i--) {
-		memcpy(ctrptr + AES_BLOCK_SIZE, ctrptr, AES_BLOCK_SIZE);
-		crypto_inc(ctrptr + AES_BLOCK_SIZE, AES_BLOCK_SIZE);
-		ctrptr += AES_BLOCK_SIZE;
-	}
-	return n;
+	memzero_explicit(ctx, sizeof(*ctx));
 }
 
-static int ctr_paes_crypt(struct skcipher_request *req)
+static int xts_paes_do_one_request(struct crypto_engine *engine, void *areq)
 {
+	struct skcipher_request *req = skcipher_request_cast(areq);
+	struct s390_pxts_req_ctx *req_ctx = skcipher_request_ctx(req);
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
-	u8 buf[AES_BLOCK_SIZE], *ctrptr;
-	struct {
-		u8 key[PAES_256_PROTKEY_SIZE];
-	} param;
-	struct skcipher_walk walk;
-	unsigned int nbytes, n, k;
-	int rc, locked;
-
-	rc = skcipher_walk_virt(&walk, req, false);
-	if (rc)
-		return rc;
-
-	spin_lock_bh(&ctx->pk_lock);
-	memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
-	spin_unlock_bh(&ctx->pk_lock);
-
-	locked = mutex_trylock(&ctrblk_lock);
+	struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk *walk = &req_ctx->walk;
+	int rc;
 
-	while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
-		n = AES_BLOCK_SIZE;
-		if (nbytes >= 2*AES_BLOCK_SIZE && locked)
-			n = __ctrblk_init(ctrblk, walk.iv, nbytes);
-		ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk.iv;
-		k = cpacf_kmctr(ctx->fc, &param, walk.dst.virt.addr,
-				walk.src.virt.addr, n, ctrptr);
-		if (k) {
-			if (ctrptr == ctrblk)
-				memcpy(walk.iv, ctrptr + k - AES_BLOCK_SIZE,
-				       AES_BLOCK_SIZE);
-			crypto_inc(walk.iv, AES_BLOCK_SIZE);
-			rc = skcipher_walk_done(&walk, nbytes - k);
-		}
-		if (k < n) {
-			if (__paes_convert_key(ctx)) {
-				if (locked)
-					mutex_unlock(&ctrblk_lock);
-				return skcipher_walk_done(&walk, -EIO);
-			}
-			spin_lock_bh(&ctx->pk_lock);
-			memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
-			spin_unlock_bh(&ctx->pk_lock);
-		}
-	}
-	if (locked)
-		mutex_unlock(&ctrblk_lock);
-	/*
-	 * final block may be < AES_BLOCK_SIZE, copy only nbytes
-	 */
-	if (nbytes) {
-		memset(buf, 0, AES_BLOCK_SIZE);
-		memcpy(buf, walk.src.virt.addr, nbytes);
-		while (1) {
-			if (cpacf_kmctr(ctx->fc, &param, buf,
-					buf, AES_BLOCK_SIZE,
-					walk.iv) == AES_BLOCK_SIZE)
-				break;
-			if (__paes_convert_key(ctx))
-				return skcipher_walk_done(&walk, -EIO);
-			spin_lock_bh(&ctx->pk_lock);
-			memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
-			spin_unlock_bh(&ctx->pk_lock);
-		}
-		memcpy(walk.dst.virt.addr, buf, nbytes);
-		crypto_inc(walk.iv, AES_BLOCK_SIZE);
-		rc = skcipher_walk_done(&walk, nbytes);
+	/* walk has already been prepared */
+
+	rc = xts_paes_do_crypt(ctx, req_ctx, true);
+	if (rc == -EKEYEXPIRED) {
+		/*
+		 * Protected key expired, conversion is in process.
+		 * Trigger a re-schedule of this request by returning
+		 * -ENOSPC ("hardware queue is full") to the crypto engine.
+		 * To avoid immediately re-invocation of this callback,
+		 * tell the scheduler to voluntarily give up the CPU here.
+		 */
+		cond_resched();
+		pr_debug("rescheduling request\n");
+		return -ENOSPC;
+	} else if (rc) {
+		skcipher_walk_done(walk, rc);
 	}
 
+	memzero_explicit(&req_ctx->param, sizeof(req_ctx->param));
+	pr_debug("request complete with rc=%d\n", rc);
+	local_bh_disable();
+	atomic_dec(&ctx->via_engine_ctr);
+	crypto_finalize_skcipher_request(engine, req, rc);
+	local_bh_enable();
 	return rc;
 }
 
-static struct skcipher_alg ctr_paes_alg = {
-	.base.cra_name		=	"ctr(paes)",
-	.base.cra_driver_name	=	"ctr-paes-s390",
-	.base.cra_priority	=	402,	/* ecb-paes-s390 + 1 */
-	.base.cra_blocksize	=	1,
-	.base.cra_ctxsize	=	sizeof(struct s390_paes_ctx),
-	.base.cra_module	=	THIS_MODULE,
-	.base.cra_list		=	LIST_HEAD_INIT(ctr_paes_alg.base.cra_list),
-	.init			=	ctr_paes_init,
-	.exit			=	ctr_paes_exit,
-	.min_keysize		=	PAES_MIN_KEYSIZE,
-	.max_keysize		=	PAES_MAX_KEYSIZE,
-	.ivsize			=	AES_BLOCK_SIZE,
-	.setkey			=	ctr_paes_set_key,
-	.encrypt		=	ctr_paes_crypt,
-	.decrypt		=	ctr_paes_crypt,
-	.chunksize		=	AES_BLOCK_SIZE,
+static struct skcipher_engine_alg xts_paes_alg = {
+	.base = {
+		.base.cra_name	      =	"xts(paes)",
+		.base.cra_driver_name =	"xts-paes-s390",
+		.base.cra_priority    =	402,	/* ecb-paes-s390 + 1 */
+		.base.cra_blocksize   =	AES_BLOCK_SIZE,
+		.base.cra_ctxsize     =	sizeof(struct s390_pxts_ctx),
+		.base.cra_module      =	THIS_MODULE,
+		.base.cra_list	      =	LIST_HEAD_INIT(xts_paes_alg.base.base.cra_list),
+		.init		      =	xts_paes_init,
+		.exit		      =	xts_paes_exit,
+		.min_keysize	      =	2 * PAES_MIN_KEYSIZE,
+		.max_keysize	      =	2 * PAES_MAX_KEYSIZE,
+		.ivsize		      =	AES_BLOCK_SIZE,
+		.setkey		      =	xts_paes_setkey,
+		.encrypt	      =	xts_paes_encrypt,
+		.decrypt	      =	xts_paes_decrypt,
+	},
+	.op = {
+		.do_one_request	      = xts_paes_do_one_request,
+	},
 };
 
-static inline void __crypto_unregister_skcipher(struct skcipher_alg *alg)
+/*
+ * alg register, unregister, module init, exit
+ */
+
+static struct miscdevice paes_dev = {
+	.name	= "paes",
+	.minor	= MISC_DYNAMIC_MINOR,
+};
+
+static inline void __crypto_unregister_skcipher(struct skcipher_engine_alg *alg)
 {
-	if (!list_empty(&alg->base.cra_list))
-		crypto_unregister_skcipher(alg);
+	if (!list_empty(&alg->base.base.cra_list))
+		crypto_engine_unregister_skcipher(alg);
 }
 
 static void paes_s390_fini(void)
 {
+	if (paes_crypto_engine) {
+		crypto_engine_stop(paes_crypto_engine);
+		crypto_engine_exit(paes_crypto_engine);
+	}
 	__crypto_unregister_skcipher(&ctr_paes_alg);
 	__crypto_unregister_skcipher(&xts_paes_alg);
 	__crypto_unregister_skcipher(&cbc_paes_alg);
 	__crypto_unregister_skcipher(&ecb_paes_alg);
 	if (ctrblk)
-		free_page((unsigned long) ctrblk);
+		free_page((unsigned long)ctrblk);
+	misc_deregister(&paes_dev);
 }
 
 static int __init paes_s390_init(void)
 {
 	int rc;
 
+	/* register a simple paes pseudo misc device */
+	rc = misc_register(&paes_dev);
+	if (rc)
+		return rc;
+
+	/* with this pseudo devie alloc and start a crypto engine */
+	paes_crypto_engine =
+		crypto_engine_alloc_init_and_set(paes_dev.this_device,
+						 true, NULL, false, MAX_QLEN);
+	if (!paes_crypto_engine) {
+		rc = -ENOMEM;
+		goto out_err;
+	}
+	rc = crypto_engine_start(paes_crypto_engine);
+	if (rc) {
+		crypto_engine_exit(paes_crypto_engine);
+		paes_crypto_engine = NULL;
+		goto out_err;
+	}
+
 	/* Query available functions for KM, KMC and KMCTR */
 	cpacf_query(CPACF_KM, &km_functions);
 	cpacf_query(CPACF_KMC, &kmc_functions);
@@ -927,40 +1653,45 @@ static int __init paes_s390_init(void)
 	if (cpacf_test_func(&km_functions, CPACF_KM_PAES_128) ||
 	    cpacf_test_func(&km_functions, CPACF_KM_PAES_192) ||
 	    cpacf_test_func(&km_functions, CPACF_KM_PAES_256)) {
-		rc = crypto_register_skcipher(&ecb_paes_alg);
+		rc = crypto_engine_register_skcipher(&ecb_paes_alg);
 		if (rc)
 			goto out_err;
+		pr_debug("%s registered\n", ecb_paes_alg.base.base.cra_driver_name);
 	}
 
 	if (cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_128) ||
 	    cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_192) ||
 	    cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_256)) {
-		rc = crypto_register_skcipher(&cbc_paes_alg);
+		rc = crypto_engine_register_skcipher(&cbc_paes_alg);
 		if (rc)
 			goto out_err;
+		pr_debug("%s registered\n", cbc_paes_alg.base.base.cra_driver_name);
 	}
 
 	if (cpacf_test_func(&km_functions, CPACF_KM_PXTS_128) ||
 	    cpacf_test_func(&km_functions, CPACF_KM_PXTS_256)) {
-		rc = crypto_register_skcipher(&xts_paes_alg);
+		rc = crypto_engine_register_skcipher(&xts_paes_alg);
 		if (rc)
 			goto out_err;
+		pr_debug("%s registered\n", xts_paes_alg.base.base.cra_driver_name);
 	}
 
 	if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_128) ||
 	    cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_192) ||
 	    cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_256)) {
-		ctrblk = (u8 *) __get_free_page(GFP_KERNEL);
+		ctrblk = (u8 *)__get_free_page(GFP_KERNEL);
 		if (!ctrblk) {
 			rc = -ENOMEM;
 			goto out_err;
 		}
-		rc = crypto_register_skcipher(&ctr_paes_alg);
+		rc = crypto_engine_register_skcipher(&ctr_paes_alg);
 		if (rc)
 			goto out_err;
+		pr_debug("%s registered\n", ctr_paes_alg.base.base.cra_driver_name);
 	}
 
 	return 0;
+
 out_err:
 	paes_s390_fini();
 	return rc;
diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h
index 2bb22db54c31..d757ccbce2b4 100644
--- a/arch/s390/crypto/sha.h
+++ b/arch/s390/crypto/sha.h
@@ -10,27 +10,33 @@
 #ifndef _CRYPTO_ARCH_S390_SHA_H
 #define _CRYPTO_ARCH_S390_SHA_H
 
-#include <linux/crypto.h>
-#include <crypto/sha1.h>
 #include <crypto/sha2.h>
 #include <crypto/sha3.h>
+#include <linux/types.h>
 
 /* must be big enough for the largest SHA variant */
-#define SHA3_STATE_SIZE			200
 #define CPACF_MAX_PARMBLOCK_SIZE	SHA3_STATE_SIZE
 #define SHA_MAX_BLOCK_SIZE		SHA3_224_BLOCK_SIZE
+#define S390_SHA_CTX_SIZE		sizeof(struct s390_sha_ctx)
 
 struct s390_sha_ctx {
 	u64 count;		/* message length in bytes */
-	u32 state[CPACF_MAX_PARMBLOCK_SIZE / sizeof(u32)];
-	u8 buf[SHA_MAX_BLOCK_SIZE];
+	union {
+		u32 state[CPACF_MAX_PARMBLOCK_SIZE / sizeof(u32)];
+		struct {
+			u64 state[SHA512_DIGEST_SIZE / sizeof(u64)];
+			u64 count_hi;
+		} sha512;
+	};
 	int func;		/* KIMD function to use */
-	int first_message_part;
+	bool first_message_part;
 };
 
 struct shash_desc;
 
-int s390_sha_update(struct shash_desc *desc, const u8 *data, unsigned int len);
-int s390_sha_final(struct shash_desc *desc, u8 *out);
+int s390_sha_update_blocks(struct shash_desc *desc, const u8 *data,
+			   unsigned int len);
+int s390_sha_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
+		   u8 *out);
 
 #endif
diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c
index bc3a22704e09..d229cbd2ba22 100644
--- a/arch/s390/crypto/sha1_s390.c
+++ b/arch/s390/crypto/sha1_s390.c
@@ -18,12 +18,12 @@
  *   Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
  *   Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
  */
+#include <asm/cpacf.h>
 #include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/cpufeature.h>
 #include <crypto/sha1.h>
-#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 
 #include "sha.h"
 
@@ -49,7 +49,6 @@ static int s390_sha1_export(struct shash_desc *desc, void *out)
 
 	octx->count = sctx->count;
 	memcpy(octx->state, sctx->state, sizeof(octx->state));
-	memcpy(octx->buffer, sctx->buf, sizeof(octx->buffer));
 	return 0;
 }
 
@@ -60,7 +59,6 @@ static int s390_sha1_import(struct shash_desc *desc, const void *in)
 
 	sctx->count = ictx->count;
 	memcpy(sctx->state, ictx->state, sizeof(ictx->state));
-	memcpy(sctx->buf, ictx->buffer, sizeof(ictx->buffer));
 	sctx->func = CPACF_KIMD_SHA_1;
 	return 0;
 }
@@ -68,16 +66,18 @@ static int s390_sha1_import(struct shash_desc *desc, const void *in)
 static struct shash_alg alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
 	.init		=	s390_sha1_init,
-	.update		=	s390_sha_update,
-	.final		=	s390_sha_final,
+	.update		=	s390_sha_update_blocks,
+	.finup		=	s390_sha_finup,
 	.export		=	s390_sha1_export,
 	.import		=	s390_sha1_import,
-	.descsize	=	sizeof(struct s390_sha_ctx),
-	.statesize	=	sizeof(struct sha1_state),
+	.descsize	=	S390_SHA_CTX_SIZE,
+	.statesize	=	SHA1_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha1",
 		.cra_driver_name=	"sha1-s390",
 		.cra_priority	=	300,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c
deleted file mode 100644
index 6f1ccdf93d3e..000000000000
--- a/arch/s390/crypto/sha256_s390.c
+++ /dev/null
@@ -1,143 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Cryptographic API.
- *
- * s390 implementation of the SHA256 and SHA224 Secure Hash Algorithm.
- *
- * s390 Version:
- *   Copyright IBM Corp. 2005, 2011
- *   Author(s): Jan Glauber (jang@de.ibm.com)
- */
-#include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/cpufeature.h>
-#include <crypto/sha2.h>
-#include <asm/cpacf.h>
-
-#include "sha.h"
-
-static int s390_sha256_init(struct shash_desc *desc)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA256_H0;
-	sctx->state[1] = SHA256_H1;
-	sctx->state[2] = SHA256_H2;
-	sctx->state[3] = SHA256_H3;
-	sctx->state[4] = SHA256_H4;
-	sctx->state[5] = SHA256_H5;
-	sctx->state[6] = SHA256_H6;
-	sctx->state[7] = SHA256_H7;
-	sctx->count = 0;
-	sctx->func = CPACF_KIMD_SHA_256;
-
-	return 0;
-}
-
-static int sha256_export(struct shash_desc *desc, void *out)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-	struct sha256_state *octx = out;
-
-	octx->count = sctx->count;
-	memcpy(octx->state, sctx->state, sizeof(octx->state));
-	memcpy(octx->buf, sctx->buf, sizeof(octx->buf));
-	return 0;
-}
-
-static int sha256_import(struct shash_desc *desc, const void *in)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-	const struct sha256_state *ictx = in;
-
-	sctx->count = ictx->count;
-	memcpy(sctx->state, ictx->state, sizeof(ictx->state));
-	memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf));
-	sctx->func = CPACF_KIMD_SHA_256;
-	return 0;
-}
-
-static struct shash_alg sha256_alg = {
-	.digestsize	=	SHA256_DIGEST_SIZE,
-	.init		=	s390_sha256_init,
-	.update		=	s390_sha_update,
-	.final		=	s390_sha_final,
-	.export		=	sha256_export,
-	.import		=	sha256_import,
-	.descsize	=	sizeof(struct s390_sha_ctx),
-	.statesize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha256",
-		.cra_driver_name=	"sha256-s390",
-		.cra_priority	=	300,
-		.cra_blocksize	=	SHA256_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-};
-
-static int s390_sha224_init(struct shash_desc *desc)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA224_H0;
-	sctx->state[1] = SHA224_H1;
-	sctx->state[2] = SHA224_H2;
-	sctx->state[3] = SHA224_H3;
-	sctx->state[4] = SHA224_H4;
-	sctx->state[5] = SHA224_H5;
-	sctx->state[6] = SHA224_H6;
-	sctx->state[7] = SHA224_H7;
-	sctx->count = 0;
-	sctx->func = CPACF_KIMD_SHA_256;
-
-	return 0;
-}
-
-static struct shash_alg sha224_alg = {
-	.digestsize	=	SHA224_DIGEST_SIZE,
-	.init		=	s390_sha224_init,
-	.update		=	s390_sha_update,
-	.final		=	s390_sha_final,
-	.export		=	sha256_export,
-	.import		=	sha256_import,
-	.descsize	=	sizeof(struct s390_sha_ctx),
-	.statesize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha224",
-		.cra_driver_name=	"sha224-s390",
-		.cra_priority	=	300,
-		.cra_blocksize	=	SHA224_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-};
-
-static int __init sha256_s390_init(void)
-{
-	int ret;
-
-	if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256))
-		return -ENODEV;
-	ret = crypto_register_shash(&sha256_alg);
-	if (ret < 0)
-		goto out;
-	ret = crypto_register_shash(&sha224_alg);
-	if (ret < 0)
-		crypto_unregister_shash(&sha256_alg);
-out:
-	return ret;
-}
-
-static void __exit sha256_s390_fini(void)
-{
-	crypto_unregister_shash(&sha224_alg);
-	crypto_unregister_shash(&sha256_alg);
-}
-
-module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha256_s390_init);
-module_exit(sha256_s390_fini);
-
-MODULE_ALIAS_CRYPTO("sha256");
-MODULE_ALIAS_CRYPTO("sha224");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA256 and SHA224 Secure Hash Algorithm");
diff --git a/arch/s390/crypto/sha3_256_s390.c b/arch/s390/crypto/sha3_256_s390.c
index a84ef692f572..4a7731ac6bcd 100644
--- a/arch/s390/crypto/sha3_256_s390.c
+++ b/arch/s390/crypto/sha3_256_s390.c
@@ -8,12 +8,14 @@
  *   Copyright IBM Corp. 2019
  *   Author(s): Joerg Schmidbauer (jschmidb@de.ibm.com)
  */
+#include <asm/cpacf.h>
 #include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/cpufeature.h>
 #include <crypto/sha3.h>
-#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
 
 #include "sha.h"
 
@@ -21,11 +23,11 @@ static int sha3_256_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 
-	if (!test_facility(86)) /* msa 12 */
+	sctx->first_message_part = test_facility(86);
+	if (!sctx->first_message_part)
 		memset(sctx->state, 0, sizeof(sctx->state));
 	sctx->count = 0;
 	sctx->func = CPACF_KIMD_SHA3_256;
-	sctx->first_message_part = 1;
 
 	return 0;
 }
@@ -35,11 +37,11 @@ static int sha3_256_export(struct shash_desc *desc, void *out)
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 	struct sha3_state *octx = out;
 
-	octx->rsiz = sctx->count;
+	if (sctx->first_message_part) {
+		memset(sctx->state, 0, sizeof(sctx->state));
+		sctx->first_message_part = 0;
+	}
 	memcpy(octx->st, sctx->state, sizeof(octx->st));
-	memcpy(octx->buf, sctx->buf, sizeof(octx->buf));
-	octx->partial = sctx->first_message_part;
-
 	return 0;
 }
 
@@ -48,10 +50,9 @@ static int sha3_256_import(struct shash_desc *desc, const void *in)
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 	const struct sha3_state *ictx = in;
 
-	sctx->count = ictx->rsiz;
+	sctx->count = 0;
 	memcpy(sctx->state, ictx->st, sizeof(ictx->st));
-	memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf));
-	sctx->first_message_part = ictx->partial;
+	sctx->first_message_part = 0;
 	sctx->func = CPACF_KIMD_SHA3_256;
 
 	return 0;
@@ -60,30 +61,26 @@ static int sha3_256_import(struct shash_desc *desc, const void *in)
 static int sha3_224_import(struct shash_desc *desc, const void *in)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-	const struct sha3_state *ictx = in;
 
-	sctx->count = ictx->rsiz;
-	memcpy(sctx->state, ictx->st, sizeof(ictx->st));
-	memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf));
-	sctx->first_message_part = ictx->partial;
+	sha3_256_import(desc, in);
 	sctx->func = CPACF_KIMD_SHA3_224;
-
 	return 0;
 }
 
 static struct shash_alg sha3_256_alg = {
 	.digestsize	=	SHA3_256_DIGEST_SIZE,	   /* = 32 */
 	.init		=	sha3_256_init,
-	.update		=	s390_sha_update,
-	.final		=	s390_sha_final,
+	.update		=	s390_sha_update_blocks,
+	.finup		=	s390_sha_finup,
 	.export		=	sha3_256_export,
 	.import		=	sha3_256_import,
-	.descsize	=	sizeof(struct s390_sha_ctx),
-	.statesize	=	sizeof(struct sha3_state),
+	.descsize	=	S390_SHA_CTX_SIZE,
+	.statesize	=	SHA3_STATE_SIZE,
 	.base		=	{
 		.cra_name	 =	"sha3-256",
 		.cra_driver_name =	"sha3-256-s390",
 		.cra_priority	 =	300,
+		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize	 =	SHA3_256_BLOCK_SIZE,
 		.cra_module	 =	THIS_MODULE,
 	}
@@ -93,28 +90,25 @@ static int sha3_224_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 
-	if (!test_facility(86)) /* msa 12 */
-		memset(sctx->state, 0, sizeof(sctx->state));
-	sctx->count = 0;
+	sha3_256_init(desc);
 	sctx->func = CPACF_KIMD_SHA3_224;
-	sctx->first_message_part = 1;
-
 	return 0;
 }
 
 static struct shash_alg sha3_224_alg = {
 	.digestsize	=	SHA3_224_DIGEST_SIZE,
 	.init		=	sha3_224_init,
-	.update		=	s390_sha_update,
-	.final		=	s390_sha_final,
+	.update		=	s390_sha_update_blocks,
+	.finup		=	s390_sha_finup,
 	.export		=	sha3_256_export, /* same as for 256 */
 	.import		=	sha3_224_import, /* function code different! */
-	.descsize	=	sizeof(struct s390_sha_ctx),
-	.statesize	=	sizeof(struct sha3_state),
+	.descsize	=	S390_SHA_CTX_SIZE,
+	.statesize	=	SHA3_STATE_SIZE,
 	.base		=	{
 		.cra_name	 =	"sha3-224",
 		.cra_driver_name =	"sha3-224-s390",
 		.cra_priority	 =	300,
+		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize	 =	SHA3_224_BLOCK_SIZE,
 		.cra_module	 =	THIS_MODULE,
 	}
diff --git a/arch/s390/crypto/sha3_512_s390.c b/arch/s390/crypto/sha3_512_s390.c
index 07528fc98ff7..018f02fff444 100644
--- a/arch/s390/crypto/sha3_512_s390.c
+++ b/arch/s390/crypto/sha3_512_s390.c
@@ -7,12 +7,14 @@
  * Copyright IBM Corp. 2019
  * Author(s): Joerg Schmidbauer (jschmidb@de.ibm.com)
  */
+#include <asm/cpacf.h>
 #include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/cpufeature.h>
 #include <crypto/sha3.h>
-#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
 
 #include "sha.h"
 
@@ -20,11 +22,11 @@ static int sha3_512_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 
-	if (!test_facility(86)) /* msa 12 */
+	sctx->first_message_part = test_facility(86);
+	if (!sctx->first_message_part)
 		memset(sctx->state, 0, sizeof(sctx->state));
 	sctx->count = 0;
 	sctx->func = CPACF_KIMD_SHA3_512;
-	sctx->first_message_part = 1;
 
 	return 0;
 }
@@ -34,13 +36,12 @@ static int sha3_512_export(struct shash_desc *desc, void *out)
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 	struct sha3_state *octx = out;
 
-	octx->rsiz = sctx->count;
-	octx->rsizw = sctx->count >> 32;
 
+	if (sctx->first_message_part) {
+		memset(sctx->state, 0, sizeof(sctx->state));
+		sctx->first_message_part = 0;
+	}
 	memcpy(octx->st, sctx->state, sizeof(octx->st));
-	memcpy(octx->buf, sctx->buf, sizeof(octx->buf));
-	octx->partial = sctx->first_message_part;
-
 	return 0;
 }
 
@@ -49,13 +50,9 @@ static int sha3_512_import(struct shash_desc *desc, const void *in)
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 	const struct sha3_state *ictx = in;
 
-	if (unlikely(ictx->rsizw))
-		return -ERANGE;
-	sctx->count = ictx->rsiz;
-
+	sctx->count = 0;
 	memcpy(sctx->state, ictx->st, sizeof(ictx->st));
-	memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf));
-	sctx->first_message_part = ictx->partial;
+	sctx->first_message_part = 0;
 	sctx->func = CPACF_KIMD_SHA3_512;
 
 	return 0;
@@ -64,33 +61,26 @@ static int sha3_512_import(struct shash_desc *desc, const void *in)
 static int sha3_384_import(struct shash_desc *desc, const void *in)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-	const struct sha3_state *ictx = in;
 
-	if (unlikely(ictx->rsizw))
-		return -ERANGE;
-	sctx->count = ictx->rsiz;
-
-	memcpy(sctx->state, ictx->st, sizeof(ictx->st));
-	memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf));
-	sctx->first_message_part = ictx->partial;
+	sha3_512_import(desc, in);
 	sctx->func = CPACF_KIMD_SHA3_384;
-
 	return 0;
 }
 
 static struct shash_alg sha3_512_alg = {
 	.digestsize	=	SHA3_512_DIGEST_SIZE,
 	.init		=	sha3_512_init,
-	.update		=	s390_sha_update,
-	.final		=	s390_sha_final,
+	.update		=	s390_sha_update_blocks,
+	.finup		=	s390_sha_finup,
 	.export		=	sha3_512_export,
 	.import		=	sha3_512_import,
-	.descsize	=	sizeof(struct s390_sha_ctx),
-	.statesize	=	sizeof(struct sha3_state),
+	.descsize	=	S390_SHA_CTX_SIZE,
+	.statesize	=	SHA3_STATE_SIZE,
 	.base		=	{
 		.cra_name	 =	"sha3-512",
 		.cra_driver_name =	"sha3-512-s390",
 		.cra_priority	 =	300,
+		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize	 =	SHA3_512_BLOCK_SIZE,
 		.cra_module	 =	THIS_MODULE,
 	}
@@ -102,28 +92,25 @@ static int sha3_384_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 
-	if (!test_facility(86)) /* msa 12 */
-		memset(sctx->state, 0, sizeof(sctx->state));
-	sctx->count = 0;
+	sha3_512_init(desc);
 	sctx->func = CPACF_KIMD_SHA3_384;
-	sctx->first_message_part = 1;
-
 	return 0;
 }
 
 static struct shash_alg sha3_384_alg = {
 	.digestsize	=	SHA3_384_DIGEST_SIZE,
 	.init		=	sha3_384_init,
-	.update		=	s390_sha_update,
-	.final		=	s390_sha_final,
+	.update		=	s390_sha_update_blocks,
+	.finup		=	s390_sha_finup,
 	.export		=	sha3_512_export, /* same as for 512 */
 	.import		=	sha3_384_import, /* function code different! */
-	.descsize	=	sizeof(struct s390_sha_ctx),
-	.statesize	=	sizeof(struct sha3_state),
+	.descsize	=	S390_SHA_CTX_SIZE,
+	.statesize	=	SHA3_STATE_SIZE,
 	.base		=	{
 		.cra_name	 =	"sha3-384",
 		.cra_driver_name =	"sha3-384-s390",
 		.cra_priority	 =	300,
+		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize	 =	SHA3_384_BLOCK_SIZE,
 		.cra_ctxsize	 =	sizeof(struct s390_sha_ctx),
 		.cra_module	 =	THIS_MODULE,
diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c
index 04f11c407763..33711a29618c 100644
--- a/arch/s390/crypto/sha512_s390.c
+++ b/arch/s390/crypto/sha512_s390.c
@@ -7,14 +7,13 @@
  * Copyright IBM Corp. 2007
  * Author(s): Jan Glauber (jang@de.ibm.com)
  */
+#include <asm/cpacf.h>
 #include <crypto/internal/hash.h>
 #include <crypto/sha2.h>
+#include <linux/cpufeature.h>
 #include <linux/errno.h>
-#include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/cpufeature.h>
-#include <asm/cpacf.h>
 
 #include "sha.h"
 
@@ -22,15 +21,16 @@ static int sha512_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
 
-	*(__u64 *)&ctx->state[0] = SHA512_H0;
-	*(__u64 *)&ctx->state[2] = SHA512_H1;
-	*(__u64 *)&ctx->state[4] = SHA512_H2;
-	*(__u64 *)&ctx->state[6] = SHA512_H3;
-	*(__u64 *)&ctx->state[8] = SHA512_H4;
-	*(__u64 *)&ctx->state[10] = SHA512_H5;
-	*(__u64 *)&ctx->state[12] = SHA512_H6;
-	*(__u64 *)&ctx->state[14] = SHA512_H7;
+	ctx->sha512.state[0] = SHA512_H0;
+	ctx->sha512.state[1] = SHA512_H1;
+	ctx->sha512.state[2] = SHA512_H2;
+	ctx->sha512.state[3] = SHA512_H3;
+	ctx->sha512.state[4] = SHA512_H4;
+	ctx->sha512.state[5] = SHA512_H5;
+	ctx->sha512.state[6] = SHA512_H6;
+	ctx->sha512.state[7] = SHA512_H7;
 	ctx->count = 0;
+	ctx->sha512.count_hi = 0;
 	ctx->func = CPACF_KIMD_SHA_512;
 
 	return 0;
@@ -42,9 +42,8 @@ static int sha512_export(struct shash_desc *desc, void *out)
 	struct sha512_state *octx = out;
 
 	octx->count[0] = sctx->count;
-	octx->count[1] = 0;
+	octx->count[1] = sctx->sha512.count_hi;
 	memcpy(octx->state, sctx->state, sizeof(octx->state));
-	memcpy(octx->buf, sctx->buf, sizeof(octx->buf));
 	return 0;
 }
 
@@ -53,12 +52,10 @@ static int sha512_import(struct shash_desc *desc, const void *in)
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 	const struct sha512_state *ictx = in;
 
-	if (unlikely(ictx->count[1]))
-		return -ERANGE;
 	sctx->count = ictx->count[0];
+	sctx->sha512.count_hi = ictx->count[1];
 
 	memcpy(sctx->state, ictx->state, sizeof(ictx->state));
-	memcpy(sctx->buf, ictx->buf, sizeof(ictx->buf));
 	sctx->func = CPACF_KIMD_SHA_512;
 	return 0;
 }
@@ -66,16 +63,18 @@ static int sha512_import(struct shash_desc *desc, const void *in)
 static struct shash_alg sha512_alg = {
 	.digestsize	=	SHA512_DIGEST_SIZE,
 	.init		=	sha512_init,
-	.update		=	s390_sha_update,
-	.final		=	s390_sha_final,
+	.update		=	s390_sha_update_blocks,
+	.finup		=	s390_sha_finup,
 	.export		=	sha512_export,
 	.import		=	sha512_import,
 	.descsize	=	sizeof(struct s390_sha_ctx),
-	.statesize	=	sizeof(struct sha512_state),
+	.statesize	=	SHA512_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha512",
 		.cra_driver_name=	"sha512-s390",
 		.cra_priority	=	300,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize	=	SHA512_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@@ -87,15 +86,16 @@ static int sha384_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
 
-	*(__u64 *)&ctx->state[0] = SHA384_H0;
-	*(__u64 *)&ctx->state[2] = SHA384_H1;
-	*(__u64 *)&ctx->state[4] = SHA384_H2;
-	*(__u64 *)&ctx->state[6] = SHA384_H3;
-	*(__u64 *)&ctx->state[8] = SHA384_H4;
-	*(__u64 *)&ctx->state[10] = SHA384_H5;
-	*(__u64 *)&ctx->state[12] = SHA384_H6;
-	*(__u64 *)&ctx->state[14] = SHA384_H7;
+	ctx->sha512.state[0] = SHA384_H0;
+	ctx->sha512.state[1] = SHA384_H1;
+	ctx->sha512.state[2] = SHA384_H2;
+	ctx->sha512.state[3] = SHA384_H3;
+	ctx->sha512.state[4] = SHA384_H4;
+	ctx->sha512.state[5] = SHA384_H5;
+	ctx->sha512.state[6] = SHA384_H6;
+	ctx->sha512.state[7] = SHA384_H7;
 	ctx->count = 0;
+	ctx->sha512.count_hi = 0;
 	ctx->func = CPACF_KIMD_SHA_512;
 
 	return 0;
@@ -104,17 +104,19 @@ static int sha384_init(struct shash_desc *desc)
 static struct shash_alg sha384_alg = {
 	.digestsize	=	SHA384_DIGEST_SIZE,
 	.init		=	sha384_init,
-	.update		=	s390_sha_update,
-	.final		=	s390_sha_final,
+	.update		=	s390_sha_update_blocks,
+	.finup		=	s390_sha_finup,
 	.export		=	sha512_export,
 	.import		=	sha512_import,
 	.descsize	=	sizeof(struct s390_sha_ctx),
-	.statesize	=	sizeof(struct sha512_state),
+	.statesize	=	SHA512_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha384",
 		.cra_driver_name=	"sha384-s390",
 		.cra_priority	=	300,
 		.cra_blocksize	=	SHA384_BLOCK_SIZE,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_ctxsize	=	sizeof(struct s390_sha_ctx),
 		.cra_module	=	THIS_MODULE,
 	}
diff --git a/arch/s390/crypto/sha_common.c b/arch/s390/crypto/sha_common.c
index 961d7d522af1..b5e2c365ea05 100644
--- a/arch/s390/crypto/sha_common.c
+++ b/arch/s390/crypto/sha_common.c
@@ -13,50 +13,33 @@
 #include <asm/cpacf.h>
 #include "sha.h"
 
-int s390_sha_update(struct shash_desc *desc, const u8 *data, unsigned int len)
+int s390_sha_update_blocks(struct shash_desc *desc, const u8 *data,
+			   unsigned int len)
 {
-	struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
 	unsigned int bsize = crypto_shash_blocksize(desc->tfm);
-	unsigned int index, n;
+	struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
+	unsigned int n;
 	int fc;
 
-	/* how much is already in the buffer? */
-	index = ctx->count % bsize;
-	ctx->count += len;
-
-	if ((index + len) < bsize)
-		goto store;
-
 	fc = ctx->func;
 	if (ctx->first_message_part)
-		fc |= test_facility(86) ? CPACF_KIMD_NIP : 0;
-
-	/* process one stored block */
-	if (index) {
-		memcpy(ctx->buf + index, data, bsize - index);
-		cpacf_kimd(fc, ctx->state, ctx->buf, bsize);
-		ctx->first_message_part = 0;
-		fc &= ~CPACF_KIMD_NIP;
-		data += bsize - index;
-		len -= bsize - index;
-		index = 0;
-	}
+		fc |= CPACF_KIMD_NIP;
 
 	/* process as many blocks as possible */
-	if (len >= bsize) {
-		n = (len / bsize) * bsize;
-		cpacf_kimd(fc, ctx->state, data, n);
-		ctx->first_message_part = 0;
-		data += n;
-		len -= n;
+	n = (len / bsize) * bsize;
+	ctx->count += n;
+	switch (ctx->func) {
+	case CPACF_KLMD_SHA_512:
+	case CPACF_KLMD_SHA3_384:
+		if (ctx->count < n)
+			ctx->sha512.count_hi++;
+		break;
 	}
-store:
-	if (len)
-		memcpy(ctx->buf + index , data, len);
-
-	return 0;
+	cpacf_kimd(fc, ctx->state, data, n);
+	ctx->first_message_part = 0;
+	return len - n;
 }
-EXPORT_SYMBOL_GPL(s390_sha_update);
+EXPORT_SYMBOL_GPL(s390_sha_update_blocks);
 
 static int s390_crypto_shash_parmsize(int func)
 {
@@ -77,15 +60,15 @@ static int s390_crypto_shash_parmsize(int func)
 	}
 }
 
-int s390_sha_final(struct shash_desc *desc, u8 *out)
+int s390_sha_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
+		   u8 *out)
 {
 	struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
-	unsigned int bsize = crypto_shash_blocksize(desc->tfm);
-	u64 bits;
-	unsigned int n;
 	int mbl_offset, fc;
+	u64 bits;
+
+	ctx->count += len;
 
-	n = ctx->count % bsize;
 	bits = ctx->count * 8;
 	mbl_offset = s390_crypto_shash_parmsize(ctx->func);
 	if (mbl_offset < 0)
@@ -95,17 +78,16 @@ int s390_sha_final(struct shash_desc *desc, u8 *out)
 
 	/* set total msg bit length (mbl) in CPACF parmblock */
 	switch (ctx->func) {
-	case CPACF_KLMD_SHA_1:
-	case CPACF_KLMD_SHA_256:
-		memcpy(ctx->state + mbl_offset, &bits, sizeof(bits));
-		break;
 	case CPACF_KLMD_SHA_512:
-		/*
-		 * the SHA512 parmblock has a 128-bit mbl field, clear
-		 * high-order u64 field, copy bits to low-order u64 field
-		 */
-		memset(ctx->state + mbl_offset, 0x00, sizeof(bits));
+		/* The SHA512 parmblock has a 128-bit mbl field. */
+		if (ctx->count < len)
+			ctx->sha512.count_hi++;
+		ctx->sha512.count_hi <<= 3;
+		ctx->sha512.count_hi |= ctx->count >> 61;
 		mbl_offset += sizeof(u64) / sizeof(u32);
+		fallthrough;
+	case CPACF_KLMD_SHA_1:
+	case CPACF_KLMD_SHA_256:
 		memcpy(ctx->state + mbl_offset, &bits, sizeof(bits));
 		break;
 	case CPACF_KLMD_SHA3_224:
@@ -121,16 +103,14 @@ int s390_sha_final(struct shash_desc *desc, u8 *out)
 	fc |= test_facility(86) ? CPACF_KLMD_DUFOP : 0;
 	if (ctx->first_message_part)
 		fc |= CPACF_KLMD_NIP;
-	cpacf_klmd(fc, ctx->state, ctx->buf, n);
+	cpacf_klmd(fc, ctx->state, src, len);
 
 	/* copy digest to out */
 	memcpy(out, ctx->state, crypto_shash_digestsize(desc->tfm));
-	/* wipe context */
-	memset(ctx, 0, sizeof *ctx);
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(s390_sha_final);
+EXPORT_SYMBOL_GPL(s390_sha_finup);
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("s390 SHA cipher common functions");
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 04ea1c03a5ff..96409573c75d 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -342,7 +342,7 @@ static struct dentry *hypfs_create_file(struct dentry *parent, const char *name,
 	struct inode *inode;
 
 	inode_lock(d_inode(parent));
-	dentry = lookup_one_len(name, parent, strlen(name));
+	dentry = lookup_noperm(&QSTR(name), parent);
 	if (IS_ERR(dentry)) {
 		dentry = ERR_PTR(-ENOMEM);
 		goto fail;
diff --git a/arch/s390/include/asm/asce.h b/arch/s390/include/asm/asce.h
new file mode 100644
index 000000000000..f6dfaaba735a
--- /dev/null
+++ b/arch/s390/include/asm/asce.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_S390_ASCE_H
+#define _ASM_S390_ASCE_H
+
+#include <linux/thread_info.h>
+#include <linux/irqflags.h>
+#include <asm/lowcore.h>
+#include <asm/ctlreg.h>
+
+static inline bool enable_sacf_uaccess(void)
+{
+	unsigned long flags;
+
+	if (test_thread_flag(TIF_ASCE_PRIMARY))
+		return true;
+	local_irq_save(flags);
+	local_ctl_load(1, &get_lowcore()->kernel_asce);
+	set_thread_flag(TIF_ASCE_PRIMARY);
+	local_irq_restore(flags);
+	return false;
+}
+
+static inline void disable_sacf_uaccess(bool previous)
+{
+	unsigned long flags;
+
+	if (previous)
+		return;
+	local_irq_save(flags);
+	local_ctl_load(1, &get_lowcore()->user_asce);
+	clear_thread_flag(TIF_ASCE_PRIMARY);
+	local_irq_restore(flags);
+}
+
+#endif /* _ASM_S390_ASCE_H */
diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index 59ab1192e2d5..54cb97603ec0 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -649,18 +649,30 @@ static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len,
  *		 instruction
  * @func: the function code passed to PCC; see CPACF_KM_xxx defines
  * @param: address of parameter block; see POP for details on each func
+ *
+ * Returns the condition code, this is
+ * 0 - cc code 0 (normal completion)
+ * 1 - cc code 1 (protected key wkvp mismatch or src operand out of range)
+ * 2 - cc code 2 (something invalid, scalar multiply infinity, ...)
+ * Condition code 3 (partial completion) is handled within the asm code
+ * and never returned.
  */
-static inline void cpacf_pcc(unsigned long func, void *param)
+static inline int cpacf_pcc(unsigned long func, void *param)
 {
+	int cc;
+
 	asm volatile(
 		"	lgr	0,%[fc]\n"
 		"	lgr	1,%[pba]\n"
 		"0:	.insn	rre,%[opc] << 16,0,0\n" /* PCC opcode */
 		"	brc	1,0b\n" /* handle partial completion */
-		:
+		CC_IPM(cc)
+		: CC_OUT(cc, cc)
 		: [fc] "d" (func), [pba] "d" ((unsigned long)param),
 		  [opc] "i" (CPACF_PCC)
-		: "cc", "memory", "0", "1");
+		: CC_CLOBBER_LIST("memory", "0", "1"));
+
+	return CC_TRANSFORM(cc);
 }
 
 /**
diff --git a/arch/s390/include/asm/cpufeature.h b/arch/s390/include/asm/cpufeature.h
index e08169bd63a5..6c6a99660e78 100644
--- a/arch/s390/include/asm/cpufeature.h
+++ b/arch/s390/include/asm/cpufeature.h
@@ -15,6 +15,7 @@ enum {
 	S390_CPU_FEATURE_MSA,
 	S390_CPU_FEATURE_VXRS,
 	S390_CPU_FEATURE_UV,
+	S390_CPU_FEATURE_D288,
 	MAX_CPU_FEATURES
 };
 
diff --git a/arch/s390/include/asm/diag288.h b/arch/s390/include/asm/diag288.h
new file mode 100644
index 000000000000..5e1b43cea9d6
--- /dev/null
+++ b/arch/s390/include/asm/diag288.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_S390_DIAG288_H
+#define _ASM_S390_DIAG288_H
+
+#include <asm/asm-extable.h>
+#include <asm/types.h>
+
+#define MIN_INTERVAL 15	    /* Minimal time supported by diag288 */
+#define MAX_INTERVAL 3600   /* One hour should be enough - pure estimation */
+
+#define WDT_DEFAULT_TIMEOUT 30
+
+/* Function codes - init, change, cancel */
+#define WDT_FUNC_INIT 0
+#define WDT_FUNC_CHANGE 1
+#define WDT_FUNC_CANCEL 2
+#define WDT_FUNC_CONCEAL 0x80000000
+
+/* Action codes for LPAR watchdog */
+#define LPARWDT_RESTART 0
+
+static inline int __diag288(unsigned int func, unsigned int timeout,
+			    unsigned long action, unsigned int len)
+{
+	union register_pair r1 = { .even = func, .odd = timeout, };
+	union register_pair r3 = { .even = action, .odd = len, };
+	int rc = -EINVAL;
+
+	asm volatile(
+		"	diag	%[r1],%[r3],0x288\n"
+		"0:	lhi	%[rc],0\n"
+		"1:"
+		EX_TABLE(0b, 1b)
+		: [rc] "+d" (rc)
+		: [r1] "d" (r1.pair), [r3] "d" (r3.pair)
+		: "cc", "memory");
+	return rc;
+}
+
+#endif /* _ASM_S390_DIAG288_H */
diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h
index f5781794356b..942f21c39697 100644
--- a/arch/s390/include/asm/futex.h
+++ b/arch/s390/include/asm/futex.h
@@ -13,9 +13,11 @@
 static uaccess_kmsan_or_inline int						\
 __futex_atomic_##name(int oparg, int *old, u32 __user *uaddr)			\
 {										\
+	bool sacf_flag;								\
 	int rc, new;								\
 										\
 	instrument_copy_from_user_before(old, uaddr, sizeof(*old));		\
+	sacf_flag = enable_sacf_uaccess();					\
 	asm_inline volatile(							\
 		"	sacf	256\n"						\
 		"0:	l	%[old],%[uaddr]\n"				\
@@ -32,6 +34,7 @@ __futex_atomic_##name(int oparg, int *old, u32 __user *uaddr)			\
 		  [new] "=&d" (new), [uaddr] "+Q" (*uaddr)			\
 		: [oparg] "d" (oparg)						\
 		: "cc");							\
+	disable_sacf_uaccess(sacf_flag);					\
 	if (!rc)								\
 		instrument_copy_from_user_after(old, uaddr, sizeof(*old), 0);	\
 	return rc;								\
@@ -75,9 +78,11 @@ int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 static uaccess_kmsan_or_inline
 int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval)
 {
+	bool sacf_flag;
 	int rc;
 
 	instrument_copy_from_user_before(uval, uaddr, sizeof(*uval));
+	sacf_flag = enable_sacf_uaccess();
 	asm_inline volatile(
 		"	sacf	256\n"
 		"0:	cs	%[old],%[new],%[uaddr]\n"
@@ -88,6 +93,7 @@ int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32
 		: [rc] "=d" (rc), [old] "+d" (oldval), [uaddr] "+Q" (*uaddr)
 		: [new] "d" (newval)
 		: "cc", "memory");
+	disable_sacf_uaccess(sacf_flag);
 	*uval = oldval;
 	instrument_copy_from_user_after(uval, uaddr, sizeof(*uval), 0);
 	return rc;
diff --git a/arch/s390/include/asm/machine.h b/arch/s390/include/asm/machine.h
index 54478caa5237..8abe5afdbfc4 100644
--- a/arch/s390/include/asm/machine.h
+++ b/arch/s390/include/asm/machine.h
@@ -18,6 +18,7 @@
 #define MFEATURE_VM		7
 #define MFEATURE_KVM		8
 #define MFEATURE_LPAR		9
+#define MFEATURE_DIAG288	10
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 88f84beebb9e..d9b8501bc93d 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -13,6 +13,7 @@
 #include <linux/mm_types.h>
 #include <asm/tlbflush.h>
 #include <asm/ctlreg.h>
+#include <asm/asce.h>
 #include <asm-generic/mm_hooks.h>
 
 #define init_new_context init_new_context
@@ -77,7 +78,8 @@ static inline void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *
 	else
 		get_lowcore()->user_asce.val = next->context.asce;
 	cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
-	/* Clear previous user-ASCE from CR7 */
+	/* Clear previous user-ASCE from CR1 and CR7 */
+	local_ctl_load(1, &s390_invalid_asce);
 	local_ctl_load(7, &s390_invalid_asce);
 	if (prev != next)
 		cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
@@ -99,6 +101,7 @@ static inline void finish_arch_post_lock_switch(void)
 {
 	struct task_struct *tsk = current;
 	struct mm_struct *mm = tsk->mm;
+	unsigned long flags;
 
 	if (mm) {
 		preempt_disable();
@@ -108,15 +111,25 @@ static inline void finish_arch_post_lock_switch(void)
 		__tlb_flush_mm_lazy(mm);
 		preempt_enable();
 	}
+	local_irq_save(flags);
+	if (test_thread_flag(TIF_ASCE_PRIMARY))
+		local_ctl_load(1, &get_lowcore()->kernel_asce);
+	else
+		local_ctl_load(1, &get_lowcore()->user_asce);
 	local_ctl_load(7, &get_lowcore()->user_asce);
+	local_irq_restore(flags);
 }
 
 #define activate_mm activate_mm
 static inline void activate_mm(struct mm_struct *prev,
                                struct mm_struct *next)
 {
-	switch_mm(prev, next, current);
+	switch_mm_irqs_off(prev, next, current);
 	cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
+	if (test_thread_flag(TIF_ASCE_PRIMARY))
+		local_ctl_load(1, &get_lowcore()->kernel_asce);
+	else
+		local_ctl_load(1, &get_lowcore()->user_asce);
 	local_ctl_load(7, &get_lowcore()->user_asce);
 }
 
diff --git a/arch/s390/include/asm/pkey.h b/arch/s390/include/asm/pkey.h
index 5dca1a46a9f6..b7b59faf16f4 100644
--- a/arch/s390/include/asm/pkey.h
+++ b/arch/s390/include/asm/pkey.h
@@ -20,9 +20,22 @@
  * @param key pointer to a buffer containing the key blob
  * @param keylen size of the key blob in bytes
  * @param protkey pointer to buffer receiving the protected key
+ * @param xflags additional execution flags (see PKEY_XFLAG_* definitions below)
+ *	  As of now the only supported flag is PKEY_XFLAG_NOMEMALLOC.
  * @return 0 on success, negative errno value on failure
  */
 int pkey_key2protkey(const u8 *key, u32 keylen,
-		     u8 *protkey, u32 *protkeylen, u32 *protkeytype);
+		     u8 *protkey, u32 *protkeylen, u32 *protkeytype,
+		     u32 xflags);
+
+/*
+ * If this flag is given in the xflags parameter, the pkey implementation
+ * is not allowed to allocate memory but instead should fall back to use
+ * preallocated memory or simple fail with -ENOMEM.
+ * This flag is for protected key derive within a cipher or similar
+ * which must not allocate memory which would cause io operations - see
+ * also the CRYPTO_ALG_ALLOCATES_MEMORY flag in crypto.h.
+ */
+#define PKEY_XFLAG_NOMEMALLOC 0x0001
 
 #endif /* _KAPI_PKEY_H */
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index c66f3fc6daaf..62c0ab4a4b9d 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -9,6 +9,7 @@
 
 #include <linux/bits.h>
 #include <uapi/asm/ptrace.h>
+#include <asm/thread_info.h>
 #include <asm/tpi.h>
 
 #define PIF_SYSCALL			0	/* inside a system call */
@@ -126,7 +127,6 @@ struct pt_regs {
 		struct tpi_info tpi_info;
 	};
 	unsigned long flags;
-	unsigned long cr1;
 	unsigned long last_break;
 };
 
@@ -229,8 +229,44 @@ static inline void instruction_pointer_set(struct pt_regs *regs,
 
 int regs_query_register_offset(const char *name);
 const char *regs_query_register_name(unsigned int offset);
-unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset);
-unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n);
+
+static __always_inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
+{
+	return regs->gprs[15];
+}
+
+static __always_inline unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset)
+{
+	if (offset >= NUM_GPRS)
+		return 0;
+	return regs->gprs[offset];
+}
+
+static __always_inline int regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr)
+{
+	unsigned long ksp = kernel_stack_pointer(regs);
+
+	return (addr & ~(THREAD_SIZE - 1)) == (ksp & ~(THREAD_SIZE - 1));
+}
+
+/**
+ * regs_get_kernel_stack_nth() - get Nth entry of the stack
+ * @regs:pt_regs which contains kernel stack pointer.
+ * @n:stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
+ * is specifined by @regs. If the @n th entry is NOT in the kernel stack,
+ * this returns 0.
+ */
+static __always_inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n)
+{
+	unsigned long addr;
+
+	addr = kernel_stack_pointer(regs) + n * sizeof(long);
+	if (!regs_within_kernel_stack(regs, addr))
+		return 0;
+	return READ_ONCE_NOCHECK(addr);
+}
 
 /**
  * regs_get_kernel_argument() - get Nth function argument in kernel
@@ -251,11 +287,6 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
 	return regs_get_kernel_stack_nth(regs, argoffset + n);
 }
 
-static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
-{
-	return regs->gprs[15];
-}
-
 static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
 {
 	regs->gprs[2] = rc;
diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h
index 2ab868cbae6c..f8f68f4ef255 100644
--- a/arch/s390/include/asm/string.h
+++ b/arch/s390/include/asm/string.h
@@ -26,11 +26,9 @@ void *memmove(void *dest, const void *src, size_t n);
 #define __HAVE_ARCH_MEMSCAN	/* inline & arch function */
 #define __HAVE_ARCH_STRCAT	/* inline & arch function */
 #define __HAVE_ARCH_STRCMP	/* arch function */
-#define __HAVE_ARCH_STRCPY	/* inline & arch function */
 #define __HAVE_ARCH_STRLCAT	/* arch function */
 #define __HAVE_ARCH_STRLEN	/* inline & arch function */
 #define __HAVE_ARCH_STRNCAT	/* arch function */
-#define __HAVE_ARCH_STRNCPY	/* arch function */
 #define __HAVE_ARCH_STRNLEN	/* inline & arch function */
 #define __HAVE_ARCH_STRSTR	/* arch function */
 #define __HAVE_ARCH_MEMSET16	/* arch function */
@@ -42,7 +40,6 @@ int memcmp(const void *s1, const void *s2, size_t n);
 int strcmp(const char *s1, const char *s2);
 size_t strlcat(char *dest, const char *src, size_t n);
 char *strncat(char *dest, const char *src, size_t n);
-char *strncpy(char *dest, const char *src, size_t n);
 char *strstr(const char *s1, const char *s2);
 #endif /* !defined(CONFIG_KASAN) && !defined(CONFIG_KMSAN) */
 
@@ -155,22 +152,6 @@ static inline char *strcat(char *dst, const char *src)
 }
 #endif
 
-#ifdef __HAVE_ARCH_STRCPY
-static inline char *strcpy(char *dst, const char *src)
-{
-	char *ret = dst;
-
-	asm volatile(
-		"	lghi	0,0\n"
-		"0:	mvst	%[dst],%[src]\n"
-		"	jo	0b"
-		: [dst] "+&a" (dst), [src] "+&a" (src)
-		:
-		: "cc", "memory", "0");
-	return ret;
-}
-#endif
-
 #if defined(__HAVE_ARCH_STRLEN) || (defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__))
 static inline size_t __no_sanitize_prefix_strfunc(strlen)(const char *s)
 {
@@ -208,7 +189,6 @@ static inline size_t strnlen(const char * s, size_t n)
 void *memchr(const void * s, int c, size_t n);
 void *memscan(void *s, int c, size_t n);
 char *strcat(char *dst, const char *src);
-char *strcpy(char *dst, const char *src);
 size_t strlen(const char *s);
 size_t strnlen(const char * s, size_t n);
 #endif /* !IN_ARCH_STRING_C */
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index 91f569cae1ce..391eb04d26d8 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -9,6 +9,7 @@
 #define _ASM_THREAD_INFO_H
 
 #include <linux/bits.h>
+#include <vdso/page.h>
 
 /*
  * General size of kernel stacks
@@ -24,8 +25,6 @@
 #define STACK_INIT_OFFSET (THREAD_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE)
 
 #ifndef __ASSEMBLY__
-#include <asm/lowcore.h>
-#include <asm/page.h>
 
 /*
  * low level task data that entry.S needs immediate access to
@@ -64,6 +63,7 @@ void arch_setup_new_exec(void);
 #define TIF_NEED_RESCHED_LAZY	3	/* lazy rescheduling needed */
 #define TIF_UPROBE		4	/* breakpointed or single-stepping */
 #define TIF_PATCH_PENDING	5	/* pending live patching update */
+#define TIF_ASCE_PRIMARY	6	/* primary asce is kernel asce */
 #define TIF_NOTIFY_SIGNAL	7	/* signal notifications exist */
 #define TIF_GUARDED_STORAGE	8	/* load guarded storage control block */
 #define TIF_ISOLATE_BP_GUEST	9	/* Run KVM guests with isolated BP */
@@ -85,6 +85,7 @@ void arch_setup_new_exec(void);
 #define _TIF_NEED_RESCHED_LAZY	BIT(TIF_NEED_RESCHED_LAZY)
 #define _TIF_UPROBE		BIT(TIF_UPROBE)
 #define _TIF_PATCH_PENDING	BIT(TIF_PATCH_PENDING)
+#define _TIF_ASCE_PRIMARY	BIT(TIF_ASCE_PRIMARY)
 #define _TIF_NOTIFY_SIGNAL	BIT(TIF_NOTIFY_SIGNAL)
 #define _TIF_GUARDED_STORAGE	BIT(TIF_GUARDED_STORAGE)
 #define _TIF_ISOLATE_BP_GUEST	BIT(TIF_ISOLATE_BP_GUEST)
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index 8629d70ec38b..a43fc88c0050 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -19,6 +19,7 @@
 #include <asm/extable.h>
 #include <asm/facility.h>
 #include <asm-generic/access_ok.h>
+#include <asm/asce.h>
 #include <linux/instrumented.h>
 
 void debug_user_asce(int exit);
@@ -478,6 +479,7 @@ static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval,
 					      __uint128_t old, __uint128_t new,
 					      unsigned long key, int size)
 {
+	bool sacf_flag;
 	int rc = 0;
 
 	switch (size) {
@@ -490,6 +492,7 @@ static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval,
 		_old = ((unsigned int)old & 0xff) << shift;
 		_new = ((unsigned int)new & 0xff) << shift;
 		mask = ~(0xff << shift);
+		sacf_flag = enable_sacf_uaccess();
 		asm_inline volatile(
 			"	spka	0(%[key])\n"
 			"	sacf	256\n"
@@ -524,6 +527,7 @@ static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval,
 			  [default_key] "J" (PAGE_DEFAULT_KEY),
 			  [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS)
 			: "memory", "cc");
+		disable_sacf_uaccess(sacf_flag);
 		*(unsigned char *)uval = prev >> shift;
 		if (!count)
 			rc = -EAGAIN;
@@ -538,6 +542,7 @@ static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval,
 		_old = ((unsigned int)old & 0xffff) << shift;
 		_new = ((unsigned int)new & 0xffff) << shift;
 		mask = ~(0xffff << shift);
+		sacf_flag = enable_sacf_uaccess();
 		asm_inline volatile(
 			"	spka	0(%[key])\n"
 			"	sacf	256\n"
@@ -572,6 +577,7 @@ static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval,
 			  [default_key] "J" (PAGE_DEFAULT_KEY),
 			  [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS)
 			: "memory", "cc");
+		disable_sacf_uaccess(sacf_flag);
 		*(unsigned short *)uval = prev >> shift;
 		if (!count)
 			rc = -EAGAIN;
@@ -580,6 +586,7 @@ static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval,
 	case 4:	{
 		unsigned int prev = old;
 
+		sacf_flag = enable_sacf_uaccess();
 		asm_inline volatile(
 			"	spka	0(%[key])\n"
 			"	sacf	256\n"
@@ -595,12 +602,14 @@ static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval,
 			  [key] "a" (key << 4),
 			  [default_key] "J" (PAGE_DEFAULT_KEY)
 			: "memory", "cc");
+		disable_sacf_uaccess(sacf_flag);
 		*(unsigned int *)uval = prev;
 		return rc;
 	}
 	case 8: {
 		unsigned long prev = old;
 
+		sacf_flag = enable_sacf_uaccess();
 		asm_inline volatile(
 			"	spka	0(%[key])\n"
 			"	sacf	256\n"
@@ -616,12 +625,14 @@ static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval,
 			  [key] "a" (key << 4),
 			  [default_key] "J" (PAGE_DEFAULT_KEY)
 			: "memory", "cc");
+		disable_sacf_uaccess(sacf_flag);
 		*(unsigned long *)uval = prev;
 		return rc;
 	}
 	case 16: {
 		__uint128_t prev = old;
 
+		sacf_flag = enable_sacf_uaccess();
 		asm_inline volatile(
 			"	spka	0(%[key])\n"
 			"	sacf	256\n"
@@ -637,6 +648,7 @@ static __always_inline int __cmpxchg_user_key(unsigned long address, void *uval,
 			  [key] "a" (key << 4),
 			  [default_key] "J" (PAGE_DEFAULT_KEY)
 			: "memory", "cc");
+		disable_sacf_uaccess(sacf_flag);
 		*(__uint128_t *)uval = prev;
 		return rc;
 	}
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index 46fb0ef6f984..b008402ec9aa 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -616,8 +616,9 @@ static inline int uv_remove_shared(unsigned long addr)
 	return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS);
 }
 
-int uv_get_secret_metadata(const u8 secret_id[UV_SECRET_ID_LEN],
-			   struct uv_secret_list_item_hdr *secret);
+int uv_find_secret(const u8 secret_id[UV_SECRET_ID_LEN],
+		   struct uv_secret_list *list,
+		   struct uv_secret_list_item_hdr *secret);
 int uv_retrieve_secret(u16 secret_idx, u8 *buf, size_t buf_size);
 
 extern int prot_virt_host;
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index 841e05f7fa7e..95ecad9c7d7d 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -50,7 +50,6 @@ int main(void)
 	OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2);
 	OFFSET(__PT_INT_CODE, pt_regs, int_code);
 	OFFSET(__PT_FLAGS, pt_regs, flags);
-	OFFSET(__PT_CR1, pt_regs, cr1);
 	OFFSET(__PT_LAST_BREAK, pt_regs, last_break);
 	DEFINE(__PT_SIZE, sizeof(struct pt_regs));
 	BLANK();
diff --git a/arch/s390/kernel/cert_store.c b/arch/s390/kernel/cert_store.c
index 03f3a1e52430..c217a5e64094 100644
--- a/arch/s390/kernel/cert_store.c
+++ b/arch/s390/kernel/cert_store.c
@@ -138,7 +138,7 @@ static void cert_store_key_describe(const struct key *key, struct seq_file *m)
 	 * First 64 bytes of the key description is key name in EBCDIC CP 500.
 	 * Convert it to ASCII for displaying in /proc/keys.
 	 */
-	strscpy(ascii, key->description, sizeof(ascii));
+	strscpy(ascii, key->description);
 	EBCASC_500(ascii, VC_NAME_LEN_BYTES);
 	seq_puts(m, ascii);
 
diff --git a/arch/s390/kernel/cpufeature.c b/arch/s390/kernel/cpufeature.c
index 1b2ae42a0c15..76210f001028 100644
--- a/arch/s390/kernel/cpufeature.c
+++ b/arch/s390/kernel/cpufeature.c
@@ -5,11 +5,13 @@
 
 #include <linux/cpufeature.h>
 #include <linux/bug.h>
+#include <asm/machine.h>
 #include <asm/elf.h>
 
 enum {
 	TYPE_HWCAP,
 	TYPE_FACILITY,
+	TYPE_MACHINE,
 };
 
 struct s390_cpu_feature {
@@ -21,6 +23,7 @@ static struct s390_cpu_feature s390_cpu_features[MAX_CPU_FEATURES] = {
 	[S390_CPU_FEATURE_MSA]	= {.type = TYPE_HWCAP, .num = HWCAP_NR_MSA},
 	[S390_CPU_FEATURE_VXRS]	= {.type = TYPE_HWCAP, .num = HWCAP_NR_VXRS},
 	[S390_CPU_FEATURE_UV]	= {.type = TYPE_FACILITY, .num = 158},
+	[S390_CPU_FEATURE_D288]	= {.type = TYPE_MACHINE, .num = MFEATURE_DIAG288},
 };
 
 /*
@@ -38,6 +41,8 @@ int cpu_have_feature(unsigned int num)
 		return !!(elf_hwcap & BIT(feature->num));
 	case TYPE_FACILITY:
 		return test_facility(feature->num);
+	case TYPE_MACHINE:
+		return test_machine_feature(feature->num);
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index 4a981266b483..adb164223f8c 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -354,7 +354,7 @@ static void *nt_prpsinfo(void *ptr)
 
 	memset(&prpsinfo, 0, sizeof(prpsinfo));
 	prpsinfo.pr_sname = 'R';
-	strcpy(prpsinfo.pr_fname, "vmlinux");
+	strscpy(prpsinfo.pr_fname, "vmlinux");
 	return nt_init(ptr, PRPSINFO, prpsinfo);
 }
 
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index ce038e9205f7..2a41be2f7925 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -251,7 +251,7 @@ static debug_info_t *debug_info_alloc(const char *name, int pages_per_area,
 	rc->level	   = level;
 	rc->buf_size	   = buf_size;
 	rc->entry_size	   = sizeof(debug_entry_t) + buf_size;
-	strscpy(rc->name, name, sizeof(rc->name));
+	strscpy(rc->name, name);
 	memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *));
 	memset(rc->debugfs_entries, 0, DEBUG_MAX_VIEWS * sizeof(struct dentry *));
 	refcount_set(&(rc->ref_count), 0);
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index dd291c9ad6a6..0f00f4b06d51 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -116,7 +116,7 @@ _LPP_OFFSET	= __LC_LPP
 	.macro SIEEXIT sie_control,lowcore
 	lg	%r9,\sie_control			# get control block pointer
 	ni	__SIE_PROG0C+3(%r9),0xfe		# no longer in SIE
-	lctlg	%c1,%c1,__LC_KERNEL_ASCE(\lowcore)	# load primary asce
+	lctlg	%c1,%c1,__LC_USER_ASCE(\lowcore)	# load primary asce
 	lg	%r9,__LC_CURRENT(\lowcore)
 	mvi	__TI_sie(%r9),0
 	larl	%r9,sie_exit			# skip forward to sie_exit
@@ -208,7 +208,7 @@ SYM_FUNC_START(__sie64a)
 	lg	%r14,__SF_SIE_CONTROL(%r15)	# get control block pointer
 	ni	__SIE_PROG0C+3(%r14),0xfe	# no longer in SIE
 	GET_LC	%r14
-	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r14)	# load primary asce
+	lctlg	%c1,%c1,__LC_USER_ASCE(%r14)	# load primary asce
 	lg	%r14,__LC_CURRENT(%r14)
 	mvi	__TI_sie(%r14),0
 SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL)
@@ -240,7 +240,6 @@ SYM_CODE_START(system_call)
 	lghi	%r14,0
 .Lsysc_per:
 	STBEAR	__LC_LAST_BREAK(%r13)
-	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r13)
 	lg	%r15,__LC_KERNEL_STACK(%r13)
 	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
 	stmg	%r0,%r7,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
@@ -261,7 +260,6 @@ SYM_CODE_START(system_call)
 	lgr	%r3,%r14
 	brasl	%r14,__do_syscall
 	STACKLEAK_ERASE
-	lctlg	%c1,%c1,__LC_USER_ASCE(%r13)
 	mvc	__LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
 	BPON
 	LBEAR	STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
@@ -278,7 +276,6 @@ SYM_CODE_START(ret_from_fork)
 	brasl	%r14,__ret_from_fork
 	STACKLEAK_ERASE
 	GET_LC	%r13
-	lctlg	%c1,%c1,__LC_USER_ASCE(%r13)
 	mvc	__LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
 	BPON
 	LBEAR	STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
@@ -299,10 +296,7 @@ SYM_CODE_START(pgm_check_handler)
 	lmg	%r8,%r9,__LC_PGM_OLD_PSW(%r13)
 	xgr	%r10,%r10
 	tmhh	%r8,0x0001		# coming from user space?
-	jno	.Lpgm_skip_asce
-	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r13)
-	j	3f			# -> fault in user space
-.Lpgm_skip_asce:
+	jo	3f			# -> fault in user space
 #if IS_ENABLED(CONFIG_KVM)
 	lg	%r11,__LC_CURRENT(%r13)
 	tm	__TI_sie(%r11),0xff
@@ -340,7 +334,6 @@ SYM_CODE_START(pgm_check_handler)
 	tmhh	%r8,0x0001		# returning to user space?
 	jno	.Lpgm_exit_kernel
 	STACKLEAK_ERASE
-	lctlg	%c1,%c1,__LC_USER_ASCE(%r13)
 	BPON
 	stpt	__LC_EXIT_TIMER(%r13)
 .Lpgm_exit_kernel:
@@ -384,8 +377,7 @@ SYM_CODE_START(\name)
 #endif
 0:	aghi	%r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
 	j	2f
-1:	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r13)
-	lg	%r15,__LC_KERNEL_STACK(%r13)
+1:	lg	%r15,__LC_KERNEL_STACK(%r13)
 2:	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
 	la	%r11,STACK_FRAME_OVERHEAD(%r15)
 	stmg	%r0,%r7,__PT_R0(%r11)
@@ -408,7 +400,6 @@ SYM_CODE_START(\name)
 	tmhh	%r8,0x0001		# returning to user ?
 	jno	2f
 	STACKLEAK_ERASE
-	lctlg	%c1,%c1,__LC_USER_ASCE(%r13)
 	BPON
 	stpt	__LC_EXIT_TIMER(%r13)
 2:	LBEAR	__PT_LAST_BREAK(%r11)
@@ -476,8 +467,6 @@ SYM_CODE_START(mcck_int_handler)
 .Lmcck_user:
 	lg	%r15,__LC_MCCK_STACK(%r13)
 	la	%r11,STACK_FRAME_OVERHEAD(%r15)
-	stctg	%c1,%c1,__PT_CR1(%r11)
-	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r13)
 	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
 	lay	%r14,__LC_GPREGS_SAVE_AREA(%r13)
 	mvc	__PT_R0(128,%r11),0(%r14)
@@ -495,7 +484,6 @@ SYM_CODE_START(mcck_int_handler)
 	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
 	lgr	%r2,%r11		# pass pointer to pt_regs
 	brasl	%r14,s390_do_machine_check
-	lctlg	%c1,%c1,__PT_CR1(%r11)
 	lmg	%r0,%r10,__PT_R0(%r11)
 	mvc	__LC_RETURN_MCCK_PSW(16,%r13),__PT_PSW(%r11) # move return PSW
 	tm	__LC_RETURN_MCCK_PSW+1(%r13),0x01 # returning to user ?
@@ -602,7 +590,8 @@ SYM_CODE_START(stack_invalid)
 	stmg	%r0,%r7,__PT_R0(%r11)
 	stmg	%r8,%r9,__PT_PSW(%r11)
 	mvc	__PT_R8(64,%r11),0(%r14)
-	stg	%r10,__PT_ORIG_GPR2(%r11) # store last break to orig_gpr2
+	GET_LC	%r2
+	mvc	__PT_ORIG_GPR2(8,%r11),__LC_PGM_LAST_BREAK(%r2)
 	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
 	lgr	%r2,%r11		# pass pointer to pt_regs
 	jg	kernel_stack_invalid
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 3b9d9ccfad63..ff15f91affde 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -270,7 +270,7 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj,	\
 {									\
 	if (len >= sizeof(_value))					\
 		return -E2BIG;						\
-	len = strscpy(_value, buf, sizeof(_value));			\
+	len = strscpy(_value, buf);					\
 	if ((ssize_t)len < 0)						\
 		return len;						\
 	strim(_value);							\
@@ -2249,26 +2249,28 @@ static int __init s390_ipl_init(void)
 
 __initcall(s390_ipl_init);
 
-static void __init strncpy_skip_quote(char *dst, char *src, int n)
+static void __init strscpy_skip_quote(char *dst, char *src, int n)
 {
 	int sx, dx;
 
-	dx = 0;
-	for (sx = 0; src[sx] != 0; sx++) {
+	if (!n)
+		return;
+	for (sx = 0, dx = 0; src[sx]; sx++) {
 		if (src[sx] == '"')
 			continue;
-		dst[dx++] = src[sx];
-		if (dx >= n)
+		dst[dx] = src[sx];
+		if (dx + 1 == n)
 			break;
+		dx++;
 	}
+	dst[dx] = '\0';
 }
 
 static int __init vmcmd_on_reboot_setup(char *str)
 {
 	if (!machine_is_vm())
 		return 1;
-	strncpy_skip_quote(vmcmd_on_reboot, str, VMCMD_MAX_SIZE);
-	vmcmd_on_reboot[VMCMD_MAX_SIZE] = 0;
+	strscpy_skip_quote(vmcmd_on_reboot, str, sizeof(vmcmd_on_reboot));
 	on_reboot_trigger.action = &vmcmd_action;
 	return 1;
 }
@@ -2278,8 +2280,7 @@ static int __init vmcmd_on_panic_setup(char *str)
 {
 	if (!machine_is_vm())
 		return 1;
-	strncpy_skip_quote(vmcmd_on_panic, str, VMCMD_MAX_SIZE);
-	vmcmd_on_panic[VMCMD_MAX_SIZE] = 0;
+	strscpy_skip_quote(vmcmd_on_panic, str, sizeof(vmcmd_on_panic));
 	on_panic_trigger.action = &vmcmd_action;
 	return 1;
 }
@@ -2289,8 +2290,7 @@ static int __init vmcmd_on_halt_setup(char *str)
 {
 	if (!machine_is_vm())
 		return 1;
-	strncpy_skip_quote(vmcmd_on_halt, str, VMCMD_MAX_SIZE);
-	vmcmd_on_halt[VMCMD_MAX_SIZE] = 0;
+	strscpy_skip_quote(vmcmd_on_halt, str, sizeof(vmcmd_on_halt));
 	on_halt_trigger.action = &vmcmd_action;
 	return 1;
 }
@@ -2300,8 +2300,7 @@ static int __init vmcmd_on_poff_setup(char *str)
 {
 	if (!machine_is_vm())
 		return 1;
-	strncpy_skip_quote(vmcmd_on_poff, str, VMCMD_MAX_SIZE);
-	vmcmd_on_poff[VMCMD_MAX_SIZE] = 0;
+	strscpy_skip_quote(vmcmd_on_poff, str, sizeof(vmcmd_on_poff));
 	on_poff_trigger.action = &vmcmd_action;
 	return 1;
 }
diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c
index 690a293eb10d..7ace1f9e4ccf 100644
--- a/arch/s390/kernel/perf_cpum_cf_events.c
+++ b/arch/s390/kernel/perf_cpum_cf_events.c
@@ -290,8 +290,8 @@ CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_NO_SPECIAL, 0x00f4);
 CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_SPECIAL, 0x00f5);
 CPUMF_EVENT_ATTR(cf_z15, DFLT_ACCESS, 0x00f7);
 CPUMF_EVENT_ATTR(cf_z15, DFLT_CYCLES, 0x00fc);
-CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x00108);
-CPUMF_EVENT_ATTR(cf_z15, DFLT_CCFINISH, 0x00109);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x0108);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_CCFINISH, 0x0109);
 CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0);
 CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1);
 CPUMF_EVENT_ATTR(cf_z16, L1D_RO_EXCL_WRITES, 0x0080);
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index 80b1f7a29f11..11f70c1e2797 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -268,35 +268,35 @@ static int __init setup_elf_platform(void)
 	add_device_randomness(&cpu_id, sizeof(cpu_id));
 	switch (cpu_id.machine) {
 	default:	/* Use "z10" as default. */
-		strcpy(elf_platform, "z10");
+		strscpy(elf_platform, "z10");
 		break;
 	case 0x2817:
 	case 0x2818:
-		strcpy(elf_platform, "z196");
+		strscpy(elf_platform, "z196");
 		break;
 	case 0x2827:
 	case 0x2828:
-		strcpy(elf_platform, "zEC12");
+		strscpy(elf_platform, "zEC12");
 		break;
 	case 0x2964:
 	case 0x2965:
-		strcpy(elf_platform, "z13");
+		strscpy(elf_platform, "z13");
 		break;
 	case 0x3906:
 	case 0x3907:
-		strcpy(elf_platform, "z14");
+		strscpy(elf_platform, "z14");
 		break;
 	case 0x8561:
 	case 0x8562:
-		strcpy(elf_platform, "z15");
+		strscpy(elf_platform, "z15");
 		break;
 	case 0x3931:
 	case 0x3932:
-		strcpy(elf_platform, "z16");
+		strscpy(elf_platform, "z16");
 		break;
 	case 0x9175:
 	case 0x9176:
-		strcpy(elf_platform, "z17");
+		strscpy(elf_platform, "z17");
 		break;
 	}
 	return 0;
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 34b8d9e745df..e1240f6b29fa 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -1524,13 +1524,6 @@ static const char *gpr_names[NUM_GPRS] = {
 	"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 };
 
-unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset)
-{
-	if (offset >= NUM_GPRS)
-		return 0;
-	return regs->gprs[offset];
-}
-
 int regs_query_register_offset(const char *name)
 {
 	unsigned long offset;
@@ -1550,29 +1543,3 @@ const char *regs_query_register_name(unsigned int offset)
 		return NULL;
 	return gpr_names[offset];
 }
-
-static int regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr)
-{
-	unsigned long ksp = kernel_stack_pointer(regs);
-
-	return (addr & ~(THREAD_SIZE - 1)) == (ksp & ~(THREAD_SIZE - 1));
-}
-
-/**
- * regs_get_kernel_stack_nth() - get Nth entry of the stack
- * @regs:pt_regs which contains kernel stack pointer.
- * @n:stack entry number.
- *
- * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
- * is specifined by @regs. If the @n th entry is NOT in the kernel stack,
- * this returns 0.
- */
-unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n)
-{
-	unsigned long addr;
-
-	addr = kernel_stack_pointer(regs) + n * sizeof(long);
-	if (!regs_within_kernel_stack(regs, addr))
-		return 0;
-	return READ_ONCE_NOCHECK(addr);
-}
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 63f41dfaba85..81f12bb77f62 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -263,7 +263,7 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu)
 	abs_lc = get_abs_lowcore();
 	memcpy(lc->cregs_save_area, abs_lc->cregs_save_area, sizeof(lc->cregs_save_area));
 	put_abs_lowcore(abs_lc);
-	lc->cregs_save_area[1] = lc->kernel_asce;
+	lc->cregs_save_area[1] = lc->user_asce;
 	lc->cregs_save_area[7] = lc->user_asce;
 	save_access_regs((unsigned int *) lc->access_regs_save_area);
 	arch_spin_lock_setup(cpu);
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index 9a5d5be8acf4..4ab0b6b4866e 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -782,7 +782,12 @@ out_kobj:
 device_initcall(uv_sysfs_init);
 
 /*
- * Find the secret with the secret_id in the provided list.
+ * Locate a secret in the list by its id.
+ * @secret_id: search pattern.
+ * @list: ephemeral buffer space
+ * @secret: output data, containing the secret's metadata.
+ *
+ * Search for a secret with the given secret_id in the Ultravisor secret store.
  *
  * Context: might sleep.
  */
@@ -803,12 +808,15 @@ static int find_secret_in_page(const u8 secret_id[UV_SECRET_ID_LEN],
 
 /*
  * Do the actual search for `uv_get_secret_metadata`.
+ * @secret_id: search pattern.
+ * @list: ephemeral buffer space
+ * @secret: output data, containing the secret's metadata.
  *
  * Context: might sleep.
  */
-static int find_secret(const u8 secret_id[UV_SECRET_ID_LEN],
-		       struct uv_secret_list *list,
-		       struct uv_secret_list_item_hdr *secret)
+int uv_find_secret(const u8 secret_id[UV_SECRET_ID_LEN],
+		   struct uv_secret_list *list,
+		   struct uv_secret_list_item_hdr *secret)
 {
 	u16 start_idx = 0;
 	u16 list_rc;
@@ -830,36 +838,7 @@ static int find_secret(const u8 secret_id[UV_SECRET_ID_LEN],
 
 	return -ENOENT;
 }
-
-/**
- * uv_get_secret_metadata() - get secret metadata for a given secret id.
- * @secret_id: search pattern.
- * @secret: output data, containing the secret's metadata.
- *
- * Search for a secret with the given secret_id in the Ultravisor secret store.
- *
- * Context: might sleep.
- *
- * Return:
- * * %0:	- Found entry; secret->idx and secret->type are valid.
- * * %ENOENT	- No entry found.
- * * %ENODEV:	- Not supported: UV not available or command not available.
- * * %EIO:	- Other unexpected UV error.
- */
-int uv_get_secret_metadata(const u8 secret_id[UV_SECRET_ID_LEN],
-			   struct uv_secret_list_item_hdr *secret)
-{
-	struct uv_secret_list *buf;
-	int rc;
-
-	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-	rc = find_secret(secret_id, buf, secret);
-	kfree(buf);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(uv_get_secret_metadata);
+EXPORT_SYMBOL_GPL(uv_find_secret);
 
 /**
  * uv_retrieve_secret() - get the secret value for the secret index.
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index 14bbfe50033c..cd35cdbfa871 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -3,6 +3,7 @@
 # Makefile for s390-specific library files..
 #
 
+obj-y += crypto/
 lib-y += delay.o string.o uaccess.o find.o spinlock.o tishift.o
 lib-y += csum-partial.o
 obj-y += mem.o xor.o
@@ -26,4 +27,4 @@ lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 obj-$(CONFIG_EXPOLINE_EXTERN) += expoline.o
 
 obj-$(CONFIG_CRC32_ARCH) += crc32-s390.o
-crc32-s390-y := crc32-glue.o crc32le-vx.o crc32be-vx.o
+crc32-s390-y := crc32.o crc32le-vx.o crc32be-vx.o
diff --git a/arch/s390/lib/crc32-glue.c b/arch/s390/lib/crc32.c
index 124214a27340..3c4b344417c1 100644
--- a/arch/s390/lib/crc32-glue.c
+++ b/arch/s390/lib/crc32.c
@@ -18,8 +18,6 @@
 #define VX_ALIGNMENT		16L
 #define VX_ALIGN_MASK		(VX_ALIGNMENT - 1)
 
-static DEFINE_STATIC_KEY_FALSE(have_vxrs);
-
 /*
  * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension
  *
@@ -34,8 +32,7 @@ static DEFINE_STATIC_KEY_FALSE(have_vxrs);
 		unsigned long prealign, aligned, remaining;		    \
 		DECLARE_KERNEL_FPU_ONSTACK16(vxstate);			    \
 									    \
-		if (datalen < VX_MIN_LEN + VX_ALIGN_MASK ||		    \
-		    !static_branch_likely(&have_vxrs))			    \
+		if (datalen < VX_MIN_LEN + VX_ALIGN_MASK || !cpu_has_vx())  \
 			return ___crc32_sw(crc, data, datalen);		    \
 									    \
 		if ((unsigned long)data & VX_ALIGN_MASK) {		    \
@@ -64,25 +61,13 @@ DEFINE_CRC32_VX(crc32_le_arch, crc32_le_vgfm_16, crc32_le_base)
 DEFINE_CRC32_VX(crc32_be_arch, crc32_be_vgfm_16, crc32_be_base)
 DEFINE_CRC32_VX(crc32c_arch, crc32c_le_vgfm_16, crc32c_base)
 
-static int __init crc32_s390_init(void)
-{
-	if (cpu_have_feature(S390_CPU_FEATURE_VXRS))
-		static_branch_enable(&have_vxrs);
-	return 0;
-}
-arch_initcall(crc32_s390_init);
-
-static void __exit crc32_s390_exit(void)
-{
-}
-module_exit(crc32_s390_exit);
-
 u32 crc32_optimizations(void)
 {
-	if (static_key_enabled(&have_vxrs))
+	if (cpu_has_vx()) {
 		return CRC32_LE_OPTIMIZATION |
 		       CRC32_BE_OPTIMIZATION |
 		       CRC32C_OPTIMIZATION;
+	}
 	return 0;
 }
 EXPORT_SYMBOL(crc32_optimizations);
diff --git a/arch/s390/lib/crypto/Kconfig b/arch/s390/lib/crypto/Kconfig
new file mode 100644
index 000000000000..e3f855ef4393
--- /dev/null
+++ b/arch/s390/lib/crypto/Kconfig
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_CHACHA_S390
+	tristate
+	default CRYPTO_LIB_CHACHA
+	select CRYPTO_LIB_CHACHA_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_SHA256_S390
+	tristate
+	default CRYPTO_LIB_SHA256
+	select CRYPTO_ARCH_HAVE_LIB_SHA256
+	select CRYPTO_LIB_SHA256_GENERIC
diff --git a/arch/s390/lib/crypto/Makefile b/arch/s390/lib/crypto/Makefile
new file mode 100644
index 000000000000..920197967f46
--- /dev/null
+++ b/arch/s390/lib/crypto/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o
+chacha_s390-y := chacha-glue.o chacha-s390.o
+
+obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256.o
diff --git a/arch/s390/lib/crypto/chacha-glue.c b/arch/s390/lib/crypto/chacha-glue.c
new file mode 100644
index 000000000000..f95ba3483bbc
--- /dev/null
+++ b/arch/s390/lib/crypto/chacha-glue.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ChaCha stream cipher (s390 optimized)
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#define KMSG_COMPONENT "chacha_s390"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <crypto/chacha.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <asm/fpu.h>
+#include "chacha-s390.h"
+
+void hchacha_block_arch(const struct chacha_state *state,
+			u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	/* TODO: implement hchacha_block_arch() in assembly */
+	hchacha_block_generic(state, out, nrounds);
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+		       unsigned int bytes, int nrounds)
+{
+	/* s390 chacha20 implementation has 20 rounds hard-coded,
+	 * it cannot handle a block of data or less, but otherwise
+	 * it can handle data of arbitrary size
+	 */
+	if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx()) {
+		chacha_crypt_generic(state, dst, src, bytes, nrounds);
+	} else {
+		DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
+
+		kernel_fpu_begin(&vxstate, KERNEL_VXR);
+		chacha20_vx(dst, src, bytes, &state->x[4], &state->x[12]);
+		kernel_fpu_end(&vxstate, KERNEL_VXR);
+
+		state->x[12] += round_up(bytes, CHACHA_BLOCK_SIZE) /
+				CHACHA_BLOCK_SIZE;
+	}
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	return cpu_has_vx();
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+MODULE_DESCRIPTION("ChaCha stream cipher (s390 optimized)");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/s390/crypto/chacha-s390.S b/arch/s390/lib/crypto/chacha-s390.S
index 63f3102678c0..63f3102678c0 100644
--- a/arch/s390/crypto/chacha-s390.S
+++ b/arch/s390/lib/crypto/chacha-s390.S
diff --git a/arch/s390/crypto/chacha-s390.h b/arch/s390/lib/crypto/chacha-s390.h
index 733744ce30f5..733744ce30f5 100644
--- a/arch/s390/crypto/chacha-s390.h
+++ b/arch/s390/lib/crypto/chacha-s390.h
diff --git a/arch/s390/lib/crypto/sha256.c b/arch/s390/lib/crypto/sha256.c
new file mode 100644
index 000000000000..7dfe120fafab
--- /dev/null
+++ b/arch/s390/lib/crypto/sha256.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-256 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <crypto/internal/sha2.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256);
+
+void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+			const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_cpacf_sha256))
+		cpacf_kimd(CPACF_KIMD_SHA_256, state, data,
+			   nblocks * SHA256_BLOCK_SIZE);
+	else
+		sha256_blocks_generic(state, data, nblocks);
+}
+EXPORT_SYMBOL_GPL(sha256_blocks_arch);
+
+bool sha256_is_arch_optimized(void)
+{
+	return static_key_enabled(&have_cpacf_sha256);
+}
+EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
+
+static int __init sha256_s390_mod_init(void)
+{
+	if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
+	    cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256))
+		static_branch_enable(&have_cpacf_sha256);
+	return 0;
+}
+subsys_initcall(sha256_s390_mod_init);
+
+static void __exit sha256_s390_mod_exit(void)
+{
+}
+module_exit(sha256_s390_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-256 using the CP Assist for Cryptographic Functions (CPACF)");
diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c
index 373fa1f01937..099de76e8b1a 100644
--- a/arch/s390/lib/string.c
+++ b/arch/s390/lib/string.c
@@ -78,50 +78,6 @@ EXPORT_SYMBOL(strnlen);
 #endif
 
 /**
- * strcpy - Copy a %NUL terminated string
- * @dest: Where to copy the string to
- * @src: Where to copy the string from
- *
- * returns a pointer to @dest
- */
-#ifdef __HAVE_ARCH_STRCPY
-char *strcpy(char *dest, const char *src)
-{
-	char *ret = dest;
-
-	asm volatile(
-		"	lghi	0,0\n"
-		"0:	mvst	%[dest],%[src]\n"
-		"	jo	0b\n"
-		: [dest] "+&a" (dest), [src] "+&a" (src)
-		:
-		: "cc", "memory", "0");
-	return ret;
-}
-EXPORT_SYMBOL(strcpy);
-#endif
-
-/**
- * strncpy - Copy a length-limited, %NUL-terminated string
- * @dest: Where to copy the string to
- * @src: Where to copy the string from
- * @n: The maximum number of bytes to copy
- *
- * The result is not %NUL-terminated if the source exceeds
- * @n bytes.
- */
-#ifdef __HAVE_ARCH_STRNCPY
-char *strncpy(char *dest, const char *src, size_t n)
-{
-	size_t len = __strnend(src, n) - src;
-	memset(dest + len, 0, n - len);
-	memcpy(dest, src, len);
-	return dest;
-}
-EXPORT_SYMBOL(strncpy);
-#endif
-
-/**
  * strcat - Append one %NUL-terminated string to another
  * @dest: The string to be appended to
  * @src: The string to append to it
@@ -181,9 +137,6 @@ EXPORT_SYMBOL(strlcat);
  * @n: The maximum numbers of bytes to copy
  *
  * returns a pointer to @dest
- *
- * Note that in contrast to strncpy, strncat ensures the result is
- * terminated.
  */
 #ifdef __HAVE_ARCH_STRNCAT
 char *strncat(char *dest, const char *src, size_t n)
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index cec20db88479..fa7d98fa1320 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -17,17 +17,18 @@
 #ifdef CONFIG_DEBUG_ENTRY
 void debug_user_asce(int exit)
 {
+	struct lowcore *lc = get_lowcore();
 	struct ctlreg cr1, cr7;
 
 	local_ctl_store(1, &cr1);
 	local_ctl_store(7, &cr7);
-	if (cr1.val == get_lowcore()->kernel_asce.val && cr7.val == get_lowcore()->user_asce.val)
+	if (cr1.val == lc->user_asce.val && cr7.val == lc->user_asce.val)
 		return;
 	panic("incorrect ASCE on kernel %s\n"
 	      "cr1:    %016lx cr7:  %016lx\n"
 	      "kernel: %016lx user: %016lx\n",
 	      exit ? "exit" : "entry", cr1.val, cr7.val,
-	      get_lowcore()->kernel_asce.val, get_lowcore()->user_asce.val);
+	      lc->kernel_asce.val, lc->user_asce.val);
 }
 #endif /*CONFIG_DEBUG_ENTRY */
 
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index a6b8b8ea9086..f7da53e212f5 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -530,6 +530,14 @@ segment_modify_shared (char *name, int do_nonshared)
 	return rc;
 }
 
+static void __dcss_diag_purge_on_cpu_0(void *data)
+{
+	struct dcss_segment *seg = (struct dcss_segment *)data;
+	unsigned long dummy;
+
+	dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy);
+}
+
 /*
  * Decrease the use count of a DCSS segment and remove
  * it from the address space if nobody is using it
@@ -538,7 +546,6 @@ segment_modify_shared (char *name, int do_nonshared)
 void
 segment_unload(char *name)
 {
-	unsigned long dummy;
 	struct dcss_segment *seg;
 
 	if (!machine_is_vm())
@@ -556,7 +563,14 @@ segment_unload(char *name)
 	kfree(seg->res);
 	vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
 	list_del(&seg->list);
-	dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy);
+	/*
+	 * Workaround for z/VM issue, where calling the DCSS unload diag on
+	 * a non-IPL CPU would cause bogus sclp maximum memory detection on
+	 * next IPL.
+	 * IPL CPU 0 cannot be set offline, so the dcss_diag() call can
+	 * directly be scheduled to that CPU.
+	 */
+	smp_call_function_single(0, __dcss_diag_purge_on_cpu_0, seg, 1);
 	kfree(seg);
 out_unlock:
 	mutex_unlock(&dcss_lock);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index e3a6f8ae156c..d177bea0bd73 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -38,11 +38,15 @@ void crst_table_free(struct mm_struct *mm, unsigned long *table)
 static void __crst_table_upgrade(void *arg)
 {
 	struct mm_struct *mm = arg;
+	struct ctlreg asce;
 
 	/* change all active ASCEs to avoid the creation of new TLBs */
 	if (current->active_mm == mm) {
-		get_lowcore()->user_asce.val = mm->context.asce;
-		local_ctl_load(7, &get_lowcore()->user_asce);
+		asce.val = mm->context.asce;
+		get_lowcore()->user_asce = asce;
+		local_ctl_load(7, &asce);
+		if (!test_thread_flag(TIF_ASCE_PRIMARY))
+			local_ctl_load(1, &asce);
 	}
 	__tlb_flush_local();
 }
@@ -52,6 +56,8 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
 	unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
 	unsigned long asce_limit = mm->context.asce_limit;
 
+	mmap_assert_write_locked(mm);
+
 	/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
 	VM_BUG_ON(asce_limit < _REGION2_SIZE);
 
@@ -75,13 +81,6 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
 
 	spin_lock_bh(&mm->page_table_lock);
 
-	/*
-	 * This routine gets called with mmap_lock lock held and there is
-	 * no reason to optimize for the case of otherwise. However, if
-	 * that would ever change, the below check will let us know.
-	 */
-	VM_BUG_ON(asce_limit != mm->context.asce_limit);
-
 	if (p4d) {
 		__pgd = (unsigned long *) mm->pgd;
 		p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 5bbdc4190b8b..cd6676c2d602 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -45,6 +45,7 @@
 /* list of all detected zpci devices */
 static LIST_HEAD(zpci_list);
 static DEFINE_SPINLOCK(zpci_list_lock);
+static DEFINE_MUTEX(zpci_add_remove_lock);
 
 static DECLARE_BITMAP(zpci_domain, ZPCI_DOMAIN_BITMAP_SIZE);
 static DEFINE_SPINLOCK(zpci_domain_lock);
@@ -70,6 +71,15 @@ EXPORT_SYMBOL_GPL(zpci_aipb);
 struct airq_iv *zpci_aif_sbv;
 EXPORT_SYMBOL_GPL(zpci_aif_sbv);
 
+void zpci_zdev_put(struct zpci_dev *zdev)
+{
+	if (!zdev)
+		return;
+	mutex_lock(&zpci_add_remove_lock);
+	kref_put_lock(&zdev->kref, zpci_release_device, &zpci_list_lock);
+	mutex_unlock(&zpci_add_remove_lock);
+}
+
 struct zpci_dev *get_zdev_by_fid(u32 fid)
 {
 	struct zpci_dev *tmp, *zdev = NULL;
@@ -837,6 +847,7 @@ int zpci_add_device(struct zpci_dev *zdev)
 {
 	int rc;
 
+	mutex_lock(&zpci_add_remove_lock);
 	zpci_dbg(1, "add fid:%x, fh:%x, c:%d\n", zdev->fid, zdev->fh, zdev->state);
 	rc = zpci_init_iommu(zdev);
 	if (rc)
@@ -850,12 +861,14 @@ int zpci_add_device(struct zpci_dev *zdev)
 	spin_lock(&zpci_list_lock);
 	list_add_tail(&zdev->entry, &zpci_list);
 	spin_unlock(&zpci_list_lock);
+	mutex_unlock(&zpci_add_remove_lock);
 	return 0;
 
 error_destroy_iommu:
 	zpci_destroy_iommu(zdev);
 error:
 	zpci_dbg(0, "add fid:%x, rc:%d\n", zdev->fid, rc);
+	mutex_unlock(&zpci_add_remove_lock);
 	return rc;
 }
 
@@ -925,21 +938,20 @@ int zpci_deconfigure_device(struct zpci_dev *zdev)
  * @zdev: the zpci_dev that was reserved
  *
  * Handle the case that a given zPCI function was reserved by another system.
- * After a call to this function the zpci_dev can not be found via
- * get_zdev_by_fid() anymore but may still be accessible via existing
- * references though it will not be functional anymore.
  */
 void zpci_device_reserved(struct zpci_dev *zdev)
 {
-	/*
-	 * Remove device from zpci_list as it is going away. This also
-	 * makes sure we ignore subsequent zPCI events for this device.
-	 */
-	spin_lock(&zpci_list_lock);
-	list_del(&zdev->entry);
-	spin_unlock(&zpci_list_lock);
+	lockdep_assert_held(&zdev->state_lock);
+	/* We may declare the device reserved multiple times */
+	if (zdev->state == ZPCI_FN_STATE_RESERVED)
+		return;
 	zdev->state = ZPCI_FN_STATE_RESERVED;
 	zpci_dbg(3, "rsv fid:%x\n", zdev->fid);
+	/*
+	 * The underlying device is gone. Allow the zdev to be freed
+	 * as soon as all other references are gone by accounting for
+	 * the removal as a dropped reference.
+	 */
 	zpci_zdev_put(zdev);
 }
 
@@ -947,13 +959,14 @@ void zpci_release_device(struct kref *kref)
 {
 	struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref);
 
+	lockdep_assert_held(&zpci_add_remove_lock);
 	WARN_ON(zdev->state != ZPCI_FN_STATE_RESERVED);
-
-	if (zdev->zbus->bus)
-		zpci_bus_remove_device(zdev, false);
-
-	if (zdev_enabled(zdev))
-		zpci_disable_device(zdev);
+	/*
+	 * We already hold zpci_list_lock thanks to kref_put_lock().
+	 * This makes sure no new reference can be taken from the list.
+	 */
+	list_del(&zdev->entry);
+	spin_unlock(&zpci_list_lock);
 
 	if (zdev->has_hp_slot)
 		zpci_exit_slot(zdev);
diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h
index e86a9419d233..ae3d7a9159bd 100644
--- a/arch/s390/pci/pci_bus.h
+++ b/arch/s390/pci/pci_bus.h
@@ -21,11 +21,8 @@ int zpci_bus_scan_device(struct zpci_dev *zdev);
 void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error);
 
 void zpci_release_device(struct kref *kref);
-static inline void zpci_zdev_put(struct zpci_dev *zdev)
-{
-	if (zdev)
-		kref_put(&zdev->kref, zpci_release_device);
-}
+
+void zpci_zdev_put(struct zpci_dev *zdev);
 
 static inline void zpci_zdev_get(struct zpci_dev *zdev)
 {
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index 9a929bbcc397..241f7251c873 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -428,6 +428,8 @@ static void __clp_add(struct clp_fh_list_entry *entry, void *data)
 		return;
 	}
 	zdev = zpci_create_device(entry->fid, entry->fh, entry->config_state);
+	if (IS_ERR(zdev))
+		return;
 	list_add_tail(&zdev->entry, scan_list);
 }
 
diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c
index 7bd7721c1239..2fbee3887d13 100644
--- a/arch/s390/pci/pci_event.c
+++ b/arch/s390/pci/pci_event.c
@@ -335,6 +335,22 @@ static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh)
 	zdev->state = ZPCI_FN_STATE_STANDBY;
 }
 
+static void zpci_event_reappear(struct zpci_dev *zdev)
+{
+	lockdep_assert_held(&zdev->state_lock);
+	/*
+	 * The zdev is in the reserved state. This means that it was presumed to
+	 * go away but there are still undropped references. Now, the platform
+	 * announced its availability again. Bring back the lingering zdev
+	 * to standby. This is safe because we hold a temporary reference
+	 * now so that it won't go away. Account for the re-appearance of the
+	 * underlying device by incrementing the reference count.
+	 */
+	zdev->state = ZPCI_FN_STATE_STANDBY;
+	zpci_zdev_get(zdev);
+	zpci_dbg(1, "rea fid:%x, fh:%x\n", zdev->fid, zdev->fh);
+}
+
 static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
 {
 	struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid);
@@ -358,8 +374,10 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
 				break;
 			}
 		} else {
+			if (zdev->state == ZPCI_FN_STATE_RESERVED)
+				zpci_event_reappear(zdev);
 			/* the configuration request may be stale */
-			if (zdev->state != ZPCI_FN_STATE_STANDBY)
+			else if (zdev->state != ZPCI_FN_STATE_STANDBY)
 				break;
 			zdev->state = ZPCI_FN_STATE_CONFIGURED;
 		}
@@ -375,6 +393,8 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
 				break;
 			}
 		} else {
+			if (zdev->state == ZPCI_FN_STATE_RESERVED)
+				zpci_event_reappear(zdev);
 			zpci_update_fh(zdev, ccdf->fh);
 		}
 		break;
diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
index 5fcc1a3b04bd..51e7a28af899 100644
--- a/arch/s390/pci/pci_mmio.c
+++ b/arch/s390/pci/pci_mmio.c
@@ -32,8 +32,10 @@ static inline int __pcistb_mio_inuser(
 		u64 len, u8 *status)
 {
 	int cc, exception;
+	bool sacf_flag;
 
 	exception = 1;
+	sacf_flag = enable_sacf_uaccess();
 	asm_inline volatile (
 		"	sacf	256\n"
 		"0:	.insn	rsy,0xeb00000000d4,%[len],%[ioaddr],%[src]\n"
@@ -44,6 +46,7 @@ static inline int __pcistb_mio_inuser(
 		: CC_OUT(cc, cc), [len] "+d" (len), [exc] "+d" (exception)
 		: [ioaddr] "a" (ioaddr), [src] "Q" (*((u8 __force *)src))
 		: CC_CLOBBER_LIST("memory"));
+	disable_sacf_uaccess(sacf_flag);
 	*status = len >> 24 & 0xff;
 	return exception ? -ENXIO : CC_TRANSFORM(cc);
 }
@@ -54,6 +57,7 @@ static inline int __pcistg_mio_inuser(
 {
 	union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen};
 	int cc, exception;
+	bool sacf_flag;
 	u64 val = 0;
 	u64 cnt = ulen;
 	u8 tmp;
@@ -64,6 +68,7 @@ static inline int __pcistg_mio_inuser(
 	 * address space. pcistg then uses the user mappings.
 	 */
 	exception = 1;
+	sacf_flag = enable_sacf_uaccess();
 	asm_inline volatile (
 		"	sacf	256\n"
 		"0:	llgc	%[tmp],0(%[src])\n"
@@ -81,6 +86,7 @@ static inline int __pcistg_mio_inuser(
 		  CC_OUT(cc, cc), [ioaddr_len] "+&d" (ioaddr_len.pair)
 		:
 		: CC_CLOBBER_LIST("memory"));
+	disable_sacf_uaccess(sacf_flag);
 	*status = ioaddr_len.odd >> 24 & 0xff;
 
 	cc = exception ? -ENXIO : CC_TRANSFORM(cc);
@@ -204,6 +210,7 @@ static inline int __pcilg_mio_inuser(
 		u64 ulen, u8 *status)
 {
 	union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen};
+	bool sacf_flag;
 	u64 cnt = ulen;
 	int shift = ulen * 8;
 	int cc, exception;
@@ -215,6 +222,7 @@ static inline int __pcilg_mio_inuser(
 	 * user address @dst
 	 */
 	exception = 1;
+	sacf_flag = enable_sacf_uaccess();
 	asm_inline volatile (
 		"	sacf	256\n"
 		"0:	.insn	rre,0xb9d60000,%[val],%[ioaddr_len]\n"
@@ -236,10 +244,10 @@ static inline int __pcilg_mio_inuser(
 		: [ioaddr_len] "+&d" (ioaddr_len.pair), [exc] "+d" (exception),
 		  CC_OUT(cc, cc), [val] "=d" (val),
 		  [dst] "+a" (dst), [cnt] "+d" (cnt), [tmp] "=d" (tmp),
-		  [shift] "+d" (shift)
+		  [shift] "+a" (shift)
 		:
 		: CC_CLOBBER_LIST("memory"));
-
+	disable_sacf_uaccess(sacf_flag);
 	cc = exception ? -ENXIO : CC_TRANSFORM(cc);
 	/* did we write everything to the user space buffer? */
 	if (!cc && cnt != 0)
diff --git a/arch/sh/configs/migor_defconfig b/arch/sh/configs/migor_defconfig
index fc2010c241fb..31dbd8888aaa 100644
--- a/arch/sh/configs/migor_defconfig
+++ b/arch/sh/configs/migor_defconfig
@@ -87,6 +87,5 @@ CONFIG_TMPFS=y
 CONFIG_NFS_FS=y
 CONFIG_ROOT_NFS=y
 CONFIG_DEBUG_FS=y
-CONFIG_CRYPTO_MANAGER=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig
index f1ba0fefe1f9..7a7c4dec2925 100644
--- a/arch/sparc/configs/sparc64_defconfig
+++ b/arch/sparc/configs/sparc64_defconfig
@@ -205,7 +205,7 @@ CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_UPROBE_EVENTS=y
 CONFIG_KEYS=y
 CONFIG_CRYPTO_NULL=m
-CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_XTS=m
diff --git a/arch/sparc/crypto/Kconfig b/arch/sparc/crypto/Kconfig
index e858597de89d..a6ba319c42dc 100644
--- a/arch/sparc/crypto/Kconfig
+++ b/arch/sparc/crypto/Kconfig
@@ -36,16 +36,6 @@ config CRYPTO_SHA1_SPARC64
 
 	  Architecture: sparc64
 
-config CRYPTO_SHA256_SPARC64
-	tristate "Hash functions: SHA-224 and SHA-256"
-	depends on SPARC64
-	select CRYPTO_SHA256
-	select CRYPTO_HASH
-	help
-	  SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
-	  Architecture: sparc64 using crypto instructions, when available
-
 config CRYPTO_SHA512_SPARC64
 	tristate "Hash functions: SHA-384 and SHA-512"
 	depends on SPARC64
diff --git a/arch/sparc/crypto/Makefile b/arch/sparc/crypto/Makefile
index a2d7fca40cb4..701c39edb0d7 100644
--- a/arch/sparc/crypto/Makefile
+++ b/arch/sparc/crypto/Makefile
@@ -4,7 +4,6 @@
 #
 
 obj-$(CONFIG_CRYPTO_SHA1_SPARC64) += sha1-sparc64.o
-obj-$(CONFIG_CRYPTO_SHA256_SPARC64) += sha256-sparc64.o
 obj-$(CONFIG_CRYPTO_SHA512_SPARC64) += sha512-sparc64.o
 obj-$(CONFIG_CRYPTO_MD5_SPARC64) += md5-sparc64.o
 
@@ -13,7 +12,6 @@ obj-$(CONFIG_CRYPTO_DES_SPARC64) += des-sparc64.o
 obj-$(CONFIG_CRYPTO_CAMELLIA_SPARC64) += camellia-sparc64.o
 
 sha1-sparc64-y := sha1_asm.o sha1_glue.o
-sha256-sparc64-y := sha256_asm.o sha256_glue.o
 sha512-sparc64-y := sha512_asm.o sha512_glue.o
 md5-sparc64-y := md5_asm.o md5_glue.o
 
diff --git a/arch/sparc/crypto/aes_asm.S b/arch/sparc/crypto/aes_asm.S
index 155cefb98520..f291174a72a1 100644
--- a/arch/sparc/crypto/aes_asm.S
+++ b/arch/sparc/crypto/aes_asm.S
@@ -1,9 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/linkage.h>
+#include <asm/opcodes.h>
 #include <asm/visasm.h>
 
-#include "opcodes.h"
-
 #define ENCRYPT_TWO_ROUNDS(KEY_BASE, I0, I1, T0, T1) \
 	AES_EROUND01(KEY_BASE +  0, I0, I1, T0) \
 	AES_EROUND23(KEY_BASE +  2, I0, I1, T1) \
diff --git a/arch/sparc/crypto/aes_glue.c b/arch/sparc/crypto/aes_glue.c
index 683150830356..359f22643b05 100644
--- a/arch/sparc/crypto/aes_glue.c
+++ b/arch/sparc/crypto/aes_glue.c
@@ -27,11 +27,10 @@
 #include <crypto/internal/skcipher.h>
 
 #include <asm/fpumacro.h>
+#include <asm/opcodes.h>
 #include <asm/pstate.h>
 #include <asm/elf.h>
 
-#include "opcodes.h"
-
 struct aes_ops {
 	void (*encrypt)(const u64 *key, const u32 *input, u32 *output);
 	void (*decrypt)(const u64 *key, const u32 *input, u32 *output);
diff --git a/arch/sparc/crypto/camellia_asm.S b/arch/sparc/crypto/camellia_asm.S
index dcdc9193fcd7..8471b346ef54 100644
--- a/arch/sparc/crypto/camellia_asm.S
+++ b/arch/sparc/crypto/camellia_asm.S
@@ -1,9 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/linkage.h>
+#include <asm/opcodes.h>
 #include <asm/visasm.h>
 
-#include "opcodes.h"
-
 #define CAMELLIA_6ROUNDS(KEY_BASE, I0, I1) \
 	CAMELLIA_F(KEY_BASE +  0, I1, I0, I1) \
 	CAMELLIA_F(KEY_BASE +  2, I0, I1, I0) \
diff --git a/arch/sparc/crypto/camellia_glue.c b/arch/sparc/crypto/camellia_glue.c
index aaa9714378e6..e7a1e1c42b99 100644
--- a/arch/sparc/crypto/camellia_glue.c
+++ b/arch/sparc/crypto/camellia_glue.c
@@ -15,11 +15,10 @@
 #include <crypto/internal/skcipher.h>
 
 #include <asm/fpumacro.h>
+#include <asm/opcodes.h>
 #include <asm/pstate.h>
 #include <asm/elf.h>
 
-#include "opcodes.h"
-
 #define CAMELLIA_MIN_KEY_SIZE        16
 #define CAMELLIA_MAX_KEY_SIZE        32
 #define CAMELLIA_BLOCK_SIZE          16
diff --git a/arch/sparc/crypto/des_asm.S b/arch/sparc/crypto/des_asm.S
index 7157468a679d..d534446cbef9 100644
--- a/arch/sparc/crypto/des_asm.S
+++ b/arch/sparc/crypto/des_asm.S
@@ -1,9 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/linkage.h>
+#include <asm/opcodes.h>
 #include <asm/visasm.h>
 
-#include "opcodes.h"
-
 	.align	32
 ENTRY(des_sparc64_key_expand)
 	/* %o0=input_key, %o1=output_key */
diff --git a/arch/sparc/crypto/des_glue.c b/arch/sparc/crypto/des_glue.c
index a499102bf706..e50ec4cd57cd 100644
--- a/arch/sparc/crypto/des_glue.c
+++ b/arch/sparc/crypto/des_glue.c
@@ -16,11 +16,10 @@
 #include <crypto/internal/skcipher.h>
 
 #include <asm/fpumacro.h>
+#include <asm/opcodes.h>
 #include <asm/pstate.h>
 #include <asm/elf.h>
 
-#include "opcodes.h"
-
 struct des_sparc64_ctx {
 	u64 encrypt_expkey[DES_EXPKEY_WORDS / 2];
 	u64 decrypt_expkey[DES_EXPKEY_WORDS / 2];
diff --git a/arch/sparc/crypto/md5_asm.S b/arch/sparc/crypto/md5_asm.S
index 7a6637455f37..60b544e4d205 100644
--- a/arch/sparc/crypto/md5_asm.S
+++ b/arch/sparc/crypto/md5_asm.S
@@ -1,9 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/linkage.h>
+#include <asm/opcodes.h>
 #include <asm/visasm.h>
 
-#include "opcodes.h"
-
 ENTRY(md5_sparc64_transform)
 	/* %o0 = digest, %o1 = data, %o2 = rounds */
 	VISEntryHalf
diff --git a/arch/sparc/crypto/md5_glue.c b/arch/sparc/crypto/md5_glue.c
index 511db98d590a..b3615f0cdf62 100644
--- a/arch/sparc/crypto/md5_glue.c
+++ b/arch/sparc/crypto/md5_glue.c
@@ -14,121 +14,104 @@
 
 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
 #include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
 #include <crypto/md5.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
 
-#include <asm/pstate.h>
-#include <asm/elf.h>
-
-#include "opcodes.h"
+struct sparc_md5_state {
+	__le32 hash[MD5_HASH_WORDS];
+	u64 byte_count;
+};
 
-asmlinkage void md5_sparc64_transform(u32 *digest, const char *data,
+asmlinkage void md5_sparc64_transform(__le32 *digest, const char *data,
 				      unsigned int rounds);
 
 static int md5_sparc64_init(struct shash_desc *desc)
 {
-	struct md5_state *mctx = shash_desc_ctx(desc);
+	struct sparc_md5_state *mctx = shash_desc_ctx(desc);
 
-	mctx->hash[0] = MD5_H0;
-	mctx->hash[1] = MD5_H1;
-	mctx->hash[2] = MD5_H2;
-	mctx->hash[3] = MD5_H3;
-	le32_to_cpu_array(mctx->hash, 4);
+	mctx->hash[0] = cpu_to_le32(MD5_H0);
+	mctx->hash[1] = cpu_to_le32(MD5_H1);
+	mctx->hash[2] = cpu_to_le32(MD5_H2);
+	mctx->hash[3] = cpu_to_le32(MD5_H3);
 	mctx->byte_count = 0;
 
 	return 0;
 }
 
-static void __md5_sparc64_update(struct md5_state *sctx, const u8 *data,
-				 unsigned int len, unsigned int partial)
-{
-	unsigned int done = 0;
-
-	sctx->byte_count += len;
-	if (partial) {
-		done = MD5_HMAC_BLOCK_SIZE - partial;
-		memcpy((u8 *)sctx->block + partial, data, done);
-		md5_sparc64_transform(sctx->hash, (u8 *)sctx->block, 1);
-	}
-	if (len - done >= MD5_HMAC_BLOCK_SIZE) {
-		const unsigned int rounds = (len - done) / MD5_HMAC_BLOCK_SIZE;
-
-		md5_sparc64_transform(sctx->hash, data + done, rounds);
-		done += rounds * MD5_HMAC_BLOCK_SIZE;
-	}
-
-	memcpy(sctx->block, data + done, len - done);
-}
-
 static int md5_sparc64_update(struct shash_desc *desc, const u8 *data,
 			      unsigned int len)
 {
-	struct md5_state *sctx = shash_desc_ctx(desc);
-	unsigned int partial = sctx->byte_count % MD5_HMAC_BLOCK_SIZE;
-
-	/* Handle the fast case right here */
-	if (partial + len < MD5_HMAC_BLOCK_SIZE) {
-		sctx->byte_count += len;
-		memcpy((u8 *)sctx->block + partial, data, len);
-	} else
-		__md5_sparc64_update(sctx, data, len, partial);
+	struct sparc_md5_state *sctx = shash_desc_ctx(desc);
 
-	return 0;
+	sctx->byte_count += round_down(len, MD5_HMAC_BLOCK_SIZE);
+	md5_sparc64_transform(sctx->hash, data, len / MD5_HMAC_BLOCK_SIZE);
+	return len - round_down(len, MD5_HMAC_BLOCK_SIZE);
 }
 
 /* Add padding and return the message digest. */
-static int md5_sparc64_final(struct shash_desc *desc, u8 *out)
+static int md5_sparc64_finup(struct shash_desc *desc, const u8 *src,
+			     unsigned int offset, u8 *out)
 {
-	struct md5_state *sctx = shash_desc_ctx(desc);
-	unsigned int i, index, padlen;
-	u32 *dst = (u32 *)out;
-	__le64 bits;
-	static const u8 padding[MD5_HMAC_BLOCK_SIZE] = { 0x80, };
-
-	bits = cpu_to_le64(sctx->byte_count << 3);
-
-	/* Pad out to 56 mod 64 and append length */
-	index = sctx->byte_count % MD5_HMAC_BLOCK_SIZE;
-	padlen = (index < 56) ? (56 - index) : ((MD5_HMAC_BLOCK_SIZE+56) - index);
-
-	/* We need to fill a whole block for __md5_sparc64_update() */
-	if (padlen <= 56) {
-		sctx->byte_count += padlen;
-		memcpy((u8 *)sctx->block + index, padding, padlen);
-	} else {
-		__md5_sparc64_update(sctx, padding, padlen, index);
-	}
-	__md5_sparc64_update(sctx, (const u8 *)&bits, sizeof(bits), 56);
+	struct sparc_md5_state *sctx = shash_desc_ctx(desc);
+	__le64 block[MD5_BLOCK_WORDS] = {};
+	u8 *p = memcpy(block, src, offset);
+	__le32 *dst = (__le32 *)out;
+	__le64 *pbits;
+	int i;
+
+	src = p;
+	p += offset;
+	*p++ = 0x80;
+	sctx->byte_count += offset;
+	pbits = &block[(MD5_BLOCK_WORDS / (offset > 55 ? 1 : 2)) - 1];
+	*pbits = cpu_to_le64(sctx->byte_count << 3);
+	md5_sparc64_transform(sctx->hash, src, (pbits - block + 1) / 8);
+	memzero_explicit(block, sizeof(block));
 
 	/* Store state in digest */
 	for (i = 0; i < MD5_HASH_WORDS; i++)
 		dst[i] = sctx->hash[i];
 
-	/* Wipe context */
-	memset(sctx, 0, sizeof(*sctx));
-
 	return 0;
 }
 
 static int md5_sparc64_export(struct shash_desc *desc, void *out)
 {
-	struct md5_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, sctx, sizeof(*sctx));
+	struct sparc_md5_state *sctx = shash_desc_ctx(desc);
+	union {
+		u8 *u8;
+		u32 *u32;
+		u64 *u64;
+	} p = { .u8 = out };
+	int i;
 
+	for (i = 0; i < MD5_HASH_WORDS; i++)
+		put_unaligned(le32_to_cpu(sctx->hash[i]), p.u32++);
+	put_unaligned(sctx->byte_count, p.u64);
 	return 0;
 }
 
 static int md5_sparc64_import(struct shash_desc *desc, const void *in)
 {
-	struct md5_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
+	struct sparc_md5_state *sctx = shash_desc_ctx(desc);
+	union {
+		const u8 *u8;
+		const u32 *u32;
+		const u64 *u64;
+	} p = { .u8 = in };
+	int i;
 
+	for (i = 0; i < MD5_HASH_WORDS; i++)
+		sctx->hash[i] = cpu_to_le32(get_unaligned(p.u32++));
+	sctx->byte_count = get_unaligned(p.u64);
 	return 0;
 }
 
@@ -136,15 +119,16 @@ static struct shash_alg alg = {
 	.digestsize	=	MD5_DIGEST_SIZE,
 	.init		=	md5_sparc64_init,
 	.update		=	md5_sparc64_update,
-	.final		=	md5_sparc64_final,
+	.finup		=	md5_sparc64_finup,
 	.export		=	md5_sparc64_export,
 	.import		=	md5_sparc64_import,
-	.descsize	=	sizeof(struct md5_state),
-	.statesize	=	sizeof(struct md5_state),
+	.descsize	=	sizeof(struct sparc_md5_state),
+	.statesize	=	sizeof(struct sparc_md5_state),
 	.base		=	{
 		.cra_name	=	"md5",
 		.cra_driver_name=	"md5-sparc64",
 		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize	=	MD5_HMAC_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
diff --git a/arch/sparc/crypto/sha1_asm.S b/arch/sparc/crypto/sha1_asm.S
index 7d8bf354f0e7..00b46bac1b08 100644
--- a/arch/sparc/crypto/sha1_asm.S
+++ b/arch/sparc/crypto/sha1_asm.S
@@ -1,9 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/linkage.h>
+#include <asm/opcodes.h>
 #include <asm/visasm.h>
 
-#include "opcodes.h"
-
 ENTRY(sha1_sparc64_transform)
 	/* %o0 = digest, %o1 = data, %o2 = rounds */
 	VISEntryHalf
diff --git a/arch/sparc/crypto/sha1_glue.c b/arch/sparc/crypto/sha1_glue.c
index 06b7becfcb21..ef19d5023b1b 100644
--- a/arch/sparc/crypto/sha1_glue.c
+++ b/arch/sparc/crypto/sha1_glue.c
@@ -11,124 +11,44 @@
 
 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
 #include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
 #include <crypto/sha1.h>
 #include <crypto/sha1_base.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 
-#include <asm/pstate.h>
-#include <asm/elf.h>
-
-#include "opcodes.h"
-
-asmlinkage void sha1_sparc64_transform(u32 *digest, const char *data,
-				       unsigned int rounds);
-
-static void __sha1_sparc64_update(struct sha1_state *sctx, const u8 *data,
-				  unsigned int len, unsigned int partial)
-{
-	unsigned int done = 0;
-
-	sctx->count += len;
-	if (partial) {
-		done = SHA1_BLOCK_SIZE - partial;
-		memcpy(sctx->buffer + partial, data, done);
-		sha1_sparc64_transform(sctx->state, sctx->buffer, 1);
-	}
-	if (len - done >= SHA1_BLOCK_SIZE) {
-		const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
-
-		sha1_sparc64_transform(sctx->state, data + done, rounds);
-		done += rounds * SHA1_BLOCK_SIZE;
-	}
-
-	memcpy(sctx->buffer, data + done, len - done);
-}
+asmlinkage void sha1_sparc64_transform(struct sha1_state *digest,
+				       const u8 *data, int rounds);
 
 static int sha1_sparc64_update(struct shash_desc *desc, const u8 *data,
 			       unsigned int len)
 {
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-	unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
-
-	/* Handle the fast case right here */
-	if (partial + len < SHA1_BLOCK_SIZE) {
-		sctx->count += len;
-		memcpy(sctx->buffer + partial, data, len);
-	} else
-		__sha1_sparc64_update(sctx, data, len, partial);
-
-	return 0;
+	return sha1_base_do_update_blocks(desc, data, len,
+					  sha1_sparc64_transform);
 }
 
 /* Add padding and return the message digest. */
-static int sha1_sparc64_final(struct shash_desc *desc, u8 *out)
-{
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-	unsigned int i, index, padlen;
-	__be32 *dst = (__be32 *)out;
-	__be64 bits;
-	static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
-
-	bits = cpu_to_be64(sctx->count << 3);
-
-	/* Pad out to 56 mod 64 and append length */
-	index = sctx->count % SHA1_BLOCK_SIZE;
-	padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
-
-	/* We need to fill a whole block for __sha1_sparc64_update() */
-	if (padlen <= 56) {
-		sctx->count += padlen;
-		memcpy(sctx->buffer + index, padding, padlen);
-	} else {
-		__sha1_sparc64_update(sctx, padding, padlen, index);
-	}
-	__sha1_sparc64_update(sctx, (const u8 *)&bits, sizeof(bits), 56);
-
-	/* Store state in digest */
-	for (i = 0; i < 5; i++)
-		dst[i] = cpu_to_be32(sctx->state[i]);
-
-	/* Wipe context */
-	memset(sctx, 0, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha1_sparc64_export(struct shash_desc *desc, void *out)
-{
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, sctx, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha1_sparc64_import(struct shash_desc *desc, const void *in)
+static int sha1_sparc64_finup(struct shash_desc *desc, const u8 *src,
+			      unsigned int len, u8 *out)
 {
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
-
-	return 0;
+	sha1_base_do_finup(desc, src, len, sha1_sparc64_transform);
+	return sha1_base_finish(desc, out);
 }
 
 static struct shash_alg alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
 	.init		=	sha1_base_init,
 	.update		=	sha1_sparc64_update,
-	.final		=	sha1_sparc64_final,
-	.export		=	sha1_sparc64_export,
-	.import		=	sha1_sparc64_import,
-	.descsize	=	sizeof(struct sha1_state),
-	.statesize	=	sizeof(struct sha1_state),
+	.finup		=	sha1_sparc64_finup,
+	.descsize	=	SHA1_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha1",
 		.cra_driver_name=	"sha1-sparc64",
 		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
diff --git a/arch/sparc/crypto/sha256_glue.c b/arch/sparc/crypto/sha256_glue.c
deleted file mode 100644
index 285561a1cde5..000000000000
--- a/arch/sparc/crypto/sha256_glue.c
+++ /dev/null
@@ -1,210 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* Glue code for SHA256 hashing optimized for sparc64 crypto opcodes.
- *
- * This is based largely upon crypto/sha256_generic.c
- *
- * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
- * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
- */
-
-#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
-
-#include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-
-#include <asm/pstate.h>
-#include <asm/elf.h>
-
-#include "opcodes.h"
-
-asmlinkage void sha256_sparc64_transform(u32 *digest, const char *data,
-					 unsigned int rounds);
-
-static void __sha256_sparc64_update(struct sha256_state *sctx, const u8 *data,
-				    unsigned int len, unsigned int partial)
-{
-	unsigned int done = 0;
-
-	sctx->count += len;
-	if (partial) {
-		done = SHA256_BLOCK_SIZE - partial;
-		memcpy(sctx->buf + partial, data, done);
-		sha256_sparc64_transform(sctx->state, sctx->buf, 1);
-	}
-	if (len - done >= SHA256_BLOCK_SIZE) {
-		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
-
-		sha256_sparc64_transform(sctx->state, data + done, rounds);
-		done += rounds * SHA256_BLOCK_SIZE;
-	}
-
-	memcpy(sctx->buf, data + done, len - done);
-}
-
-static int sha256_sparc64_update(struct shash_desc *desc, const u8 *data,
-				 unsigned int len)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
-
-	/* Handle the fast case right here */
-	if (partial + len < SHA256_BLOCK_SIZE) {
-		sctx->count += len;
-		memcpy(sctx->buf + partial, data, len);
-	} else
-		__sha256_sparc64_update(sctx, data, len, partial);
-
-	return 0;
-}
-
-static int sha256_sparc64_final(struct shash_desc *desc, u8 *out)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-	unsigned int i, index, padlen;
-	__be32 *dst = (__be32 *)out;
-	__be64 bits;
-	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
-
-	bits = cpu_to_be64(sctx->count << 3);
-
-	/* Pad out to 56 mod 64 and append length */
-	index = sctx->count % SHA256_BLOCK_SIZE;
-	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56) - index);
-
-	/* We need to fill a whole block for __sha256_sparc64_update() */
-	if (padlen <= 56) {
-		sctx->count += padlen;
-		memcpy(sctx->buf + index, padding, padlen);
-	} else {
-		__sha256_sparc64_update(sctx, padding, padlen, index);
-	}
-	__sha256_sparc64_update(sctx, (const u8 *)&bits, sizeof(bits), 56);
-
-	/* Store state in digest */
-	for (i = 0; i < 8; i++)
-		dst[i] = cpu_to_be32(sctx->state[i]);
-
-	/* Wipe context */
-	memset(sctx, 0, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha224_sparc64_final(struct shash_desc *desc, u8 *hash)
-{
-	u8 D[SHA256_DIGEST_SIZE];
-
-	sha256_sparc64_final(desc, D);
-
-	memcpy(hash, D, SHA224_DIGEST_SIZE);
-	memzero_explicit(D, SHA256_DIGEST_SIZE);
-
-	return 0;
-}
-
-static int sha256_sparc64_export(struct shash_desc *desc, void *out)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, sctx, sizeof(*sctx));
-	return 0;
-}
-
-static int sha256_sparc64_import(struct shash_desc *desc, const void *in)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
-	return 0;
-}
-
-static struct shash_alg sha256_alg = {
-	.digestsize	=	SHA256_DIGEST_SIZE,
-	.init		=	sha256_base_init,
-	.update		=	sha256_sparc64_update,
-	.final		=	sha256_sparc64_final,
-	.export		=	sha256_sparc64_export,
-	.import		=	sha256_sparc64_import,
-	.descsize	=	sizeof(struct sha256_state),
-	.statesize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha256",
-		.cra_driver_name=	"sha256-sparc64",
-		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
-		.cra_blocksize	=	SHA256_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-};
-
-static struct shash_alg sha224_alg = {
-	.digestsize	=	SHA224_DIGEST_SIZE,
-	.init		=	sha224_base_init,
-	.update		=	sha256_sparc64_update,
-	.final		=	sha224_sparc64_final,
-	.descsize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha224",
-		.cra_driver_name=	"sha224-sparc64",
-		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
-		.cra_blocksize	=	SHA224_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-};
-
-static bool __init sparc64_has_sha256_opcode(void)
-{
-	unsigned long cfr;
-
-	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
-		return false;
-
-	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
-	if (!(cfr & CFR_SHA256))
-		return false;
-
-	return true;
-}
-
-static int __init sha256_sparc64_mod_init(void)
-{
-	if (sparc64_has_sha256_opcode()) {
-		int ret = crypto_register_shash(&sha224_alg);
-		if (ret < 0)
-			return ret;
-
-		ret = crypto_register_shash(&sha256_alg);
-		if (ret < 0) {
-			crypto_unregister_shash(&sha224_alg);
-			return ret;
-		}
-
-		pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n");
-		return 0;
-	}
-	pr_info("sparc64 sha256 opcode not available.\n");
-	return -ENODEV;
-}
-
-static void __exit sha256_sparc64_mod_fini(void)
-{
-	crypto_unregister_shash(&sha224_alg);
-	crypto_unregister_shash(&sha256_alg);
-}
-
-module_init(sha256_sparc64_mod_init);
-module_exit(sha256_sparc64_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-224 and SHA-256 Secure Hash Algorithm, sparc64 sha256 opcode accelerated");
-
-MODULE_ALIAS_CRYPTO("sha224");
-MODULE_ALIAS_CRYPTO("sha256");
-
-#include "crop_devid.c"
diff --git a/arch/sparc/crypto/sha512_asm.S b/arch/sparc/crypto/sha512_asm.S
index b2f6e6728802..9932b4fe1b59 100644
--- a/arch/sparc/crypto/sha512_asm.S
+++ b/arch/sparc/crypto/sha512_asm.S
@@ -1,9 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/linkage.h>
+#include <asm/opcodes.h>
 #include <asm/visasm.h>
 
-#include "opcodes.h"
-
 ENTRY(sha512_sparc64_transform)
 	/* %o0 = digest, %o1 = data, %o2 = rounds */
 	VISEntry
diff --git a/arch/sparc/crypto/sha512_glue.c b/arch/sparc/crypto/sha512_glue.c
index d66efa4ec59a..47b9277b6877 100644
--- a/arch/sparc/crypto/sha512_glue.c
+++ b/arch/sparc/crypto/sha512_glue.c
@@ -10,115 +10,42 @@
 
 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
 #include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
 #include <crypto/sha2.h>
 #include <crypto/sha512_base.h>
-
-#include <asm/pstate.h>
-#include <asm/elf.h>
-
-#include "opcodes.h"
+#include <linux/kernel.h>
+#include <linux/module.h>
 
 asmlinkage void sha512_sparc64_transform(u64 *digest, const char *data,
 					 unsigned int rounds);
 
-static void __sha512_sparc64_update(struct sha512_state *sctx, const u8 *data,
-				    unsigned int len, unsigned int partial)
+static void sha512_block(struct sha512_state *sctx, const u8 *src, int blocks)
 {
-	unsigned int done = 0;
-
-	if ((sctx->count[0] += len) < len)
-		sctx->count[1]++;
-	if (partial) {
-		done = SHA512_BLOCK_SIZE - partial;
-		memcpy(sctx->buf + partial, data, done);
-		sha512_sparc64_transform(sctx->state, sctx->buf, 1);
-	}
-	if (len - done >= SHA512_BLOCK_SIZE) {
-		const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
-
-		sha512_sparc64_transform(sctx->state, data + done, rounds);
-		done += rounds * SHA512_BLOCK_SIZE;
-	}
-
-	memcpy(sctx->buf, data + done, len - done);
+	sha512_sparc64_transform(sctx->state, src, blocks);
 }
 
 static int sha512_sparc64_update(struct shash_desc *desc, const u8 *data,
 				 unsigned int len)
 {
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-	unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
-
-	/* Handle the fast case right here */
-	if (partial + len < SHA512_BLOCK_SIZE) {
-		if ((sctx->count[0] += len) < len)
-			sctx->count[1]++;
-		memcpy(sctx->buf + partial, data, len);
-	} else
-		__sha512_sparc64_update(sctx, data, len, partial);
-
-	return 0;
+	return sha512_base_do_update_blocks(desc, data, len, sha512_block);
 }
 
-static int sha512_sparc64_final(struct shash_desc *desc, u8 *out)
+static int sha512_sparc64_finup(struct shash_desc *desc, const u8 *src,
+				unsigned int len, u8 *out)
 {
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-	unsigned int i, index, padlen;
-	__be64 *dst = (__be64 *)out;
-	__be64 bits[2];
-	static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
-
-	/* Save number of bits */
-	bits[1] = cpu_to_be64(sctx->count[0] << 3);
-	bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
-
-	/* Pad out to 112 mod 128 and append length */
-	index = sctx->count[0] % SHA512_BLOCK_SIZE;
-	padlen = (index < 112) ? (112 - index) : ((SHA512_BLOCK_SIZE+112) - index);
-
-	/* We need to fill a whole block for __sha512_sparc64_update() */
-	if (padlen <= 112) {
-		if ((sctx->count[0] += padlen) < padlen)
-			sctx->count[1]++;
-		memcpy(sctx->buf + index, padding, padlen);
-	} else {
-		__sha512_sparc64_update(sctx, padding, padlen, index);
-	}
-	__sha512_sparc64_update(sctx, (const u8 *)&bits, sizeof(bits), 112);
-
-	/* Store state in digest */
-	for (i = 0; i < 8; i++)
-		dst[i] = cpu_to_be64(sctx->state[i]);
-
-	/* Wipe context */
-	memset(sctx, 0, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha384_sparc64_final(struct shash_desc *desc, u8 *hash)
-{
-	u8 D[64];
-
-	sha512_sparc64_final(desc, D);
-
-	memcpy(hash, D, 48);
-	memzero_explicit(D, 64);
-
-	return 0;
+	sha512_base_do_finup(desc, src, len, sha512_block);
+	return sha512_base_finish(desc, out);
 }
 
 static struct shash_alg sha512 = {
 	.digestsize	=	SHA512_DIGEST_SIZE,
 	.init		=	sha512_base_init,
 	.update		=	sha512_sparc64_update,
-	.final		=	sha512_sparc64_final,
-	.descsize	=	sizeof(struct sha512_state),
+	.finup		=	sha512_sparc64_finup,
+	.descsize	=	SHA512_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha512",
 		.cra_driver_name=	"sha512-sparc64",
@@ -132,8 +59,8 @@ static struct shash_alg sha384 = {
 	.digestsize	=	SHA384_DIGEST_SIZE,
 	.init		=	sha384_base_init,
 	.update		=	sha512_sparc64_update,
-	.final		=	sha384_sparc64_final,
-	.descsize	=	sizeof(struct sha512_state),
+	.finup		=	sha512_sparc64_finup,
+	.descsize	=	SHA512_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha384",
 		.cra_driver_name=	"sha384-sparc64",
diff --git a/arch/sparc/crypto/opcodes.h b/arch/sparc/include/asm/opcodes.h
index 417b6a10a337..ebfda6eb49b2 100644
--- a/arch/sparc/crypto/opcodes.h
+++ b/arch/sparc/include/asm/opcodes.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _OPCODES_H
-#define _OPCODES_H
+#ifndef _SPARC_ASM_OPCODES_H
+#define _SPARC_ASM_OPCODES_H
 
 #define SPARC_CR_OPCODE_PRIORITY	300
 
@@ -97,4 +97,4 @@
 #define MOVXTOD_G7_F62		\
 	.word	0xbfb02307;
 
-#endif /* _OPCODES_H */
+#endif /* _SPARC_ASM_OPCODES_H */
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile
index 5724d0f356eb..5cf9781d68b4 100644
--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@@ -4,6 +4,7 @@
 
 asflags-y := -ansi -DST_DIV0=0x02
 
+obj-y                 += crypto/
 lib-$(CONFIG_SPARC32) += ashrdi3.o
 lib-$(CONFIG_SPARC32) += memcpy.o memset.o
 lib-y                 += strlen.o
@@ -54,4 +55,4 @@ obj-$(CONFIG_SPARC64) += iomap.o
 obj-$(CONFIG_SPARC32) += atomic32.o
 obj-$(CONFIG_SPARC64) += PeeCeeI.o
 obj-$(CONFIG_CRC32_ARCH) += crc32-sparc.o
-crc32-sparc-y := crc32_glue.o crc32c_asm.o
+crc32-sparc-y := crc32.o crc32c_asm.o
diff --git a/arch/sparc/lib/crc32_glue.c b/arch/sparc/lib/crc32.c
index a70752c729cf..40d4720a42a1 100644
--- a/arch/sparc/lib/crc32_glue.c
+++ b/arch/sparc/lib/crc32.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* Glue code for CRC32C optimized for sparc64 crypto opcodes.
+/* CRC32c (Castagnoli), sparc64 crc32c opcode accelerated
  *
  * This is based largely upon arch/x86/crypto/crc32c-intel.c
  *
@@ -17,7 +17,7 @@
 #include <asm/pstate.h>
 #include <asm/elf.h>
 
-static DEFINE_STATIC_KEY_FALSE(have_crc32c_opcode);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32c_opcode);
 
 u32 crc32_le_arch(u32 crc, const u8 *data, size_t len)
 {
@@ -74,7 +74,7 @@ static int __init crc32_sparc_init(void)
 	pr_info("Using sparc64 crc32c opcode optimized CRC32C implementation\n");
 	return 0;
 }
-arch_initcall(crc32_sparc_init);
+subsys_initcall(crc32_sparc_init);
 
 static void __exit crc32_sparc_exit(void)
 {
diff --git a/arch/sparc/lib/crc32c_asm.S b/arch/sparc/lib/crc32c_asm.S
index ee454fa6aed6..4db873850f44 100644
--- a/arch/sparc/lib/crc32c_asm.S
+++ b/arch/sparc/lib/crc32c_asm.S
@@ -1,10 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/linkage.h>
+#include <asm/opcodes.h>
 #include <asm/visasm.h>
 #include <asm/asi.h>
 
-#include "../crypto/opcodes.h"
-
 ENTRY(crc32c_sparc64)
 	/* %o0=crc32p, %o1=data_ptr, %o2=len */
 	VISEntryHalf
diff --git a/arch/sparc/lib/crypto/Kconfig b/arch/sparc/lib/crypto/Kconfig
new file mode 100644
index 000000000000..e5c3e4d3dba6
--- /dev/null
+++ b/arch/sparc/lib/crypto/Kconfig
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_SHA256_SPARC64
+	tristate
+	depends on SPARC64
+	default CRYPTO_LIB_SHA256
+	select CRYPTO_ARCH_HAVE_LIB_SHA256
+	select CRYPTO_LIB_SHA256_GENERIC
diff --git a/arch/sparc/lib/crypto/Makefile b/arch/sparc/lib/crypto/Makefile
new file mode 100644
index 000000000000..75ee244ad6f7
--- /dev/null
+++ b/arch/sparc/lib/crypto/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_SHA256_SPARC64) += sha256-sparc64.o
+sha256-sparc64-y := sha256.o sha256_asm.o
diff --git a/arch/sparc/lib/crypto/sha256.c b/arch/sparc/lib/crypto/sha256.c
new file mode 100644
index 000000000000..8bdec2db08b3
--- /dev/null
+++ b/arch/sparc/lib/crypto/sha256.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * SHA-256 accelerated using the sparc64 sha256 opcodes
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+#include <crypto/internal/sha2.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_opcodes);
+
+asmlinkage void sha256_sparc64_transform(u32 state[SHA256_STATE_WORDS],
+					 const u8 *data, size_t nblocks);
+
+void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+			const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_sha256_opcodes))
+		sha256_sparc64_transform(state, data, nblocks);
+	else
+		sha256_blocks_generic(state, data, nblocks);
+}
+EXPORT_SYMBOL_GPL(sha256_blocks_arch);
+
+bool sha256_is_arch_optimized(void)
+{
+	return static_key_enabled(&have_sha256_opcodes);
+}
+EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
+
+static int __init sha256_sparc64_mod_init(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return 0;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_SHA256))
+		return 0;
+
+	static_branch_enable(&have_sha256_opcodes);
+	pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n");
+	return 0;
+}
+subsys_initcall(sha256_sparc64_mod_init);
+
+static void __exit sha256_sparc64_mod_exit(void)
+{
+}
+module_exit(sha256_sparc64_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-256 accelerated using the sparc64 sha256 opcodes");
diff --git a/arch/sparc/crypto/sha256_asm.S b/arch/sparc/lib/crypto/sha256_asm.S
index 0b39ec7d7ca2..ddcdd3daf31e 100644
--- a/arch/sparc/crypto/sha256_asm.S
+++ b/arch/sparc/lib/crypto/sha256_asm.S
@@ -1,11 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/linkage.h>
+#include <asm/opcodes.h>
 #include <asm/visasm.h>
 
-#include "opcodes.h"
-
 ENTRY(sha256_sparc64_transform)
-	/* %o0 = digest, %o1 = data, %o2 = rounds */
+	/* %o0 = state, %o1 = data, %o2 = nblocks */
 	VISEntryHalf
 	ld	[%o0 + 0x00], %f0
 	ld	[%o0 + 0x04], %f1
diff --git a/arch/um/Makefile b/arch/um/Makefile
index 1d36a613aad8..9ed792e565c9 100644
--- a/arch/um/Makefile
+++ b/arch/um/Makefile
@@ -154,5 +154,6 @@ MRPROPER_FILES += $(HOST_DIR)/include/generated
 archclean:
 	@find . \( -name '*.bb' -o -name '*.bbg' -o -name '*.da' \
 		-o -name '*.gcov' \) -type f -print | xargs rm -f
+	$(Q)$(MAKE) -f $(srctree)/Makefile ARCH=$(HEADER_ARCH) clean
 
 export HEADER_ARCH SUBARCH USER_CFLAGS CFLAGS_NO_HARDENING DEV_NULL_PATH
diff --git a/arch/um/include/asm/fpu/api.h b/arch/um/include/asm/fpu/api.h
index 71bfd9ef3938..3abf67c83c40 100644
--- a/arch/um/include/asm/fpu/api.h
+++ b/arch/um/include/asm/fpu/api.h
@@ -2,6 +2,8 @@
 #ifndef _ASM_UM_FPU_API_H
 #define _ASM_UM_FPU_API_H
 
+#include <linux/types.h>
+
 /* Copyright (c) 2020 Cambridge Greys Ltd
  * Copyright (c) 2020 Red Hat Inc.
  * A set of "dummy" defines to allow the direct inclusion
diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h
index 3a08f9029a3f..1c6e0ae41b0c 100644
--- a/arch/um/include/asm/uaccess.h
+++ b/arch/um/include/asm/uaccess.h
@@ -55,6 +55,7 @@ do {									\
 		goto err_label;						\
 	}								\
 	*((type *)dst) = get_unaligned((type *)(src));			\
+	barrier();							\
 	current->thread.segv_continue = NULL;				\
 } while (0)
 
@@ -66,6 +67,7 @@ do {									\
 	if (__faulted)							\
 		goto err_label;						\
 	put_unaligned(*((type *)src), (type *)(dst));			\
+	barrier();							\
 	current->thread.segv_continue = NULL;				\
 } while (0)
 
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index ce073150dc20..ef2272e92a43 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -225,20 +225,20 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
 			panic("Failed to sync kernel TLBs: %d", err);
 		goto out;
 	}
-	else if (current->mm == NULL) {
-		if (current->pagefault_disabled) {
-			if (!mc) {
-				show_regs(container_of(regs, struct pt_regs, regs));
-				panic("Segfault with pagefaults disabled but no mcontext");
-			}
-			if (!current->thread.segv_continue) {
-				show_regs(container_of(regs, struct pt_regs, regs));
-				panic("Segfault without recovery target");
-			}
-			mc_set_rip(mc, current->thread.segv_continue);
-			current->thread.segv_continue = NULL;
-			goto out;
+	else if (current->pagefault_disabled) {
+		if (!mc) {
+			show_regs(container_of(regs, struct pt_regs, regs));
+			panic("Segfault with pagefaults disabled but no mcontext");
 		}
+		if (!current->thread.segv_continue) {
+			show_regs(container_of(regs, struct pt_regs, regs));
+			panic("Segfault without recovery target");
+		}
+		mc_set_rip(mc, current->thread.segv_continue);
+		current->thread.segv_continue = NULL;
+		goto out;
+	}
+	else if (current->mm == NULL) {
 		show_regs(container_of(regs, struct pt_regs, regs));
 		panic("Segfault with no mm");
 	}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4b9f378e05f6..e21cca404943 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2368,6 +2368,7 @@ config STRICT_SIGALTSTACK_SIZE
 config CFI_AUTO_DEFAULT
 	bool "Attempt to use FineIBT by default at boot time"
 	depends on FINEIBT
+	depends on !RUST || RUSTC_VERSION >= 108800
 	default y
 	help
 	  Attempt to use FineIBT by default at boot time. If enabled,
@@ -2710,6 +2711,18 @@ config MITIGATION_SSB
 	  of speculative execution in a similar way to the Meltdown and Spectre
 	  security vulnerabilities.
 
+config MITIGATION_ITS
+	bool "Enable Indirect Target Selection mitigation"
+	depends on CPU_SUP_INTEL && X86_64
+	depends on MITIGATION_RETPOLINE && MITIGATION_RETHUNK
+	select EXECMEM
+	default y
+	help
+	  Enable Indirect Target Selection (ITS) mitigation. ITS is a bug in
+	  BPU on some Intel CPUs that may allow Spectre V2 style attacks. If
+	  disabled, mitigation cannot be enabled via cmdline.
+	  See <file:Documentation/admin-guide/hw-vuln/indirect-target-selection.rst>
+
 endif
 
 config ARCH_HAS_ADD_PAGES
diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler
index 6d20a6ce0507..4d06fd3c8dfe 100644
--- a/arch/x86/Kconfig.assembler
+++ b/arch/x86/Kconfig.assembler
@@ -6,15 +6,6 @@ config AS_AVX512
 	help
 	  Supported by binutils >= 2.25 and LLVM integrated assembler
 
-config AS_SHA1_NI
-	def_bool $(as-instr,sha1msg1 %xmm0$(comma)%xmm1)
-	help
-	  Supported by binutils >= 2.24 and LLVM integrated assembler
-
-config AS_SHA256_NI
-	def_bool $(as-instr,sha256msg1 %xmm0$(comma)%xmm1)
-	help
-	  Supported by binutils >= 2.24 and LLVM integrated assembler
 config AS_TPAUSE
 	def_bool $(as-instr,tpause %ecx)
 	help
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 81f55da81967..640fcac3af74 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -59,7 +59,7 @@ KBUILD_CFLAGS	+= $(CONFIG_CC_IMPLICIT_FALLTHROUGH)
 $(obj)/bzImage: asflags-y  := $(SVGA_MODE)
 
 quiet_cmd_image = BUILD   $@
-      cmd_image = cp $< $@; truncate -s %4K $@; cat $(obj)/vmlinux.bin >>$@
+      cmd_image = (dd if=$< bs=4k conv=sync status=none; cat $(filter-out $<,$(real-prereqs))) >$@
 
 $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin FORCE
 	$(call if_changed,image)
diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c
index f676156d9f3d..0e9f84ab4bdc 100644
--- a/arch/x86/boot/compressed/mem.c
+++ b/arch/x86/boot/compressed/mem.c
@@ -34,14 +34,11 @@ static bool early_is_tdx_guest(void)
 
 void arch_accept_memory(phys_addr_t start, phys_addr_t end)
 {
-	static bool sevsnp;
-
 	/* Platform-specific memory-acceptance call goes here */
 	if (early_is_tdx_guest()) {
 		if (!tdx_accept_memory(start, end))
 			panic("TDX: Failed to accept memory\n");
-	} else if (sevsnp || (sev_get_status() & MSR_AMD64_SEV_SNP_ENABLED)) {
-		sevsnp = true;
+	} else if (early_is_sevsnp_guest()) {
 		snp_accept_memory(start, end);
 	} else {
 		error("Cannot accept memory: unknown platform\n");
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 89ba168f4f0f..0003e4416efd 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -645,3 +645,43 @@ void sev_prep_identity_maps(unsigned long top_level_pgt)
 
 	sev_verify_cbit(top_level_pgt);
 }
+
+bool early_is_sevsnp_guest(void)
+{
+	static bool sevsnp;
+
+	if (sevsnp)
+		return true;
+
+	if (!(sev_get_status() & MSR_AMD64_SEV_SNP_ENABLED))
+		return false;
+
+	sevsnp = true;
+
+	if (!snp_vmpl) {
+		unsigned int eax, ebx, ecx, edx;
+
+		/*
+		 * CPUID Fn8000_001F_EAX[28] - SVSM support
+		 */
+		eax = 0x8000001f;
+		ecx = 0;
+		native_cpuid(&eax, &ebx, &ecx, &edx);
+		if (eax & BIT(28)) {
+			struct msr m;
+
+			/* Obtain the address of the calling area to use */
+			boot_rdmsr(MSR_SVSM_CAA, &m);
+			boot_svsm_caa = (void *)m.q;
+			boot_svsm_caa_pa = m.q;
+
+			/*
+			 * The real VMPL level cannot be discovered, but the
+			 * memory acceptance routines make no use of that so
+			 * any non-zero value suffices here.
+			 */
+			snp_vmpl = U8_MAX;
+		}
+	}
+	return true;
+}
diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h
index 4e463f33186d..d3900384b8ab 100644
--- a/arch/x86/boot/compressed/sev.h
+++ b/arch/x86/boot/compressed/sev.h
@@ -13,12 +13,14 @@
 bool sev_snp_enabled(void);
 void snp_accept_memory(phys_addr_t start, phys_addr_t end);
 u64 sev_get_status(void);
+bool early_is_sevsnp_guest(void);
 
 #else
 
 static inline bool sev_snp_enabled(void) { return false; }
 static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
 static inline u64 sev_get_status(void) { return 0; }
+static inline bool early_is_sevsnp_guest(void) { return false; }
 
 #endif
 
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index b0c1a7a57497..36beaac713c1 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -959,6 +959,102 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end)
 	set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
 }
 
+static int vmgexit_ap_control(u64 event, struct sev_es_save_area *vmsa, u32 apic_id)
+{
+	bool create = event != SVM_VMGEXIT_AP_DESTROY;
+	struct ghcb_state state;
+	unsigned long flags;
+	struct ghcb *ghcb;
+	int ret = 0;
+
+	local_irq_save(flags);
+
+	ghcb = __sev_get_ghcb(&state);
+
+	vc_ghcb_invalidate(ghcb);
+
+	if (create)
+		ghcb_set_rax(ghcb, vmsa->sev_features);
+
+	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
+	ghcb_set_sw_exit_info_1(ghcb,
+				((u64)apic_id << 32)	|
+				((u64)snp_vmpl << 16)	|
+				event);
+	ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
+
+	sev_es_wr_ghcb_msr(__pa(ghcb));
+	VMGEXIT();
+
+	if (!ghcb_sw_exit_info_1_is_valid(ghcb) ||
+	    lower_32_bits(ghcb->save.sw_exit_info_1)) {
+		pr_err("SNP AP %s error\n", (create ? "CREATE" : "DESTROY"));
+		ret = -EINVAL;
+	}
+
+	__sev_put_ghcb(&state);
+
+	local_irq_restore(flags);
+
+	return ret;
+}
+
+static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa)
+{
+	int ret;
+
+	if (snp_vmpl) {
+		struct svsm_call call = {};
+		unsigned long flags;
+
+		local_irq_save(flags);
+
+		call.caa = this_cpu_read(svsm_caa);
+		call.rcx = __pa(va);
+
+		if (make_vmsa) {
+			/* Protocol 0, Call ID 2 */
+			call.rax = SVSM_CORE_CALL(SVSM_CORE_CREATE_VCPU);
+			call.rdx = __pa(caa);
+			call.r8  = apic_id;
+		} else {
+			/* Protocol 0, Call ID 3 */
+			call.rax = SVSM_CORE_CALL(SVSM_CORE_DELETE_VCPU);
+		}
+
+		ret = svsm_perform_call_protocol(&call);
+
+		local_irq_restore(flags);
+	} else {
+		/*
+		 * If the kernel runs at VMPL0, it can change the VMSA
+		 * bit for a page using the RMPADJUST instruction.
+		 * However, for the instruction to succeed it must
+		 * target the permissions of a lesser privileged (higher
+		 * numbered) VMPL level, so use VMPL1.
+		 */
+		u64 attrs = 1;
+
+		if (make_vmsa)
+			attrs |= RMPADJUST_VMSA_PAGE_BIT;
+
+		ret = rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
+	}
+
+	return ret;
+}
+
+static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id)
+{
+	int err;
+
+	err = snp_set_vmsa(vmsa, NULL, apic_id, false);
+	if (err)
+		pr_err("clear VMSA page failed (%u), leaking page\n", err);
+	else
+		free_page((unsigned long)vmsa);
+}
+
 static void set_pte_enc(pte_t *kpte, int level, void *va)
 {
 	struct pte_enc_desc d = {
@@ -1005,7 +1101,8 @@ static void unshare_all_memory(void)
 			data = per_cpu(runtime_data, cpu);
 			ghcb = (unsigned long)&data->ghcb_page;
 
-			if (addr <= ghcb && ghcb <= addr + size) {
+			/* Handle the case of a huge page containing the GHCB page */
+			if (addr <= ghcb && ghcb < addr + size) {
 				skipped_addr = true;
 				break;
 			}
@@ -1055,11 +1152,70 @@ void snp_kexec_begin(void)
 		pr_warn("Failed to stop shared<->private conversions\n");
 }
 
+/*
+ * Shutdown all APs except the one handling kexec/kdump and clearing
+ * the VMSA tag on AP's VMSA pages as they are not being used as
+ * VMSA page anymore.
+ */
+static void shutdown_all_aps(void)
+{
+	struct sev_es_save_area *vmsa;
+	int apic_id, this_cpu, cpu;
+
+	this_cpu = get_cpu();
+
+	/*
+	 * APs are already in HLT loop when enc_kexec_finish() callback
+	 * is invoked.
+	 */
+	for_each_present_cpu(cpu) {
+		vmsa = per_cpu(sev_vmsa, cpu);
+
+		/*
+		 * The BSP or offlined APs do not have guest allocated VMSA
+		 * and there is no need  to clear the VMSA tag for this page.
+		 */
+		if (!vmsa)
+			continue;
+
+		/*
+		 * Cannot clear the VMSA tag for the currently running vCPU.
+		 */
+		if (this_cpu == cpu) {
+			unsigned long pa;
+			struct page *p;
+
+			pa = __pa(vmsa);
+			/*
+			 * Mark the VMSA page of the running vCPU as offline
+			 * so that is excluded and not touched by makedumpfile
+			 * while generating vmcore during kdump.
+			 */
+			p = pfn_to_online_page(pa >> PAGE_SHIFT);
+			if (p)
+				__SetPageOffline(p);
+			continue;
+		}
+
+		apic_id = cpuid_to_apicid[cpu];
+
+		/*
+		 * Issue AP destroy to ensure AP gets kicked out of guest mode
+		 * to allow using RMPADJUST to remove the VMSA tag on it's
+		 * VMSA page.
+		 */
+		vmgexit_ap_control(SVM_VMGEXIT_AP_DESTROY, vmsa, apic_id);
+		snp_cleanup_vmsa(vmsa, apic_id);
+	}
+
+	put_cpu();
+}
+
 void snp_kexec_finish(void)
 {
 	struct sev_es_runtime_data *data;
+	unsigned long size, addr;
 	unsigned int level, cpu;
-	unsigned long size;
 	struct ghcb *ghcb;
 	pte_t *pte;
 
@@ -1069,6 +1225,8 @@ void snp_kexec_finish(void)
 	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
 		return;
 
+	shutdown_all_aps();
+
 	unshare_all_memory();
 
 	/*
@@ -1085,54 +1243,11 @@ void snp_kexec_finish(void)
 		ghcb = &data->ghcb_page;
 		pte = lookup_address((unsigned long)ghcb, &level);
 		size = page_level_size(level);
-		set_pte_enc(pte, level, (void *)ghcb);
-		snp_set_memory_private((unsigned long)ghcb, (size / PAGE_SIZE));
-	}
-}
-
-static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa)
-{
-	int ret;
-
-	if (snp_vmpl) {
-		struct svsm_call call = {};
-		unsigned long flags;
-
-		local_irq_save(flags);
-
-		call.caa = this_cpu_read(svsm_caa);
-		call.rcx = __pa(va);
-
-		if (make_vmsa) {
-			/* Protocol 0, Call ID 2 */
-			call.rax = SVSM_CORE_CALL(SVSM_CORE_CREATE_VCPU);
-			call.rdx = __pa(caa);
-			call.r8  = apic_id;
-		} else {
-			/* Protocol 0, Call ID 3 */
-			call.rax = SVSM_CORE_CALL(SVSM_CORE_DELETE_VCPU);
-		}
-
-		ret = svsm_perform_call_protocol(&call);
-
-		local_irq_restore(flags);
-	} else {
-		/*
-		 * If the kernel runs at VMPL0, it can change the VMSA
-		 * bit for a page using the RMPADJUST instruction.
-		 * However, for the instruction to succeed it must
-		 * target the permissions of a lesser privileged (higher
-		 * numbered) VMPL level, so use VMPL1.
-		 */
-		u64 attrs = 1;
-
-		if (make_vmsa)
-			attrs |= RMPADJUST_VMSA_PAGE_BIT;
-
-		ret = rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
+		/* Handle the case of a huge page containing the GHCB page */
+		addr = (unsigned long)ghcb & page_level_mask(level);
+		set_pte_enc(pte, level, (void *)addr);
+		snp_set_memory_private(addr, (size / PAGE_SIZE));
 	}
-
-	return ret;
 }
 
 #define __ATTR_BASE		(SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK)
@@ -1166,24 +1281,10 @@ static void *snp_alloc_vmsa_page(int cpu)
 	return page_address(p + 1);
 }
 
-static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id)
-{
-	int err;
-
-	err = snp_set_vmsa(vmsa, NULL, apic_id, false);
-	if (err)
-		pr_err("clear VMSA page failed (%u), leaking page\n", err);
-	else
-		free_page((unsigned long)vmsa);
-}
-
 static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
 {
 	struct sev_es_save_area *cur_vmsa, *vmsa;
-	struct ghcb_state state;
 	struct svsm_ca *caa;
-	unsigned long flags;
-	struct ghcb *ghcb;
 	u8 sipi_vector;
 	int cpu, ret;
 	u64 cr4;
@@ -1297,33 +1398,7 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
 	}
 
 	/* Issue VMGEXIT AP Creation NAE event */
-	local_irq_save(flags);
-
-	ghcb = __sev_get_ghcb(&state);
-
-	vc_ghcb_invalidate(ghcb);
-	ghcb_set_rax(ghcb, vmsa->sev_features);
-	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
-	ghcb_set_sw_exit_info_1(ghcb,
-				((u64)apic_id << 32)	|
-				((u64)snp_vmpl << 16)	|
-				SVM_VMGEXIT_AP_CREATE);
-	ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
-
-	sev_es_wr_ghcb_msr(__pa(ghcb));
-	VMGEXIT();
-
-	if (!ghcb_sw_exit_info_1_is_valid(ghcb) ||
-	    lower_32_bits(ghcb->save.sw_exit_info_1)) {
-		pr_err("SNP AP Creation error\n");
-		ret = -EINVAL;
-	}
-
-	__sev_put_ghcb(&state);
-
-	local_irq_restore(flags);
-
-	/* Perform cleanup if there was an error */
+	ret = vmgexit_ap_control(SVM_VMGEXIT_AP_CREATE, vmsa, apic_id);
 	if (ret) {
 		snp_cleanup_vmsa(vmsa, apic_id);
 		vmsa = NULL;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 91801138b10b..7cd2f395f301 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,7 +1,6 @@
 CONFIG_WERROR=y
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_USELIB=y
 CONFIG_AUDIT=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 3d948f10c94c..56cfdc79e2c6 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -4,7 +4,7 @@ menu "Accelerated Cryptographic Algorithms for CPU (x86)"
 
 config CRYPTO_CURVE25519_X86
 	tristate
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_KPP
 	select CRYPTO_LIB_CURVE25519_GENERIC
 	select CRYPTO_ARCH_HAVE_LIB_CURVE25519
@@ -17,13 +17,11 @@ config CRYPTO_CURVE25519_X86
 
 config CRYPTO_AES_NI_INTEL
 	tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XCTR, XTS, GCM (AES-NI/VAES)"
-	depends on X86
 	select CRYPTO_AEAD
 	select CRYPTO_LIB_AES
 	select CRYPTO_LIB_GF128MUL
 	select CRYPTO_ALGAPI
 	select CRYPTO_SKCIPHER
-	select CRYPTO_SIMD
 	help
 	  Block cipher: AES cipher algorithms
 	  AEAD cipher: AES with GCM
@@ -38,7 +36,7 @@ config CRYPTO_AES_NI_INTEL
 
 config CRYPTO_BLOWFISH_X86_64
 	tristate "Ciphers: Blowfish, modes: ECB, CBC"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SKCIPHER
 	select CRYPTO_BLOWFISH_COMMON
 	imply CRYPTO_CTR
@@ -50,7 +48,7 @@ config CRYPTO_BLOWFISH_X86_64
 
 config CRYPTO_CAMELLIA_X86_64
 	tristate "Ciphers: Camellia with modes: ECB, CBC"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SKCIPHER
 	imply CRYPTO_CTR
 	help
@@ -61,10 +59,9 @@ config CRYPTO_CAMELLIA_X86_64
 
 config CRYPTO_CAMELLIA_AESNI_AVX_X86_64
 	tristate "Ciphers: Camellia with modes: ECB, CBC (AES-NI/AVX)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SKCIPHER
 	select CRYPTO_CAMELLIA_X86_64
-	select CRYPTO_SIMD
 	imply CRYPTO_XTS
 	help
 	  Length-preserving ciphers: Camellia with ECB and CBC modes
@@ -75,7 +72,7 @@ config CRYPTO_CAMELLIA_AESNI_AVX_X86_64
 
 config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64
 	tristate "Ciphers: Camellia with modes: ECB, CBC (AES-NI/AVX2)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_CAMELLIA_AESNI_AVX_X86_64
 	help
 	  Length-preserving ciphers: Camellia with ECB and CBC modes
@@ -86,11 +83,10 @@ config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64
 
 config CRYPTO_CAST5_AVX_X86_64
 	tristate "Ciphers: CAST5 with modes: ECB, CBC (AVX)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SKCIPHER
 	select CRYPTO_CAST5
 	select CRYPTO_CAST_COMMON
-	select CRYPTO_SIMD
 	imply CRYPTO_CTR
 	help
 	  Length-preserving ciphers: CAST5 (CAST-128) cipher algorithm
@@ -103,11 +99,10 @@ config CRYPTO_CAST5_AVX_X86_64
 
 config CRYPTO_CAST6_AVX_X86_64
 	tristate "Ciphers: CAST6 with modes: ECB, CBC (AVX)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SKCIPHER
 	select CRYPTO_CAST6
 	select CRYPTO_CAST_COMMON
-	select CRYPTO_SIMD
 	imply CRYPTO_XTS
 	imply CRYPTO_CTR
 	help
@@ -121,7 +116,7 @@ config CRYPTO_CAST6_AVX_X86_64
 
 config CRYPTO_DES3_EDE_X86_64
 	tristate "Ciphers: Triple DES EDE with modes: ECB, CBC"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SKCIPHER
 	select CRYPTO_LIB_DES
 	imply CRYPTO_CTR
@@ -135,10 +130,9 @@ config CRYPTO_DES3_EDE_X86_64
 
 config CRYPTO_SERPENT_SSE2_X86_64
 	tristate "Ciphers: Serpent with modes: ECB, CBC (SSE2)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SKCIPHER
 	select CRYPTO_SERPENT
-	select CRYPTO_SIMD
 	imply CRYPTO_CTR
 	help
 	  Length-preserving ciphers: Serpent cipher algorithm
@@ -151,10 +145,9 @@ config CRYPTO_SERPENT_SSE2_X86_64
 
 config CRYPTO_SERPENT_SSE2_586
 	tristate "Ciphers: Serpent with modes: ECB, CBC (32-bit with SSE2)"
-	depends on X86 && !64BIT
+	depends on !64BIT
 	select CRYPTO_SKCIPHER
 	select CRYPTO_SERPENT
-	select CRYPTO_SIMD
 	imply CRYPTO_CTR
 	help
 	  Length-preserving ciphers: Serpent cipher algorithm
@@ -167,10 +160,9 @@ config CRYPTO_SERPENT_SSE2_586
 
 config CRYPTO_SERPENT_AVX_X86_64
 	tristate "Ciphers: Serpent with modes: ECB, CBC (AVX)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SKCIPHER
 	select CRYPTO_SERPENT
-	select CRYPTO_SIMD
 	imply CRYPTO_XTS
 	imply CRYPTO_CTR
 	help
@@ -184,7 +176,7 @@ config CRYPTO_SERPENT_AVX_X86_64
 
 config CRYPTO_SERPENT_AVX2_X86_64
 	tristate "Ciphers: Serpent with modes: ECB, CBC (AVX2)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SERPENT_AVX_X86_64
 	help
 	  Length-preserving ciphers: Serpent cipher algorithm
@@ -197,9 +189,8 @@ config CRYPTO_SERPENT_AVX2_X86_64
 
 config CRYPTO_SM4_AESNI_AVX_X86_64
 	tristate "Ciphers: SM4 with modes: ECB, CBC, CTR (AES-NI/AVX)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SKCIPHER
-	select CRYPTO_SIMD
 	select CRYPTO_ALGAPI
 	select CRYPTO_SM4
 	help
@@ -218,9 +209,8 @@ config CRYPTO_SM4_AESNI_AVX_X86_64
 
 config CRYPTO_SM4_AESNI_AVX2_X86_64
 	tristate "Ciphers: SM4 with modes: ECB, CBC, CTR (AES-NI/AVX2)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SKCIPHER
-	select CRYPTO_SIMD
 	select CRYPTO_ALGAPI
 	select CRYPTO_SM4
 	select CRYPTO_SM4_AESNI_AVX_X86_64
@@ -240,7 +230,7 @@ config CRYPTO_SM4_AESNI_AVX2_X86_64
 
 config CRYPTO_TWOFISH_586
 	tristate "Ciphers: Twofish (32-bit)"
-	depends on (X86 || UML_X86) && !64BIT
+	depends on !64BIT
 	select CRYPTO_ALGAPI
 	select CRYPTO_TWOFISH_COMMON
 	imply CRYPTO_CTR
@@ -251,7 +241,7 @@ config CRYPTO_TWOFISH_586
 
 config CRYPTO_TWOFISH_X86_64
 	tristate "Ciphers: Twofish"
-	depends on (X86 || UML_X86) && 64BIT
+	depends on 64BIT
 	select CRYPTO_ALGAPI
 	select CRYPTO_TWOFISH_COMMON
 	imply CRYPTO_CTR
@@ -262,7 +252,7 @@ config CRYPTO_TWOFISH_X86_64
 
 config CRYPTO_TWOFISH_X86_64_3WAY
 	tristate "Ciphers: Twofish with modes: ECB, CBC (3-way parallel)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SKCIPHER
 	select CRYPTO_TWOFISH_COMMON
 	select CRYPTO_TWOFISH_X86_64
@@ -277,9 +267,8 @@ config CRYPTO_TWOFISH_X86_64_3WAY
 
 config CRYPTO_TWOFISH_AVX_X86_64
 	tristate "Ciphers: Twofish with modes: ECB, CBC (AVX)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SKCIPHER
-	select CRYPTO_SIMD
 	select CRYPTO_TWOFISH_COMMON
 	select CRYPTO_TWOFISH_X86_64
 	select CRYPTO_TWOFISH_X86_64_3WAY
@@ -295,9 +284,8 @@ config CRYPTO_TWOFISH_AVX_X86_64
 
 config CRYPTO_ARIA_AESNI_AVX_X86_64
 	tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX/GFNI)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SKCIPHER
-	select CRYPTO_SIMD
 	select CRYPTO_ALGAPI
 	select CRYPTO_ARIA
 	help
@@ -313,9 +301,8 @@ config CRYPTO_ARIA_AESNI_AVX_X86_64
 
 config CRYPTO_ARIA_AESNI_AVX2_X86_64
 	tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX2/GFNI)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SKCIPHER
-	select CRYPTO_SIMD
 	select CRYPTO_ALGAPI
 	select CRYPTO_ARIA
 	select CRYPTO_ARIA_AESNI_AVX_X86_64
@@ -332,9 +319,8 @@ config CRYPTO_ARIA_AESNI_AVX2_X86_64
 
 config CRYPTO_ARIA_GFNI_AVX512_X86_64
 	tristate "Ciphers: ARIA with modes: ECB, CTR (AVX512/GFNI)"
-	depends on X86 && 64BIT && AS_AVX512 && AS_GFNI
+	depends on 64BIT && AS_GFNI
 	select CRYPTO_SKCIPHER
-	select CRYPTO_SIMD
 	select CRYPTO_ALGAPI
 	select CRYPTO_ARIA
 	select CRYPTO_ARIA_AESNI_AVX_X86_64
@@ -349,27 +335,10 @@ config CRYPTO_ARIA_GFNI_AVX512_X86_64
 
 	  Processes 64 blocks in parallel.
 
-config CRYPTO_CHACHA20_X86_64
-	tristate
-	depends on X86 && 64BIT
-	select CRYPTO_SKCIPHER
-	select CRYPTO_LIB_CHACHA_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-	default CRYPTO_LIB_CHACHA_INTERNAL
-	help
-	  Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
-	  stream cipher algorithms
-
-	  Architecture: x86_64 using:
-	  - SSSE3 (Supplemental SSE3)
-	  - AVX2 (Advanced Vector Extensions 2)
-	  - AVX-512VL (Advanced Vector Extensions-512VL)
-
 config CRYPTO_AEGIS128_AESNI_SSE2
 	tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE4.1)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_AEAD
-	select CRYPTO_SIMD
 	help
 	  AEGIS-128 AEAD algorithm
 
@@ -379,7 +348,7 @@ config CRYPTO_AEGIS128_AESNI_SSE2
 
 config CRYPTO_NHPOLY1305_SSE2
 	tristate "Hash functions: NHPoly1305 (SSE2)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_NHPOLY1305
 	help
 	  NHPoly1305 hash function for Adiantum
@@ -389,7 +358,7 @@ config CRYPTO_NHPOLY1305_SSE2
 
 config CRYPTO_NHPOLY1305_AVX2
 	tristate "Hash functions: NHPoly1305 (AVX2)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_NHPOLY1305
 	help
 	  NHPoly1305 hash function for Adiantum
@@ -397,21 +366,9 @@ config CRYPTO_NHPOLY1305_AVX2
 	  Architecture: x86_64 using:
 	  - AVX2 (Advanced Vector Extensions 2)
 
-config CRYPTO_BLAKE2S_X86
-	bool "Hash functions: BLAKE2s (SSSE3/AVX-512)"
-	depends on X86 && 64BIT
-	select CRYPTO_LIB_BLAKE2S_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	help
-	  BLAKE2s cryptographic hash function (RFC 7693)
-
-	  Architecture: x86_64 using:
-	  - SSSE3 (Supplemental SSE3)
-	  - AVX-512 (Advanced Vector Extensions-512)
-
 config CRYPTO_POLYVAL_CLMUL_NI
 	tristate "Hash functions: POLYVAL (CLMUL-NI)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_POLYVAL
 	help
 	  POLYVAL hash function for HCTR2
@@ -419,23 +376,9 @@ config CRYPTO_POLYVAL_CLMUL_NI
 	  Architecture: x86_64 using:
 	  - CLMUL-NI (carry-less multiplication new instructions)
 
-config CRYPTO_POLY1305_X86_64
-	tristate
-	depends on X86 && 64BIT
-	select CRYPTO_HASH
-	select CRYPTO_LIB_POLY1305_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_POLY1305
-	default CRYPTO_LIB_POLY1305_INTERNAL
-	help
-	  Poly1305 authenticator algorithm (RFC7539)
-
-	  Architecture: x86_64 using:
-	  - SSE2 (Streaming SIMD Extensions 2)
-	  - AVX2 (Advanced Vector Extensions 2)
-
 config CRYPTO_SHA1_SSSE3
 	tristate "Hash functions: SHA-1 (SSSE3/AVX/AVX2/SHA-NI)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SHA1
 	select CRYPTO_HASH
 	help
@@ -447,23 +390,9 @@ config CRYPTO_SHA1_SSSE3
 	  - AVX2 (Advanced Vector Extensions 2)
 	  - SHA-NI (SHA Extensions New Instructions)
 
-config CRYPTO_SHA256_SSSE3
-	tristate "Hash functions: SHA-224 and SHA-256 (SSSE3/AVX/AVX2/SHA-NI)"
-	depends on X86 && 64BIT
-	select CRYPTO_SHA256
-	select CRYPTO_HASH
-	help
-	  SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
-	  Architecture: x86_64 using:
-	  - SSSE3 (Supplemental SSE3)
-	  - AVX (Advanced Vector Extensions)
-	  - AVX2 (Advanced Vector Extensions 2)
-	  - SHA-NI (SHA Extensions New Instructions)
-
 config CRYPTO_SHA512_SSSE3
 	tristate "Hash functions: SHA-384 and SHA-512 (SSSE3/AVX/AVX2)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_SHA512
 	select CRYPTO_HASH
 	help
@@ -476,9 +405,9 @@ config CRYPTO_SHA512_SSSE3
 
 config CRYPTO_SM3_AVX_X86_64
 	tristate "Hash functions: SM3 (AVX)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_HASH
-	select CRYPTO_SM3
+	select CRYPTO_LIB_SM3
 	help
 	  SM3 secure hash function as defined by OSCCA GM/T 0004-2012 SM3
 
@@ -489,7 +418,7 @@ config CRYPTO_SM3_AVX_X86_64
 
 config CRYPTO_GHASH_CLMUL_NI_INTEL
 	tristate "Hash functions: GHASH (CLMUL-NI)"
-	depends on X86 && 64BIT
+	depends on 64BIT
 	select CRYPTO_CRYPTD
 	help
 	  GCM GHASH hash function (NIST SP800-38D)
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 5d19f41bde58..aa289a9e0153 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -42,10 +42,6 @@ cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
 obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o
 aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
 
-obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
-chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha_glue.o
-chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o
-
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
 aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \
@@ -56,29 +52,17 @@ aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o
 endif
 
 obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
-sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o
-sha1-ssse3-$(CONFIG_AS_SHA1_NI) += sha1_ni_asm.o
-
-obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
-sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
-sha256-ssse3-$(CONFIG_AS_SHA256_NI) += sha256_ni_asm.o
+sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o
 
 obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
 sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
 
-obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o
-libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
-
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 
 obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
 polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
 
-obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
-poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
-targets += poly1305-x86_64-cryptogams.S
-
 obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
 nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
@@ -104,10 +88,5 @@ aria-aesni-avx2-x86_64-y := aria-aesni-avx2-asm_64.o aria_aesni_avx2_glue.o
 obj-$(CONFIG_CRYPTO_ARIA_GFNI_AVX512_X86_64) += aria-gfni-avx512-x86_64.o
 aria-gfni-avx512-x86_64-y := aria-gfni-avx512-asm_64.o aria_gfni_avx512_glue.o
 
-quiet_cmd_perlasm = PERLASM $@
-      cmd_perlasm = $(PERL) $< > $@
-$(obj)/%.S: $(src)/%.pl FORCE
-	$(call if_changed,perlasm)
-
 # Disable GCOV in odd or sensitive code
 GCOV_PROFILE_curve25519-x86_64.o := n
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 26786e15abac..f1b6d40154e3 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -8,7 +8,6 @@
  */
 
 #include <crypto/internal/aead.h>
-#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/module.h>
@@ -233,21 +232,18 @@ static struct aead_alg crypto_aegis128_aesni_alg = {
 	.chunksize = AEGIS128_BLOCK_SIZE,
 
 	.base = {
-		.cra_flags = CRYPTO_ALG_INTERNAL,
 		.cra_blocksize = 1,
 		.cra_ctxsize = sizeof(struct aegis_ctx) +
 			       __alignof__(struct aegis_ctx),
 		.cra_priority = 400,
 
-		.cra_name = "__aegis128",
-		.cra_driver_name = "__aegis128-aesni",
+		.cra_name = "aegis128",
+		.cra_driver_name = "aegis128-aesni",
 
 		.cra_module = THIS_MODULE,
 	}
 };
 
-static struct simd_aead_alg *simd_alg;
-
 static int __init crypto_aegis128_aesni_module_init(void)
 {
 	if (!boot_cpu_has(X86_FEATURE_XMM4_1) ||
@@ -255,13 +251,12 @@ static int __init crypto_aegis128_aesni_module_init(void)
 	    !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
 		return -ENODEV;
 
-	return simd_register_aeads_compat(&crypto_aegis128_aesni_alg, 1,
-					  &simd_alg);
+	return crypto_register_aead(&crypto_aegis128_aesni_alg);
 }
 
 static void __exit crypto_aegis128_aesni_module_exit(void)
 {
-	simd_unregister_aeads(&crypto_aegis128_aesni_alg, 1, &simd_alg);
+	crypto_unregister_aead(&crypto_aegis128_aesni_alg);
 }
 
 module_init(crypto_aegis128_aesni_module_init);
diff --git a/arch/x86/crypto/aes-ctr-avx-x86_64.S b/arch/x86/crypto/aes-ctr-avx-x86_64.S
index 1685d8b24b2c..bbbfd80f5a50 100644
--- a/arch/x86/crypto/aes-ctr-avx-x86_64.S
+++ b/arch/x86/crypto/aes-ctr-avx-x86_64.S
@@ -48,8 +48,7 @@
 // using the following sets of CPU features:
 //	- AES-NI && AVX
 //	- VAES && AVX2
-//	- VAES && (AVX10/256 || (AVX512BW && AVX512VL)) && BMI2
-//	- VAES && (AVX10/512 || (AVX512BW && AVX512VL)) && BMI2
+//	- VAES && AVX512BW && AVX512VL && BMI2
 //
 // See the function definitions at the bottom of the file for more information.
 
@@ -76,7 +75,6 @@
 .text
 
 // Move a vector between memory and a register.
-// The register operand must be in the first 16 vector registers.
 .macro	_vmovdqu	src, dst
 .if VL < 64
 	vmovdqu		\src, \dst
@@ -86,7 +84,6 @@
 .endm
 
 // Move a vector between registers.
-// The registers must be in the first 16 vector registers.
 .macro	_vmovdqa	src, dst
 .if VL < 64
 	vmovdqa		\src, \dst
@@ -96,7 +93,7 @@
 .endm
 
 // Broadcast a 128-bit value from memory to all 128-bit lanes of a vector
-// register.  The register operand must be in the first 16 vector registers.
+// register.
 .macro	_vbroadcast128	src, dst
 .if VL == 16
 	vmovdqu		\src, \dst
@@ -108,7 +105,6 @@
 .endm
 
 // XOR two vectors together.
-// Any register operands must be in the first 16 vector registers.
 .macro	_vpxor	src1, src2, dst
 .if VL < 64
 	vpxor		\src1, \src2, \dst
@@ -199,8 +195,8 @@
 // XOR each with the zero-th round key.  Also update LE_CTR if !\final.
 .macro	_prepare_2_ctr_vecs	is_xctr, i0, i1, final=0
 .if \is_xctr
-  .if USE_AVX10
-	_vmovdqa	LE_CTR, AESDATA\i0
+  .if USE_AVX512
+	vmovdqa64	LE_CTR, AESDATA\i0
 	vpternlogd	$0x96, XCTR_IV, RNDKEY0, AESDATA\i0
   .else
 	vpxor		XCTR_IV, LE_CTR, AESDATA\i0
@@ -208,7 +204,7 @@
   .endif
 	vpaddq		LE_CTR_INC1, LE_CTR, AESDATA\i1
 
-  .if USE_AVX10
+  .if USE_AVX512
 	vpternlogd	$0x96, XCTR_IV, RNDKEY0, AESDATA\i1
   .else
 	vpxor		XCTR_IV, AESDATA\i1, AESDATA\i1
@@ -481,18 +477,12 @@
 .Lxor_tail_partial_vec_0\@:
 	// XOR the remaining 1 <= LEN < VL bytes.  It's easy if masked
 	// loads/stores are available; otherwise it's a bit harder...
-.if USE_AVX10
-  .if VL <= 32
-	mov		$-1, %eax
-	bzhi		LEN, %eax, %eax
-	kmovd		%eax, %k1
-  .else
+.if USE_AVX512
 	mov		$-1, %rax
 	bzhi		LEN64, %rax, %rax
 	kmovq		%rax, %k1
-  .endif
 	vmovdqu8	(SRC), AESDATA1{%k1}{z}
-	_vpxor		AESDATA1, AESDATA0, AESDATA0
+	vpxord		AESDATA1, AESDATA0, AESDATA0
 	vmovdqu8	AESDATA0, (DST){%k1}
 .else
   .if VL == 32
@@ -554,7 +544,7 @@
 // eliminates carries.  |ctr| is the per-message block counter starting at 1.
 
 .set	VL, 16
-.set	USE_AVX10, 0
+.set	USE_AVX512, 0
 SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx)
 	_aes_ctr_crypt	0
 SYM_FUNC_END(aes_ctr64_crypt_aesni_avx)
@@ -564,7 +554,7 @@ SYM_FUNC_END(aes_xctr_crypt_aesni_avx)
 
 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
 .set	VL, 32
-.set	USE_AVX10, 0
+.set	USE_AVX512, 0
 SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2)
 	_aes_ctr_crypt	0
 SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2)
@@ -572,21 +562,12 @@ SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2)
 	_aes_ctr_crypt	1
 SYM_FUNC_END(aes_xctr_crypt_vaes_avx2)
 
-.set	VL, 32
-.set	USE_AVX10, 1
-SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_256)
-	_aes_ctr_crypt	0
-SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_256)
-SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_256)
-	_aes_ctr_crypt	1
-SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_256)
-
 .set	VL, 64
-.set	USE_AVX10, 1
-SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_512)
+.set	USE_AVX512, 1
+SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx512)
 	_aes_ctr_crypt	0
-SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_512)
-SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_512)
+SYM_FUNC_END(aes_ctr64_crypt_vaes_avx512)
+SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx512)
 	_aes_ctr_crypt	1
-SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_512)
+SYM_FUNC_END(aes_xctr_crypt_vaes_avx512)
 #endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index 93ba0ddbe009..db79cdf81588 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -52,32 +52,25 @@
  * different code, it uses a macro to generate several implementations that
  * share similar source code but are targeted at different CPUs, listed below:
  *
- * AES-NI + AVX
+ * AES-NI && AVX
  *    - 128-bit vectors (1 AES block per vector)
  *    - VEX-coded instructions
  *    - xmm0-xmm15
  *    - This is for older CPUs that lack VAES but do have AVX.
  *
- * VAES + VPCLMULQDQ + AVX2
+ * VAES && VPCLMULQDQ && AVX2
  *    - 256-bit vectors (2 AES blocks per vector)
  *    - VEX-coded instructions
  *    - ymm0-ymm15
- *    - This is for CPUs that have VAES but lack AVX512 or AVX10,
- *      e.g. Intel's Alder Lake and AMD's Zen 3.
+ *    - This is for CPUs that have VAES but either lack AVX512 (e.g. Intel's
+ *      Alder Lake and AMD's Zen 3) or downclock too eagerly when using zmm
+ *      registers (e.g. Intel's Ice Lake).
  *
- * VAES + VPCLMULQDQ + AVX10/256 + BMI2
- *    - 256-bit vectors (2 AES blocks per vector)
+ * VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2
+ *    - 512-bit vectors (4 AES blocks per vector)
  *    - EVEX-coded instructions
- *    - ymm0-ymm31
- *    - This is for CPUs that have AVX512 but where using zmm registers causes
- *      downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
- *    - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
- *      To avoid confusion with 512-bit, we just write AVX10/256.
- *
- * VAES + VPCLMULQDQ + AVX10/512 + BMI2
- *    - Same as the previous one, but upgrades to 512-bit vectors
- *      (4 AES blocks per vector) in zmm0-zmm31.
- *    - This is for CPUs that have good AVX512 or AVX10/512 support.
+ *    - zmm0-zmm31
+ *    - This is for CPUs that have good AVX512 support.
  *
  * This file doesn't have an implementation for AES-NI alone (without AVX), as
  * the lack of VEX would make all the assembly code different.
@@ -107,9 +100,20 @@
 	// exists when there's a carry out of the low 64 bits of the tweak.
 	.quad	0x87, 1
 
+	// These are the shift amounts that are needed when multiplying by [x^0,
+	// x^1, x^2, x^3] to compute the first vector of tweaks when VL=64.
+	//
+	// The right shifts by 64 are expected to zeroize the destination.
+	// 'vpsrlvq' is indeed defined to do that; i.e. it doesn't truncate the
+	// amount to 64 & 63 = 0 like the 'shr' scalar shift instruction would.
+.Lrshift_amounts:
+	.byte	64, 64, 63, 63, 62, 62, 61, 61
+.Llshift_amounts:
+	.byte	0, 0, 1, 1, 2, 2, 3, 3
+
 	// This table contains constants for vpshufb and vpblendvb, used to
 	// handle variable byte shifts and blending during ciphertext stealing
-	// on CPUs that don't support AVX10-style masking.
+	// on CPUs that don't support AVX512-style masking.
 .Lcts_permute_table:
 	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
 	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
@@ -138,7 +142,7 @@
 .irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
 	_define_Vi	\i
 .endr
-.if USE_AVX10
+.if USE_AVX512
 .irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
 	_define_Vi	\i
 .endr
@@ -193,7 +197,7 @@
 	// keys to the *end* of this register range.  I.e., AES-128 uses
 	// KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
 	// (All also use KEY0 for the XOR-only "round" at the beginning.)
-.if USE_AVX10
+.if USE_AVX512
 	.set	KEY1_XMM,	%xmm16
 	.set	KEY1,		V16
 	.set	KEY2_XMM,	%xmm17
@@ -227,7 +231,6 @@
 .endm
 
 // Move a vector between memory and a register.
-// The register operand must be in the first 16 vector registers.
 .macro	_vmovdqu	src, dst
 .if VL < 64
 	vmovdqu		\src, \dst
@@ -238,9 +241,9 @@
 
 // Broadcast a 128-bit value into a vector.
 .macro	_vbroadcast128	src, dst
-.if VL == 16 && !USE_AVX10
+.if VL == 16
 	vmovdqu		\src, \dst
-.elseif VL == 32 && !USE_AVX10
+.elseif VL == 32
 	vbroadcasti128	\src, \dst
 .else
 	vbroadcasti32x4	\src, \dst
@@ -248,7 +251,6 @@
 .endm
 
 // XOR two vectors together.
-// Any register operands must be in the first 16 vector registers.
 .macro	_vpxor	src1, src2, dst
 .if VL < 64
 	vpxor		\src1, \src2, \dst
@@ -259,7 +261,7 @@
 
 // XOR three vectors together.
 .macro	_xor3	src1, src2, src3_and_dst
-.if USE_AVX10
+.if USE_AVX512
 	// vpternlogd with immediate 0x96 is a three-argument XOR.
 	vpternlogd	$0x96, \src1, \src2, \src3_and_dst
 .else
@@ -274,7 +276,7 @@
 	vpshufd		$0x13, \src, \tmp
 	vpaddq		\src, \src, \dst
 	vpsrad		$31, \tmp, \tmp
-.if USE_AVX10
+.if USE_AVX512
 	vpternlogd	$0x78, GF_POLY_XMM, \tmp, \dst
 .else
 	vpand		GF_POLY_XMM, \tmp, \tmp
@@ -303,52 +305,75 @@
 // Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
 // store them in the vector registers TWEAK0-TWEAK3.  Clobbers V0-V5.
 .macro	_compute_first_set_of_tweaks
-	vmovdqu		(TWEAK), TWEAK0_XMM
-	_vbroadcast128	.Lgf_poly(%rip), GF_POLY
 .if VL == 16
-	// With VL=16, multiplying by x serially is fastest.
+	vmovdqu		(TWEAK), TWEAK0_XMM
+	vmovdqu		.Lgf_poly(%rip), GF_POLY
 	_next_tweak	TWEAK0, %xmm0, TWEAK1
 	_next_tweak	TWEAK1, %xmm0, TWEAK2
 	_next_tweak	TWEAK2, %xmm0, TWEAK3
-.else
-.if VL == 32
-	// Compute the second block of TWEAK0.
+.elseif VL == 32
+	vmovdqu		(TWEAK), TWEAK0_XMM
+	vbroadcasti128	.Lgf_poly(%rip), GF_POLY
+
+	// Compute the first vector of tweaks.
 	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
 	vinserti128	$1, %xmm1, TWEAK0, TWEAK0
-.elseif VL == 64
-	// Compute the remaining blocks of TWEAK0.
-	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
-	_next_tweak	%xmm1, %xmm0, %xmm2
-	_next_tweak	%xmm2, %xmm0, %xmm3
-	vinserti32x4	$1, %xmm1, TWEAK0, TWEAK0
-	vinserti32x4	$2, %xmm2, TWEAK0, TWEAK0
-	vinserti32x4	$3, %xmm3, TWEAK0, TWEAK0
-.endif
-	// Compute TWEAK[1-3] from TWEAK0.
-	vpsrlq		$64 - 1*VL/16, TWEAK0, V0
-	vpsrlq		$64 - 2*VL/16, TWEAK0, V2
-	vpsrlq		$64 - 3*VL/16, TWEAK0, V4
+
+	// Compute the next three vectors of tweaks:
+	//	TWEAK1 = TWEAK0 * [x^2, x^2]
+	//	TWEAK2 = TWEAK0 * [x^4, x^4]
+	//	TWEAK3 = TWEAK0 * [x^6, x^6]
+	vpsrlq		$64 - 2, TWEAK0, V0
+	vpsrlq		$64 - 4, TWEAK0, V2
+	vpsrlq		$64 - 6, TWEAK0, V4
 	vpclmulqdq	$0x01, GF_POLY, V0, V1
 	vpclmulqdq	$0x01, GF_POLY, V2, V3
 	vpclmulqdq	$0x01, GF_POLY, V4, V5
 	vpslldq		$8, V0, V0
 	vpslldq		$8, V2, V2
 	vpslldq		$8, V4, V4
-	vpsllq		$1*VL/16, TWEAK0, TWEAK1
-	vpsllq		$2*VL/16, TWEAK0, TWEAK2
-	vpsllq		$3*VL/16, TWEAK0, TWEAK3
-.if USE_AVX10
-	vpternlogd	$0x96, V0, V1, TWEAK1
-	vpternlogd	$0x96, V2, V3, TWEAK2
-	vpternlogd	$0x96, V4, V5, TWEAK3
-.else
+	vpsllq		$2, TWEAK0, TWEAK1
+	vpsllq		$4, TWEAK0, TWEAK2
+	vpsllq		$6, TWEAK0, TWEAK3
 	vpxor		V0, TWEAK1, TWEAK1
 	vpxor		V2, TWEAK2, TWEAK2
 	vpxor		V4, TWEAK3, TWEAK3
 	vpxor		V1, TWEAK1, TWEAK1
 	vpxor		V3, TWEAK2, TWEAK2
 	vpxor		V5, TWEAK3, TWEAK3
-.endif
+.else
+	vbroadcasti32x4	(TWEAK), TWEAK0
+	vbroadcasti32x4	.Lgf_poly(%rip), GF_POLY
+
+	// Compute the first vector of tweaks:
+	//	TWEAK0 = broadcast128(TWEAK) * [x^0, x^1, x^2, x^3]
+	vpmovzxbq	.Lrshift_amounts(%rip), V4
+	vpsrlvq		V4, TWEAK0, V0
+	vpclmulqdq	$0x01, GF_POLY, V0, V1
+	vpmovzxbq	.Llshift_amounts(%rip), V4
+	vpslldq		$8, V0, V0
+	vpsllvq		V4, TWEAK0, TWEAK0
+	vpternlogd	$0x96, V0, V1, TWEAK0
+
+	// Compute the next three vectors of tweaks:
+	//	TWEAK1 = TWEAK0 * [x^4, x^4, x^4, x^4]
+	//	TWEAK2 = TWEAK0 * [x^8, x^8, x^8, x^8]
+	//	TWEAK3 = TWEAK0 * [x^12, x^12, x^12, x^12]
+	// x^8 only needs byte-aligned shifts, so optimize accordingly.
+	vpsrlq		$64 - 4, TWEAK0, V0
+	vpsrldq		$(64 - 8) / 8, TWEAK0, V2
+	vpsrlq		$64 - 12, TWEAK0, V4
+	vpclmulqdq	$0x01, GF_POLY, V0, V1
+	vpclmulqdq	$0x01, GF_POLY, V2, V3
+	vpclmulqdq	$0x01, GF_POLY, V4, V5
+	vpslldq		$8, V0, V0
+	vpslldq		$8, V4, V4
+	vpsllq		$4, TWEAK0, TWEAK1
+	vpslldq		$8 / 8, TWEAK0, TWEAK2
+	vpsllq		$12, TWEAK0, TWEAK3
+	vpternlogd	$0x96, V0, V1, TWEAK1
+	vpxord		V3, TWEAK2, TWEAK2
+	vpternlogd	$0x96, V4, V5, TWEAK3
 .endif
 .endm
 
@@ -474,26 +499,26 @@
 	lea		OFFS-16(KEY, KEYLEN64, 4), KEY
 
 	// If all 32 SIMD registers are available, cache all the round keys.
-.if USE_AVX10
+.if USE_AVX512
 	cmp		$24, KEYLEN
 	jl		.Laes128\@
 	je		.Laes192\@
-	_vbroadcast128	-6*16(KEY), KEY1
-	_vbroadcast128	-5*16(KEY), KEY2
+	vbroadcasti32x4	-6*16(KEY), KEY1
+	vbroadcasti32x4	-5*16(KEY), KEY2
 .Laes192\@:
-	_vbroadcast128	-4*16(KEY), KEY3
-	_vbroadcast128	-3*16(KEY), KEY4
+	vbroadcasti32x4	-4*16(KEY), KEY3
+	vbroadcasti32x4	-3*16(KEY), KEY4
 .Laes128\@:
-	_vbroadcast128	-2*16(KEY), KEY5
-	_vbroadcast128	-1*16(KEY), KEY6
-	_vbroadcast128	0*16(KEY), KEY7
-	_vbroadcast128	1*16(KEY), KEY8
-	_vbroadcast128	2*16(KEY), KEY9
-	_vbroadcast128	3*16(KEY), KEY10
-	_vbroadcast128	4*16(KEY), KEY11
-	_vbroadcast128	5*16(KEY), KEY12
-	_vbroadcast128	6*16(KEY), KEY13
-	_vbroadcast128	7*16(KEY), KEY14
+	vbroadcasti32x4	-2*16(KEY), KEY5
+	vbroadcasti32x4	-1*16(KEY), KEY6
+	vbroadcasti32x4	0*16(KEY), KEY7
+	vbroadcasti32x4	1*16(KEY), KEY8
+	vbroadcasti32x4	2*16(KEY), KEY9
+	vbroadcasti32x4	3*16(KEY), KEY10
+	vbroadcasti32x4	4*16(KEY), KEY11
+	vbroadcasti32x4	5*16(KEY), KEY12
+	vbroadcasti32x4	6*16(KEY), KEY13
+	vbroadcasti32x4	7*16(KEY), KEY14
 .endif
 .endm
 
@@ -521,7 +546,7 @@
 // using the same key for all block(s).  The round key is loaded from the
 // appropriate register or memory location for round \i.  May clobber \tmp.
 .macro _vaes_1x		enc, i, xmm_suffix, data, tmp
-.if USE_AVX10
+.if USE_AVX512
 	_vaes		\enc, KEY\i\xmm_suffix, \data
 .else
 .ifnb \xmm_suffix
@@ -538,7 +563,7 @@
 // appropriate register or memory location for round \i.  In addition, does two
 // steps of the computation of the next set of tweaks.  May clobber V4 and V5.
 .macro	_vaes_4x	enc, i
-.if USE_AVX10
+.if USE_AVX512
 	_tweak_step	(2*(\i-5))
 	_vaes		\enc, KEY\i, V0
 	_vaes		\enc, KEY\i, V1
@@ -574,7 +599,7 @@
 .irp i, 5,6,7,8,9,10,11,12,13
 	_vaes_1x	\enc, \i, \xmm_suffix, \data, tmp=\tmp
 .endr
-.if USE_AVX10
+.if USE_AVX512
 	vpxord		KEY14\xmm_suffix, \tweak, \tmp
 .else
 .ifnb \xmm_suffix
@@ -617,11 +642,11 @@
 	// This is the main loop, en/decrypting 4*VL bytes per iteration.
 
 	// XOR each source block with its tweak and the zero-th round key.
-.if USE_AVX10
-	_vmovdqu	0*VL(SRC), V0
-	_vmovdqu	1*VL(SRC), V1
-	_vmovdqu	2*VL(SRC), V2
-	_vmovdqu	3*VL(SRC), V3
+.if USE_AVX512
+	vmovdqu8	0*VL(SRC), V0
+	vmovdqu8	1*VL(SRC), V1
+	vmovdqu8	2*VL(SRC), V2
+	vmovdqu8	3*VL(SRC), V3
 	vpternlogd	$0x96, TWEAK0, KEY0, V0
 	vpternlogd	$0x96, TWEAK1, KEY0, V1
 	vpternlogd	$0x96, TWEAK2, KEY0, V2
@@ -654,7 +679,7 @@
 	// Reduce latency by doing the XOR before the vaesenclast, utilizing the
 	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a)
 	// (and likewise for vaesdeclast).
-.if USE_AVX10
+.if USE_AVX512
 	_tweak_step	18
 	_tweak_step	19
 	vpxord		TWEAK0, KEY14, V4
@@ -762,7 +787,7 @@
 	_aes_crypt	\enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1
 .endif
 
-.if USE_AVX10
+.if USE_AVX512
 	// Create a mask that has the first LEN bits set.
 	mov		$-1, %r9d
 	bzhi		LEN, %r9d, %r9d
@@ -811,7 +836,7 @@
 //			   u8 iv[AES_BLOCK_SIZE]);
 //
 // Encrypt |iv| using the AES key |tweak_key| to get the first tweak.  Assumes
-// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10.
+// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX512.
 SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
 	.set	TWEAK_KEY,	%rdi
 	.set	IV,		%rsi
@@ -853,7 +878,7 @@ SYM_FUNC_END(aes_xts_encrypt_iv)
 // multiple of 16, then this function updates |tweak| to contain the next tweak.
 
 .set	VL, 16
-.set	USE_AVX10, 0
+.set	USE_AVX512, 0
 SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
 	_aes_xts_crypt	1
 SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
@@ -863,7 +888,7 @@ SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
 
 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
 .set	VL, 32
-.set	USE_AVX10, 0
+.set	USE_AVX512, 0
 SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
 	_aes_xts_crypt	1
 SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
@@ -871,21 +896,12 @@ SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
 	_aes_xts_crypt	0
 SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
 
-.set	VL, 32
-.set	USE_AVX10, 1
-SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
-	_aes_xts_crypt	1
-SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
-SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
-	_aes_xts_crypt	0
-SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
-
 .set	VL, 64
-.set	USE_AVX10, 1
-SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
+.set	USE_AVX512, 1
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx512)
 	_aes_xts_crypt	1
-SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
-SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx512)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx512)
 	_aes_xts_crypt	0
-SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx512)
 #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index bc655d794a95..061b1ced93c5 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -566,10 +566,9 @@ static struct crypto_alg aesni_cipher_alg = {
 static struct skcipher_alg aesni_skciphers[] = {
 	{
 		.base = {
-			.cra_name		= "__ecb(aes)",
-			.cra_driver_name	= "__ecb-aes-aesni",
+			.cra_name		= "ecb(aes)",
+			.cra_driver_name	= "ecb-aes-aesni",
 			.cra_priority		= 400,
-			.cra_flags		= CRYPTO_ALG_INTERNAL,
 			.cra_blocksize		= AES_BLOCK_SIZE,
 			.cra_ctxsize		= CRYPTO_AES_CTX_SIZE,
 			.cra_module		= THIS_MODULE,
@@ -581,10 +580,9 @@ static struct skcipher_alg aesni_skciphers[] = {
 		.decrypt	= ecb_decrypt,
 	}, {
 		.base = {
-			.cra_name		= "__cbc(aes)",
-			.cra_driver_name	= "__cbc-aes-aesni",
+			.cra_name		= "cbc(aes)",
+			.cra_driver_name	= "cbc-aes-aesni",
 			.cra_priority		= 400,
-			.cra_flags		= CRYPTO_ALG_INTERNAL,
 			.cra_blocksize		= AES_BLOCK_SIZE,
 			.cra_ctxsize		= CRYPTO_AES_CTX_SIZE,
 			.cra_module		= THIS_MODULE,
@@ -597,10 +595,9 @@ static struct skcipher_alg aesni_skciphers[] = {
 		.decrypt	= cbc_decrypt,
 	}, {
 		.base = {
-			.cra_name		= "__cts(cbc(aes))",
-			.cra_driver_name	= "__cts-cbc-aes-aesni",
+			.cra_name		= "cts(cbc(aes))",
+			.cra_driver_name	= "cts-cbc-aes-aesni",
 			.cra_priority		= 400,
-			.cra_flags		= CRYPTO_ALG_INTERNAL,
 			.cra_blocksize		= AES_BLOCK_SIZE,
 			.cra_ctxsize		= CRYPTO_AES_CTX_SIZE,
 			.cra_module		= THIS_MODULE,
@@ -615,10 +612,9 @@ static struct skcipher_alg aesni_skciphers[] = {
 #ifdef CONFIG_X86_64
 	}, {
 		.base = {
-			.cra_name		= "__ctr(aes)",
-			.cra_driver_name	= "__ctr-aes-aesni",
+			.cra_name		= "ctr(aes)",
+			.cra_driver_name	= "ctr-aes-aesni",
 			.cra_priority		= 400,
-			.cra_flags		= CRYPTO_ALG_INTERNAL,
 			.cra_blocksize		= 1,
 			.cra_ctxsize		= CRYPTO_AES_CTX_SIZE,
 			.cra_module		= THIS_MODULE,
@@ -633,10 +629,9 @@ static struct skcipher_alg aesni_skciphers[] = {
 #endif
 	}, {
 		.base = {
-			.cra_name		= "__xts(aes)",
-			.cra_driver_name	= "__xts-aes-aesni",
+			.cra_name		= "xts(aes)",
+			.cra_driver_name	= "xts-aes-aesni",
 			.cra_priority		= 401,
-			.cra_flags		= CRYPTO_ALG_INTERNAL,
 			.cra_blocksize		= AES_BLOCK_SIZE,
 			.cra_ctxsize		= XTS_AES_CTX_SIZE,
 			.cra_module		= THIS_MODULE,
@@ -651,9 +646,6 @@ static struct skcipher_alg aesni_skciphers[] = {
 	}
 };
 
-static
-struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)];
-
 #ifdef CONFIG_X86_64
 asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
 				   u8 iv[AES_BLOCK_SIZE]);
@@ -792,10 +784,9 @@ static int xctr_crypt_##suffix(struct skcipher_request *req)		       \
 }									       \
 									       \
 static struct skcipher_alg skcipher_algs_##suffix[] = {{		       \
-	.base.cra_name		= "__xts(aes)",				       \
-	.base.cra_driver_name	= "__xts-aes-" driver_name_suffix,	       \
+	.base.cra_name		= "xts(aes)",				       \
+	.base.cra_driver_name	= "xts-aes-" driver_name_suffix,	       \
 	.base.cra_priority	= priority,				       \
-	.base.cra_flags		= CRYPTO_ALG_INTERNAL,			       \
 	.base.cra_blocksize	= AES_BLOCK_SIZE,			       \
 	.base.cra_ctxsize	= XTS_AES_CTX_SIZE,			       \
 	.base.cra_module	= THIS_MODULE,				       \
@@ -807,10 +798,9 @@ static struct skcipher_alg skcipher_algs_##suffix[] = {{		       \
 	.encrypt		= xts_encrypt_##suffix,			       \
 	.decrypt		= xts_decrypt_##suffix,			       \
 }, {									       \
-	.base.cra_name		= "__ctr(aes)",				       \
-	.base.cra_driver_name	= "__ctr-aes-" driver_name_suffix,	       \
+	.base.cra_name		= "ctr(aes)",				       \
+	.base.cra_driver_name	= "ctr-aes-" driver_name_suffix,	       \
 	.base.cra_priority	= priority,				       \
-	.base.cra_flags		= CRYPTO_ALG_INTERNAL,			       \
 	.base.cra_blocksize	= 1,					       \
 	.base.cra_ctxsize	= CRYPTO_AES_CTX_SIZE,			       \
 	.base.cra_module	= THIS_MODULE,				       \
@@ -822,10 +812,9 @@ static struct skcipher_alg skcipher_algs_##suffix[] = {{		       \
 	.encrypt		= ctr_crypt_##suffix,			       \
 	.decrypt		= ctr_crypt_##suffix,			       \
 }, {									       \
-	.base.cra_name		= "__xctr(aes)",			       \
-	.base.cra_driver_name	= "__xctr-aes-" driver_name_suffix,	       \
+	.base.cra_name		= "xctr(aes)",				       \
+	.base.cra_driver_name	= "xctr-aes-" driver_name_suffix,	       \
 	.base.cra_priority	= priority,				       \
-	.base.cra_flags		= CRYPTO_ALG_INTERNAL,			       \
 	.base.cra_blocksize	= 1,					       \
 	.base.cra_ctxsize	= CRYPTO_AES_CTX_SIZE,			       \
 	.base.cra_module	= THIS_MODULE,				       \
@@ -836,16 +825,12 @@ static struct skcipher_alg skcipher_algs_##suffix[] = {{		       \
 	.setkey			= aesni_skcipher_setkey,		       \
 	.encrypt		= xctr_crypt_##suffix,			       \
 	.decrypt		= xctr_crypt_##suffix,			       \
-}};									       \
-									       \
-static struct simd_skcipher_alg *					       \
-simd_skcipher_algs_##suffix[ARRAY_SIZE(skcipher_algs_##suffix)]
+}}
 
 DEFINE_AVX_SKCIPHER_ALGS(aesni_avx, "aesni-avx", 500);
 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
 DEFINE_AVX_SKCIPHER_ALGS(vaes_avx2, "vaes-avx2", 600);
-DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_256, "vaes-avx10_256", 700);
-DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_512, "vaes-avx10_512", 800);
+DEFINE_AVX_SKCIPHER_ALGS(vaes_avx512, "vaes-avx512", 800);
 #endif
 
 /* The common part of the x86_64 AES-GCM key struct */
@@ -1499,10 +1484,9 @@ static struct aead_alg aes_gcm_algs_##suffix[] = { {			       \
 	.chunksize		= AES_BLOCK_SIZE,			       \
 	.maxauthsize		= 16,					       \
 	.base = {							       \
-		.cra_name		= "__gcm(aes)",			       \
-		.cra_driver_name	= "__" generic_driver_name,	       \
+		.cra_name		= "gcm(aes)",			       \
+		.cra_driver_name	= generic_driver_name,		       \
 		.cra_priority		= (priority),			       \
-		.cra_flags		= CRYPTO_ALG_INTERNAL,		       \
 		.cra_blocksize		= 1,				       \
 		.cra_ctxsize		= (ctxsize),			       \
 		.cra_module		= THIS_MODULE,			       \
@@ -1516,17 +1500,14 @@ static struct aead_alg aes_gcm_algs_##suffix[] = { {			       \
 	.chunksize		= AES_BLOCK_SIZE,			       \
 	.maxauthsize		= 16,					       \
 	.base = {							       \
-		.cra_name		= "__rfc4106(gcm(aes))",	       \
-		.cra_driver_name	= "__" rfc_driver_name,		       \
+		.cra_name		= "rfc4106(gcm(aes))",		       \
+		.cra_driver_name	= rfc_driver_name,		       \
 		.cra_priority		= (priority),			       \
-		.cra_flags		= CRYPTO_ALG_INTERNAL,		       \
 		.cra_blocksize		= 1,				       \
 		.cra_ctxsize		= (ctxsize),			       \
 		.cra_module		= THIS_MODULE,			       \
 	},								       \
-} };									       \
-									       \
-static struct simd_aead_alg *aes_gcm_simdalgs_##suffix[2]		       \
+} }
 
 /* aes_gcm_algs_aesni */
 DEFINE_GCM_ALGS(aesni, /* no flags */ 0,
@@ -1556,14 +1537,12 @@ static int __init register_avx_algs(void)
 
 	if (!boot_cpu_has(X86_FEATURE_AVX))
 		return 0;
-	err = simd_register_skciphers_compat(skcipher_algs_aesni_avx,
-					     ARRAY_SIZE(skcipher_algs_aesni_avx),
-					     simd_skcipher_algs_aesni_avx);
+	err = crypto_register_skciphers(skcipher_algs_aesni_avx,
+					ARRAY_SIZE(skcipher_algs_aesni_avx));
 	if (err)
 		return err;
-	err = simd_register_aeads_compat(aes_gcm_algs_aesni_avx,
-					 ARRAY_SIZE(aes_gcm_algs_aesni_avx),
-					 aes_gcm_simdalgs_aesni_avx);
+	err = crypto_register_aeads(aes_gcm_algs_aesni_avx,
+				    ARRAY_SIZE(aes_gcm_algs_aesni_avx));
 	if (err)
 		return err;
 	/*
@@ -1579,9 +1558,8 @@ static int __init register_avx_algs(void)
 	    !boot_cpu_has(X86_FEATURE_PCLMULQDQ) ||
 	    !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
 		return 0;
-	err = simd_register_skciphers_compat(skcipher_algs_vaes_avx2,
-					     ARRAY_SIZE(skcipher_algs_vaes_avx2),
-					     simd_skcipher_algs_vaes_avx2);
+	err = crypto_register_skciphers(skcipher_algs_vaes_avx2,
+					ARRAY_SIZE(skcipher_algs_vaes_avx2));
 	if (err)
 		return err;
 
@@ -1592,76 +1570,52 @@ static int __init register_avx_algs(void)
 			       XFEATURE_MASK_AVX512, NULL))
 		return 0;
 
-	err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_256,
-					     ARRAY_SIZE(skcipher_algs_vaes_avx10_256),
-					     simd_skcipher_algs_vaes_avx10_256);
-	if (err)
-		return err;
-	err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256,
-					 ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256),
-					 aes_gcm_simdalgs_vaes_avx10_256);
+	err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_256,
+				    ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256));
 	if (err)
 		return err;
 
 	if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
 		int i;
 
-		for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx10_512); i++)
-			skcipher_algs_vaes_avx10_512[i].base.cra_priority = 1;
+		for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++)
+			skcipher_algs_vaes_avx512[i].base.cra_priority = 1;
 		for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++)
 			aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1;
 	}
 
-	err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_512,
-					     ARRAY_SIZE(skcipher_algs_vaes_avx10_512),
-					     simd_skcipher_algs_vaes_avx10_512);
+	err = crypto_register_skciphers(skcipher_algs_vaes_avx512,
+					ARRAY_SIZE(skcipher_algs_vaes_avx512));
 	if (err)
 		return err;
-	err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512,
-					 ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512),
-					 aes_gcm_simdalgs_vaes_avx10_512);
+	err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_512,
+				    ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512));
 	if (err)
 		return err;
 #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
 	return 0;
 }
 
+#define unregister_skciphers(A) \
+	if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \
+		crypto_unregister_skciphers((A), ARRAY_SIZE(A))
+#define unregister_aeads(A) \
+	if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \
+		crypto_unregister_aeads((A), ARRAY_SIZE(A))
+
 static void unregister_avx_algs(void)
 {
-	if (simd_skcipher_algs_aesni_avx[0])
-		simd_unregister_skciphers(skcipher_algs_aesni_avx,
-					  ARRAY_SIZE(skcipher_algs_aesni_avx),
-					  simd_skcipher_algs_aesni_avx);
-	if (aes_gcm_simdalgs_aesni_avx[0])
-		simd_unregister_aeads(aes_gcm_algs_aesni_avx,
-				      ARRAY_SIZE(aes_gcm_algs_aesni_avx),
-				      aes_gcm_simdalgs_aesni_avx);
+	unregister_skciphers(skcipher_algs_aesni_avx);
+	unregister_aeads(aes_gcm_algs_aesni_avx);
 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
-	if (simd_skcipher_algs_vaes_avx2[0])
-		simd_unregister_skciphers(skcipher_algs_vaes_avx2,
-					  ARRAY_SIZE(skcipher_algs_vaes_avx2),
-					  simd_skcipher_algs_vaes_avx2);
-	if (simd_skcipher_algs_vaes_avx10_256[0])
-		simd_unregister_skciphers(skcipher_algs_vaes_avx10_256,
-					  ARRAY_SIZE(skcipher_algs_vaes_avx10_256),
-					  simd_skcipher_algs_vaes_avx10_256);
-	if (aes_gcm_simdalgs_vaes_avx10_256[0])
-		simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256,
-				      ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256),
-				      aes_gcm_simdalgs_vaes_avx10_256);
-	if (simd_skcipher_algs_vaes_avx10_512[0])
-		simd_unregister_skciphers(skcipher_algs_vaes_avx10_512,
-					  ARRAY_SIZE(skcipher_algs_vaes_avx10_512),
-					  simd_skcipher_algs_vaes_avx10_512);
-	if (aes_gcm_simdalgs_vaes_avx10_512[0])
-		simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512,
-				      ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512),
-				      aes_gcm_simdalgs_vaes_avx10_512);
+	unregister_skciphers(skcipher_algs_vaes_avx2);
+	unregister_skciphers(skcipher_algs_vaes_avx512);
+	unregister_aeads(aes_gcm_algs_vaes_avx10_256);
+	unregister_aeads(aes_gcm_algs_vaes_avx10_512);
 #endif
 }
 #else /* CONFIG_X86_64 */
 static struct aead_alg aes_gcm_algs_aesni[0];
-static struct simd_aead_alg *aes_gcm_simdalgs_aesni[0];
 
 static int __init register_avx_algs(void)
 {
@@ -1690,15 +1644,13 @@ static int __init aesni_init(void)
 	if (err)
 		return err;
 
-	err = simd_register_skciphers_compat(aesni_skciphers,
-					     ARRAY_SIZE(aesni_skciphers),
-					     aesni_simd_skciphers);
+	err = crypto_register_skciphers(aesni_skciphers,
+					ARRAY_SIZE(aesni_skciphers));
 	if (err)
 		goto unregister_cipher;
 
-	err = simd_register_aeads_compat(aes_gcm_algs_aesni,
-					 ARRAY_SIZE(aes_gcm_algs_aesni),
-					 aes_gcm_simdalgs_aesni);
+	err = crypto_register_aeads(aes_gcm_algs_aesni,
+				    ARRAY_SIZE(aes_gcm_algs_aesni));
 	if (err)
 		goto unregister_skciphers;
 
@@ -1710,12 +1662,11 @@ static int __init aesni_init(void)
 
 unregister_avx:
 	unregister_avx_algs();
-	simd_unregister_aeads(aes_gcm_algs_aesni,
-			      ARRAY_SIZE(aes_gcm_algs_aesni),
-			      aes_gcm_simdalgs_aesni);
+	crypto_unregister_aeads(aes_gcm_algs_aesni,
+				ARRAY_SIZE(aes_gcm_algs_aesni));
 unregister_skciphers:
-	simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
-				  aesni_simd_skciphers);
+	crypto_unregister_skciphers(aesni_skciphers,
+				    ARRAY_SIZE(aesni_skciphers));
 unregister_cipher:
 	crypto_unregister_alg(&aesni_cipher_alg);
 	return err;
@@ -1723,11 +1674,10 @@ unregister_cipher:
 
 static void __exit aesni_exit(void)
 {
-	simd_unregister_aeads(aes_gcm_algs_aesni,
-			      ARRAY_SIZE(aes_gcm_algs_aesni),
-			      aes_gcm_simdalgs_aesni);
-	simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
-				  aesni_simd_skciphers);
+	crypto_unregister_aeads(aes_gcm_algs_aesni,
+				ARRAY_SIZE(aes_gcm_algs_aesni));
+	crypto_unregister_skciphers(aesni_skciphers,
+				    ARRAY_SIZE(aesni_skciphers));
 	crypto_unregister_alg(&aesni_cipher_alg);
 	unregister_avx_algs();
 }
diff --git a/arch/x86/crypto/aria_aesni_avx2_glue.c b/arch/x86/crypto/aria_aesni_avx2_glue.c
index 87a11804fc77..b4bddcd58457 100644
--- a/arch/x86/crypto/aria_aesni_avx2_glue.c
+++ b/arch/x86/crypto/aria_aesni_avx2_glue.c
@@ -6,7 +6,6 @@
  */
 
 #include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
 #include <crypto/aria.h>
 #include <linux/crypto.h>
 #include <linux/err.h>
@@ -165,10 +164,9 @@ static int aria_avx2_init_tfm(struct crypto_skcipher *tfm)
 
 static struct skcipher_alg aria_algs[] = {
 	{
-		.base.cra_name		= "__ecb(aria)",
-		.base.cra_driver_name	= "__ecb-aria-avx2",
+		.base.cra_name		= "ecb(aria)",
+		.base.cra_driver_name	= "ecb-aria-avx2",
 		.base.cra_priority	= 500,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= ARIA_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct aria_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -178,11 +176,10 @@ static struct skcipher_alg aria_algs[] = {
 		.encrypt		= aria_avx2_ecb_encrypt,
 		.decrypt		= aria_avx2_ecb_decrypt,
 	}, {
-		.base.cra_name		= "__ctr(aria)",
-		.base.cra_driver_name	= "__ctr-aria-avx2",
+		.base.cra_name		= "ctr(aria)",
+		.base.cra_driver_name	= "ctr-aria-avx2",
 		.base.cra_priority	= 500,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL |
-					  CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE,
+		.base.cra_flags		= CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct aria_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -197,8 +194,6 @@ static struct skcipher_alg aria_algs[] = {
 	}
 };
 
-static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)];
-
 static int __init aria_avx2_init(void)
 {
 	const char *feature_name;
@@ -233,15 +228,12 @@ static int __init aria_avx2_init(void)
 		aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_ctr_crypt_32way;
 	}
 
-	return simd_register_skciphers_compat(aria_algs,
-					      ARRAY_SIZE(aria_algs),
-					      aria_simd_algs);
+	return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
 }
 
 static void __exit aria_avx2_exit(void)
 {
-	simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs),
-				  aria_simd_algs);
+	crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
 }
 
 module_init(aria_avx2_init);
diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c
index 4e1516b76669..ab9b38d05332 100644
--- a/arch/x86/crypto/aria_aesni_avx_glue.c
+++ b/arch/x86/crypto/aria_aesni_avx_glue.c
@@ -6,7 +6,6 @@
  */
 
 #include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
 #include <crypto/aria.h>
 #include <linux/crypto.h>
 #include <linux/err.h>
@@ -152,10 +151,9 @@ static int aria_avx_init_tfm(struct crypto_skcipher *tfm)
 
 static struct skcipher_alg aria_algs[] = {
 	{
-		.base.cra_name		= "__ecb(aria)",
-		.base.cra_driver_name	= "__ecb-aria-avx",
+		.base.cra_name		= "ecb(aria)",
+		.base.cra_driver_name	= "ecb-aria-avx",
 		.base.cra_priority	= 400,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= ARIA_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct aria_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -165,10 +163,9 @@ static struct skcipher_alg aria_algs[] = {
 		.encrypt		= aria_avx_ecb_encrypt,
 		.decrypt		= aria_avx_ecb_decrypt,
 	}, {
-		.base.cra_name		= "__ctr(aria)",
-		.base.cra_driver_name	= "__ctr-aria-avx",
+		.base.cra_name		= "ctr(aria)",
+		.base.cra_driver_name	= "ctr-aria-avx",
 		.base.cra_priority	= 400,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct aria_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -184,8 +181,6 @@ static struct skcipher_alg aria_algs[] = {
 	}
 };
 
-static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)];
-
 static int __init aria_avx_init(void)
 {
 	const char *feature_name;
@@ -213,15 +208,12 @@ static int __init aria_avx_init(void)
 		aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_ctr_crypt_16way;
 	}
 
-	return simd_register_skciphers_compat(aria_algs,
-					      ARRAY_SIZE(aria_algs),
-					      aria_simd_algs);
+	return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
 }
 
 static void __exit aria_avx_exit(void)
 {
-	simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs),
-				  aria_simd_algs);
+	crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
 }
 
 module_init(aria_avx_init);
diff --git a/arch/x86/crypto/aria_gfni_avx512_glue.c b/arch/x86/crypto/aria_gfni_avx512_glue.c
index f4a2208d2638..363cbf4399cc 100644
--- a/arch/x86/crypto/aria_gfni_avx512_glue.c
+++ b/arch/x86/crypto/aria_gfni_avx512_glue.c
@@ -6,7 +6,6 @@
  */
 
 #include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
 #include <crypto/aria.h>
 #include <linux/crypto.h>
 #include <linux/err.h>
@@ -165,10 +164,9 @@ static int aria_avx512_init_tfm(struct crypto_skcipher *tfm)
 
 static struct skcipher_alg aria_algs[] = {
 	{
-		.base.cra_name		= "__ecb(aria)",
-		.base.cra_driver_name	= "__ecb-aria-avx512",
+		.base.cra_name		= "ecb(aria)",
+		.base.cra_driver_name	= "ecb-aria-avx512",
 		.base.cra_priority	= 600,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= ARIA_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct aria_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -178,11 +176,10 @@ static struct skcipher_alg aria_algs[] = {
 		.encrypt		= aria_avx512_ecb_encrypt,
 		.decrypt		= aria_avx512_ecb_decrypt,
 	}, {
-		.base.cra_name		= "__ctr(aria)",
-		.base.cra_driver_name	= "__ctr-aria-avx512",
+		.base.cra_name		= "ctr(aria)",
+		.base.cra_driver_name	= "ctr-aria-avx512",
 		.base.cra_priority	= 600,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL |
-					  CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE,
+		.base.cra_flags		= CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct aria_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -197,8 +194,6 @@ static struct skcipher_alg aria_algs[] = {
 	}
 };
 
-static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)];
-
 static int __init aria_avx512_init(void)
 {
 	const char *feature_name;
@@ -229,15 +224,12 @@ static int __init aria_avx512_init(void)
 	aria_ops.aria_decrypt_64way = aria_gfni_avx512_decrypt_64way;
 	aria_ops.aria_ctr_crypt_64way = aria_gfni_avx512_ctr_crypt_64way;
 
-	return simd_register_skciphers_compat(aria_algs,
-					      ARRAY_SIZE(aria_algs),
-					      aria_simd_algs);
+	return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
 }
 
 static void __exit aria_avx512_exit(void)
 {
-	simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs),
-				  aria_simd_algs);
+	crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
 }
 
 module_init(aria_avx512_init);
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index e7e4d64e9577..2d2f4e16537c 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -6,7 +6,6 @@
  */
 
 #include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
 #include <linux/crypto.h>
 #include <linux/err.h>
 #include <linux/module.h>
@@ -69,10 +68,9 @@ static int cbc_decrypt(struct skcipher_request *req)
 
 static struct skcipher_alg camellia_algs[] = {
 	{
-		.base.cra_name		= "__ecb(camellia)",
-		.base.cra_driver_name	= "__ecb-camellia-aesni-avx2",
+		.base.cra_name		= "ecb(camellia)",
+		.base.cra_driver_name	= "ecb-camellia-aesni-avx2",
 		.base.cra_priority	= 500,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= CAMELLIA_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct camellia_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -82,10 +80,9 @@ static struct skcipher_alg camellia_algs[] = {
 		.encrypt		= ecb_encrypt,
 		.decrypt		= ecb_decrypt,
 	}, {
-		.base.cra_name		= "__cbc(camellia)",
-		.base.cra_driver_name	= "__cbc-camellia-aesni-avx2",
+		.base.cra_name		= "cbc(camellia)",
+		.base.cra_driver_name	= "cbc-camellia-aesni-avx2",
 		.base.cra_priority	= 500,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= CAMELLIA_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct camellia_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -98,8 +95,6 @@ static struct skcipher_alg camellia_algs[] = {
 	},
 };
 
-static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)];
-
 static int __init camellia_aesni_init(void)
 {
 	const char *feature_name;
@@ -118,15 +113,13 @@ static int __init camellia_aesni_init(void)
 		return -ENODEV;
 	}
 
-	return simd_register_skciphers_compat(camellia_algs,
-					      ARRAY_SIZE(camellia_algs),
-					      camellia_simd_algs);
+	return crypto_register_skciphers(camellia_algs,
+					 ARRAY_SIZE(camellia_algs));
 }
 
 static void __exit camellia_aesni_fini(void)
 {
-	simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs),
-				  camellia_simd_algs);
+	crypto_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs));
 }
 
 module_init(camellia_aesni_init);
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index c7ccf63e741e..a7d162388142 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -6,7 +6,6 @@
  */
 
 #include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
 #include <linux/crypto.h>
 #include <linux/err.h>
 #include <linux/module.h>
@@ -69,10 +68,9 @@ static int cbc_decrypt(struct skcipher_request *req)
 
 static struct skcipher_alg camellia_algs[] = {
 	{
-		.base.cra_name		= "__ecb(camellia)",
-		.base.cra_driver_name	= "__ecb-camellia-aesni",
+		.base.cra_name		= "ecb(camellia)",
+		.base.cra_driver_name	= "ecb-camellia-aesni",
 		.base.cra_priority	= 400,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= CAMELLIA_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct camellia_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -82,10 +80,9 @@ static struct skcipher_alg camellia_algs[] = {
 		.encrypt		= ecb_encrypt,
 		.decrypt		= ecb_decrypt,
 	}, {
-		.base.cra_name		= "__cbc(camellia)",
-		.base.cra_driver_name	= "__cbc-camellia-aesni",
+		.base.cra_name		= "cbc(camellia)",
+		.base.cra_driver_name	= "cbc-camellia-aesni",
 		.base.cra_priority	= 400,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= CAMELLIA_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct camellia_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -98,8 +95,6 @@ static struct skcipher_alg camellia_algs[] = {
 	}
 };
 
-static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)];
-
 static int __init camellia_aesni_init(void)
 {
 	const char *feature_name;
@@ -117,15 +112,13 @@ static int __init camellia_aesni_init(void)
 		return -ENODEV;
 	}
 
-	return simd_register_skciphers_compat(camellia_algs,
-					      ARRAY_SIZE(camellia_algs),
-					      camellia_simd_algs);
+	return crypto_register_skciphers(camellia_algs,
+					 ARRAY_SIZE(camellia_algs));
 }
 
 static void __exit camellia_aesni_fini(void)
 {
-	simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs),
-				  camellia_simd_algs);
+	crypto_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs));
 }
 
 module_init(camellia_aesni_init);
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 3976a87f92ad..3aca04d43b34 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -8,7 +8,6 @@
 
 #include <crypto/algapi.h>
 #include <crypto/cast5.h>
-#include <crypto/internal/simd.h>
 #include <linux/crypto.h>
 #include <linux/err.h>
 #include <linux/module.h>
@@ -64,10 +63,9 @@ static int cbc_decrypt(struct skcipher_request *req)
 
 static struct skcipher_alg cast5_algs[] = {
 	{
-		.base.cra_name		= "__ecb(cast5)",
-		.base.cra_driver_name	= "__ecb-cast5-avx",
+		.base.cra_name		= "ecb(cast5)",
+		.base.cra_driver_name	= "ecb-cast5-avx",
 		.base.cra_priority	= 200,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= CAST5_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct cast5_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -77,10 +75,9 @@ static struct skcipher_alg cast5_algs[] = {
 		.encrypt		= ecb_encrypt,
 		.decrypt		= ecb_decrypt,
 	}, {
-		.base.cra_name		= "__cbc(cast5)",
-		.base.cra_driver_name	= "__cbc-cast5-avx",
+		.base.cra_name		= "cbc(cast5)",
+		.base.cra_driver_name	= "cbc-cast5-avx",
 		.base.cra_priority	= 200,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= CAST5_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct cast5_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -93,8 +90,6 @@ static struct skcipher_alg cast5_algs[] = {
 	}
 };
 
-static struct simd_skcipher_alg *cast5_simd_algs[ARRAY_SIZE(cast5_algs)];
-
 static int __init cast5_init(void)
 {
 	const char *feature_name;
@@ -105,15 +100,13 @@ static int __init cast5_init(void)
 		return -ENODEV;
 	}
 
-	return simd_register_skciphers_compat(cast5_algs,
-					      ARRAY_SIZE(cast5_algs),
-					      cast5_simd_algs);
+	return crypto_register_skciphers(cast5_algs,
+					 ARRAY_SIZE(cast5_algs));
 }
 
 static void __exit cast5_exit(void)
 {
-	simd_unregister_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs),
-				  cast5_simd_algs);
+	crypto_unregister_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs));
 }
 
 module_init(cast5_init);
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 7e2aea372349..c4dd28c30303 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -14,7 +14,6 @@
 #include <linux/err.h>
 #include <crypto/algapi.h>
 #include <crypto/cast6.h>
-#include <crypto/internal/simd.h>
 
 #include "ecb_cbc_helpers.h"
 
@@ -64,10 +63,9 @@ static int cbc_decrypt(struct skcipher_request *req)
 
 static struct skcipher_alg cast6_algs[] = {
 	{
-		.base.cra_name		= "__ecb(cast6)",
-		.base.cra_driver_name	= "__ecb-cast6-avx",
+		.base.cra_name		= "ecb(cast6)",
+		.base.cra_driver_name	= "ecb-cast6-avx",
 		.base.cra_priority	= 200,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= CAST6_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct cast6_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -77,10 +75,9 @@ static struct skcipher_alg cast6_algs[] = {
 		.encrypt		= ecb_encrypt,
 		.decrypt		= ecb_decrypt,
 	}, {
-		.base.cra_name		= "__cbc(cast6)",
-		.base.cra_driver_name	= "__cbc-cast6-avx",
+		.base.cra_name		= "cbc(cast6)",
+		.base.cra_driver_name	= "cbc-cast6-avx",
 		.base.cra_priority	= 200,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= CAST6_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct cast6_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -93,8 +90,6 @@ static struct skcipher_alg cast6_algs[] = {
 	},
 };
 
-static struct simd_skcipher_alg *cast6_simd_algs[ARRAY_SIZE(cast6_algs)];
-
 static int __init cast6_init(void)
 {
 	const char *feature_name;
@@ -105,15 +100,12 @@ static int __init cast6_init(void)
 		return -ENODEV;
 	}
 
-	return simd_register_skciphers_compat(cast6_algs,
-					      ARRAY_SIZE(cast6_algs),
-					      cast6_simd_algs);
+	return crypto_register_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs));
 }
 
 static void __exit cast6_exit(void)
 {
-	simd_unregister_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs),
-				  cast6_simd_algs);
+	crypto_unregister_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs));
 }
 
 module_init(cast6_init);
diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
deleted file mode 100644
index 8bb74a272879..000000000000
--- a/arch/x86/crypto/chacha_glue.c
+++ /dev/null
@@ -1,311 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
- * including ChaCha20 (RFC7539)
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <crypto/algapi.h>
-#include <crypto/internal/chacha.h>
-#include <crypto/internal/simd.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
-				       unsigned int len, int nrounds);
-asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
-					unsigned int len, int nrounds);
-asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
-
-asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
-				       unsigned int len, int nrounds);
-asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
-				       unsigned int len, int nrounds);
-asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
-				       unsigned int len, int nrounds);
-
-asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
-					   unsigned int len, int nrounds);
-asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
-					   unsigned int len, int nrounds);
-asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
-					   unsigned int len, int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
-
-static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
-{
-	len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
-	return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
-}
-
-static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
-			  unsigned int bytes, int nrounds)
-{
-	if (IS_ENABLED(CONFIG_AS_AVX512) &&
-	    static_branch_likely(&chacha_use_avx512vl)) {
-		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
-			chacha_8block_xor_avx512vl(state, dst, src, bytes,
-						   nrounds);
-			bytes -= CHACHA_BLOCK_SIZE * 8;
-			src += CHACHA_BLOCK_SIZE * 8;
-			dst += CHACHA_BLOCK_SIZE * 8;
-			state[12] += 8;
-		}
-		if (bytes > CHACHA_BLOCK_SIZE * 4) {
-			chacha_8block_xor_avx512vl(state, dst, src, bytes,
-						   nrounds);
-			state[12] += chacha_advance(bytes, 8);
-			return;
-		}
-		if (bytes > CHACHA_BLOCK_SIZE * 2) {
-			chacha_4block_xor_avx512vl(state, dst, src, bytes,
-						   nrounds);
-			state[12] += chacha_advance(bytes, 4);
-			return;
-		}
-		if (bytes) {
-			chacha_2block_xor_avx512vl(state, dst, src, bytes,
-						   nrounds);
-			state[12] += chacha_advance(bytes, 2);
-			return;
-		}
-	}
-
-	if (static_branch_likely(&chacha_use_avx2)) {
-		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
-			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
-			bytes -= CHACHA_BLOCK_SIZE * 8;
-			src += CHACHA_BLOCK_SIZE * 8;
-			dst += CHACHA_BLOCK_SIZE * 8;
-			state[12] += 8;
-		}
-		if (bytes > CHACHA_BLOCK_SIZE * 4) {
-			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
-			state[12] += chacha_advance(bytes, 8);
-			return;
-		}
-		if (bytes > CHACHA_BLOCK_SIZE * 2) {
-			chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
-			state[12] += chacha_advance(bytes, 4);
-			return;
-		}
-		if (bytes > CHACHA_BLOCK_SIZE) {
-			chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
-			state[12] += chacha_advance(bytes, 2);
-			return;
-		}
-	}
-
-	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
-		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
-		bytes -= CHACHA_BLOCK_SIZE * 4;
-		src += CHACHA_BLOCK_SIZE * 4;
-		dst += CHACHA_BLOCK_SIZE * 4;
-		state[12] += 4;
-	}
-	if (bytes > CHACHA_BLOCK_SIZE) {
-		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
-		state[12] += chacha_advance(bytes, 4);
-		return;
-	}
-	if (bytes) {
-		chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
-		state[12]++;
-	}
-}
-
-void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
-{
-	if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
-		hchacha_block_generic(state, stream, nrounds);
-	} else {
-		kernel_fpu_begin();
-		hchacha_block_ssse3(state, stream, nrounds);
-		kernel_fpu_end();
-	}
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
-		       int nrounds)
-{
-	if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
-	    bytes <= CHACHA_BLOCK_SIZE)
-		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
-
-	do {
-		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
-		kernel_fpu_begin();
-		chacha_dosimd(state, dst, src, todo, nrounds);
-		kernel_fpu_end();
-
-		bytes -= todo;
-		src += todo;
-		dst += todo;
-	} while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-static int chacha_simd_stream_xor(struct skcipher_request *req,
-				  const struct chacha_ctx *ctx, const u8 *iv)
-{
-	u32 state[CHACHA_STATE_WORDS] __aligned(8);
-	struct skcipher_walk walk;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	chacha_init(state, ctx->key, iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-
-		if (nbytes < walk.total)
-			nbytes = round_down(nbytes, walk.stride);
-
-		if (!static_branch_likely(&chacha_use_simd) ||
-		    !crypto_simd_usable()) {
-			chacha_crypt_generic(state, walk.dst.virt.addr,
-					     walk.src.virt.addr, nbytes,
-					     ctx->nrounds);
-		} else {
-			kernel_fpu_begin();
-			chacha_dosimd(state, walk.dst.virt.addr,
-				      walk.src.virt.addr, nbytes,
-				      ctx->nrounds);
-			kernel_fpu_end();
-		}
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
-
-	return err;
-}
-
-static int chacha_simd(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	return chacha_simd_stream_xor(req, ctx, req->iv);
-}
-
-static int xchacha_simd(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-	u32 state[CHACHA_STATE_WORDS] __aligned(8);
-	struct chacha_ctx subctx;
-	u8 real_iv[16];
-
-	chacha_init(state, ctx->key, req->iv);
-
-	if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {
-		kernel_fpu_begin();
-		hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
-		kernel_fpu_end();
-	} else {
-		hchacha_block_generic(state, subctx.key, ctx->nrounds);
-	}
-	subctx.nrounds = ctx->nrounds;
-
-	memcpy(&real_iv[0], req->iv + 24, 8);
-	memcpy(&real_iv[8], req->iv + 16, 8);
-	return chacha_simd_stream_xor(req, &subctx, real_iv);
-}
-
-static struct skcipher_alg algs[] = {
-	{
-		.base.cra_name		= "chacha20",
-		.base.cra_driver_name	= "chacha20-simd",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= CHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= chacha_simd,
-		.decrypt		= chacha_simd,
-	}, {
-		.base.cra_name		= "xchacha20",
-		.base.cra_driver_name	= "xchacha20-simd",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= xchacha_simd,
-		.decrypt		= xchacha_simd,
-	}, {
-		.base.cra_name		= "xchacha12",
-		.base.cra_driver_name	= "xchacha12-simd",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= chacha12_setkey,
-		.encrypt		= xchacha_simd,
-		.decrypt		= xchacha_simd,
-	},
-};
-
-static int __init chacha_simd_mod_init(void)
-{
-	if (!boot_cpu_has(X86_FEATURE_SSSE3))
-		return 0;
-
-	static_branch_enable(&chacha_use_simd);
-
-	if (boot_cpu_has(X86_FEATURE_AVX) &&
-	    boot_cpu_has(X86_FEATURE_AVX2) &&
-	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
-		static_branch_enable(&chacha_use_avx2);
-
-		if (IS_ENABLED(CONFIG_AS_AVX512) &&
-		    boot_cpu_has(X86_FEATURE_AVX512VL) &&
-		    boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
-			static_branch_enable(&chacha_use_avx512vl);
-	}
-	return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ?
-		crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
-}
-
-static void __exit chacha_simd_mod_fini(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3))
-		crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-module_init(chacha_simd_mod_init);
-module_exit(chacha_simd_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-simd");
-MODULE_ALIAS_CRYPTO("xchacha20");
-MODULE_ALIAS_CRYPTO("xchacha20-simd");
-MODULE_ALIAS_CRYPTO("xchacha12");
-MODULE_ALIAS_CRYPTO("xchacha12-simd");
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 99cb983ded9e..c4fbaa82ed7a 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -103,8 +103,8 @@ SYM_FUNC_START(clmul_ghash_mul)
 SYM_FUNC_END(clmul_ghash_mul)
 
 /*
- * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
- *			   const le128 *shash);
+ * int clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
+ *			  const le128 *shash);
  */
 SYM_FUNC_START(clmul_ghash_update)
 	FRAME_BEGIN
@@ -127,6 +127,7 @@ SYM_FUNC_START(clmul_ghash_update)
 	pshufb BSWAP, DATA
 	movups DATA, (%rdi)
 .Lupdate_just_ret:
+	mov %rdx, %rax
 	FRAME_END
 	RET
 SYM_FUNC_END(clmul_ghash_update)
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index c759ec808bf1..aea5d4d06be7 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -7,41 +7,27 @@
  *   Author: Huang Ying <ying.huang@intel.com>
  */
 
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/crypto.h>
-#include <crypto/algapi.h>
-#include <crypto/cryptd.h>
-#include <crypto/gf128mul.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <asm/cpu_device_id.h>
 #include <asm/simd.h>
+#include <crypto/b128ops.h>
+#include <crypto/ghash.h>
+#include <crypto/internal/hash.h>
+#include <crypto/utils.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
 #include <linux/unaligned.h>
 
-#define GHASH_BLOCK_SIZE	16
-#define GHASH_DIGEST_SIZE	16
+asmlinkage void clmul_ghash_mul(char *dst, const le128 *shash);
 
-void clmul_ghash_mul(char *dst, const le128 *shash);
+asmlinkage int clmul_ghash_update(char *dst, const char *src,
+				  unsigned int srclen, const le128 *shash);
 
-void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
-			const le128 *shash);
-
-struct ghash_async_ctx {
-	struct cryptd_ahash *cryptd_tfm;
-};
-
-struct ghash_ctx {
+struct x86_ghash_ctx {
 	le128 shash;
 };
 
-struct ghash_desc_ctx {
-	u8 buffer[GHASH_BLOCK_SIZE];
-	u32 bytes;
-};
-
 static int ghash_init(struct shash_desc *desc)
 {
 	struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
@@ -54,7 +40,7 @@ static int ghash_init(struct shash_desc *desc)
 static int ghash_setkey(struct crypto_shash *tfm,
 			const u8 *key, unsigned int keylen)
 {
-	struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
+	struct x86_ghash_ctx *ctx = crypto_shash_ctx(tfm);
 	u64 a, b;
 
 	if (keylen != GHASH_BLOCK_SIZE)
@@ -95,64 +81,38 @@ static int ghash_setkey(struct crypto_shash *tfm,
 static int ghash_update(struct shash_desc *desc,
 			 const u8 *src, unsigned int srclen)
 {
+	struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
 	struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
-	struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
 	u8 *dst = dctx->buffer;
+	int remain;
 
 	kernel_fpu_begin();
-	if (dctx->bytes) {
-		int n = min(srclen, dctx->bytes);
-		u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
-
-		dctx->bytes -= n;
-		srclen -= n;
-
-		while (n--)
-			*pos++ ^= *src++;
-
-		if (!dctx->bytes)
-			clmul_ghash_mul(dst, &ctx->shash);
-	}
-
-	clmul_ghash_update(dst, src, srclen, &ctx->shash);
+	remain = clmul_ghash_update(dst, src, srclen, &ctx->shash);
 	kernel_fpu_end();
-
-	if (srclen & 0xf) {
-		src += srclen - (srclen & 0xf);
-		srclen &= 0xf;
-		dctx->bytes = GHASH_BLOCK_SIZE - srclen;
-		while (srclen--)
-			*dst++ ^= *src++;
-	}
-
-	return 0;
+	return remain;
 }
 
-static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx)
+static void ghash_flush(struct x86_ghash_ctx *ctx, struct ghash_desc_ctx *dctx,
+			const u8 *src, unsigned int len)
 {
 	u8 *dst = dctx->buffer;
 
-	if (dctx->bytes) {
-		u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
-
-		while (dctx->bytes--)
-			*tmp++ ^= 0;
-
-		kernel_fpu_begin();
+	kernel_fpu_begin();
+	if (len) {
+		crypto_xor(dst, src, len);
 		clmul_ghash_mul(dst, &ctx->shash);
-		kernel_fpu_end();
 	}
-
-	dctx->bytes = 0;
+	kernel_fpu_end();
 }
 
-static int ghash_final(struct shash_desc *desc, u8 *dst)
+static int ghash_finup(struct shash_desc *desc, const u8 *src,
+		       unsigned int len, u8 *dst)
 {
+	struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
 	struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
-	struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
 	u8 *buf = dctx->buffer;
 
-	ghash_flush(ctx, dctx);
+	ghash_flush(ctx, dctx, src, len);
 	memcpy(dst, buf, GHASH_BLOCK_SIZE);
 
 	return 0;
@@ -162,186 +122,20 @@ static struct shash_alg ghash_alg = {
 	.digestsize	= GHASH_DIGEST_SIZE,
 	.init		= ghash_init,
 	.update		= ghash_update,
-	.final		= ghash_final,
+	.finup		= ghash_finup,
 	.setkey		= ghash_setkey,
 	.descsize	= sizeof(struct ghash_desc_ctx),
 	.base		= {
-		.cra_name		= "__ghash",
-		.cra_driver_name	= "__ghash-pclmulqdqni",
-		.cra_priority		= 0,
-		.cra_flags		= CRYPTO_ALG_INTERNAL,
+		.cra_name		= "ghash",
+		.cra_driver_name	= "ghash-pclmulqdqni",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize		= GHASH_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct ghash_ctx),
+		.cra_ctxsize		= sizeof(struct x86_ghash_ctx),
 		.cra_module		= THIS_MODULE,
 	},
 };
 
-static int ghash_async_init(struct ahash_request *req)
-{
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
-	struct ahash_request *cryptd_req = ahash_request_ctx(req);
-	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
-	struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
-	struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
-
-	desc->tfm = child;
-	return crypto_shash_init(desc);
-}
-
-static void ghash_init_cryptd_req(struct ahash_request *req)
-{
-	struct ahash_request *cryptd_req = ahash_request_ctx(req);
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
-	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
-
-	ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
-	ahash_request_set_callback(cryptd_req, req->base.flags,
-				   req->base.complete, req->base.data);
-	ahash_request_set_crypt(cryptd_req, req->src, req->result,
-				req->nbytes);
-}
-
-static int ghash_async_update(struct ahash_request *req)
-{
-	struct ahash_request *cryptd_req = ahash_request_ctx(req);
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
-	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
-
-	if (!crypto_simd_usable() ||
-	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
-		ghash_init_cryptd_req(req);
-		return crypto_ahash_update(cryptd_req);
-	} else {
-		struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
-		return shash_ahash_update(req, desc);
-	}
-}
-
-static int ghash_async_final(struct ahash_request *req)
-{
-	struct ahash_request *cryptd_req = ahash_request_ctx(req);
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
-	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
-
-	if (!crypto_simd_usable() ||
-	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
-		ghash_init_cryptd_req(req);
-		return crypto_ahash_final(cryptd_req);
-	} else {
-		struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
-		return crypto_shash_final(desc, req->result);
-	}
-}
-
-static int ghash_async_import(struct ahash_request *req, const void *in)
-{
-	struct ahash_request *cryptd_req = ahash_request_ctx(req);
-	struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
-	struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	ghash_async_init(req);
-	memcpy(dctx, in, sizeof(*dctx));
-	return 0;
-
-}
-
-static int ghash_async_export(struct ahash_request *req, void *out)
-{
-	struct ahash_request *cryptd_req = ahash_request_ctx(req);
-	struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
-	struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	memcpy(out, dctx, sizeof(*dctx));
-	return 0;
-
-}
-
-static int ghash_async_digest(struct ahash_request *req)
-{
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
-	struct ahash_request *cryptd_req = ahash_request_ctx(req);
-	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
-
-	if (!crypto_simd_usable() ||
-	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
-		ghash_init_cryptd_req(req);
-		return crypto_ahash_digest(cryptd_req);
-	} else {
-		struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
-		struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
-
-		desc->tfm = child;
-		return shash_ahash_digest(req, desc);
-	}
-}
-
-static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
-			      unsigned int keylen)
-{
-	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
-	struct crypto_ahash *child = &ctx->cryptd_tfm->base;
-
-	crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
-	crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm)
-			       & CRYPTO_TFM_REQ_MASK);
-	return crypto_ahash_setkey(child, key, keylen);
-}
-
-static int ghash_async_init_tfm(struct crypto_tfm *tfm)
-{
-	struct cryptd_ahash *cryptd_tfm;
-	struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
-
-	cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni",
-					CRYPTO_ALG_INTERNAL,
-					CRYPTO_ALG_INTERNAL);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-	ctx->cryptd_tfm = cryptd_tfm;
-	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
-				 sizeof(struct ahash_request) +
-				 crypto_ahash_reqsize(&cryptd_tfm->base));
-
-	return 0;
-}
-
-static void ghash_async_exit_tfm(struct crypto_tfm *tfm)
-{
-	struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
-
-	cryptd_free_ahash(ctx->cryptd_tfm);
-}
-
-static struct ahash_alg ghash_async_alg = {
-	.init		= ghash_async_init,
-	.update		= ghash_async_update,
-	.final		= ghash_async_final,
-	.setkey		= ghash_async_setkey,
-	.digest		= ghash_async_digest,
-	.export		= ghash_async_export,
-	.import		= ghash_async_import,
-	.halg = {
-		.digestsize	= GHASH_DIGEST_SIZE,
-		.statesize = sizeof(struct ghash_desc_ctx),
-		.base = {
-			.cra_name		= "ghash",
-			.cra_driver_name	= "ghash-clmulni",
-			.cra_priority		= 400,
-			.cra_ctxsize		= sizeof(struct ghash_async_ctx),
-			.cra_flags		= CRYPTO_ALG_ASYNC,
-			.cra_blocksize		= GHASH_BLOCK_SIZE,
-			.cra_module		= THIS_MODULE,
-			.cra_init		= ghash_async_init_tfm,
-			.cra_exit		= ghash_async_exit_tfm,
-		},
-	},
-};
-
 static const struct x86_cpu_id pcmul_cpu_id[] = {
 	X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), /* Pickle-Mickle-Duck */
 	{}
@@ -350,29 +144,14 @@ MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
 
 static int __init ghash_pclmulqdqni_mod_init(void)
 {
-	int err;
-
 	if (!x86_match_cpu(pcmul_cpu_id))
 		return -ENODEV;
 
-	err = crypto_register_shash(&ghash_alg);
-	if (err)
-		goto err_out;
-	err = crypto_register_ahash(&ghash_async_alg);
-	if (err)
-		goto err_shash;
-
-	return 0;
-
-err_shash:
-	crypto_unregister_shash(&ghash_alg);
-err_out:
-	return err;
+	return crypto_register_shash(&ghash_alg);
 }
 
 static void __exit ghash_pclmulqdqni_mod_exit(void)
 {
-	crypto_unregister_ahash(&ghash_async_alg);
 	crypto_unregister_shash(&ghash_alg);
 }
 
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
deleted file mode 100644
index 08ff4b489f7e..000000000000
--- a/arch/x86/crypto/poly1305_glue.c
+++ /dev/null
@@ -1,290 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/poly1305.h>
-#include <crypto/internal/simd.h>
-#include <linux/crypto.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
-
-asmlinkage void poly1305_init_x86_64(void *ctx,
-				     const u8 key[POLY1305_BLOCK_SIZE]);
-asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
-				       const size_t len, const u32 padbit);
-asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
-				     const u32 nonce[4]);
-asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
-				  const u32 nonce[4]);
-asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
-				    const u32 padbit);
-asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
-				     const u32 padbit);
-asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
-				       const size_t len, const u32 padbit);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
-
-struct poly1305_arch_internal {
-	union {
-		struct {
-			u32 h[5];
-			u32 is_base2_26;
-		};
-		u64 hs[3];
-	};
-	u64 r[2];
-	u64 pad;
-	struct { u32 r2, r1, r4, r3; } rn[9];
-};
-
-/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
- * the unfortunate situation of using AVX and then having to go back to scalar
- * -- because the user is silly and has called the update function from two
- * separate contexts -- then we need to convert back to the original base before
- * proceeding. It is possible to reason that the initial reduction below is
- * sufficient given the implementation invariants. However, for an avoidance of
- * doubt and because this is not performance critical, we do the full reduction
- * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
- */
-static void convert_to_base2_64(void *ctx)
-{
-	struct poly1305_arch_internal *state = ctx;
-	u32 cy;
-
-	if (!state->is_base2_26)
-		return;
-
-	cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
-	cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
-	cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
-	cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
-	state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
-	state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
-	state->hs[2] = state->h[4] >> 24;
-#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
-	cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
-	state->hs[2] &= 3;
-	state->hs[0] += cy;
-	state->hs[1] += (cy = ULT(state->hs[0], cy));
-	state->hs[2] += ULT(state->hs[1], cy);
-#undef ULT
-	state->is_base2_26 = 0;
-}
-
-static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE])
-{
-	poly1305_init_x86_64(ctx, key);
-}
-
-static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
-				 const u32 padbit)
-{
-	struct poly1305_arch_internal *state = ctx;
-
-	/* SIMD disables preemption, so relax after processing each page. */
-	BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
-		     SZ_4K % POLY1305_BLOCK_SIZE);
-
-	if (!static_branch_likely(&poly1305_use_avx) ||
-	    (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
-	    !crypto_simd_usable()) {
-		convert_to_base2_64(ctx);
-		poly1305_blocks_x86_64(ctx, inp, len, padbit);
-		return;
-	}
-
-	do {
-		const size_t bytes = min_t(size_t, len, SZ_4K);
-
-		kernel_fpu_begin();
-		if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
-			poly1305_blocks_avx512(ctx, inp, bytes, padbit);
-		else if (static_branch_likely(&poly1305_use_avx2))
-			poly1305_blocks_avx2(ctx, inp, bytes, padbit);
-		else
-			poly1305_blocks_avx(ctx, inp, bytes, padbit);
-		kernel_fpu_end();
-
-		len -= bytes;
-		inp += bytes;
-	} while (len);
-}
-
-static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
-			       const u32 nonce[4])
-{
-	if (!static_branch_likely(&poly1305_use_avx))
-		poly1305_emit_x86_64(ctx, mac, nonce);
-	else
-		poly1305_emit_avx(ctx, mac, nonce);
-}
-
-void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
-{
-	poly1305_simd_init(&dctx->h, key);
-	dctx->s[0] = get_unaligned_le32(&key[16]);
-	dctx->s[1] = get_unaligned_le32(&key[20]);
-	dctx->s[2] = get_unaligned_le32(&key[24]);
-	dctx->s[3] = get_unaligned_le32(&key[28]);
-	dctx->buflen = 0;
-	dctx->sset = true;
-}
-EXPORT_SYMBOL(poly1305_init_arch);
-
-static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
-					       const u8 *inp, unsigned int len)
-{
-	unsigned int acc = 0;
-	if (unlikely(!dctx->sset)) {
-		if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
-			poly1305_simd_init(&dctx->h, inp);
-			inp += POLY1305_BLOCK_SIZE;
-			len -= POLY1305_BLOCK_SIZE;
-			acc += POLY1305_BLOCK_SIZE;
-			dctx->rset = 1;
-		}
-		if (len >= POLY1305_BLOCK_SIZE) {
-			dctx->s[0] = get_unaligned_le32(&inp[0]);
-			dctx->s[1] = get_unaligned_le32(&inp[4]);
-			dctx->s[2] = get_unaligned_le32(&inp[8]);
-			dctx->s[3] = get_unaligned_le32(&inp[12]);
-			acc += POLY1305_BLOCK_SIZE;
-			dctx->sset = true;
-		}
-	}
-	return acc;
-}
-
-void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
-			  unsigned int srclen)
-{
-	unsigned int bytes, used;
-
-	if (unlikely(dctx->buflen)) {
-		bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
-		memcpy(dctx->buf + dctx->buflen, src, bytes);
-		src += bytes;
-		srclen -= bytes;
-		dctx->buflen += bytes;
-
-		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-			if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE)))
-				poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
-			dctx->buflen = 0;
-		}
-	}
-
-	if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
-		bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
-		srclen -= bytes;
-		used = crypto_poly1305_setdctxkey(dctx, src, bytes);
-		if (likely(bytes - used))
-			poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1);
-		src += bytes;
-	}
-
-	if (unlikely(srclen)) {
-		dctx->buflen = srclen;
-		memcpy(dctx->buf, src, srclen);
-	}
-}
-EXPORT_SYMBOL(poly1305_update_arch);
-
-void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
-{
-	if (unlikely(dctx->buflen)) {
-		dctx->buf[dctx->buflen++] = 1;
-		memset(dctx->buf + dctx->buflen, 0,
-		       POLY1305_BLOCK_SIZE - dctx->buflen);
-		poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
-	}
-
-	poly1305_simd_emit(&dctx->h, dst, dctx->s);
-	memzero_explicit(dctx, sizeof(*dctx));
-}
-EXPORT_SYMBOL(poly1305_final_arch);
-
-static int crypto_poly1305_init(struct shash_desc *desc)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	*dctx = (struct poly1305_desc_ctx){};
-	return 0;
-}
-
-static int crypto_poly1305_update(struct shash_desc *desc,
-				  const u8 *src, unsigned int srclen)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	poly1305_update_arch(dctx, src, srclen);
-	return 0;
-}
-
-static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	if (unlikely(!dctx->sset))
-		return -ENOKEY;
-
-	poly1305_final_arch(dctx, dst);
-	return 0;
-}
-
-static struct shash_alg alg = {
-	.digestsize	= POLY1305_DIGEST_SIZE,
-	.init		= crypto_poly1305_init,
-	.update		= crypto_poly1305_update,
-	.final		= crypto_poly1305_final,
-	.descsize	= sizeof(struct poly1305_desc_ctx),
-	.base		= {
-		.cra_name		= "poly1305",
-		.cra_driver_name	= "poly1305-simd",
-		.cra_priority		= 300,
-		.cra_blocksize		= POLY1305_BLOCK_SIZE,
-		.cra_module		= THIS_MODULE,
-	},
-};
-
-static int __init poly1305_simd_mod_init(void)
-{
-	if (boot_cpu_has(X86_FEATURE_AVX) &&
-	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
-		static_branch_enable(&poly1305_use_avx);
-	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
-	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
-		static_branch_enable(&poly1305_use_avx2);
-	if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
-	    boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
-	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
-	    /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
-	    boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X)
-		static_branch_enable(&poly1305_use_avx512);
-	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
-}
-
-static void __exit poly1305_simd_mod_exit(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
-		crypto_unregister_shash(&alg);
-}
-
-module_init(poly1305_simd_mod_init);
-module_exit(poly1305_simd_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
-MODULE_DESCRIPTION("Poly1305 authenticator");
-MODULE_ALIAS_CRYPTO("poly1305");
-MODULE_ALIAS_CRYPTO("poly1305-simd");
diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c
index 8fa58b0f3cb3..6b466867f91a 100644
--- a/arch/x86/crypto/polyval-clmulni_glue.c
+++ b/arch/x86/crypto/polyval-clmulni_glue.c
@@ -16,16 +16,15 @@
  * operations.
  */
 
-#include <crypto/algapi.h>
+#include <asm/cpu_device_id.h>
+#include <asm/fpu/api.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/polyval.h>
-#include <linux/crypto.h>
-#include <linux/init.h>
+#include <crypto/utils.h>
+#include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
+#include <linux/string.h>
 
 #define POLYVAL_ALIGN	16
 #define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN)
@@ -42,7 +41,6 @@ struct polyval_tfm_ctx {
 
 struct polyval_desc_ctx {
 	u8 buffer[POLYVAL_BLOCK_SIZE];
-	u32 bytes;
 };
 
 asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
@@ -57,25 +55,16 @@ static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm)
 static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
 	const u8 *in, size_t nblocks, u8 *accumulator)
 {
-	if (likely(crypto_simd_usable())) {
-		kernel_fpu_begin();
-		clmul_polyval_update(keys, in, nblocks, accumulator);
-		kernel_fpu_end();
-	} else {
-		polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in,
-			nblocks, accumulator);
-	}
+	kernel_fpu_begin();
+	clmul_polyval_update(keys, in, nblocks, accumulator);
+	kernel_fpu_end();
 }
 
 static void internal_polyval_mul(u8 *op1, const u8 *op2)
 {
-	if (likely(crypto_simd_usable())) {
-		kernel_fpu_begin();
-		clmul_polyval_mul(op1, op2);
-		kernel_fpu_end();
-	} else {
-		polyval_mul_non4k(op1, op2);
-	}
+	kernel_fpu_begin();
+	clmul_polyval_mul(op1, op2);
+	kernel_fpu_end();
 }
 
 static int polyval_x86_setkey(struct crypto_shash *tfm,
@@ -112,49 +101,27 @@ static int polyval_x86_update(struct shash_desc *desc,
 {
 	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
 	const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
-	u8 *pos;
 	unsigned int nblocks;
-	unsigned int n;
-
-	if (dctx->bytes) {
-		n = min(srclen, dctx->bytes);
-		pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes;
-
-		dctx->bytes -= n;
-		srclen -= n;
-
-		while (n--)
-			*pos++ ^= *src++;
 
-		if (!dctx->bytes)
-			internal_polyval_mul(dctx->buffer,
-					    tctx->key_powers[NUM_KEY_POWERS-1]);
-	}
-
-	while (srclen >= POLYVAL_BLOCK_SIZE) {
+	do {
 		/* Allow rescheduling every 4K bytes. */
 		nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
 		internal_polyval_update(tctx, src, nblocks, dctx->buffer);
 		srclen -= nblocks * POLYVAL_BLOCK_SIZE;
 		src += nblocks * POLYVAL_BLOCK_SIZE;
-	}
+	} while (srclen >= POLYVAL_BLOCK_SIZE);
 
-	if (srclen) {
-		dctx->bytes = POLYVAL_BLOCK_SIZE - srclen;
-		pos = dctx->buffer;
-		while (srclen--)
-			*pos++ ^= *src++;
-	}
-
-	return 0;
+	return srclen;
 }
 
-static int polyval_x86_final(struct shash_desc *desc, u8 *dst)
+static int polyval_x86_finup(struct shash_desc *desc, const u8 *src,
+			     unsigned int len, u8 *dst)
 {
 	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
 	const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
 
-	if (dctx->bytes) {
+	if (len) {
+		crypto_xor(dctx->buffer, src, len);
 		internal_polyval_mul(dctx->buffer,
 				     tctx->key_powers[NUM_KEY_POWERS-1]);
 	}
@@ -168,13 +135,14 @@ static struct shash_alg polyval_alg = {
 	.digestsize	= POLYVAL_DIGEST_SIZE,
 	.init		= polyval_x86_init,
 	.update		= polyval_x86_update,
-	.final		= polyval_x86_final,
+	.finup		= polyval_x86_finup,
 	.setkey		= polyval_x86_setkey,
 	.descsize	= sizeof(struct polyval_desc_ctx),
 	.base		= {
 		.cra_name		= "polyval",
 		.cra_driver_name	= "polyval-clmulni",
 		.cra_priority		= 200,
+		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize		= POLYVAL_BLOCK_SIZE,
 		.cra_ctxsize		= POLYVAL_CTX_SIZE,
 		.cra_module		= THIS_MODULE,
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 347e97f4b713..f5f2121b7956 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -10,7 +10,6 @@
 #include <linux/crypto.h>
 #include <linux/err.h>
 #include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
 #include <crypto/serpent.h>
 
 #include "serpent-avx.h"
@@ -65,10 +64,9 @@ static int cbc_decrypt(struct skcipher_request *req)
 
 static struct skcipher_alg serpent_algs[] = {
 	{
-		.base.cra_name		= "__ecb(serpent)",
-		.base.cra_driver_name	= "__ecb-serpent-avx2",
+		.base.cra_name		= "ecb(serpent)",
+		.base.cra_driver_name	= "ecb-serpent-avx2",
 		.base.cra_priority	= 600,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= SERPENT_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct serpent_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -78,10 +76,9 @@ static struct skcipher_alg serpent_algs[] = {
 		.encrypt		= ecb_encrypt,
 		.decrypt		= ecb_decrypt,
 	}, {
-		.base.cra_name		= "__cbc(serpent)",
-		.base.cra_driver_name	= "__cbc-serpent-avx2",
+		.base.cra_name		= "cbc(serpent)",
+		.base.cra_driver_name	= "cbc-serpent-avx2",
 		.base.cra_priority	= 600,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= SERPENT_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct serpent_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -94,8 +91,6 @@ static struct skcipher_alg serpent_algs[] = {
 	},
 };
 
-static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)];
-
 static int __init serpent_avx2_init(void)
 {
 	const char *feature_name;
@@ -110,15 +105,13 @@ static int __init serpent_avx2_init(void)
 		return -ENODEV;
 	}
 
-	return simd_register_skciphers_compat(serpent_algs,
-					      ARRAY_SIZE(serpent_algs),
-					      serpent_simd_algs);
+	return crypto_register_skciphers(serpent_algs,
+					 ARRAY_SIZE(serpent_algs));
 }
 
 static void __exit serpent_avx2_fini(void)
 {
-	simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs),
-				  serpent_simd_algs);
+	crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs));
 }
 
 module_init(serpent_avx2_init);
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 6c248e1ea4ef..e640abc1cb8a 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -13,7 +13,6 @@
 #include <linux/crypto.h>
 #include <linux/err.h>
 #include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
 #include <crypto/serpent.h>
 
 #include "serpent-avx.h"
@@ -71,10 +70,9 @@ static int cbc_decrypt(struct skcipher_request *req)
 
 static struct skcipher_alg serpent_algs[] = {
 	{
-		.base.cra_name		= "__ecb(serpent)",
-		.base.cra_driver_name	= "__ecb-serpent-avx",
+		.base.cra_name		= "ecb(serpent)",
+		.base.cra_driver_name	= "ecb-serpent-avx",
 		.base.cra_priority	= 500,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= SERPENT_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct serpent_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -84,10 +82,9 @@ static struct skcipher_alg serpent_algs[] = {
 		.encrypt		= ecb_encrypt,
 		.decrypt		= ecb_decrypt,
 	}, {
-		.base.cra_name		= "__cbc(serpent)",
-		.base.cra_driver_name	= "__cbc-serpent-avx",
+		.base.cra_name		= "cbc(serpent)",
+		.base.cra_driver_name	= "cbc-serpent-avx",
 		.base.cra_priority	= 500,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= SERPENT_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct serpent_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -100,8 +97,6 @@ static struct skcipher_alg serpent_algs[] = {
 	},
 };
 
-static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)];
-
 static int __init serpent_init(void)
 {
 	const char *feature_name;
@@ -112,15 +107,13 @@ static int __init serpent_init(void)
 		return -ENODEV;
 	}
 
-	return simd_register_skciphers_compat(serpent_algs,
-					      ARRAY_SIZE(serpent_algs),
-					      serpent_simd_algs);
+	return crypto_register_skciphers(serpent_algs,
+					 ARRAY_SIZE(serpent_algs));
 }
 
 static void __exit serpent_exit(void)
 {
-	simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs),
-				  serpent_simd_algs);
+	crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs));
 }
 
 module_init(serpent_init);
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index d78f37e9b2cf..80ee17ec21b4 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -18,7 +18,6 @@
 #include <linux/err.h>
 #include <crypto/algapi.h>
 #include <crypto/b128ops.h>
-#include <crypto/internal/simd.h>
 #include <crypto/serpent.h>
 
 #include "serpent-sse2.h"
@@ -74,10 +73,9 @@ static int cbc_decrypt(struct skcipher_request *req)
 
 static struct skcipher_alg serpent_algs[] = {
 	{
-		.base.cra_name		= "__ecb(serpent)",
-		.base.cra_driver_name	= "__ecb-serpent-sse2",
+		.base.cra_name		= "ecb(serpent)",
+		.base.cra_driver_name	= "ecb-serpent-sse2",
 		.base.cra_priority	= 400,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= SERPENT_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct serpent_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -87,10 +85,9 @@ static struct skcipher_alg serpent_algs[] = {
 		.encrypt		= ecb_encrypt,
 		.decrypt		= ecb_decrypt,
 	}, {
-		.base.cra_name		= "__cbc(serpent)",
-		.base.cra_driver_name	= "__cbc-serpent-sse2",
+		.base.cra_name		= "cbc(serpent)",
+		.base.cra_driver_name	= "cbc-serpent-sse2",
 		.base.cra_priority	= 400,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= SERPENT_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct serpent_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -103,8 +100,6 @@ static struct skcipher_alg serpent_algs[] = {
 	},
 };
 
-static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)];
-
 static int __init serpent_sse2_init(void)
 {
 	if (!boot_cpu_has(X86_FEATURE_XMM2)) {
@@ -112,15 +107,13 @@ static int __init serpent_sse2_init(void)
 		return -ENODEV;
 	}
 
-	return simd_register_skciphers_compat(serpent_algs,
-					      ARRAY_SIZE(serpent_algs),
-					      serpent_simd_algs);
+	return crypto_register_skciphers(serpent_algs,
+					 ARRAY_SIZE(serpent_algs));
 }
 
 static void __exit serpent_sse2_exit(void)
 {
-	simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs),
-				  serpent_simd_algs);
+	crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs));
 }
 
 module_init(serpent_sse2_init);
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index ab8bc54f254d..0a912bfc86c5 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -16,21 +16,17 @@
 
 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
 #include <crypto/sha1.h>
 #include <crypto/sha1_base.h>
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 
 static const struct x86_cpu_id module_cpu_ids[] = {
-#ifdef CONFIG_AS_SHA1_NI
 	X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL),
-#endif
 	X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
 	X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
 	X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
@@ -38,14 +34,10 @@ static const struct x86_cpu_id module_cpu_ids[] = {
 };
 MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids);
 
-static int sha1_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len, sha1_block_fn *sha1_xform)
+static inline int sha1_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, sha1_block_fn *sha1_xform)
 {
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable() ||
-	    (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
-		return crypto_sha1_update(desc, data, len);
+	int remain;
 
 	/*
 	 * Make sure struct sha1_state begins directly with the SHA1
@@ -54,22 +46,18 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
 	BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
 
 	kernel_fpu_begin();
-	sha1_base_do_update(desc, data, len, sha1_xform);
+	remain = sha1_base_do_update_blocks(desc, data, len, sha1_xform);
 	kernel_fpu_end();
 
-	return 0;
+	return remain;
 }
 
-static int sha1_finup(struct shash_desc *desc, const u8 *data,
-		      unsigned int len, u8 *out, sha1_block_fn *sha1_xform)
+static inline int sha1_finup(struct shash_desc *desc, const u8 *data,
+			     unsigned int len, u8 *out,
+			     sha1_block_fn *sha1_xform)
 {
-	if (!crypto_simd_usable())
-		return crypto_sha1_finup(desc, data, len, out);
-
 	kernel_fpu_begin();
-	if (len)
-		sha1_base_do_update(desc, data, len, sha1_xform);
-	sha1_base_do_finalize(desc, sha1_xform);
+	sha1_base_do_finup(desc, data, len, sha1_xform);
 	kernel_fpu_end();
 
 	return sha1_base_finish(desc, out);
@@ -90,23 +78,17 @@ static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
 	return sha1_finup(desc, data, len, out, sha1_transform_ssse3);
 }
 
-/* Add padding and return the message digest. */
-static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
-{
-	return sha1_ssse3_finup(desc, NULL, 0, out);
-}
-
 static struct shash_alg sha1_ssse3_alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
 	.init		=	sha1_base_init,
 	.update		=	sha1_ssse3_update,
-	.final		=	sha1_ssse3_final,
 	.finup		=	sha1_ssse3_finup,
-	.descsize	=	sizeof(struct sha1_state),
+	.descsize	=	SHA1_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha1",
 		.cra_driver_name =	"sha1-ssse3",
 		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@@ -140,22 +122,17 @@ static int sha1_avx_finup(struct shash_desc *desc, const u8 *data,
 	return sha1_finup(desc, data, len, out, sha1_transform_avx);
 }
 
-static int sha1_avx_final(struct shash_desc *desc, u8 *out)
-{
-	return sha1_avx_finup(desc, NULL, 0, out);
-}
-
 static struct shash_alg sha1_avx_alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
 	.init		=	sha1_base_init,
 	.update		=	sha1_avx_update,
-	.final		=	sha1_avx_final,
 	.finup		=	sha1_avx_finup,
-	.descsize	=	sizeof(struct sha1_state),
+	.descsize	=	SHA1_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha1",
 		.cra_driver_name =	"sha1-avx",
 		.cra_priority	=	160,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@@ -200,8 +177,8 @@ static bool avx2_usable(void)
 	return false;
 }
 
-static void sha1_apply_transform_avx2(struct sha1_state *state,
-				      const u8 *data, int blocks)
+static inline void sha1_apply_transform_avx2(struct sha1_state *state,
+					     const u8 *data, int blocks)
 {
 	/* Select the optimal transform based on data block size */
 	if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE)
@@ -222,22 +199,17 @@ static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data,
 	return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2);
 }
 
-static int sha1_avx2_final(struct shash_desc *desc, u8 *out)
-{
-	return sha1_avx2_finup(desc, NULL, 0, out);
-}
-
 static struct shash_alg sha1_avx2_alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
 	.init		=	sha1_base_init,
 	.update		=	sha1_avx2_update,
-	.final		=	sha1_avx2_final,
 	.finup		=	sha1_avx2_finup,
-	.descsize	=	sizeof(struct sha1_state),
+	.descsize	=	SHA1_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha1",
 		.cra_driver_name =	"sha1-avx2",
 		.cra_priority	=	170,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@@ -256,7 +228,6 @@ static void unregister_sha1_avx2(void)
 		crypto_unregister_shash(&sha1_avx2_alg);
 }
 
-#ifdef CONFIG_AS_SHA1_NI
 asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data,
 				  int rounds);
 
@@ -272,22 +243,17 @@ static int sha1_ni_finup(struct shash_desc *desc, const u8 *data,
 	return sha1_finup(desc, data, len, out, sha1_ni_transform);
 }
 
-static int sha1_ni_final(struct shash_desc *desc, u8 *out)
-{
-	return sha1_ni_finup(desc, NULL, 0, out);
-}
-
 static struct shash_alg sha1_ni_alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
 	.init		=	sha1_base_init,
 	.update		=	sha1_ni_update,
-	.final		=	sha1_ni_final,
 	.finup		=	sha1_ni_finup,
-	.descsize	=	sizeof(struct sha1_state),
+	.descsize	=	SHA1_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha1",
 		.cra_driver_name =	"sha1-ni",
 		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@@ -306,11 +272,6 @@ static void unregister_sha1_ni(void)
 		crypto_unregister_shash(&sha1_ni_alg);
 }
 
-#else
-static inline int register_sha1_ni(void) { return 0; }
-static inline void unregister_sha1_ni(void) { }
-#endif
-
 static int __init sha1_ssse3_mod_init(void)
 {
 	if (!x86_match_cpu(module_cpu_ids))
@@ -360,6 +321,4 @@ MODULE_ALIAS_CRYPTO("sha1");
 MODULE_ALIAS_CRYPTO("sha1-ssse3");
 MODULE_ALIAS_CRYPTO("sha1-avx");
 MODULE_ALIAS_CRYPTO("sha1-avx2");
-#ifdef CONFIG_AS_SHA1_NI
 MODULE_ALIAS_CRYPTO("sha1-ni");
-#endif
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
deleted file mode 100644
index e04a43d9f7d5..000000000000
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- * Cryptographic API.
- *
- * Glue code for the SHA256 Secure Hash Algorithm assembler implementations
- * using SSSE3, AVX, AVX2, and SHA-NI instructions.
- *
- * This file is based on sha256_generic.c
- *
- * Copyright (C) 2013 Intel Corporation.
- *
- * Author:
- *     Tim Chen <tim.c.chen@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-
-#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
-
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <linux/string.h>
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
-
-asmlinkage void sha256_transform_ssse3(struct sha256_state *state,
-				       const u8 *data, int blocks);
-
-static const struct x86_cpu_id module_cpu_ids[] = {
-#ifdef CONFIG_AS_SHA256_NI
-	X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL),
-#endif
-	X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
-	X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
-	X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
-	{}
-};
-MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids);
-
-static int _sha256_update(struct shash_desc *desc, const u8 *data,
-			  unsigned int len, sha256_block_fn *sha256_xform)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable() ||
-	    (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
-		return crypto_sha256_update(desc, data, len);
-
-	/*
-	 * Make sure struct sha256_state begins directly with the SHA256
-	 * 256-bit internal state, as this is what the asm functions expect.
-	 */
-	BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
-
-	kernel_fpu_begin();
-	sha256_base_do_update(desc, data, len, sha256_xform);
-	kernel_fpu_end();
-
-	return 0;
-}
-
-static int sha256_finup(struct shash_desc *desc, const u8 *data,
-	      unsigned int len, u8 *out, sha256_block_fn *sha256_xform)
-{
-	if (!crypto_simd_usable())
-		return crypto_sha256_finup(desc, data, len, out);
-
-	kernel_fpu_begin();
-	if (len)
-		sha256_base_do_update(desc, data, len, sha256_xform);
-	sha256_base_do_finalize(desc, sha256_xform);
-	kernel_fpu_end();
-
-	return sha256_base_finish(desc, out);
-}
-
-static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
-			 unsigned int len)
-{
-	return _sha256_update(desc, data, len, sha256_transform_ssse3);
-}
-
-static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
-	      unsigned int len, u8 *out)
-{
-	return sha256_finup(desc, data, len, out, sha256_transform_ssse3);
-}
-
-/* Add padding and return the message digest. */
-static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
-{
-	return sha256_ssse3_finup(desc, NULL, 0, out);
-}
-
-static int sha256_ssse3_digest(struct shash_desc *desc, const u8 *data,
-	      unsigned int len, u8 *out)
-{
-	return sha256_base_init(desc) ?:
-	       sha256_ssse3_finup(desc, data, len, out);
-}
-
-static struct shash_alg sha256_ssse3_algs[] = { {
-	.digestsize	=	SHA256_DIGEST_SIZE,
-	.init		=	sha256_base_init,
-	.update		=	sha256_ssse3_update,
-	.final		=	sha256_ssse3_final,
-	.finup		=	sha256_ssse3_finup,
-	.digest		=	sha256_ssse3_digest,
-	.descsize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha256",
-		.cra_driver_name =	"sha256-ssse3",
-		.cra_priority	=	150,
-		.cra_blocksize	=	SHA256_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-}, {
-	.digestsize	=	SHA224_DIGEST_SIZE,
-	.init		=	sha224_base_init,
-	.update		=	sha256_ssse3_update,
-	.final		=	sha256_ssse3_final,
-	.finup		=	sha256_ssse3_finup,
-	.descsize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha224",
-		.cra_driver_name =	"sha224-ssse3",
-		.cra_priority	=	150,
-		.cra_blocksize	=	SHA224_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-} };
-
-static int register_sha256_ssse3(void)
-{
-	if (boot_cpu_has(X86_FEATURE_SSSE3))
-		return crypto_register_shashes(sha256_ssse3_algs,
-				ARRAY_SIZE(sha256_ssse3_algs));
-	return 0;
-}
-
-static void unregister_sha256_ssse3(void)
-{
-	if (boot_cpu_has(X86_FEATURE_SSSE3))
-		crypto_unregister_shashes(sha256_ssse3_algs,
-				ARRAY_SIZE(sha256_ssse3_algs));
-}
-
-asmlinkage void sha256_transform_avx(struct sha256_state *state,
-				     const u8 *data, int blocks);
-
-static int sha256_avx_update(struct shash_desc *desc, const u8 *data,
-			 unsigned int len)
-{
-	return _sha256_update(desc, data, len, sha256_transform_avx);
-}
-
-static int sha256_avx_finup(struct shash_desc *desc, const u8 *data,
-		      unsigned int len, u8 *out)
-{
-	return sha256_finup(desc, data, len, out, sha256_transform_avx);
-}
-
-static int sha256_avx_final(struct shash_desc *desc, u8 *out)
-{
-	return sha256_avx_finup(desc, NULL, 0, out);
-}
-
-static int sha256_avx_digest(struct shash_desc *desc, const u8 *data,
-		      unsigned int len, u8 *out)
-{
-	return sha256_base_init(desc) ?:
-	       sha256_avx_finup(desc, data, len, out);
-}
-
-static struct shash_alg sha256_avx_algs[] = { {
-	.digestsize	=	SHA256_DIGEST_SIZE,
-	.init		=	sha256_base_init,
-	.update		=	sha256_avx_update,
-	.final		=	sha256_avx_final,
-	.finup		=	sha256_avx_finup,
-	.digest		=	sha256_avx_digest,
-	.descsize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha256",
-		.cra_driver_name =	"sha256-avx",
-		.cra_priority	=	160,
-		.cra_blocksize	=	SHA256_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-}, {
-	.digestsize	=	SHA224_DIGEST_SIZE,
-	.init		=	sha224_base_init,
-	.update		=	sha256_avx_update,
-	.final		=	sha256_avx_final,
-	.finup		=	sha256_avx_finup,
-	.descsize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha224",
-		.cra_driver_name =	"sha224-avx",
-		.cra_priority	=	160,
-		.cra_blocksize	=	SHA224_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-} };
-
-static bool avx_usable(void)
-{
-	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
-		if (boot_cpu_has(X86_FEATURE_AVX))
-			pr_info("AVX detected but unusable.\n");
-		return false;
-	}
-
-	return true;
-}
-
-static int register_sha256_avx(void)
-{
-	if (avx_usable())
-		return crypto_register_shashes(sha256_avx_algs,
-				ARRAY_SIZE(sha256_avx_algs));
-	return 0;
-}
-
-static void unregister_sha256_avx(void)
-{
-	if (avx_usable())
-		crypto_unregister_shashes(sha256_avx_algs,
-				ARRAY_SIZE(sha256_avx_algs));
-}
-
-asmlinkage void sha256_transform_rorx(struct sha256_state *state,
-				      const u8 *data, int blocks);
-
-static int sha256_avx2_update(struct shash_desc *desc, const u8 *data,
-			 unsigned int len)
-{
-	return _sha256_update(desc, data, len, sha256_transform_rorx);
-}
-
-static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data,
-		      unsigned int len, u8 *out)
-{
-	return sha256_finup(desc, data, len, out, sha256_transform_rorx);
-}
-
-static int sha256_avx2_final(struct shash_desc *desc, u8 *out)
-{
-	return sha256_avx2_finup(desc, NULL, 0, out);
-}
-
-static int sha256_avx2_digest(struct shash_desc *desc, const u8 *data,
-		      unsigned int len, u8 *out)
-{
-	return sha256_base_init(desc) ?:
-	       sha256_avx2_finup(desc, data, len, out);
-}
-
-static struct shash_alg sha256_avx2_algs[] = { {
-	.digestsize	=	SHA256_DIGEST_SIZE,
-	.init		=	sha256_base_init,
-	.update		=	sha256_avx2_update,
-	.final		=	sha256_avx2_final,
-	.finup		=	sha256_avx2_finup,
-	.digest		=	sha256_avx2_digest,
-	.descsize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha256",
-		.cra_driver_name =	"sha256-avx2",
-		.cra_priority	=	170,
-		.cra_blocksize	=	SHA256_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-}, {
-	.digestsize	=	SHA224_DIGEST_SIZE,
-	.init		=	sha224_base_init,
-	.update		=	sha256_avx2_update,
-	.final		=	sha256_avx2_final,
-	.finup		=	sha256_avx2_finup,
-	.descsize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha224",
-		.cra_driver_name =	"sha224-avx2",
-		.cra_priority	=	170,
-		.cra_blocksize	=	SHA224_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-} };
-
-static bool avx2_usable(void)
-{
-	if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
-		    boot_cpu_has(X86_FEATURE_BMI2))
-		return true;
-
-	return false;
-}
-
-static int register_sha256_avx2(void)
-{
-	if (avx2_usable())
-		return crypto_register_shashes(sha256_avx2_algs,
-				ARRAY_SIZE(sha256_avx2_algs));
-	return 0;
-}
-
-static void unregister_sha256_avx2(void)
-{
-	if (avx2_usable())
-		crypto_unregister_shashes(sha256_avx2_algs,
-				ARRAY_SIZE(sha256_avx2_algs));
-}
-
-#ifdef CONFIG_AS_SHA256_NI
-asmlinkage void sha256_ni_transform(struct sha256_state *digest,
-				    const u8 *data, int rounds);
-
-static int sha256_ni_update(struct shash_desc *desc, const u8 *data,
-			 unsigned int len)
-{
-	return _sha256_update(desc, data, len, sha256_ni_transform);
-}
-
-static int sha256_ni_finup(struct shash_desc *desc, const u8 *data,
-		      unsigned int len, u8 *out)
-{
-	return sha256_finup(desc, data, len, out, sha256_ni_transform);
-}
-
-static int sha256_ni_final(struct shash_desc *desc, u8 *out)
-{
-	return sha256_ni_finup(desc, NULL, 0, out);
-}
-
-static int sha256_ni_digest(struct shash_desc *desc, const u8 *data,
-		      unsigned int len, u8 *out)
-{
-	return sha256_base_init(desc) ?:
-	       sha256_ni_finup(desc, data, len, out);
-}
-
-static struct shash_alg sha256_ni_algs[] = { {
-	.digestsize	=	SHA256_DIGEST_SIZE,
-	.init		=	sha256_base_init,
-	.update		=	sha256_ni_update,
-	.final		=	sha256_ni_final,
-	.finup		=	sha256_ni_finup,
-	.digest		=	sha256_ni_digest,
-	.descsize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha256",
-		.cra_driver_name =	"sha256-ni",
-		.cra_priority	=	250,
-		.cra_blocksize	=	SHA256_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-}, {
-	.digestsize	=	SHA224_DIGEST_SIZE,
-	.init		=	sha224_base_init,
-	.update		=	sha256_ni_update,
-	.final		=	sha256_ni_final,
-	.finup		=	sha256_ni_finup,
-	.descsize	=	sizeof(struct sha256_state),
-	.base		=	{
-		.cra_name	=	"sha224",
-		.cra_driver_name =	"sha224-ni",
-		.cra_priority	=	250,
-		.cra_blocksize	=	SHA224_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-} };
-
-static int register_sha256_ni(void)
-{
-	if (boot_cpu_has(X86_FEATURE_SHA_NI))
-		return crypto_register_shashes(sha256_ni_algs,
-				ARRAY_SIZE(sha256_ni_algs));
-	return 0;
-}
-
-static void unregister_sha256_ni(void)
-{
-	if (boot_cpu_has(X86_FEATURE_SHA_NI))
-		crypto_unregister_shashes(sha256_ni_algs,
-				ARRAY_SIZE(sha256_ni_algs));
-}
-
-#else
-static inline int register_sha256_ni(void) { return 0; }
-static inline void unregister_sha256_ni(void) { }
-#endif
-
-static int __init sha256_ssse3_mod_init(void)
-{
-	if (!x86_match_cpu(module_cpu_ids))
-		return -ENODEV;
-
-	if (register_sha256_ssse3())
-		goto fail;
-
-	if (register_sha256_avx()) {
-		unregister_sha256_ssse3();
-		goto fail;
-	}
-
-	if (register_sha256_avx2()) {
-		unregister_sha256_avx();
-		unregister_sha256_ssse3();
-		goto fail;
-	}
-
-	if (register_sha256_ni()) {
-		unregister_sha256_avx2();
-		unregister_sha256_avx();
-		unregister_sha256_ssse3();
-		goto fail;
-	}
-
-	return 0;
-fail:
-	return -ENODEV;
-}
-
-static void __exit sha256_ssse3_mod_fini(void)
-{
-	unregister_sha256_ni();
-	unregister_sha256_avx2();
-	unregister_sha256_avx();
-	unregister_sha256_ssse3();
-}
-
-module_init(sha256_ssse3_mod_init);
-module_exit(sha256_ssse3_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-
-MODULE_ALIAS_CRYPTO("sha256");
-MODULE_ALIAS_CRYPTO("sha256-ssse3");
-MODULE_ALIAS_CRYPTO("sha256-avx");
-MODULE_ALIAS_CRYPTO("sha256-avx2");
-MODULE_ALIAS_CRYPTO("sha224");
-MODULE_ALIAS_CRYPTO("sha224-ssse3");
-MODULE_ALIAS_CRYPTO("sha224-avx");
-MODULE_ALIAS_CRYPTO("sha224-avx2");
-#ifdef CONFIG_AS_SHA256_NI
-MODULE_ALIAS_CRYPTO("sha256-ni");
-MODULE_ALIAS_CRYPTO("sha224-ni");
-#endif
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 6d3b85e53d0e..067684c54395 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -27,17 +27,13 @@
 
 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <linux/init.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/types.h>
 #include <crypto/sha2.h>
 #include <crypto/sha512_base.h>
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
 
 asmlinkage void sha512_transform_ssse3(struct sha512_state *state,
 				       const u8 *data, int blocks);
@@ -45,11 +41,7 @@ asmlinkage void sha512_transform_ssse3(struct sha512_state *state,
 static int sha512_update(struct shash_desc *desc, const u8 *data,
 		       unsigned int len, sha512_block_fn *sha512_xform)
 {
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable() ||
-	    (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
-		return crypto_sha512_update(desc, data, len);
+	int remain;
 
 	/*
 	 * Make sure struct sha512_state begins directly with the SHA512
@@ -58,22 +50,17 @@ static int sha512_update(struct shash_desc *desc, const u8 *data,
 	BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
 
 	kernel_fpu_begin();
-	sha512_base_do_update(desc, data, len, sha512_xform);
+	remain = sha512_base_do_update_blocks(desc, data, len, sha512_xform);
 	kernel_fpu_end();
 
-	return 0;
+	return remain;
 }
 
 static int sha512_finup(struct shash_desc *desc, const u8 *data,
 	      unsigned int len, u8 *out, sha512_block_fn *sha512_xform)
 {
-	if (!crypto_simd_usable())
-		return crypto_sha512_finup(desc, data, len, out);
-
 	kernel_fpu_begin();
-	if (len)
-		sha512_base_do_update(desc, data, len, sha512_xform);
-	sha512_base_do_finalize(desc, sha512_xform);
+	sha512_base_do_finup(desc, data, len, sha512_xform);
 	kernel_fpu_end();
 
 	return sha512_base_finish(desc, out);
@@ -91,23 +78,18 @@ static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
 	return sha512_finup(desc, data, len, out, sha512_transform_ssse3);
 }
 
-/* Add padding and return the message digest. */
-static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
-{
-	return sha512_ssse3_finup(desc, NULL, 0, out);
-}
-
 static struct shash_alg sha512_ssse3_algs[] = { {
 	.digestsize	=	SHA512_DIGEST_SIZE,
 	.init		=	sha512_base_init,
 	.update		=	sha512_ssse3_update,
-	.final		=	sha512_ssse3_final,
 	.finup		=	sha512_ssse3_finup,
-	.descsize	=	sizeof(struct sha512_state),
+	.descsize	=	SHA512_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha512",
 		.cra_driver_name =	"sha512-ssse3",
 		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize	=	SHA512_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@@ -115,13 +97,14 @@ static struct shash_alg sha512_ssse3_algs[] = { {
 	.digestsize	=	SHA384_DIGEST_SIZE,
 	.init		=	sha384_base_init,
 	.update		=	sha512_ssse3_update,
-	.final		=	sha512_ssse3_final,
 	.finup		=	sha512_ssse3_finup,
-	.descsize	=	sizeof(struct sha512_state),
+	.descsize	=	SHA512_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha384",
 		.cra_driver_name =	"sha384-ssse3",
 		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize	=	SHA384_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@@ -167,23 +150,18 @@ static int sha512_avx_finup(struct shash_desc *desc, const u8 *data,
 	return sha512_finup(desc, data, len, out, sha512_transform_avx);
 }
 
-/* Add padding and return the message digest. */
-static int sha512_avx_final(struct shash_desc *desc, u8 *out)
-{
-	return sha512_avx_finup(desc, NULL, 0, out);
-}
-
 static struct shash_alg sha512_avx_algs[] = { {
 	.digestsize	=	SHA512_DIGEST_SIZE,
 	.init		=	sha512_base_init,
 	.update		=	sha512_avx_update,
-	.final		=	sha512_avx_final,
 	.finup		=	sha512_avx_finup,
-	.descsize	=	sizeof(struct sha512_state),
+	.descsize	=	SHA512_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha512",
 		.cra_driver_name =	"sha512-avx",
 		.cra_priority	=	160,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize	=	SHA512_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@@ -191,13 +169,14 @@ static struct shash_alg sha512_avx_algs[] = { {
 	.digestsize	=	SHA384_DIGEST_SIZE,
 	.init		=	sha384_base_init,
 	.update		=	sha512_avx_update,
-	.final		=	sha512_avx_final,
 	.finup		=	sha512_avx_finup,
-	.descsize	=	sizeof(struct sha512_state),
+	.descsize	=	SHA512_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha384",
 		.cra_driver_name =	"sha384-avx",
 		.cra_priority	=	160,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize	=	SHA384_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@@ -233,23 +212,18 @@ static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data,
 	return sha512_finup(desc, data, len, out, sha512_transform_rorx);
 }
 
-/* Add padding and return the message digest. */
-static int sha512_avx2_final(struct shash_desc *desc, u8 *out)
-{
-	return sha512_avx2_finup(desc, NULL, 0, out);
-}
-
 static struct shash_alg sha512_avx2_algs[] = { {
 	.digestsize	=	SHA512_DIGEST_SIZE,
 	.init		=	sha512_base_init,
 	.update		=	sha512_avx2_update,
-	.final		=	sha512_avx2_final,
 	.finup		=	sha512_avx2_finup,
-	.descsize	=	sizeof(struct sha512_state),
+	.descsize	=	SHA512_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha512",
 		.cra_driver_name =	"sha512-avx2",
 		.cra_priority	=	170,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize	=	SHA512_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@@ -257,13 +231,14 @@ static struct shash_alg sha512_avx2_algs[] = { {
 	.digestsize	=	SHA384_DIGEST_SIZE,
 	.init		=	sha384_base_init,
 	.update		=	sha512_avx2_update,
-	.final		=	sha512_avx2_final,
 	.finup		=	sha512_avx2_finup,
-	.descsize	=	sizeof(struct sha512_state),
+	.descsize	=	SHA512_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sha384",
 		.cra_driver_name =	"sha384-avx2",
 		.cra_priority	=	170,
+		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize	=	SHA384_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
diff --git a/arch/x86/crypto/sm3_avx_glue.c b/arch/x86/crypto/sm3_avx_glue.c
index 661b6f22ffcd..6e8c42b9dc8e 100644
--- a/arch/x86/crypto/sm3_avx_glue.c
+++ b/arch/x86/crypto/sm3_avx_glue.c
@@ -10,12 +10,11 @@
 
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/types.h>
 #include <crypto/sm3.h>
 #include <crypto/sm3_base.h>
-#include <asm/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 
 asmlinkage void sm3_transform_avx(struct sm3_state *state,
 			const u8 *data, int nblocks);
@@ -23,13 +22,7 @@ asmlinkage void sm3_transform_avx(struct sm3_state *state,
 static int sm3_avx_update(struct shash_desc *desc, const u8 *data,
 			 unsigned int len)
 {
-	struct sm3_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable() ||
-			(sctx->count % SM3_BLOCK_SIZE) + len < SM3_BLOCK_SIZE) {
-		sm3_update(sctx, data, len);
-		return 0;
-	}
+	int remain;
 
 	/*
 	 * Make sure struct sm3_state begins directly with the SM3
@@ -38,45 +31,17 @@ static int sm3_avx_update(struct shash_desc *desc, const u8 *data,
 	BUILD_BUG_ON(offsetof(struct sm3_state, state) != 0);
 
 	kernel_fpu_begin();
-	sm3_base_do_update(desc, data, len, sm3_transform_avx);
+	remain = sm3_base_do_update_blocks(desc, data, len, sm3_transform_avx);
 	kernel_fpu_end();
-
-	return 0;
+	return remain;
 }
 
 static int sm3_avx_finup(struct shash_desc *desc, const u8 *data,
 		      unsigned int len, u8 *out)
 {
-	if (!crypto_simd_usable()) {
-		struct sm3_state *sctx = shash_desc_ctx(desc);
-
-		if (len)
-			sm3_update(sctx, data, len);
-
-		sm3_final(sctx, out);
-		return 0;
-	}
-
 	kernel_fpu_begin();
-	if (len)
-		sm3_base_do_update(desc, data, len, sm3_transform_avx);
-	sm3_base_do_finalize(desc, sm3_transform_avx);
+	sm3_base_do_finup(desc, data, len, sm3_transform_avx);
 	kernel_fpu_end();
-
-	return sm3_base_finish(desc, out);
-}
-
-static int sm3_avx_final(struct shash_desc *desc, u8 *out)
-{
-	if (!crypto_simd_usable()) {
-		sm3_final(shash_desc_ctx(desc), out);
-		return 0;
-	}
-
-	kernel_fpu_begin();
-	sm3_base_do_finalize(desc, sm3_transform_avx);
-	kernel_fpu_end();
-
 	return sm3_base_finish(desc, out);
 }
 
@@ -84,13 +49,14 @@ static struct shash_alg sm3_avx_alg = {
 	.digestsize	=	SM3_DIGEST_SIZE,
 	.init		=	sm3_base_init,
 	.update		=	sm3_avx_update,
-	.final		=	sm3_avx_final,
 	.finup		=	sm3_avx_finup,
-	.descsize	=	sizeof(struct sm3_state),
+	.descsize	=	SM3_STATE_SIZE,
 	.base		=	{
 		.cra_name	=	"sm3",
 		.cra_driver_name =	"sm3-avx",
 		.cra_priority	=	300,
+		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize	=	SM3_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
diff --git a/arch/x86/crypto/sm4_aesni_avx2_glue.c b/arch/x86/crypto/sm4_aesni_avx2_glue.c
index 1148fd4cd57f..fec0ab7a63dd 100644
--- a/arch/x86/crypto/sm4_aesni_avx2_glue.c
+++ b/arch/x86/crypto/sm4_aesni_avx2_glue.c
@@ -8,11 +8,10 @@
  * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
  */
 
+#include <asm/fpu/api.h>
 #include <linux/module.h>
 #include <linux/crypto.h>
 #include <linux/kernel.h>
-#include <asm/simd.h>
-#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/sm4.h>
 #include "sm4-avx.h"
@@ -48,10 +47,9 @@ static int ctr_crypt(struct skcipher_request *req)
 static struct skcipher_alg sm4_aesni_avx2_skciphers[] = {
 	{
 		.base = {
-			.cra_name		= "__ecb(sm4)",
-			.cra_driver_name	= "__ecb-sm4-aesni-avx2",
+			.cra_name		= "ecb(sm4)",
+			.cra_driver_name	= "ecb-sm4-aesni-avx2",
 			.cra_priority		= 500,
-			.cra_flags		= CRYPTO_ALG_INTERNAL,
 			.cra_blocksize		= SM4_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(struct sm4_ctx),
 			.cra_module		= THIS_MODULE,
@@ -64,10 +62,9 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = {
 		.decrypt	= sm4_avx_ecb_decrypt,
 	}, {
 		.base = {
-			.cra_name		= "__cbc(sm4)",
-			.cra_driver_name	= "__cbc-sm4-aesni-avx2",
+			.cra_name		= "cbc(sm4)",
+			.cra_driver_name	= "cbc-sm4-aesni-avx2",
 			.cra_priority		= 500,
-			.cra_flags		= CRYPTO_ALG_INTERNAL,
 			.cra_blocksize		= SM4_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(struct sm4_ctx),
 			.cra_module		= THIS_MODULE,
@@ -81,10 +78,9 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = {
 		.decrypt	= cbc_decrypt,
 	}, {
 		.base = {
-			.cra_name		= "__ctr(sm4)",
-			.cra_driver_name	= "__ctr-sm4-aesni-avx2",
+			.cra_name		= "ctr(sm4)",
+			.cra_driver_name	= "ctr-sm4-aesni-avx2",
 			.cra_priority		= 500,
-			.cra_flags		= CRYPTO_ALG_INTERNAL,
 			.cra_blocksize		= 1,
 			.cra_ctxsize		= sizeof(struct sm4_ctx),
 			.cra_module		= THIS_MODULE,
@@ -100,9 +96,6 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = {
 	}
 };
 
-static struct simd_skcipher_alg *
-simd_sm4_aesni_avx2_skciphers[ARRAY_SIZE(sm4_aesni_avx2_skciphers)];
-
 static int __init sm4_init(void)
 {
 	const char *feature_name;
@@ -121,16 +114,14 @@ static int __init sm4_init(void)
 		return -ENODEV;
 	}
 
-	return simd_register_skciphers_compat(sm4_aesni_avx2_skciphers,
-					ARRAY_SIZE(sm4_aesni_avx2_skciphers),
-					simd_sm4_aesni_avx2_skciphers);
+	return crypto_register_skciphers(sm4_aesni_avx2_skciphers,
+					 ARRAY_SIZE(sm4_aesni_avx2_skciphers));
 }
 
 static void __exit sm4_exit(void)
 {
-	simd_unregister_skciphers(sm4_aesni_avx2_skciphers,
-				ARRAY_SIZE(sm4_aesni_avx2_skciphers),
-				simd_sm4_aesni_avx2_skciphers);
+	crypto_unregister_skciphers(sm4_aesni_avx2_skciphers,
+				    ARRAY_SIZE(sm4_aesni_avx2_skciphers));
 }
 
 module_init(sm4_init);
diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c
index 85b4ca78b47b..72867fc49ce8 100644
--- a/arch/x86/crypto/sm4_aesni_avx_glue.c
+++ b/arch/x86/crypto/sm4_aesni_avx_glue.c
@@ -8,11 +8,10 @@
  * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
  */
 
+#include <asm/fpu/api.h>
 #include <linux/module.h>
 #include <linux/crypto.h>
 #include <linux/kernel.h>
-#include <asm/simd.h>
-#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/sm4.h>
 #include "sm4-avx.h"
@@ -263,10 +262,9 @@ static int ctr_crypt(struct skcipher_request *req)
 static struct skcipher_alg sm4_aesni_avx_skciphers[] = {
 	{
 		.base = {
-			.cra_name		= "__ecb(sm4)",
-			.cra_driver_name	= "__ecb-sm4-aesni-avx",
+			.cra_name		= "ecb(sm4)",
+			.cra_driver_name	= "ecb-sm4-aesni-avx",
 			.cra_priority		= 400,
-			.cra_flags		= CRYPTO_ALG_INTERNAL,
 			.cra_blocksize		= SM4_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(struct sm4_ctx),
 			.cra_module		= THIS_MODULE,
@@ -279,10 +277,9 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = {
 		.decrypt	= sm4_avx_ecb_decrypt,
 	}, {
 		.base = {
-			.cra_name		= "__cbc(sm4)",
-			.cra_driver_name	= "__cbc-sm4-aesni-avx",
+			.cra_name		= "cbc(sm4)",
+			.cra_driver_name	= "cbc-sm4-aesni-avx",
 			.cra_priority		= 400,
-			.cra_flags		= CRYPTO_ALG_INTERNAL,
 			.cra_blocksize		= SM4_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(struct sm4_ctx),
 			.cra_module		= THIS_MODULE,
@@ -296,10 +293,9 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = {
 		.decrypt	= cbc_decrypt,
 	}, {
 		.base = {
-			.cra_name		= "__ctr(sm4)",
-			.cra_driver_name	= "__ctr-sm4-aesni-avx",
+			.cra_name		= "ctr(sm4)",
+			.cra_driver_name	= "ctr-sm4-aesni-avx",
 			.cra_priority		= 400,
-			.cra_flags		= CRYPTO_ALG_INTERNAL,
 			.cra_blocksize		= 1,
 			.cra_ctxsize		= sizeof(struct sm4_ctx),
 			.cra_module		= THIS_MODULE,
@@ -315,9 +311,6 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = {
 	}
 };
 
-static struct simd_skcipher_alg *
-simd_sm4_aesni_avx_skciphers[ARRAY_SIZE(sm4_aesni_avx_skciphers)];
-
 static int __init sm4_init(void)
 {
 	const char *feature_name;
@@ -335,16 +328,14 @@ static int __init sm4_init(void)
 		return -ENODEV;
 	}
 
-	return simd_register_skciphers_compat(sm4_aesni_avx_skciphers,
-					ARRAY_SIZE(sm4_aesni_avx_skciphers),
-					simd_sm4_aesni_avx_skciphers);
+	return crypto_register_skciphers(sm4_aesni_avx_skciphers,
+					 ARRAY_SIZE(sm4_aesni_avx_skciphers));
 }
 
 static void __exit sm4_exit(void)
 {
-	simd_unregister_skciphers(sm4_aesni_avx_skciphers,
-					ARRAY_SIZE(sm4_aesni_avx_skciphers),
-					simd_sm4_aesni_avx_skciphers);
+	crypto_unregister_skciphers(sm4_aesni_avx_skciphers,
+				    ARRAY_SIZE(sm4_aesni_avx_skciphers));
 }
 
 module_init(sm4_init);
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 3eb3440b477a..9e20db013750 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -13,7 +13,6 @@
 #include <linux/crypto.h>
 #include <linux/err.h>
 #include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
 #include <crypto/twofish.h>
 
 #include "twofish.h"
@@ -74,10 +73,9 @@ static int cbc_decrypt(struct skcipher_request *req)
 
 static struct skcipher_alg twofish_algs[] = {
 	{
-		.base.cra_name		= "__ecb(twofish)",
-		.base.cra_driver_name	= "__ecb-twofish-avx",
+		.base.cra_name		= "ecb(twofish)",
+		.base.cra_driver_name	= "ecb-twofish-avx",
 		.base.cra_priority	= 400,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= TF_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct twofish_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -87,10 +85,9 @@ static struct skcipher_alg twofish_algs[] = {
 		.encrypt		= ecb_encrypt,
 		.decrypt		= ecb_decrypt,
 	}, {
-		.base.cra_name		= "__cbc(twofish)",
-		.base.cra_driver_name	= "__cbc-twofish-avx",
+		.base.cra_name		= "cbc(twofish)",
+		.base.cra_driver_name	= "cbc-twofish-avx",
 		.base.cra_priority	= 400,
-		.base.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.base.cra_blocksize	= TF_BLOCK_SIZE,
 		.base.cra_ctxsize	= sizeof(struct twofish_ctx),
 		.base.cra_module	= THIS_MODULE,
@@ -103,8 +100,6 @@ static struct skcipher_alg twofish_algs[] = {
 	},
 };
 
-static struct simd_skcipher_alg *twofish_simd_algs[ARRAY_SIZE(twofish_algs)];
-
 static int __init twofish_init(void)
 {
 	const char *feature_name;
@@ -114,15 +109,13 @@ static int __init twofish_init(void)
 		return -ENODEV;
 	}
 
-	return simd_register_skciphers_compat(twofish_algs,
-					      ARRAY_SIZE(twofish_algs),
-					      twofish_simd_algs);
+	return crypto_register_skciphers(twofish_algs,
+					 ARRAY_SIZE(twofish_algs));
 }
 
 static void __exit twofish_exit(void)
 {
-	simd_unregister_skciphers(twofish_algs, ARRAY_SIZE(twofish_algs),
-				  twofish_simd_algs);
+	crypto_unregister_skciphers(twofish_algs, ARRAY_SIZE(twofish_algs));
 }
 
 module_init(twofish_init);
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f40bdf97d390..ed04a968cc7d 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1525,7 +1525,9 @@ SYM_CODE_END(rewind_stack_and_make_dead)
  * ORC to unwind properly.
  *
  * The alignment is for performance and not for safety, and may be safely
- * refactored in the future if needed.
+ * refactored in the future if needed. The .skips are for safety, to ensure
+ * that all RETs are in the second half of a cacheline to mitigate Indirect
+ * Target Selection, rather than taking the slowpath via its_return_thunk.
  */
 SYM_FUNC_START(clear_bhb_loop)
 	ANNOTATE_NOENDBR
@@ -1536,10 +1538,22 @@ SYM_FUNC_START(clear_bhb_loop)
 	call	1f
 	jmp	5f
 	.align 64, 0xcc
+	/*
+	 * Shift instructions so that the RET is in the upper half of the
+	 * cacheline and don't take the slowpath to its_return_thunk.
+	 */
+	.skip 32 - (.Lret1 - 1f), 0xcc
 	ANNOTATE_INTRA_FUNCTION_CALL
 1:	call	2f
-	RET
+.Lret1:	RET
 	.align 64, 0xcc
+	/*
+	 * As above shift instructions for RET at .Lret2 as well.
+	 *
+	 * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc
+	 * but some Clang versions (e.g. 18) don't like this.
+	 */
+	.skip 32 - 18, 0xcc
 2:	movl	$5, %eax
 3:	jmp	4f
 	nop
@@ -1547,7 +1561,7 @@ SYM_FUNC_START(clear_bhb_loop)
 	jnz	3b
 	sub	$1, %ecx
 	jnz	1b
-	RET
+.Lret2:	RET
 5:	lfence
 	pop	%rbp
 	RET
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 3a319cf6d364..3204591249e8 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4623,7 +4623,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)
 	arr[pebs_enable] = (struct perf_guest_switch_msr){
 		.msr = MSR_IA32_PEBS_ENABLE,
 		.host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask,
-		.guest = pebs_mask & ~cpuc->intel_ctrl_host_mask,
+		.guest = pebs_mask & ~cpuc->intel_ctrl_host_mask & kvm_pmu->pebs_enable,
 	};
 
 	if (arr[pebs_enable].host) {
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index fb02e434d072..e2ea4fdedea9 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2464,8 +2464,9 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_
 				setup_pebs_fixed_sample_data);
 }
 
-static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size)
+static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, u64 mask)
 {
+	u64 pebs_enabled = cpuc->pebs_enabled & mask;
 	struct perf_event *event;
 	int bit;
 
@@ -2476,7 +2477,7 @@ static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int
 	 * It needs to call intel_pmu_save_and_restart_reload() to
 	 * update the event->count for this case.
 	 */
-	for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, size) {
+	for_each_set_bit(bit, (unsigned long *)&pebs_enabled, X86_PMC_IDX_MAX) {
 		event = cpuc->events[bit];
 		if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
 			intel_pmu_save_and_restart_reload(event, 0);
@@ -2511,7 +2512,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
 	}
 
 	if (unlikely(base >= top)) {
-		intel_pmu_pebs_event_update_no_drain(cpuc, size);
+		intel_pmu_pebs_event_update_no_drain(cpuc, mask);
 		return;
 	}
 
@@ -2625,7 +2626,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
 	       (hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED);
 
 	if (unlikely(base >= top)) {
-		intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX);
+		intel_pmu_pebs_event_update_no_drain(cpuc, mask);
 		return;
 	}
 
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 4a37a8bd87fd..f2294784babc 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -6,6 +6,7 @@
 #include <linux/stringify.h>
 #include <linux/objtool.h>
 #include <asm/asm.h>
+#include <asm/bug.h>
 
 #define ALT_FLAGS_SHIFT		16
 
@@ -124,6 +125,37 @@ static __always_inline int x86_call_depth_emit_accounting(u8 **pprog,
 }
 #endif
 
+#ifdef CONFIG_MITIGATION_ITS
+extern void its_init_mod(struct module *mod);
+extern void its_fini_mod(struct module *mod);
+extern void its_free_mod(struct module *mod);
+extern u8 *its_static_thunk(int reg);
+#else /* CONFIG_MITIGATION_ITS */
+static inline void its_init_mod(struct module *mod) { }
+static inline void its_fini_mod(struct module *mod) { }
+static inline void its_free_mod(struct module *mod) { }
+static inline u8 *its_static_thunk(int reg)
+{
+	WARN_ONCE(1, "ITS not compiled in");
+
+	return NULL;
+}
+#endif
+
+#if defined(CONFIG_MITIGATION_RETHUNK) && defined(CONFIG_OBJTOOL)
+extern bool cpu_wants_rethunk(void);
+extern bool cpu_wants_rethunk_at(void *addr);
+#else
+static __always_inline bool cpu_wants_rethunk(void)
+{
+	return false;
+}
+static __always_inline bool cpu_wants_rethunk_at(void *addr)
+{
+	return false;
+}
+#endif
+
 #ifdef CONFIG_SMP
 extern void alternatives_smp_module_add(struct module *mod, char *name,
 					void *locks, void *locks_end,
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 6c2c152d8a67..30144ef9ef02 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -75,7 +75,7 @@
 #define X86_FEATURE_CENTAUR_MCR		( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */
 #define X86_FEATURE_K8			( 3*32+ 4) /* Opteron, Athlon64 */
 #define X86_FEATURE_ZEN5		( 3*32+ 5) /* CPU based on Zen5 microarchitecture */
-/* Free                                 ( 3*32+ 6) */
+#define X86_FEATURE_ZEN6		( 3*32+ 6) /* CPU based on Zen6 microarchitecture */
 /* Free                                 ( 3*32+ 7) */
 #define X86_FEATURE_CONSTANT_TSC	( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */
 #define X86_FEATURE_UP			( 3*32+ 9) /* "up" SMP kernel running on UP */
@@ -481,6 +481,7 @@
 #define X86_FEATURE_AMD_HETEROGENEOUS_CORES (21*32 + 6) /* Heterogeneous Core Topology */
 #define X86_FEATURE_AMD_WORKLOAD_CLASS	(21*32 + 7) /* Workload Classification */
 #define X86_FEATURE_PREFER_YMM		(21*32 + 8) /* Avoid ZMM registers due to downclocking */
+#define X86_FEATURE_INDIRECT_THUNK_ITS	(21*32 + 9) /* Use thunk for indirect branches in lower half of cacheline */
 
 /*
  * BUG word(s)
@@ -533,4 +534,6 @@
 #define X86_BUG_BHI			X86_BUG(1*32 + 3) /* "bhi" CPU is affected by Branch History Injection */
 #define X86_BUG_IBPB_NO_RET	   	X86_BUG(1*32 + 4) /* "ibpb_no_ret" IBPB omits return target predictions */
 #define X86_BUG_SPECTRE_V2_USER		X86_BUG(1*32 + 5) /* "spectre_v2_user" CPU is affected by Spectre variant 2 attack between user processes */
+#define X86_BUG_ITS			X86_BUG(1*32 + 6) /* "its" CPU is affected by Indirect Target Selection */
+#define X86_BUG_ITS_NATIVE_ONLY		X86_BUG(1*32 + 7) /* "its_native_only" CPU is affected by ITS, VMX is not affected */
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index f42de5f05e7e..3ebeee2644de 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -126,6 +126,7 @@ static inline void fpstate_init_soft(struct swregs_state *soft) {}
 #endif
 
 /* State tracking */
+DECLARE_PER_CPU(bool, kernel_fpu_allowed);
 DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
 
 /* Process cleanup */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3bdae454a959..7bc174a1f1cb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -35,6 +35,7 @@
 #include <asm/mtrr.h>
 #include <asm/msr-index.h>
 #include <asm/asm.h>
+#include <asm/irq_remapping.h>
 #include <asm/kvm_page_track.h>
 #include <asm/kvm_vcpu_regs.h>
 #include <asm/reboot.h>
@@ -2423,4 +2424,9 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
  */
 #define KVM_EXIT_HYPERCALL_MBZ		GENMASK_ULL(31, 1)
 
+static inline bool kvm_arch_has_irq_bypass(void)
+{
+	return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP);
+}
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 695e569159c1..be7cddc414e4 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -17,10 +17,12 @@ struct ucode_cpu_info {
 void load_ucode_bsp(void);
 void load_ucode_ap(void);
 void microcode_bsp_resume(void);
+bool __init microcode_loader_disabled(void);
 #else
 static inline void load_ucode_bsp(void)	{ }
 static inline void load_ucode_ap(void) { }
 static inline void microcode_bsp_resume(void) { }
+static inline bool __init microcode_loader_disabled(void) { return false; }
 #endif
 
 extern unsigned long initrd_start_early;
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 53da787b9326..7c3f5ef1a169 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -211,6 +211,14 @@
 						 * VERW clears CPU Register
 						 * File.
 						 */
+#define ARCH_CAP_ITS_NO			BIT_ULL(62) /*
+						     * Not susceptible to
+						     * Indirect Target Selection.
+						     * This bit is not set by
+						     * HW, but is synthesized by
+						     * VMMs for guests to know
+						     * their affected status.
+						     */
 
 #define MSR_IA32_FLUSH_CMD		0x0000010b
 #define L1D_FLUSH			BIT(0)	/*
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 5c43f145454d..7d04ade33541 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -336,10 +336,14 @@
 
 #else /* __ASSEMBLER__ */
 
+#define ITS_THUNK_SIZE	64
+
 typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
+typedef u8 its_thunk_t[ITS_THUNK_SIZE];
 extern retpoline_thunk_t __x86_indirect_thunk_array[];
 extern retpoline_thunk_t __x86_indirect_call_thunk_array[];
 extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];
+extern its_thunk_t	 __x86_indirect_its_thunk_array[];
 
 #ifdef CONFIG_MITIGATION_RETHUNK
 extern void __x86_return_thunk(void);
@@ -363,6 +367,12 @@ static inline void srso_return_thunk(void) {}
 static inline void srso_alias_return_thunk(void) {}
 #endif
 
+#ifdef CONFIG_MITIGATION_ITS
+extern void its_return_thunk(void);
+#else
+static inline void its_return_thunk(void) {}
+#endif
+
 extern void retbleed_return_thunk(void);
 extern void srso_return_thunk(void);
 extern void srso_alias_return_thunk(void);
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index a33147520044..c88691b15f3c 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -6,6 +6,8 @@
 #include <linux/mm.h>		/* for struct page */
 #include <linux/pagemap.h>
 
+#include <asm/cpufeature.h>
+
 #define __HAVE_ARCH_PTE_ALLOC_ONE
 #define __HAVE_ARCH_PGD_FREE
 #include <asm-generic/pgalloc.h>
@@ -29,16 +31,17 @@ static inline void paravirt_release_pud(unsigned long pfn) {}
 static inline void paravirt_release_p4d(unsigned long pfn) {}
 #endif
 
-#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
 /*
- * Instead of one PGD, we acquire two PGDs.  Being order-1, it is
- * both 8k in size and 8k-aligned.  That lets us just flip bit 12
- * in a pointer to swap between the two 4k halves.
+ * In case of Page Table Isolation active, we acquire two PGDs instead of one.
+ * Being order-1, it is both 8k in size and 8k-aligned.  That lets us just
+ * flip bit 12 in a pointer to swap between the two 4k halves.
  */
-#define PGD_ALLOCATION_ORDER 1
-#else
-#define PGD_ALLOCATION_ORDER 0
-#endif
+static inline unsigned int pgd_allocation_order(void)
+{
+	if (cpu_feature_enabled(X86_FEATURE_PTI))
+		return 1;
+	return 0;
+}
 
 /*
  * Allocate and free page tables.
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index acb85b9346d8..0020d77a0800 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -116,7 +116,7 @@ enum psc_op {
 #define GHCB_MSR_VMPL_REQ		0x016
 #define GHCB_MSR_VMPL_REQ_LEVEL(v)			\
 	/* GHCBData[39:32] */				\
-	(((u64)(v) & GENMASK_ULL(7, 0) << 32) |		\
+	((((u64)(v) & GENMASK_ULL(7, 0)) << 32) |	\
 	/* GHCBDdata[11:0] */				\
 	GHCB_MSR_VMPL_REQ)
 
diff --git a/arch/x86/include/asm/simd.h b/arch/x86/include/asm/simd.h
index a341c878e977..b8027b63cd7a 100644
--- a/arch/x86/include/asm/simd.h
+++ b/arch/x86/include/asm/simd.h
@@ -1,6 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
 
 #include <asm/fpu/api.h>
+#include <linux/compiler_attributes.h>
+#include <linux/types.h>
 
 /*
  * may_use_simd - whether it is allowable at this time to issue SIMD
@@ -10,3 +14,5 @@ static __must_check inline bool may_use_simd(void)
 {
 	return irq_fpu_usable();
 }
+
+#endif	/* _ASM_SIMD_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index bf82c6f7d690..45bcff181cba 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -18,6 +18,7 @@
 #include <linux/mmu_context.h>
 #include <linux/bsearch.h>
 #include <linux/sync_core.h>
+#include <linux/execmem.h>
 #include <asm/text-patching.h>
 #include <asm/alternative.h>
 #include <asm/sections.h>
@@ -31,6 +32,8 @@
 #include <asm/paravirt.h>
 #include <asm/asm-prototypes.h>
 #include <asm/cfi.h>
+#include <asm/ibt.h>
+#include <asm/set_memory.h>
 
 int __read_mostly alternatives_patched;
 
@@ -124,6 +127,171 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
 #endif
 };
 
+#ifdef CONFIG_FINEIBT
+static bool cfi_paranoid __ro_after_init;
+#endif
+
+#ifdef CONFIG_MITIGATION_ITS
+
+#ifdef CONFIG_MODULES
+static struct module *its_mod;
+#endif
+static void *its_page;
+static unsigned int its_offset;
+
+/* Initialize a thunk with the "jmp *reg; int3" instructions. */
+static void *its_init_thunk(void *thunk, int reg)
+{
+	u8 *bytes = thunk;
+	int offset = 0;
+	int i = 0;
+
+#ifdef CONFIG_FINEIBT
+	if (cfi_paranoid) {
+		/*
+		 * When ITS uses indirect branch thunk the fineibt_paranoid
+		 * caller sequence doesn't fit in the caller site. So put the
+		 * remaining part of the sequence (<ea> + JNE) into the ITS
+		 * thunk.
+		 */
+		bytes[i++] = 0xea; /* invalid instruction */
+		bytes[i++] = 0x75; /* JNE */
+		bytes[i++] = 0xfd;
+
+		offset = 1;
+	}
+#endif
+
+	if (reg >= 8) {
+		bytes[i++] = 0x41; /* REX.B prefix */
+		reg -= 8;
+	}
+	bytes[i++] = 0xff;
+	bytes[i++] = 0xe0 + reg; /* jmp *reg */
+	bytes[i++] = 0xcc;
+
+	return thunk + offset;
+}
+
+#ifdef CONFIG_MODULES
+void its_init_mod(struct module *mod)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+		return;
+
+	mutex_lock(&text_mutex);
+	its_mod = mod;
+	its_page = NULL;
+}
+
+void its_fini_mod(struct module *mod)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+		return;
+
+	WARN_ON_ONCE(its_mod != mod);
+
+	its_mod = NULL;
+	its_page = NULL;
+	mutex_unlock(&text_mutex);
+
+	for (int i = 0; i < mod->its_num_pages; i++) {
+		void *page = mod->its_page_array[i];
+		execmem_restore_rox(page, PAGE_SIZE);
+	}
+}
+
+void its_free_mod(struct module *mod)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+		return;
+
+	for (int i = 0; i < mod->its_num_pages; i++) {
+		void *page = mod->its_page_array[i];
+		execmem_free(page);
+	}
+	kfree(mod->its_page_array);
+}
+#endif /* CONFIG_MODULES */
+
+static void *its_alloc(void)
+{
+	void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
+
+	if (!page)
+		return NULL;
+
+#ifdef CONFIG_MODULES
+	if (its_mod) {
+		void *tmp = krealloc(its_mod->its_page_array,
+				     (its_mod->its_num_pages+1) * sizeof(void *),
+				     GFP_KERNEL);
+		if (!tmp)
+			return NULL;
+
+		its_mod->its_page_array = tmp;
+		its_mod->its_page_array[its_mod->its_num_pages++] = page;
+
+		execmem_make_temp_rw(page, PAGE_SIZE);
+	}
+#endif /* CONFIG_MODULES */
+
+	return no_free_ptr(page);
+}
+
+static void *its_allocate_thunk(int reg)
+{
+	int size = 3 + (reg / 8);
+	void *thunk;
+
+#ifdef CONFIG_FINEIBT
+	/*
+	 * The ITS thunk contains an indirect jump and an int3 instruction so
+	 * its size is 3 or 4 bytes depending on the register used. If CFI
+	 * paranoid is used then 3 extra bytes are added in the ITS thunk to
+	 * complete the fineibt_paranoid caller sequence.
+	 */
+	if (cfi_paranoid)
+		size += 3;
+#endif
+
+	if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) {
+		its_page = its_alloc();
+		if (!its_page) {
+			pr_err("ITS page allocation failed\n");
+			return NULL;
+		}
+		memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE);
+		its_offset = 32;
+	}
+
+	/*
+	 * If the indirect branch instruction will be in the lower half
+	 * of a cacheline, then update the offset to reach the upper half.
+	 */
+	if ((its_offset + size - 1) % 64 < 32)
+		its_offset = ((its_offset - 1) | 0x3F) + 33;
+
+	thunk = its_page + its_offset;
+	its_offset += size;
+
+	return its_init_thunk(thunk, reg);
+}
+
+u8 *its_static_thunk(int reg)
+{
+	u8 *thunk = __x86_indirect_its_thunk_array[reg];
+
+#ifdef CONFIG_FINEIBT
+	/* Paranoid thunk starts 2 bytes before */
+	if (cfi_paranoid)
+		return thunk - 2;
+#endif
+	return thunk;
+}
+
+#endif
+
 /*
  * Nomenclature for variable names to simplify and clarify this code and ease
  * any potential staring at it:
@@ -581,7 +749,8 @@ static int emit_indirect(int op, int reg, u8 *bytes)
 	return i;
 }
 
-static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes,
+			     void *call_dest, void *jmp_dest)
 {
 	u8 op = insn->opcode.bytes[0];
 	int i = 0;
@@ -602,7 +771,7 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8
 	switch (op) {
 	case CALL_INSN_OPCODE:
 		__text_gen_insn(bytes+i, op, addr+i,
-				__x86_indirect_call_thunk_array[reg],
+				call_dest,
 				CALL_INSN_SIZE);
 		i += CALL_INSN_SIZE;
 		break;
@@ -610,7 +779,7 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8
 	case JMP32_INSN_OPCODE:
 clang_jcc:
 		__text_gen_insn(bytes+i, op, addr+i,
-				__x86_indirect_jump_thunk_array[reg],
+				jmp_dest,
 				JMP32_INSN_SIZE);
 		i += JMP32_INSN_SIZE;
 		break;
@@ -625,6 +794,48 @@ clang_jcc:
 	return i;
 }
 
+static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+	return __emit_trampoline(addr, insn, bytes,
+				 __x86_indirect_call_thunk_array[reg],
+				 __x86_indirect_jump_thunk_array[reg]);
+}
+
+#ifdef CONFIG_MITIGATION_ITS
+static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+	u8 *thunk = __x86_indirect_its_thunk_array[reg];
+	u8 *tmp = its_allocate_thunk(reg);
+
+	if (tmp)
+		thunk = tmp;
+
+	return __emit_trampoline(addr, insn, bytes, thunk, thunk);
+}
+
+/* Check if an indirect branch is at ITS-unsafe address */
+static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+		return false;
+
+	/* Indirect branch opcode is 2 or 3 bytes depending on reg */
+	addr += 1 + reg / 8;
+
+	/* Lower-half of the cacheline? */
+	return !(addr & 0x20);
+}
+#else /* CONFIG_MITIGATION_ITS */
+
+#ifdef CONFIG_FINEIBT
+static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
+{
+	return false;
+}
+#endif
+
+#endif /* CONFIG_MITIGATION_ITS */
+
 /*
  * Rewrite the compiler generated retpoline thunk calls.
  *
@@ -699,6 +910,15 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
 		bytes[i++] = 0xe8; /* LFENCE */
 	}
 
+#ifdef CONFIG_MITIGATION_ITS
+	/*
+	 * Check if the address of last byte of emitted-indirect is in
+	 * lower-half of the cacheline. Such branches need ITS mitigation.
+	 */
+	if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg))
+		return emit_its_trampoline(addr, insn, reg, bytes);
+#endif
+
 	ret = emit_indirect(op, reg, bytes + i);
 	if (ret < 0)
 		return ret;
@@ -732,6 +952,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
 		int len, ret;
 		u8 bytes[16];
 		u8 op1, op2;
+		u8 *dest;
 
 		ret = insn_decode_kernel(&insn, addr);
 		if (WARN_ON_ONCE(ret < 0))
@@ -748,6 +969,12 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
 
 		case CALL_INSN_OPCODE:
 		case JMP32_INSN_OPCODE:
+			/* Check for cfi_paranoid + ITS */
+			dest = addr + insn.length + insn.immediate.value;
+			if (dest[-1] == 0xea && (dest[0] & 0xf0) == 0x70) {
+				WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
+				continue;
+			}
 			break;
 
 		case 0x0f: /* escape */
@@ -775,6 +1002,21 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
 
 #ifdef CONFIG_MITIGATION_RETHUNK
 
+bool cpu_wants_rethunk(void)
+{
+	return cpu_feature_enabled(X86_FEATURE_RETHUNK);
+}
+
+bool cpu_wants_rethunk_at(void *addr)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_RETHUNK))
+		return false;
+	if (x86_return_thunk != its_return_thunk)
+		return true;
+
+	return !((unsigned long)addr & 0x20);
+}
+
 /*
  * Rewrite the compiler generated return thunk tail-calls.
  *
@@ -791,7 +1033,7 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes)
 	int i = 0;
 
 	/* Patch the custom return thunks... */
-	if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
+	if (cpu_wants_rethunk_at(addr)) {
 		i = JMP32_INSN_SIZE;
 		__text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
 	} else {
@@ -808,7 +1050,7 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
 {
 	s32 *s;
 
-	if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+	if (cpu_wants_rethunk())
 		static_call_force_reinit();
 
 	for (s = start; s < end; s++) {
@@ -1022,8 +1264,6 @@ int cfi_get_func_arity(void *func)
 static bool cfi_rand __ro_after_init = true;
 static u32  cfi_seed __ro_after_init;
 
-static bool cfi_paranoid __ro_after_init = false;
-
 /*
  * Re-hash the CFI hash with a boot-time seed while making sure the result is
  * not a valid ENDBR instruction.
@@ -1436,6 +1676,19 @@ static int cfi_rand_callers(s32 *start, s32 *end)
 	return 0;
 }
 
+static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+	u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2;
+
+#ifdef CONFIG_MITIGATION_ITS
+	u8 *tmp = its_allocate_thunk(reg);
+	if (tmp)
+		thunk = tmp;
+#endif
+
+	return __emit_trampoline(addr, insn, bytes, thunk, thunk);
+}
+
 static int cfi_rewrite_callers(s32 *start, s32 *end)
 {
 	s32 *s;
@@ -1477,9 +1730,14 @@ static int cfi_rewrite_callers(s32 *start, s32 *end)
 		memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size);
 		memcpy(bytes + fineibt_caller_hash, &hash, 4);
 
-		ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind);
-		if (WARN_ON_ONCE(ret != 3))
-			continue;
+		if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) {
+			emit_paranoid_trampoline(addr + fineibt_caller_size,
+						 &insn, 11, bytes + fineibt_caller_size);
+		} else {
+			ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind);
+			if (WARN_ON_ONCE(ret != 3))
+				continue;
+		}
 
 		text_poke_early(addr, bytes, fineibt_paranoid_size);
 	}
@@ -1706,29 +1964,66 @@ Efault:
 	return false;
 }
 
+static bool is_paranoid_thunk(unsigned long addr)
+{
+	u32 thunk;
+
+	__get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault);
+	return (thunk & 0x00FFFFFF) == 0xfd75ea;
+
+Efault:
+	return false;
+}
+
 /*
  * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[]
- * sequence.
+ * sequence, or to an invalid instruction (0xea) + Jcc.d8 for cfi_paranoid + ITS
+ * thunk.
  */
 static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type)
 {
 	unsigned long addr = regs->ip - fineibt_paranoid_ud;
-	u32 hash;
 
-	if (!cfi_paranoid || !is_cfi_trap(addr + fineibt_caller_size - LEN_UD2))
+	if (!cfi_paranoid)
 		return false;
 
-	__get_kernel_nofault(&hash, addr + fineibt_caller_hash, u32, Efault);
-	*target = regs->r11 + fineibt_preamble_size;
-	*type = regs->r10;
+	if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) {
+		*target = regs->r11 + fineibt_preamble_size;
+		*type = regs->r10;
+
+		/*
+		 * Since the trapping instruction is the exact, but LOCK prefixed,
+		 * Jcc.d8 that got us here, the normal fixup will work.
+		 */
+		return true;
+	}
 
 	/*
-	 * Since the trapping instruction is the exact, but LOCK prefixed,
-	 * Jcc.d8 that got us here, the normal fixup will work.
+	 * The cfi_paranoid + ITS thunk combination results in:
+	 *
+	 *  0:   41 ba 78 56 34 12       mov    $0x12345678, %r10d
+	 *  6:   45 3b 53 f7             cmp    -0x9(%r11), %r10d
+	 *  a:   4d 8d 5b f0             lea    -0x10(%r11), %r11
+	 *  e:   2e e8 XX XX XX XX	 cs call __x86_indirect_paranoid_thunk_r11
+	 *
+	 * Where the paranoid_thunk looks like:
+	 *
+	 *  1d:  <ea>                    (bad)
+	 *  __x86_indirect_paranoid_thunk_r11:
+	 *  1e:  75 fd                   jne 1d
+	 *  __x86_indirect_its_thunk_r11:
+	 *  20:  41 ff eb                jmp *%r11
+	 *  23:  cc                      int3
+	 *
 	 */
-	return true;
+	if (is_paranoid_thunk(regs->ip)) {
+		*target = regs->r11 + fineibt_preamble_size;
+		*type = regs->r10;
+
+		regs->ip = *target;
+		return true;
+	}
 
-Efault:
 	return false;
 }
 
@@ -2031,6 +2326,8 @@ static noinline void __init alt_reloc_selftest(void)
 
 void __init alternative_instructions(void)
 {
+	u64 ibt;
+
 	int3_selftest();
 
 	/*
@@ -2057,6 +2354,9 @@ void __init alternative_instructions(void)
 	 */
 	paravirt_set_cap();
 
+	/* Keep CET-IBT disabled until caller/callee are patched */
+	ibt = ibt_save(/*disable*/ true);
+
 	__apply_fineibt(__retpoline_sites, __retpoline_sites_end,
 			__cfi_sites, __cfi_sites_end, true);
 
@@ -2080,6 +2380,8 @@ void __init alternative_instructions(void)
 	 */
 	apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
 
+	ibt_restore(ibt);
+
 #ifdef CONFIG_SMP
 	/* Patch to UP if other cpus not imminent. */
 	if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 2b36379ff675..4e06baab40bb 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -472,6 +472,11 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 		case 0x60 ... 0x7f:
 			setup_force_cpu_cap(X86_FEATURE_ZEN5);
 			break;
+		case 0x50 ... 0x5f:
+		case 0x90 ... 0xaf:
+		case 0xc0 ... 0xcf:
+			setup_force_cpu_cap(X86_FEATURE_ZEN6);
+			break;
 		default:
 			goto warn;
 		}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 362602b705cc..8596ce85026c 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -49,6 +49,7 @@ static void __init srbds_select_mitigation(void);
 static void __init l1d_flush_select_mitigation(void);
 static void __init srso_select_mitigation(void);
 static void __init gds_select_mitigation(void);
+static void __init its_select_mitigation(void);
 
 /* The base value of the SPEC_CTRL MSR without task-specific bits set */
 u64 x86_spec_ctrl_base;
@@ -66,6 +67,14 @@ static DEFINE_MUTEX(spec_ctrl_mutex);
 
 void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk;
 
+static void __init set_return_thunk(void *thunk)
+{
+	if (x86_return_thunk != __x86_return_thunk)
+		pr_warn("x86/bugs: return thunk changed\n");
+
+	x86_return_thunk = thunk;
+}
+
 /* Update SPEC_CTRL MSR and its cached copy unconditionally */
 static void update_spec_ctrl(u64 val)
 {
@@ -178,6 +187,7 @@ void __init cpu_select_mitigations(void)
 	 */
 	srso_select_mitigation();
 	gds_select_mitigation();
+	its_select_mitigation();
 }
 
 /*
@@ -1118,7 +1128,7 @@ do_cmd_auto:
 		setup_force_cpu_cap(X86_FEATURE_RETHUNK);
 		setup_force_cpu_cap(X86_FEATURE_UNRET);
 
-		x86_return_thunk = retbleed_return_thunk;
+		set_return_thunk(retbleed_return_thunk);
 
 		if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
 		    boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
@@ -1153,7 +1163,7 @@ do_cmd_auto:
 		setup_force_cpu_cap(X86_FEATURE_RETHUNK);
 		setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
 
-		x86_return_thunk = call_depth_return_thunk;
+		set_return_thunk(call_depth_return_thunk);
 		break;
 
 	default:
@@ -1188,6 +1198,145 @@ do_cmd_auto:
 }
 
 #undef pr_fmt
+#define pr_fmt(fmt)     "ITS: " fmt
+
+enum its_mitigation_cmd {
+	ITS_CMD_OFF,
+	ITS_CMD_ON,
+	ITS_CMD_VMEXIT,
+	ITS_CMD_RSB_STUFF,
+};
+
+enum its_mitigation {
+	ITS_MITIGATION_OFF,
+	ITS_MITIGATION_VMEXIT_ONLY,
+	ITS_MITIGATION_ALIGNED_THUNKS,
+	ITS_MITIGATION_RETPOLINE_STUFF,
+};
+
+static const char * const its_strings[] = {
+	[ITS_MITIGATION_OFF]			= "Vulnerable",
+	[ITS_MITIGATION_VMEXIT_ONLY]		= "Mitigation: Vulnerable, KVM: Not affected",
+	[ITS_MITIGATION_ALIGNED_THUNKS]		= "Mitigation: Aligned branch/return thunks",
+	[ITS_MITIGATION_RETPOLINE_STUFF]	= "Mitigation: Retpolines, Stuffing RSB",
+};
+
+static enum its_mitigation its_mitigation __ro_after_init = ITS_MITIGATION_ALIGNED_THUNKS;
+
+static enum its_mitigation_cmd its_cmd __ro_after_init =
+	IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_CMD_ON : ITS_CMD_OFF;
+
+static int __init its_parse_cmdline(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	if (!IS_ENABLED(CONFIG_MITIGATION_ITS)) {
+		pr_err("Mitigation disabled at compile time, ignoring option (%s)", str);
+		return 0;
+	}
+
+	if (!strcmp(str, "off")) {
+		its_cmd = ITS_CMD_OFF;
+	} else if (!strcmp(str, "on")) {
+		its_cmd = ITS_CMD_ON;
+	} else if (!strcmp(str, "force")) {
+		its_cmd = ITS_CMD_ON;
+		setup_force_cpu_bug(X86_BUG_ITS);
+	} else if (!strcmp(str, "vmexit")) {
+		its_cmd = ITS_CMD_VMEXIT;
+	} else if (!strcmp(str, "stuff")) {
+		its_cmd = ITS_CMD_RSB_STUFF;
+	} else {
+		pr_err("Ignoring unknown indirect_target_selection option (%s).", str);
+	}
+
+	return 0;
+}
+early_param("indirect_target_selection", its_parse_cmdline);
+
+static void __init its_select_mitigation(void)
+{
+	enum its_mitigation_cmd cmd = its_cmd;
+
+	if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) {
+		its_mitigation = ITS_MITIGATION_OFF;
+		return;
+	}
+
+	/* Retpoline+CDT mitigates ITS, bail out */
+	if (boot_cpu_has(X86_FEATURE_RETPOLINE) &&
+	    boot_cpu_has(X86_FEATURE_CALL_DEPTH)) {
+		its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
+		goto out;
+	}
+
+	/* Exit early to avoid irrelevant warnings */
+	if (cmd == ITS_CMD_OFF) {
+		its_mitigation = ITS_MITIGATION_OFF;
+		goto out;
+	}
+	if (spectre_v2_enabled == SPECTRE_V2_NONE) {
+		pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n");
+		its_mitigation = ITS_MITIGATION_OFF;
+		goto out;
+	}
+	if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ||
+	    !IS_ENABLED(CONFIG_MITIGATION_RETHUNK)) {
+		pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n");
+		its_mitigation = ITS_MITIGATION_OFF;
+		goto out;
+	}
+	if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) {
+		pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n");
+		its_mitigation = ITS_MITIGATION_OFF;
+		goto out;
+	}
+	if (boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) {
+		pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n");
+		its_mitigation = ITS_MITIGATION_OFF;
+		goto out;
+	}
+
+	if (cmd == ITS_CMD_RSB_STUFF &&
+	    (!boot_cpu_has(X86_FEATURE_RETPOLINE) || !IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING))) {
+		pr_err("RSB stuff mitigation not supported, using default\n");
+		cmd = ITS_CMD_ON;
+	}
+
+	switch (cmd) {
+	case ITS_CMD_OFF:
+		its_mitigation = ITS_MITIGATION_OFF;
+		break;
+	case ITS_CMD_VMEXIT:
+		if (boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY)) {
+			its_mitigation = ITS_MITIGATION_VMEXIT_ONLY;
+			goto out;
+		}
+		fallthrough;
+	case ITS_CMD_ON:
+		its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+		if (!boot_cpu_has(X86_FEATURE_RETPOLINE))
+			setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS);
+		setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+		set_return_thunk(its_return_thunk);
+		break;
+	case ITS_CMD_RSB_STUFF:
+		its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
+		setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+		setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
+		set_return_thunk(call_depth_return_thunk);
+		if (retbleed_mitigation == RETBLEED_MITIGATION_NONE) {
+			retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
+			pr_info("Retbleed mitigation updated to stuffing\n");
+		}
+		break;
+	}
+out:
+	pr_info("%s\n", its_strings[its_mitigation]);
+}
+
+#undef pr_fmt
 #define pr_fmt(fmt)     "Spectre V2 : " fmt
 
 static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
@@ -1697,11 +1846,11 @@ static void __init bhi_select_mitigation(void)
 			return;
 	}
 
-	/* Mitigate in hardware if supported */
-	if (spec_ctrl_bhi_dis())
+	if (!IS_ENABLED(CONFIG_X86_64))
 		return;
 
-	if (!IS_ENABLED(CONFIG_X86_64))
+	/* Mitigate in hardware if supported */
+	if (spec_ctrl_bhi_dis())
 		return;
 
 	if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
@@ -2607,10 +2756,10 @@ static void __init srso_select_mitigation(void)
 
 			if (boot_cpu_data.x86 == 0x19) {
 				setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
-				x86_return_thunk = srso_alias_return_thunk;
+				set_return_thunk(srso_alias_return_thunk);
 			} else {
 				setup_force_cpu_cap(X86_FEATURE_SRSO);
-				x86_return_thunk = srso_return_thunk;
+				set_return_thunk(srso_return_thunk);
 			}
 			if (has_microcode)
 				srso_mitigation = SRSO_MITIGATION_SAFE_RET;
@@ -2800,6 +2949,11 @@ static ssize_t rfds_show_state(char *buf)
 	return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]);
 }
 
+static ssize_t its_show_state(char *buf)
+{
+	return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]);
+}
+
 static char *stibp_state(void)
 {
 	if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
@@ -2982,6 +3136,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
 	case X86_BUG_RFDS:
 		return rfds_show_state(buf);
 
+	case X86_BUG_ITS:
+		return its_show_state(buf);
+
 	default:
 		break;
 	}
@@ -3061,6 +3218,11 @@ ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attrib
 {
 	return cpu_show_common(dev, attr, buf, X86_BUG_RFDS);
 }
+
+ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpu_show_common(dev, attr, buf, X86_BUG_ITS);
+}
 #endif
 
 void __warn_thunk(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 12126adbc3a9..0ff057ff11ce 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1227,6 +1227,10 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
 #define GDS		BIT(6)
 /* CPU is affected by Register File Data Sampling */
 #define RFDS		BIT(7)
+/* CPU is affected by Indirect Target Selection */
+#define ITS		BIT(8)
+/* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */
+#define ITS_NATIVE_ONLY	BIT(9)
 
 static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
 	VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE,	     X86_STEP_MAX,	SRBDS),
@@ -1238,22 +1242,25 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
 	VULNBL_INTEL_STEPS(INTEL_BROADWELL_G,	     X86_STEP_MAX,	SRBDS),
 	VULNBL_INTEL_STEPS(INTEL_BROADWELL_X,	     X86_STEP_MAX,	MMIO),
 	VULNBL_INTEL_STEPS(INTEL_BROADWELL,	     X86_STEP_MAX,	SRBDS),
-	VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS),
+	VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X,		      0x5,	MMIO | RETBLEED | GDS),
+	VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | ITS),
 	VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | SRBDS),
 	VULNBL_INTEL_STEPS(INTEL_SKYLAKE,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | SRBDS),
-	VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | SRBDS),
-	VULNBL_INTEL_STEPS(INTEL_KABYLAKE,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | SRBDS),
+	VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L,		      0xb,	MMIO | RETBLEED | GDS | SRBDS),
+	VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | SRBDS | ITS),
+	VULNBL_INTEL_STEPS(INTEL_KABYLAKE,		      0xc,	MMIO | RETBLEED | GDS | SRBDS),
+	VULNBL_INTEL_STEPS(INTEL_KABYLAKE,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | SRBDS | ITS),
 	VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L,	     X86_STEP_MAX,	RETBLEED),
-	VULNBL_INTEL_STEPS(INTEL_ICELAKE_L,	     X86_STEP_MAX,	MMIO | MMIO_SBDS | RETBLEED | GDS),
-	VULNBL_INTEL_STEPS(INTEL_ICELAKE_D,	     X86_STEP_MAX,	MMIO | GDS),
-	VULNBL_INTEL_STEPS(INTEL_ICELAKE_X,	     X86_STEP_MAX,	MMIO | GDS),
-	VULNBL_INTEL_STEPS(INTEL_COMETLAKE,	     X86_STEP_MAX,	MMIO | MMIO_SBDS | RETBLEED | GDS),
-	VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L,		      0x0,	MMIO | RETBLEED),
-	VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L,	     X86_STEP_MAX,	MMIO | MMIO_SBDS | RETBLEED | GDS),
-	VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L,	     X86_STEP_MAX,	GDS),
-	VULNBL_INTEL_STEPS(INTEL_TIGERLAKE,	     X86_STEP_MAX,	GDS),
+	VULNBL_INTEL_STEPS(INTEL_ICELAKE_L,	     X86_STEP_MAX,	MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
+	VULNBL_INTEL_STEPS(INTEL_ICELAKE_D,	     X86_STEP_MAX,	MMIO | GDS | ITS | ITS_NATIVE_ONLY),
+	VULNBL_INTEL_STEPS(INTEL_ICELAKE_X,	     X86_STEP_MAX,	MMIO | GDS | ITS | ITS_NATIVE_ONLY),
+	VULNBL_INTEL_STEPS(INTEL_COMETLAKE,	     X86_STEP_MAX,	MMIO | MMIO_SBDS | RETBLEED | GDS | ITS),
+	VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L,		      0x0,	MMIO | RETBLEED | ITS),
+	VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L,	     X86_STEP_MAX,	MMIO | MMIO_SBDS | RETBLEED | GDS | ITS),
+	VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L,	     X86_STEP_MAX,	GDS | ITS | ITS_NATIVE_ONLY),
+	VULNBL_INTEL_STEPS(INTEL_TIGERLAKE,	     X86_STEP_MAX,	GDS | ITS | ITS_NATIVE_ONLY),
 	VULNBL_INTEL_STEPS(INTEL_LAKEFIELD,	     X86_STEP_MAX,	MMIO | MMIO_SBDS | RETBLEED),
-	VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS),
+	VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
 	VULNBL_INTEL_TYPE(INTEL_ALDERLAKE,		     ATOM,	RFDS),
 	VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L,	     X86_STEP_MAX,	RFDS),
 	VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE,		     ATOM,	RFDS),
@@ -1318,6 +1325,32 @@ static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr)
 	return cpu_matches(cpu_vuln_blacklist, RFDS);
 }
 
+static bool __init vulnerable_to_its(u64 x86_arch_cap_msr)
+{
+	/* The "immunity" bit trumps everything else: */
+	if (x86_arch_cap_msr & ARCH_CAP_ITS_NO)
+		return false;
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return false;
+
+	/* None of the affected CPUs have BHI_CTRL */
+	if (boot_cpu_has(X86_FEATURE_BHI_CTRL))
+		return false;
+
+	/*
+	 * If a VMM did not expose ITS_NO, assume that a guest could
+	 * be running on a vulnerable hardware or may migrate to such
+	 * hardware.
+	 */
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+		return true;
+
+	if (cpu_matches(cpu_vuln_blacklist, ITS))
+		return true;
+
+	return false;
+}
+
 static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 {
 	u64 x86_arch_cap_msr = x86_read_arch_cap_msr();
@@ -1439,9 +1472,12 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 	if (vulnerable_to_rfds(x86_arch_cap_msr))
 		setup_force_cpu_bug(X86_BUG_RFDS);
 
-	/* When virtualized, eIBRS could be hidden, assume vulnerable */
-	if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) &&
-	    !cpu_matches(cpu_vuln_whitelist, NO_BHI) &&
+	/*
+	 * Intel parts with eIBRS are vulnerable to BHI attacks. Parts with
+	 * BHI_NO still need to use the BHI mitigation to prevent Intra-mode
+	 * attacks.  When virtualized, eIBRS could be hidden, assume vulnerable.
+	 */
+	if (!cpu_matches(cpu_vuln_whitelist, NO_BHI) &&
 	    (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) ||
 	     boot_cpu_has(X86_FEATURE_HYPERVISOR)))
 		setup_force_cpu_bug(X86_BUG_BHI);
@@ -1449,6 +1485,12 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 	if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET))
 		setup_force_cpu_bug(X86_BUG_IBPB_NO_RET);
 
+	if (vulnerable_to_its(x86_arch_cap_msr)) {
+		setup_force_cpu_bug(X86_BUG_ITS);
+		if (cpu_matches(cpu_vuln_blacklist, ITS_NATIVE_ONLY))
+			setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY);
+	}
+
 	if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
 		return;
 
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 4a10d35e70aa..96cb992d50ef 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -1098,15 +1098,17 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz
 
 static int __init save_microcode_in_initrd(void)
 {
-	unsigned int cpuid_1_eax = native_cpuid_eax(1);
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 	struct cont_desc desc = { 0 };
+	unsigned int cpuid_1_eax;
 	enum ucode_state ret;
 	struct cpio_data cp;
 
-	if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
+	if (microcode_loader_disabled() || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
 		return 0;
 
+	cpuid_1_eax = native_cpuid_eax(1);
+
 	if (!find_blobs_in_containers(&cp))
 		return -EINVAL;
 
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index b3658d11e7b6..079f046ee26d 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -41,8 +41,8 @@
 
 #include "internal.h"
 
-static struct microcode_ops	*microcode_ops;
-bool dis_ucode_ldr = true;
+static struct microcode_ops *microcode_ops;
+static bool dis_ucode_ldr = false;
 
 bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV);
 module_param(force_minrev, bool, S_IRUSR | S_IWUSR);
@@ -84,6 +84,9 @@ static bool amd_check_current_patch_level(void)
 	u32 lvl, dummy, i;
 	u32 *levels;
 
+	if (x86_cpuid_vendor() != X86_VENDOR_AMD)
+		return false;
+
 	native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
 
 	levels = final_levels;
@@ -95,27 +98,29 @@ static bool amd_check_current_patch_level(void)
 	return false;
 }
 
-static bool __init check_loader_disabled_bsp(void)
+bool __init microcode_loader_disabled(void)
 {
-	static const char *__dis_opt_str = "dis_ucode_ldr";
-	const char *cmdline = boot_command_line;
-	const char *option  = __dis_opt_str;
+	if (dis_ucode_ldr)
+		return true;
 
 	/*
-	 * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
-	 * completely accurate as xen pv guests don't see that CPUID bit set but
-	 * that's good enough as they don't land on the BSP path anyway.
+	 * Disable when:
+	 *
+	 * 1) The CPU does not support CPUID.
+	 *
+	 * 2) Bit 31 in CPUID[1]:ECX is clear
+	 *    The bit is reserved for hypervisor use. This is still not
+	 *    completely accurate as XEN PV guests don't see that CPUID bit
+	 *    set, but that's good enough as they don't land on the BSP
+	 *    path anyway.
+	 *
+	 * 3) Certain AMD patch levels are not allowed to be
+	 *    overwritten.
 	 */
-	if (native_cpuid_ecx(1) & BIT(31))
-		return true;
-
-	if (x86_cpuid_vendor() == X86_VENDOR_AMD) {
-		if (amd_check_current_patch_level())
-			return true;
-	}
-
-	if (cmdline_find_option_bool(cmdline, option) <= 0)
-		dis_ucode_ldr = false;
+	if (!have_cpuid_p() ||
+	    native_cpuid_ecx(1) & BIT(31) ||
+	    amd_check_current_patch_level())
+		dis_ucode_ldr = true;
 
 	return dis_ucode_ldr;
 }
@@ -125,7 +130,10 @@ void __init load_ucode_bsp(void)
 	unsigned int cpuid_1_eax;
 	bool intel = true;
 
-	if (!have_cpuid_p())
+	if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0)
+		dis_ucode_ldr = true;
+
+	if (microcode_loader_disabled())
 		return;
 
 	cpuid_1_eax = native_cpuid_eax(1);
@@ -146,9 +154,6 @@ void __init load_ucode_bsp(void)
 		return;
 	}
 
-	if (check_loader_disabled_bsp())
-		return;
-
 	if (intel)
 		load_ucode_intel_bsp(&early_data);
 	else
@@ -159,6 +164,11 @@ void load_ucode_ap(void)
 {
 	unsigned int cpuid_1_eax;
 
+	/*
+	 * Can't use microcode_loader_disabled() here - .init section
+	 * hell. It doesn't have to either - the BSP variant must've
+	 * parsed cmdline already anyway.
+	 */
 	if (dis_ucode_ldr)
 		return;
 
@@ -810,7 +820,7 @@ static int __init microcode_init(void)
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 	int error;
 
-	if (dis_ucode_ldr)
+	if (microcode_loader_disabled())
 		return -EINVAL;
 
 	if (c->x86_vendor == X86_VENDOR_INTEL)
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 819199bc0119..2a397da43923 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -389,7 +389,7 @@ static int __init save_builtin_microcode(void)
 	if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED)
 		return 0;
 
-	if (dis_ucode_ldr || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+	if (microcode_loader_disabled() || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
 		return 0;
 
 	uci.mc = get_microcode_blob(&uci, true);
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
index 5df621752fef..50a9702ae4e2 100644
--- a/arch/x86/kernel/cpu/microcode/internal.h
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -94,7 +94,6 @@ static inline unsigned int x86_cpuid_family(void)
 	return x86_family(eax);
 }
 
-extern bool dis_ucode_ldr;
 extern bool force_minrev;
 
 #ifdef CONFIG_CPU_SUP_AMD
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 9d8dd8deb2a7..9920122018a0 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1299,6 +1299,14 @@ void __init e820__memblock_setup(void)
 		memblock_add(entry->addr, entry->size);
 	}
 
+	/*
+	 * 32-bit systems are limited to 4BG of memory even with HIGHMEM and
+	 * to even less without it.
+	 * Discard memory after max_pfn - the actual limit detected at runtime.
+	 */
+	if (IS_ENABLED(CONFIG_X86_32))
+		memblock_remove(PFN_PHYS(max_pfn), -1);
+
 	/* Throw away partial pages: */
 	memblock_trim_memory(PAGE_SIZE);
 
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 91d6341f281f..399f43aa78d5 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -43,8 +43,11 @@ struct fpu_state_config fpu_user_cfg __ro_after_init;
  */
 struct fpstate init_fpstate __ro_after_init;
 
-/* Track in-kernel FPU usage */
-static DEFINE_PER_CPU(bool, in_kernel_fpu);
+/*
+ * Track FPU initialization and kernel-mode usage. 'true' means the FPU is
+ * initialized and is not currently being used by the kernel:
+ */
+DEFINE_PER_CPU(bool, kernel_fpu_allowed);
 
 /*
  * Track which context is using the FPU on the CPU:
@@ -61,15 +64,18 @@ bool irq_fpu_usable(void)
 		return false;
 
 	/*
-	 * In kernel FPU usage already active?  This detects any explicitly
-	 * nested usage in task or softirq context, which is unsupported.  It
-	 * also detects attempted usage in a hardirq that has interrupted a
-	 * kernel-mode FPU section.
+	 * Return false in the following cases:
+	 *
+	 * - FPU is not yet initialized. This can happen only when the call is
+	 *   coming from CPU onlining, for example for microcode checksumming.
+	 * - The kernel is already using the FPU, either because of explicit
+	 *   nesting (which should never be done), or because of implicit
+	 *   nesting when a hardirq interrupted a kernel-mode FPU section.
+	 *
+	 * The single boolean check below handles both cases:
 	 */
-	if (this_cpu_read(in_kernel_fpu)) {
-		WARN_ON_FPU(!in_hardirq());
+	if (!this_cpu_read(kernel_fpu_allowed))
 		return false;
-	}
 
 	/*
 	 * When not in NMI or hard interrupt context, FPU can be used in:
@@ -431,9 +437,10 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
 		fpregs_lock();
 
 	WARN_ON_FPU(!irq_fpu_usable());
-	WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
 
-	this_cpu_write(in_kernel_fpu, true);
+	/* Toggle kernel_fpu_allowed to false: */
+	WARN_ON_FPU(!this_cpu_read(kernel_fpu_allowed));
+	this_cpu_write(kernel_fpu_allowed, false);
 
 	if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) &&
 	    !test_thread_flag(TIF_NEED_FPU_LOAD)) {
@@ -453,9 +460,10 @@ EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);
 
 void kernel_fpu_end(void)
 {
-	WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
+	/* Toggle kernel_fpu_allowed back to true: */
+	WARN_ON_FPU(this_cpu_read(kernel_fpu_allowed));
+	this_cpu_write(kernel_fpu_allowed, true);
 
-	this_cpu_write(in_kernel_fpu, false);
 	if (!irqs_disabled())
 		fpregs_unlock();
 }
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 998a08f17e33..1975c37c3668 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -51,6 +51,9 @@ void fpu__init_cpu(void)
 {
 	fpu__init_cpu_generic();
 	fpu__init_cpu_xstate();
+
+	/* Start allowing kernel-mode FPU: */
+	this_cpu_write(kernel_fpu_allowed, true);
 }
 
 static bool __init fpu__probe_without_cpuid(void)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cace6e8d7cc7..5eb1514af559 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -354,7 +354,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
 		goto fail;
 
 	ip = trampoline + size;
-	if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+	if (cpu_wants_rethunk_at(ip))
 		__text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE);
 	else
 		memcpy(ip, retq, sizeof(retq));
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index de001b2146ab..375f2d7f1762 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -145,10 +145,6 @@ void __init __no_stack_protector mk_early_pgtbl_32(void)
 	*ptr = (unsigned long)ptep + PAGE_OFFSET;
 
 #ifdef CONFIG_MICROCODE_INITRD32
-	/* Running on a hypervisor? */
-	if (native_cpuid_ecx(1) & BIT(31))
-		return;
-
 	params = (struct boot_params *)__pa_nodebug(&boot_params);
 	if (!params->hdr.ramdisk_size || !params->hdr.ramdisk_image)
 		return;
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 80265162aeff..1f325304c4a8 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -42,7 +42,7 @@ static void load_segments(void)
 
 static void machine_kexec_free_page_tables(struct kimage *image)
 {
-	free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER);
+	free_pages((unsigned long)image->arch.pgd, pgd_allocation_order());
 	image->arch.pgd = NULL;
 #ifdef CONFIG_X86_PAE
 	free_page((unsigned long)image->arch.pmd0);
@@ -59,7 +59,7 @@ static void machine_kexec_free_page_tables(struct kimage *image)
 static int machine_kexec_alloc_page_tables(struct kimage *image)
 {
 	image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-						    PGD_ALLOCATION_ORDER);
+						    pgd_allocation_order());
 #ifdef CONFIG_X86_PAE
 	image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
 	image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index a7998f351701..ff07558b7ebc 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -266,6 +266,8 @@ int module_finalize(const Elf_Ehdr *hdr,
 			ibt_endbr = s;
 	}
 
+	its_init_mod(me);
+
 	if (retpolines || cfi) {
 		void *rseg = NULL, *cseg = NULL;
 		unsigned int rsize = 0, csize = 0;
@@ -286,6 +288,9 @@ int module_finalize(const Elf_Ehdr *hdr,
 		void *rseg = (void *)retpolines->sh_addr;
 		apply_retpolines(rseg, rseg + retpolines->sh_size);
 	}
+
+	its_fini_mod(me);
+
 	if (returns) {
 		void *rseg = (void *)returns->sh_addr;
 		apply_returns(rseg, rseg + returns->sh_size);
@@ -326,4 +331,5 @@ int module_finalize(const Elf_Ehdr *hdr,
 void module_arch_cleanup(struct module *mod)
 {
 	alternatives_smp_module_del(mod);
+	its_free_mod(mod);
 }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d6cf1e23c2a3..2901f5cfd825 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1188,6 +1188,12 @@ void cpu_disable_common(void)
 
 	remove_siblinginfo(cpu);
 
+	/*
+	 * Stop allowing kernel-mode FPU. This is needed so that if the CPU is
+	 * brought online again, the initial state is not allowed:
+	 */
+	this_cpu_write(kernel_fpu_allowed, false);
+
 	/* It's now safe to remove this processor from the online map */
 	lock_vector_lock();
 	remove_cpu_from_maps(cpu);
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index a59c72e77645..c3d7ff44b29a 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -81,7 +81,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
 		break;
 
 	case RET:
-		if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+		if (cpu_wants_rethunk_at(insn))
 			code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk);
 		else
 			code = &retinsn;
@@ -90,7 +90,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
 	case JCC:
 		if (!func) {
 			func = __static_call_return;
-			if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+			if (cpu_wants_rethunk())
 				func = x86_return_thunk;
 		}
 
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index ccdc45e5b759..cda5f8362e9d 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -466,10 +466,18 @@ SECTIONS
 }
 
 /*
- * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
+ * COMPILE_TEST kernels can be large - CONFIG_KASAN, for example, can cause
+ * this.  Let's assume that nobody will be running a COMPILE_TEST kernel and
+ * let's assert that fuller build coverage is more valuable than being able to
+ * run a COMPILE_TEST kernel.
+ */
+#ifndef CONFIG_COMPILE_TEST
+/*
+ * The ASSERT() sync to . is intentional, for binutils 2.14 compatibility:
  */
 . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
 	   "kernel image bigger than KERNEL_IMAGE_SIZE");
+#endif
 
 /* needed for Clang - see arch/x86/entry/entry.S */
 PROVIDE(__ref_stack_chk_guard = __stack_chk_guard);
@@ -497,6 +505,16 @@ PROVIDE(__ref_stack_chk_guard = __stack_chk_guard);
 		"SRSO function pair won't alias");
 #endif
 
+#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)
+. = ASSERT(__x86_indirect_its_thunk_rax & 0x20, "__x86_indirect_thunk_rax not in second half of cacheline");
+. = ASSERT(((__x86_indirect_its_thunk_rcx - __x86_indirect_its_thunk_rax) % 64) == 0, "Indirect thunks are not cacheline apart");
+. = ASSERT(__x86_indirect_its_thunk_array == __x86_indirect_its_thunk_rax, "Gap in ITS thunk array");
+#endif
+
+#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)
+. = ASSERT(its_return_thunk & 0x20, "its_return_thunk not in second half of cacheline");
+#endif
+
 #endif /* CONFIG_X86_64 */
 
 /*
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 050a0e229a4d..f2b36d32ef40 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -104,6 +104,9 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
 
 static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 {
+	if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
+		kvm_mmu_free_obsolete_roots(vcpu);
+
 	/*
 	 * Checking root.hpa is sufficient even when KVM has mirror root.
 	 * We can have either:
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 63bb77ee1bb1..8d1b632e33d2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5974,6 +5974,7 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
 	__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
 	__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_free_obsolete_roots);
 
 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
 				    int *bytes)
@@ -7669,9 +7670,30 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
 }
 
 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+				int level)
+{
+	return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+				 int level)
+{
+	lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+			       int level)
+{
+	lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
+}
+
 bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
 					struct kvm_gfn_range *range)
 {
+	struct kvm_memory_slot *slot = range->slot;
+	int level;
+
 	/*
 	 * Zap SPTEs even if the slot can't be mapped PRIVATE.  KVM x86 only
 	 * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
@@ -7686,6 +7708,38 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
 	if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
 		return false;
 
+	if (WARN_ON_ONCE(range->end <= range->start))
+		return false;
+
+	/*
+	 * If the head and tail pages of the range currently allow a hugepage,
+	 * i.e. reside fully in the slot and don't have mixed attributes, then
+	 * add each corresponding hugepage range to the ongoing invalidation,
+	 * e.g. to prevent KVM from creating a hugepage in response to a fault
+	 * for a gfn whose attributes aren't changing.  Note, only the range
+	 * of gfns whose attributes are being modified needs to be explicitly
+	 * unmapped, as that will unmap any existing hugepages.
+	 */
+	for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+		gfn_t start = gfn_round_for_level(range->start, level);
+		gfn_t end = gfn_round_for_level(range->end - 1, level);
+		gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+
+		if ((start != range->start || start + nr_pages > range->end) &&
+		    start >= slot->base_gfn &&
+		    start + nr_pages <= slot->base_gfn + slot->npages &&
+		    !hugepage_test_mixed(slot, start, level))
+			kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages);
+
+		if (end == start)
+			continue;
+
+		if ((end + nr_pages) > range->end &&
+		    (end + nr_pages) <= (slot->base_gfn + slot->npages) &&
+		    !hugepage_test_mixed(slot, end, level))
+			kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages);
+	}
+
 	/* Unmap the old attribute page. */
 	if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE)
 		range->attr_filter = KVM_FILTER_SHARED;
@@ -7695,23 +7749,7 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
 	return kvm_unmap_gfn_range(kvm, range);
 }
 
-static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
-				int level)
-{
-	return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
-}
-
-static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
-				 int level)
-{
-	lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
-}
 
-static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
-			       int level)
-{
-	lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
-}
 
 static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
 			       gfn_t gfn, int level, unsigned long attrs)
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index 699e551ec93b..9864c057187d 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -131,6 +131,7 @@ void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
 
 	kvm_mmu_reset_context(vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_smm_changed);
 
 void process_smi(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 65fd245a9953..7338879d1c0c 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -796,12 +796,15 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
 	struct amd_svm_iommu_ir *ir;
 	u64 entry;
 
+	if (WARN_ON_ONCE(!pi->ir_data))
+		return -EINVAL;
+
 	/**
 	 * In some cases, the existing irte is updated and re-set,
 	 * so we need to check here if it's already been * added
 	 * to the ir_list.
 	 */
-	if (pi->ir_data && (pi->prev_ga_tag != 0)) {
+	if (pi->prev_ga_tag) {
 		struct kvm *kvm = svm->vcpu.kvm;
 		u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
 		struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
@@ -820,7 +823,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
 	 * Allocating new amd_iommu_pi_data, which will get
 	 * add to the per-vcpu ir_list.
 	 */
-	ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
+	ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT);
 	if (!ir) {
 		ret = -ENOMEM;
 		goto out;
@@ -896,10 +899,10 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 {
 	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_irq_routing_table *irq_rt;
+	bool enable_remapped_mode = true;
 	int idx, ret = 0;
 
-	if (!kvm_arch_has_assigned_device(kvm) ||
-	    !irq_remapping_cap(IRQ_POSTING_CAP))
+	if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass())
 		return 0;
 
 	pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
@@ -933,6 +936,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 		    kvm_vcpu_apicv_active(&svm->vcpu)) {
 			struct amd_iommu_pi_data pi;
 
+			enable_remapped_mode = false;
+
 			/* Try to enable guest_mode in IRTE */
 			pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
 					    AVIC_HPA_MASK);
@@ -951,33 +956,6 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 			 */
 			if (!ret && pi.is_guest_mode)
 				svm_ir_list_add(svm, &pi);
-		} else {
-			/* Use legacy mode in IRTE */
-			struct amd_iommu_pi_data pi;
-
-			/**
-			 * Here, pi is used to:
-			 * - Tell IOMMU to use legacy mode for this interrupt.
-			 * - Retrieve ga_tag of prior interrupt remapping data.
-			 */
-			pi.prev_ga_tag = 0;
-			pi.is_guest_mode = false;
-			ret = irq_set_vcpu_affinity(host_irq, &pi);
-
-			/**
-			 * Check if the posted interrupt was previously
-			 * setup with the guest_mode by checking if the ga_tag
-			 * was cached. If so, we need to clean up the per-vcpu
-			 * ir_list.
-			 */
-			if (!ret && pi.prev_ga_tag) {
-				int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
-				struct kvm_vcpu *vcpu;
-
-				vcpu = kvm_get_vcpu_by_id(kvm, id);
-				if (vcpu)
-					svm_ir_list_del(to_svm(vcpu), &pi);
-			}
 		}
 
 		if (!ret && svm) {
@@ -993,6 +971,34 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 	}
 
 	ret = 0;
+	if (enable_remapped_mode) {
+		/* Use legacy mode in IRTE */
+		struct amd_iommu_pi_data pi;
+
+		/**
+		 * Here, pi is used to:
+		 * - Tell IOMMU to use legacy mode for this interrupt.
+		 * - Retrieve ga_tag of prior interrupt remapping data.
+		 */
+		pi.prev_ga_tag = 0;
+		pi.is_guest_mode = false;
+		ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+		/**
+		 * Check if the posted interrupt was previously
+		 * setup with the guest_mode by checking if the ga_tag
+		 * was cached. If so, we need to clean up the per-vcpu
+		 * ir_list.
+		 */
+		if (!ret && pi.prev_ga_tag) {
+			int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+			struct kvm_vcpu *vcpu;
+
+			vcpu = kvm_get_vcpu_by_id(kvm, id);
+			if (vcpu)
+				svm_ir_list_del(to_svm(vcpu), &pi);
+		}
+	}
 out:
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 	return ret;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0bc708ee2788..70420720c728 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2933,6 +2933,7 @@ void __init sev_set_cpu_caps(void)
 void __init sev_hardware_setup(void)
 {
 	unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
+	struct sev_platform_init_args init_args = {0};
 	bool sev_snp_supported = false;
 	bool sev_es_supported = false;
 	bool sev_supported = false;
@@ -3059,6 +3060,15 @@ out:
 	sev_supported_vmsa_features = 0;
 	if (sev_es_debug_swap_enabled)
 		sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP;
+
+	if (!sev_enabled)
+		return;
+
+	/*
+	 * Do both SNP and SEV initialization at KVM module load.
+	 */
+	init_args.probe = true;
+	sev_platform_init(&init_args);
 }
 
 void sev_hardware_unsetup(void)
@@ -3074,6 +3084,8 @@ void sev_hardware_unsetup(void)
 
 	misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
 	misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
+
+	sev_platform_shutdown();
 }
 
 int sev_cpu_init(struct svm_cpu_data *sd)
@@ -3173,9 +3185,14 @@ skip_vmsa_free:
 		kvfree(svm->sev_es.ghcb_sa);
 }
 
+static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control)
+{
+	return (((u64)control->exit_code_hi) << 32) | control->exit_code;
+}
+
 static void dump_ghcb(struct vcpu_svm *svm)
 {
-	struct ghcb *ghcb = svm->sev_es.ghcb;
+	struct vmcb_control_area *control = &svm->vmcb->control;
 	unsigned int nbits;
 
 	/* Re-use the dump_invalid_vmcb module parameter */
@@ -3184,18 +3201,24 @@ static void dump_ghcb(struct vcpu_svm *svm)
 		return;
 	}
 
-	nbits = sizeof(ghcb->save.valid_bitmap) * 8;
+	nbits = sizeof(svm->sev_es.valid_bitmap) * 8;
 
-	pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa);
+	/*
+	 * Print KVM's snapshot of the GHCB values that were (unsuccessfully)
+	 * used to handle the exit.  If the guest has since modified the GHCB
+	 * itself, dumping the raw GHCB won't help debug why KVM was unable to
+	 * handle the VMGEXIT that KVM observed.
+	 */
+	pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa);
 	pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
-	       ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb));
+	       kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm));
 	pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
-	       ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb));
+	       control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm));
 	pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
-	       ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb));
+	       control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm));
 	pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch",
-	       ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb));
-	pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap);
+	       svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm));
+	pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap);
 }
 
 static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
@@ -3266,11 +3289,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
 	memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
 }
 
-static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control)
-{
-	return (((u64)control->exit_code_hi) << 32) | control->exit_code;
-}
-
 static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d5d0c5c3300b..a89c271a1951 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -607,9 +607,6 @@ static void svm_disable_virtualization_cpu(void)
 	kvm_cpu_svm_disable();
 
 	amd_pmu_disable_virt();
-
-	if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
-		msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
 }
 
 static int svm_enable_virtualization_cpu(void)
@@ -687,9 +684,6 @@ static int svm_enable_virtualization_cpu(void)
 		rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi);
 	}
 
-	if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
-		msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
-
 	return 0;
 }
 
@@ -1518,6 +1512,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
 	__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
 }
 
+#ifdef CONFIG_CPU_MITIGATIONS
+static DEFINE_SPINLOCK(srso_lock);
+static atomic_t srso_nr_vms;
+
+static void svm_srso_clear_bp_spec_reduce(void *ign)
+{
+	struct svm_cpu_data *sd = this_cpu_ptr(&svm_data);
+
+	if (!sd->bp_spec_reduce_set)
+		return;
+
+	msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+	sd->bp_spec_reduce_set = false;
+}
+
+static void svm_srso_vm_destroy(void)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+		return;
+
+	if (atomic_dec_return(&srso_nr_vms))
+		return;
+
+	guard(spinlock)(&srso_lock);
+
+	/*
+	 * Verify a new VM didn't come along, acquire the lock, and increment
+	 * the count before this task acquired the lock.
+	 */
+	if (atomic_read(&srso_nr_vms))
+		return;
+
+	on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
+}
+
+static void svm_srso_vm_init(void)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+		return;
+
+	/*
+	 * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0
+	 * transition, i.e. destroying the last VM, is fully complete, e.g. so
+	 * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs.
+	 */
+	if (atomic_inc_not_zero(&srso_nr_vms))
+		return;
+
+	guard(spinlock)(&srso_lock);
+
+	atomic_inc(&srso_nr_vms);
+}
+#else
+static void svm_srso_vm_init(void) { }
+static void svm_srso_vm_destroy(void) { }
+#endif
+
 static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -1550,6 +1601,11 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 	    (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
 		kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
 
+	if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) &&
+	    !sd->bp_spec_reduce_set) {
+		sd->bp_spec_reduce_set = true;
+		msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+	}
 	svm->guest_state_loaded = true;
 }
 
@@ -2231,6 +2287,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu)
 	 */
 	if (!sev_es_guest(vcpu->kvm)) {
 		clear_page(svm->vmcb);
+#ifdef CONFIG_KVM_SMM
+		if (is_smm(vcpu))
+			kvm_smm_changed(vcpu, false);
+#endif
 		kvm_vcpu_reset(vcpu, true);
 	}
 
@@ -5036,6 +5096,8 @@ static void svm_vm_destroy(struct kvm *kvm)
 {
 	avic_vm_destroy(kvm);
 	sev_vm_destroy(kvm);
+
+	svm_srso_vm_destroy();
 }
 
 static int svm_vm_init(struct kvm *kvm)
@@ -5061,6 +5123,7 @@ static int svm_vm_init(struct kvm *kvm)
 			return ret;
 	}
 
+	svm_srso_vm_init();
 	return 0;
 }
 
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index d4490eaed55d..f16b068c4228 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -335,6 +335,8 @@ struct svm_cpu_data {
 	u32 next_asid;
 	u32 min_asid;
 
+	bool bp_spec_reduce_set;
+
 	struct vmcb *save_area;
 	unsigned long save_area_pa;
 
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index ccda95e53f62..ba736cbb0587 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -11,6 +11,13 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
 
+#ifdef CREATE_TRACE_POINTS
+#define tracing_kvm_rip_read(vcpu) ({					\
+	typeof(vcpu) __vcpu = vcpu;					\
+	__vcpu->arch.guest_state_protected ? 0 : kvm_rip_read(__vcpu);	\
+	})
+#endif
+
 /*
  * Tracepoint for guest mode entry.
  */
@@ -28,7 +35,7 @@ TRACE_EVENT(kvm_entry,
 
 	TP_fast_assign(
 		__entry->vcpu_id        = vcpu->vcpu_id;
-		__entry->rip		= kvm_rip_read(vcpu);
+		__entry->rip		= tracing_kvm_rip_read(vcpu);
 		__entry->immediate_exit	= force_immediate_exit;
 
 		kvm_x86_call(get_entry_info)(vcpu, &__entry->intr_info,
@@ -319,7 +326,7 @@ TRACE_EVENT(name,							     \
 	),								     \
 									     \
 	TP_fast_assign(							     \
-		__entry->guest_rip	= kvm_rip_read(vcpu);		     \
+		__entry->guest_rip	= tracing_kvm_rip_read(vcpu);		     \
 		__entry->isa            = isa;				     \
 		__entry->vcpu_id        = vcpu->vcpu_id;		     \
 		__entry->requests       = READ_ONCE(vcpu->requests);	     \
@@ -423,7 +430,7 @@ TRACE_EVENT(kvm_page_fault,
 
 	TP_fast_assign(
 		__entry->vcpu_id	= vcpu->vcpu_id;
-		__entry->guest_rip	= kvm_rip_read(vcpu);
+		__entry->guest_rip	= tracing_kvm_rip_read(vcpu);
 		__entry->fault_address	= fault_address;
 		__entry->error_code	= error_code;
 	),
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 51116fe69a50..d70e5b90087d 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -297,6 +297,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 {
 	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_irq_routing_table *irq_rt;
+	bool enable_remapped_mode = true;
 	struct kvm_lapic_irq irq;
 	struct kvm_vcpu *vcpu;
 	struct vcpu_data vcpu_info;
@@ -335,21 +336,8 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 
 		kvm_set_msi_irq(kvm, e, &irq);
 		if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
-		    !kvm_irq_is_postable(&irq)) {
-			/*
-			 * Make sure the IRTE is in remapped mode if
-			 * we don't handle it in posted mode.
-			 */
-			ret = irq_set_vcpu_affinity(host_irq, NULL);
-			if (ret < 0) {
-				printk(KERN_INFO
-				   "failed to back to remapped mode, irq: %u\n",
-				   host_irq);
-				goto out;
-			}
-
+		    !kvm_irq_is_postable(&irq))
 			continue;
-		}
 
 		vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
 		vcpu_info.vector = irq.vector;
@@ -357,11 +345,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 		trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
 				vcpu_info.vector, vcpu_info.pi_desc_addr, set);
 
-		if (set)
-			ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
-		else
-			ret = irq_set_vcpu_affinity(host_irq, NULL);
+		if (!set)
+			continue;
 
+		enable_remapped_mode = false;
+
+		ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
 		if (ret < 0) {
 			printk(KERN_INFO "%s: failed to update PI IRTE\n",
 					__func__);
@@ -369,6 +358,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 		}
 	}
 
+	if (enable_remapped_mode)
+		ret = irq_set_vcpu_affinity(host_irq, NULL);
+
 	ret = 0;
 out:
 	srcu_read_unlock(&kvm->irq_srcu, idx);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3712dde0bf9d..be7bb6d20129 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1584,7 +1584,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
 	 ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
 	 ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
 	 ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \
-	 ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO)
+	 ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO)
 
 static u64 kvm_get_arch_capabilities(void)
 {
@@ -1618,6 +1618,8 @@ static u64 kvm_get_arch_capabilities(void)
 		data |= ARCH_CAP_MDS_NO;
 	if (!boot_cpu_has_bug(X86_BUG_RFDS))
 		data |= ARCH_CAP_RFDS_NO;
+	if (!boot_cpu_has_bug(X86_BUG_ITS))
+		data |= ARCH_CAP_ITS_NO;
 
 	if (!boot_cpu_has(X86_FEATURE_RTM)) {
 		/*
@@ -4597,7 +4599,7 @@ static bool kvm_is_vm_type_supported(unsigned long type)
 	return type < 32 && (kvm_caps.supported_vm_types & BIT(type));
 }
 
-static inline u32 kvm_sync_valid_fields(struct kvm *kvm)
+static inline u64 kvm_sync_valid_fields(struct kvm *kvm)
 {
 	return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS;
 }
@@ -11098,7 +11100,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	/*
 	 * Profile KVM exit RIPs:
 	 */
-	if (unlikely(prof_on == KVM_PROFILING)) {
+	if (unlikely(prof_on == KVM_PROFILING &&
+		     !vcpu->arch.guest_state_protected)) {
 		unsigned long rip = kvm_rip_read(vcpu);
 		profile_hit(KVM_PROFILING, (void *)rip);
 	}
@@ -11492,7 +11495,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 {
 	struct kvm_queued_exception *ex = &vcpu->arch.exception;
 	struct kvm_run *kvm_run = vcpu->run;
-	u32 sync_valid_fields;
+	u64 sync_valid_fields;
 	int r;
 
 	r = kvm_mmu_post_init_vm(vcpu->kvm);
@@ -13556,25 +13559,27 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
 
-bool kvm_arch_has_irq_bypass(void)
-{
-	return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP);
-}
-
 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
 				      struct irq_bypass_producer *prod)
 {
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(cons, struct kvm_kernel_irqfd, consumer);
+	struct kvm *kvm = irqfd->kvm;
 	int ret;
 
-	irqfd->producer = prod;
 	kvm_arch_start_assignment(irqfd->kvm);
+
+	spin_lock_irq(&kvm->irqfds.lock);
+	irqfd->producer = prod;
+
 	ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
 					   prod->irq, irqfd->gsi, 1);
 	if (ret)
 		kvm_arch_end_assignment(irqfd->kvm);
 
+	spin_unlock_irq(&kvm->irqfds.lock);
+
+
 	return ret;
 }
 
@@ -13584,9 +13589,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 	int ret;
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(cons, struct kvm_kernel_irqfd, consumer);
+	struct kvm *kvm = irqfd->kvm;
 
 	WARN_ON(irqfd->producer != prod);
-	irqfd->producer = NULL;
 
 	/*
 	 * When producer of consumer is unregistered, we change back to
@@ -13594,12 +13599,18 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 	 * when the irq is masked/disabled or the consumer side (KVM
 	 * int this case doesn't want to receive the interrupts.
 	*/
+	spin_lock_irq(&kvm->irqfds.lock);
+	irqfd->producer = NULL;
+
 	ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
 					   prod->irq, irqfd->gsi, 0);
 	if (ret)
 		printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
 		       " fails: %d\n", irqfd->consumer.token, ret);
 
+	spin_unlock_irq(&kvm->irqfds.lock);
+
+
 	kvm_arch_end_assignment(irqfd->kvm);
 }
 
@@ -13612,7 +13623,8 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
 bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
 				  struct kvm_kernel_irq_routing_entry *new)
 {
-	if (new->type != KVM_IRQ_ROUTING_MSI)
+	if (old->type != KVM_IRQ_ROUTING_MSI ||
+	    new->type != KVM_IRQ_ROUTING_MSI)
 		return true;
 
 	return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 1c50352eb49f..4fa5c4e1ba8a 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -3,6 +3,8 @@
 # Makefile for x86 specific library files.
 #
 
+obj-y += crypto/
+
 # Produces uninteresting flaky coverage.
 KCOV_INSTRUMENT_delay.o	:= n
 
@@ -39,14 +41,14 @@ lib-$(CONFIG_FUNCTION_ERROR_INJECTION)	+= error-inject.o
 lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o
 
 obj-$(CONFIG_CRC32_ARCH) += crc32-x86.o
-crc32-x86-y := crc32-glue.o crc32-pclmul.o
+crc32-x86-y := crc32.o crc32-pclmul.o
 crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o
 
 obj-$(CONFIG_CRC64_ARCH) += crc64-x86.o
-crc64-x86-y := crc64-glue.o crc64-pclmul.o
+crc64-x86-y := crc64.o crc64-pclmul.o
 
 obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o
-crc-t10dif-x86-y := crc-t10dif-glue.o crc16-msb-pclmul.o
+crc-t10dif-x86-y := crc-t10dif.o crc16-msb-pclmul.o
 
 obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
 obj-y += iomem.o
diff --git a/arch/x86/lib/crc-t10dif-glue.c b/arch/x86/lib/crc-t10dif.c
index f89c335cde3c..db7ce59c31ac 100644
--- a/arch/x86/lib/crc-t10dif-glue.c
+++ b/arch/x86/lib/crc-t10dif.c
@@ -9,7 +9,7 @@
 #include <linux/module.h>
 #include "crc-pclmul-template.h"
 
-static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
 
 DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16);
 
@@ -29,7 +29,7 @@ static int __init crc_t10dif_x86_init(void)
 	}
 	return 0;
 }
-arch_initcall(crc_t10dif_x86_init);
+subsys_initcall(crc_t10dif_x86_init);
 
 static void __exit crc_t10dif_x86_exit(void)
 {
diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32.c
index e3f93b17ac3f..d09343e2cea9 100644
--- a/arch/x86/lib/crc32-glue.c
+++ b/arch/x86/lib/crc32.c
@@ -11,8 +11,8 @@
 #include <linux/module.h>
 #include "crc-pclmul-template.h"
 
-static DEFINE_STATIC_KEY_FALSE(have_crc32);
-static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
 
 DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
 
@@ -88,7 +88,7 @@ static int __init crc32_x86_init(void)
 	}
 	return 0;
 }
-arch_initcall(crc32_x86_init);
+subsys_initcall(crc32_x86_init);
 
 static void __exit crc32_x86_exit(void)
 {
diff --git a/arch/x86/lib/crc64-glue.c b/arch/x86/lib/crc64.c
index b0e1b719ecbf..351a09f5813e 100644
--- a/arch/x86/lib/crc64-glue.c
+++ b/arch/x86/lib/crc64.c
@@ -9,7 +9,7 @@
 #include <linux/module.h>
 #include "crc-pclmul-template.h"
 
-static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
 
 DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64);
 DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64);
@@ -39,7 +39,7 @@ static int __init crc64_x86_init(void)
 	}
 	return 0;
 }
-arch_initcall(crc64_x86_init);
+subsys_initcall(crc64_x86_init);
 
 static void __exit crc64_x86_exit(void)
 {
diff --git a/arch/x86/lib/crypto/.gitignore b/arch/x86/lib/crypto/.gitignore
new file mode 100644
index 000000000000..580c839bb177
--- /dev/null
+++ b/arch/x86/lib/crypto/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-x86_64-cryptogams.S
diff --git a/arch/x86/lib/crypto/Kconfig b/arch/x86/lib/crypto/Kconfig
new file mode 100644
index 000000000000..5e94cdee492c
--- /dev/null
+++ b/arch/x86/lib/crypto/Kconfig
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_BLAKE2S_X86
+	bool "Hash functions: BLAKE2s (SSSE3/AVX-512)"
+	depends on 64BIT
+	select CRYPTO_LIB_BLAKE2S_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
+	help
+	  BLAKE2s cryptographic hash function (RFC 7693)
+
+	  Architecture: x86_64 using:
+	  - SSSE3 (Supplemental SSE3)
+	  - AVX-512 (Advanced Vector Extensions-512)
+
+config CRYPTO_CHACHA20_X86_64
+	tristate
+	depends on 64BIT
+	default CRYPTO_LIB_CHACHA
+	select CRYPTO_LIB_CHACHA_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_X86_64
+	tristate
+	depends on 64BIT
+	default CRYPTO_LIB_POLY1305
+	select CRYPTO_ARCH_HAVE_LIB_POLY1305
+
+config CRYPTO_SHA256_X86_64
+	tristate
+	depends on 64BIT
+	default CRYPTO_LIB_SHA256
+	select CRYPTO_ARCH_HAVE_LIB_SHA256
+	select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
+	select CRYPTO_LIB_SHA256_GENERIC
diff --git a/arch/x86/lib/crypto/Makefile b/arch/x86/lib/crypto/Makefile
new file mode 100644
index 000000000000..abceca3d31c0
--- /dev/null
+++ b/arch/x86/lib/crypto/Makefile
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o
+libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
+
+obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
+chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o
+
+obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
+poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
+targets += poly1305-x86_64-cryptogams.S
+
+obj-$(CONFIG_CRYPTO_SHA256_X86_64) += sha256-x86_64.o
+sha256-x86_64-y := sha256.o sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256-ni-asm.o
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $< > $@
+
+$(obj)/%.S: $(src)/%.pl FORCE
+	$(call if_changed,perlasm)
diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/lib/crypto/blake2s-core.S
index b50b35ff1fdb..ac1c845445a4 100644
--- a/arch/x86/crypto/blake2s-core.S
+++ b/arch/x86/lib/crypto/blake2s-core.S
@@ -29,7 +29,6 @@ SIGMA:
 .byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
 .byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
 .byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
-#ifdef CONFIG_AS_AVX512
 .section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
 .align 64
 SIGMA2:
@@ -43,7 +42,6 @@ SIGMA2:
 .long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
 .long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
 .long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
-#endif /* CONFIG_AS_AVX512 */
 
 .text
 SYM_FUNC_START(blake2s_compress_ssse3)
@@ -174,7 +172,6 @@ SYM_FUNC_START(blake2s_compress_ssse3)
 	RET
 SYM_FUNC_END(blake2s_compress_ssse3)
 
-#ifdef CONFIG_AS_AVX512
 SYM_FUNC_START(blake2s_compress_avx512)
 	vmovdqu		(%rdi),%xmm0
 	vmovdqu		0x10(%rdi),%xmm1
@@ -253,4 +250,3 @@ SYM_FUNC_START(blake2s_compress_avx512)
 	vzeroupper
 	RET
 SYM_FUNC_END(blake2s_compress_avx512)
-#endif /* CONFIG_AS_AVX512 */
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/lib/crypto/blake2s-glue.c
index 0313f9673f56..adc296cd17c9 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/lib/crypto/blake2s-glue.c
@@ -3,17 +3,15 @@
  * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  */
 
-#include <crypto/internal/blake2s.h>
-
-#include <linux/types.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/sizes.h>
-
 #include <asm/cpufeature.h>
 #include <asm/fpu/api.h>
 #include <asm/processor.h>
 #include <asm/simd.h>
+#include <crypto/internal/blake2s.h>
+#include <linux/init.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
 
 asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
 				       const u8 *block, const size_t nblocks,
@@ -41,8 +39,7 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block,
 					    SZ_4K / BLAKE2S_BLOCK_SIZE);
 
 		kernel_fpu_begin();
-		if (IS_ENABLED(CONFIG_AS_AVX512) &&
-		    static_branch_likely(&blake2s_use_avx512))
+		if (static_branch_likely(&blake2s_use_avx512))
 			blake2s_compress_avx512(state, block, blocks, inc);
 		else
 			blake2s_compress_ssse3(state, block, blocks, inc);
@@ -59,8 +56,7 @@ static int __init blake2s_mod_init(void)
 	if (boot_cpu_has(X86_FEATURE_SSSE3))
 		static_branch_enable(&blake2s_use_ssse3);
 
-	if (IS_ENABLED(CONFIG_AS_AVX512) &&
-	    boot_cpu_has(X86_FEATURE_AVX) &&
+	if (boot_cpu_has(X86_FEATURE_AVX) &&
 	    boot_cpu_has(X86_FEATURE_AVX2) &&
 	    boot_cpu_has(X86_FEATURE_AVX512F) &&
 	    boot_cpu_has(X86_FEATURE_AVX512VL) &&
diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/lib/crypto/chacha-avx2-x86_64.S
index f3d8fc018249..f3d8fc018249 100644
--- a/arch/x86/crypto/chacha-avx2-x86_64.S
+++ b/arch/x86/lib/crypto/chacha-avx2-x86_64.S
diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S
index 259383e1ad44..259383e1ad44 100644
--- a/arch/x86/crypto/chacha-avx512vl-x86_64.S
+++ b/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S
diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/lib/crypto/chacha-ssse3-x86_64.S
index 7111949cd5b9..7111949cd5b9 100644
--- a/arch/x86/crypto/chacha-ssse3-x86_64.S
+++ b/arch/x86/lib/crypto/chacha-ssse3-x86_64.S
diff --git a/arch/x86/lib/crypto/chacha_glue.c b/arch/x86/lib/crypto/chacha_glue.c
new file mode 100644
index 000000000000..10b2c945f541
--- /dev/null
+++ b/arch/x86/lib/crypto/chacha_glue.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * ChaCha and HChaCha functions (x86_64 optimized)
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <asm/simd.h>
+#include <crypto/chacha.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+
+asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state,
+					u8 *dst, const u8 *src,
+					unsigned int len, int nrounds);
+asmlinkage void hchacha_block_ssse3(const struct chacha_state *state,
+				    u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+
+asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state,
+					   u8 *dst, const u8 *src,
+					   unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state,
+					   u8 *dst, const u8 *src,
+					   unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state,
+					   u8 *dst, const u8 *src,
+					   unsigned int len, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
+
+static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
+{
+	len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
+	return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
+}
+
+static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds)
+{
+	if (static_branch_likely(&chacha_use_avx512vl)) {
+		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+			chacha_8block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			bytes -= CHACHA_BLOCK_SIZE * 8;
+			src += CHACHA_BLOCK_SIZE * 8;
+			dst += CHACHA_BLOCK_SIZE * 8;
+			state->x[12] += 8;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 4) {
+			chacha_8block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			state->x[12] += chacha_advance(bytes, 8);
+			return;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 2) {
+			chacha_4block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			state->x[12] += chacha_advance(bytes, 4);
+			return;
+		}
+		if (bytes) {
+			chacha_2block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			state->x[12] += chacha_advance(bytes, 2);
+			return;
+		}
+	}
+
+	if (static_branch_likely(&chacha_use_avx2)) {
+		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+			bytes -= CHACHA_BLOCK_SIZE * 8;
+			src += CHACHA_BLOCK_SIZE * 8;
+			dst += CHACHA_BLOCK_SIZE * 8;
+			state->x[12] += 8;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 4) {
+			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+			state->x[12] += chacha_advance(bytes, 8);
+			return;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 2) {
+			chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
+			state->x[12] += chacha_advance(bytes, 4);
+			return;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE) {
+			chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
+			state->x[12] += chacha_advance(bytes, 2);
+			return;
+		}
+	}
+
+	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+		bytes -= CHACHA_BLOCK_SIZE * 4;
+		src += CHACHA_BLOCK_SIZE * 4;
+		dst += CHACHA_BLOCK_SIZE * 4;
+		state->x[12] += 4;
+	}
+	if (bytes > CHACHA_BLOCK_SIZE) {
+		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+		state->x[12] += chacha_advance(bytes, 4);
+		return;
+	}
+	if (bytes) {
+		chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
+		state->x[12]++;
+	}
+}
+
+void hchacha_block_arch(const struct chacha_state *state,
+			u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	if (!static_branch_likely(&chacha_use_simd)) {
+		hchacha_block_generic(state, out, nrounds);
+	} else {
+		kernel_fpu_begin();
+		hchacha_block_ssse3(state, out, nrounds);
+		kernel_fpu_end();
+	}
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+		       unsigned int bytes, int nrounds)
+{
+	if (!static_branch_likely(&chacha_use_simd) ||
+	    bytes <= CHACHA_BLOCK_SIZE)
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		kernel_fpu_begin();
+		chacha_dosimd(state, dst, src, todo, nrounds);
+		kernel_fpu_end();
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	return static_key_enabled(&chacha_use_simd);
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+static int __init chacha_simd_mod_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_SSSE3))
+		return 0;
+
+	static_branch_enable(&chacha_use_simd);
+
+	if (boot_cpu_has(X86_FEATURE_AVX) &&
+	    boot_cpu_has(X86_FEATURE_AVX2) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+		static_branch_enable(&chacha_use_avx2);
+
+		if (boot_cpu_has(X86_FEATURE_AVX512VL) &&
+		    boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
+			static_branch_enable(&chacha_use_avx512vl);
+	}
+	return 0;
+}
+subsys_initcall(chacha_simd_mod_init);
+
+static void __exit chacha_simd_mod_exit(void)
+{
+}
+module_exit(chacha_simd_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)");
diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl
index b9abcd79c1f4..501827254fed 100644
--- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+++ b/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl
@@ -118,6 +118,19 @@ sub declare_function() {
 	}
 }
 
+sub declare_typed_function() {
+	my ($name, $align, $nargs) = @_;
+	if($kernel) {
+		$code .= "SYM_TYPED_FUNC_START($name)\n";
+		$code .= ".L$name:\n";
+	} else {
+		$code .= ".globl	$name\n";
+		$code .= ".type	$name,\@function,$nargs\n";
+		$code .= ".align	$align\n";
+		$code .= "$name:\n";
+	}
+}
+
 sub end_function() {
 	my ($name) = @_;
 	if($kernel) {
@@ -128,7 +141,7 @@ sub end_function() {
 }
 
 $code.=<<___ if $kernel;
-#include <linux/linkage.h>
+#include <linux/cfi_types.h>
 ___
 
 if ($avx) {
@@ -236,14 +249,14 @@ ___
 $code.=<<___ if (!$kernel);
 .extern	OPENSSL_ia32cap_P
 
-.globl	poly1305_init_x86_64
-.hidden	poly1305_init_x86_64
+.globl	poly1305_block_init_arch
+.hidden	poly1305_block_init_arch
 .globl	poly1305_blocks_x86_64
 .hidden	poly1305_blocks_x86_64
 .globl	poly1305_emit_x86_64
 .hidden	poly1305_emit_x86_64
 ___
-&declare_function("poly1305_init_x86_64", 32, 3);
+&declare_typed_function("poly1305_block_init_arch", 32, 3);
 $code.=<<___;
 	xor	%eax,%eax
 	mov	%rax,0($ctx)		# initialize hash value
@@ -298,7 +311,7 @@ $code.=<<___;
 .Lno_key:
 	RET
 ___
-&end_function("poly1305_init_x86_64");
+&end_function("poly1305_block_init_arch");
 
 &declare_function("poly1305_blocks_x86_64", 32, 4);
 $code.=<<___;
@@ -2811,18 +2824,10 @@ if ($avx>2) {
 # reason stack layout is kept identical to poly1305_blocks_avx2. If not
 # for this tail, we wouldn't have to even allocate stack frame...
 
-if($kernel) {
-	$code .= "#ifdef CONFIG_AS_AVX512\n";
-}
-
 &declare_function("poly1305_blocks_avx512", 32, 4);
 poly1305_blocks_avxN(1);
 &end_function("poly1305_blocks_avx512");
 
-if ($kernel) {
-	$code .= "#endif\n";
-}
-
 if (!$kernel && $avx>3) {
 ########################################################################
 # VPMADD52 version using 2^44 radix.
@@ -4113,9 +4118,9 @@ avx_handler:
 
 .section	.pdata
 .align	4
-	.rva	.LSEH_begin_poly1305_init_x86_64
-	.rva	.LSEH_end_poly1305_init_x86_64
-	.rva	.LSEH_info_poly1305_init_x86_64
+	.rva	.LSEH_begin_poly1305_block_init_arch
+	.rva	.LSEH_end_poly1305_block_init_arch
+	.rva	.LSEH_info_poly1305_block_init_arch
 
 	.rva	.LSEH_begin_poly1305_blocks_x86_64
 	.rva	.LSEH_end_poly1305_blocks_x86_64
@@ -4163,10 +4168,10 @@ ___
 $code.=<<___;
 .section	.xdata
 .align	8
-.LSEH_info_poly1305_init_x86_64:
+.LSEH_info_poly1305_block_init_arch:
 	.byte	9,0,0,0
 	.rva	se_handler
-	.rva	.LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
+	.rva	.LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch
 
 .LSEH_info_poly1305_blocks_x86_64:
 	.byte	9,0,0,0
diff --git a/arch/x86/lib/crypto/poly1305_glue.c b/arch/x86/lib/crypto/poly1305_glue.c
new file mode 100644
index 000000000000..b7e78a583e07
--- /dev/null
+++ b/arch/x86/lib/crypto/poly1305_glue.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <asm/cpu_device_id.h>
+#include <asm/fpu/api.h>
+#include <crypto/internal/poly1305.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <linux/unaligned.h>
+
+struct poly1305_arch_internal {
+	union {
+		struct {
+			u32 h[5];
+			u32 is_base2_26;
+		};
+		u64 hs[3];
+	};
+	u64 r[2];
+	u64 pad;
+	struct { u32 r2, r1, r4, r3; } rn[9];
+};
+
+asmlinkage void poly1305_block_init_arch(
+	struct poly1305_block_state *state,
+	const u8 raw_key[POLY1305_BLOCK_SIZE]);
+EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
+asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx,
+				       const u8 *inp,
+				       const size_t len, const u32 padbit);
+asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx,
+				     u8 mac[POLY1305_DIGEST_SIZE],
+				     const u32 nonce[4]);
+asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx,
+				  u8 mac[POLY1305_DIGEST_SIZE],
+				  const u32 nonce[4]);
+asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx,
+				    const u8 *inp, const size_t len,
+				    const u32 padbit);
+asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx,
+				     const u8 *inp, const size_t len,
+				     const u32 padbit);
+asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx,
+				       const u8 *inp,
+				       const size_t len, const u32 padbit);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
+
+void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp,
+			  unsigned int len, u32 padbit)
+{
+	struct poly1305_arch_internal *ctx =
+		container_of(&state->h.h, struct poly1305_arch_internal, h);
+
+	/* SIMD disables preemption, so relax after processing each page. */
+	BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
+		     SZ_4K % POLY1305_BLOCK_SIZE);
+
+	if (!static_branch_likely(&poly1305_use_avx)) {
+		poly1305_blocks_x86_64(ctx, inp, len, padbit);
+		return;
+	}
+
+	do {
+		const unsigned int bytes = min(len, SZ_4K);
+
+		kernel_fpu_begin();
+		if (static_branch_likely(&poly1305_use_avx512))
+			poly1305_blocks_avx512(ctx, inp, bytes, padbit);
+		else if (static_branch_likely(&poly1305_use_avx2))
+			poly1305_blocks_avx2(ctx, inp, bytes, padbit);
+		else
+			poly1305_blocks_avx(ctx, inp, bytes, padbit);
+		kernel_fpu_end();
+
+		len -= bytes;
+		inp += bytes;
+	} while (len);
+}
+EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
+
+void poly1305_emit_arch(const struct poly1305_state *ctx,
+			u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4])
+{
+	if (!static_branch_likely(&poly1305_use_avx))
+		poly1305_emit_x86_64(ctx, mac, nonce);
+	else
+		poly1305_emit_avx(ctx, mac, nonce);
+}
+EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+
+bool poly1305_is_arch_optimized(void)
+{
+	return static_key_enabled(&poly1305_use_avx);
+}
+EXPORT_SYMBOL(poly1305_is_arch_optimized);
+
+static int __init poly1305_simd_mod_init(void)
+{
+	if (boot_cpu_has(X86_FEATURE_AVX) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+		static_branch_enable(&poly1305_use_avx);
+	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+		static_branch_enable(&poly1305_use_avx2);
+	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
+	    boot_cpu_has(X86_FEATURE_AVX512F) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
+	    /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
+	    boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X)
+		static_branch_enable(&poly1305_use_avx512);
+	return 0;
+}
+subsys_initcall(poly1305_simd_mod_init);
+
+static void __exit poly1305_simd_mod_exit(void)
+{
+}
+module_exit(poly1305_simd_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
+MODULE_DESCRIPTION("Poly1305 authenticator");
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/lib/crypto/sha256-avx-asm.S
index 53de72bdd851..0d7b2c3e45d9 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/lib/crypto/sha256-avx-asm.S
@@ -48,7 +48,7 @@
 ########################################################################
 
 #include <linux/linkage.h>
-#include <linux/cfi_types.h>
+#include <linux/objtool.h>
 
 ## assume buffers not aligned
 #define    VMOVDQ vmovdqu
@@ -341,13 +341,13 @@ a = TMP_
 .endm
 
 ########################################################################
-## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
-## arg 1 : pointer to state
-## arg 2 : pointer to input data
-## arg 3 : Num blocks
+## void sha256_transform_avx(u32 state[SHA256_STATE_WORDS],
+##			     const u8 *data, size_t nblocks);
 ########################################################################
 .text
-SYM_TYPED_FUNC_START(sha256_transform_avx)
+SYM_FUNC_START(sha256_transform_avx)
+	ANNOTATE_NOENDBR	# since this is called only via static_call
+
 	pushq   %rbx
 	pushq   %r12
 	pushq   %r13
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/lib/crypto/sha256-avx2-asm.S
index 0bbec1c75cd0..25d3380321ec 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/lib/crypto/sha256-avx2-asm.S
@@ -49,7 +49,7 @@
 ########################################################################
 
 #include <linux/linkage.h>
-#include <linux/cfi_types.h>
+#include <linux/objtool.h>
 
 ## assume buffers not aligned
 #define	VMOVDQ vmovdqu
@@ -518,13 +518,13 @@ STACK_SIZE	= _CTX      + _CTX_SIZE
 .endm
 
 ########################################################################
-## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
-## arg 1 : pointer to state
-## arg 2 : pointer to input data
-## arg 3 : Num blocks
+## void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS],
+##			      const u8 *data, size_t nblocks);
 ########################################################################
 .text
-SYM_TYPED_FUNC_START(sha256_transform_rorx)
+SYM_FUNC_START(sha256_transform_rorx)
+	ANNOTATE_NOENDBR	# since this is called only via static_call
+
 	pushq	%rbx
 	pushq	%r12
 	pushq	%r13
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/lib/crypto/sha256-ni-asm.S
index d515a55a3bc1..d3548206cf3d 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/lib/crypto/sha256-ni-asm.S
@@ -54,9 +54,9 @@
  */
 
 #include <linux/linkage.h>
-#include <linux/cfi_types.h>
+#include <linux/objtool.h>
 
-#define DIGEST_PTR	%rdi	/* 1st arg */
+#define STATE_PTR	%rdi	/* 1st arg */
 #define DATA_PTR	%rsi	/* 2nd arg */
 #define NUM_BLKS	%rdx	/* 3rd arg */
 
@@ -98,24 +98,20 @@
 .endm
 
 /*
- * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ * Intel SHA Extensions optimized implementation of a SHA-256 block function
  *
- * The function takes a pointer to the current hash values, a pointer to the
- * input data, and a number of 64 byte blocks to process.  Once all blocks have
- * been processed, the digest pointer is  updated with the resulting hash value.
- * The function only processes complete blocks, there is no functionality to
- * store partial blocks.  All message padding and hash value initialization must
- * be done outside the update function.
+ * This function takes a pointer to the current SHA-256 state, a pointer to the
+ * input data, and the number of 64-byte blocks to process.  Once all blocks
+ * have been processed, the state is updated with the new state.  This function
+ * only processes complete blocks.  State initialization, buffering of partial
+ * blocks, and digest finalization is expected to be handled elsewhere.
  *
- * void sha256_ni_transform(uint32_t *digest, const void *data,
-		uint32_t numBlocks);
- * digest : pointer to digest
- * data: pointer to input data
- * numBlocks: Number of blocks to process
+ * void sha256_ni_transform(u32 state[SHA256_STATE_WORDS],
+ *			    const u8 *data, size_t nblocks);
  */
-
 .text
-SYM_TYPED_FUNC_START(sha256_ni_transform)
+SYM_FUNC_START(sha256_ni_transform)
+	ANNOTATE_NOENDBR	# since this is called only via static_call
 
 	shl		$6, NUM_BLKS		/*  convert to bytes */
 	jz		.Ldone_hash
@@ -126,8 +122,8 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
 	 * Need to reorder these appropriately
 	 * DCBA, HGFE -> ABEF, CDGH
 	 */
-	movdqu		0*16(DIGEST_PTR), STATE0	/* DCBA */
-	movdqu		1*16(DIGEST_PTR), STATE1	/* HGFE */
+	movdqu		0*16(STATE_PTR), STATE0		/* DCBA */
+	movdqu		1*16(STATE_PTR), STATE1		/* HGFE */
 
 	movdqa		STATE0, TMP
 	punpcklqdq	STATE1, STATE0			/* FEBA */
@@ -166,8 +162,8 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
 	pshufd		$0xB1, STATE0, STATE0		/* HGFE */
 	pshufd		$0x1B, STATE1, STATE1		/* DCBA */
 
-	movdqu		STATE1, 0*16(DIGEST_PTR)
-	movdqu		STATE0, 1*16(DIGEST_PTR)
+	movdqu		STATE1, 0*16(STATE_PTR)
+	movdqu		STATE0, 1*16(STATE_PTR)
 
 .Ldone_hash:
 
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/lib/crypto/sha256-ssse3-asm.S
index 93264ee44543..7f24a4cdcb25 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/lib/crypto/sha256-ssse3-asm.S
@@ -47,7 +47,7 @@
 ########################################################################
 
 #include <linux/linkage.h>
-#include <linux/cfi_types.h>
+#include <linux/objtool.h>
 
 ## assume buffers not aligned
 #define    MOVDQ movdqu
@@ -348,15 +348,13 @@ a = TMP_
 .endm
 
 ########################################################################
-## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data,
-##			       int blocks);
-## arg 1 : pointer to state
-##	   (struct sha256_state is assumed to begin with u32 state[8])
-## arg 2 : pointer to input data
-## arg 3 : Num blocks
+## void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS],
+##			       const u8 *data, size_t nblocks);
 ########################################################################
 .text
-SYM_TYPED_FUNC_START(sha256_transform_ssse3)
+SYM_FUNC_START(sha256_transform_ssse3)
+	ANNOTATE_NOENDBR	# since this is called only via static_call
+
 	pushq   %rbx
 	pushq   %r12
 	pushq   %r13
diff --git a/arch/x86/lib/crypto/sha256.c b/arch/x86/lib/crypto/sha256.c
new file mode 100644
index 000000000000..80380f8fdcee
--- /dev/null
+++ b/arch/x86/lib/crypto/sha256.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-256 optimized for x86_64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <crypto/internal/sha2.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/static_call.h>
+
+asmlinkage void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS],
+				       const u8 *data, size_t nblocks);
+asmlinkage void sha256_transform_avx(u32 state[SHA256_STATE_WORDS],
+				     const u8 *data, size_t nblocks);
+asmlinkage void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS],
+				      const u8 *data, size_t nblocks);
+asmlinkage void sha256_ni_transform(u32 state[SHA256_STATE_WORDS],
+				    const u8 *data, size_t nblocks);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86);
+
+DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3);
+
+void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
+			const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_sha256_x86)) {
+		kernel_fpu_begin();
+		static_call(sha256_blocks_x86)(state, data, nblocks);
+		kernel_fpu_end();
+	} else {
+		sha256_blocks_generic(state, data, nblocks);
+	}
+}
+EXPORT_SYMBOL_GPL(sha256_blocks_simd);
+
+void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+			const u8 *data, size_t nblocks)
+{
+	sha256_blocks_generic(state, data, nblocks);
+}
+EXPORT_SYMBOL_GPL(sha256_blocks_arch);
+
+bool sha256_is_arch_optimized(void)
+{
+	return static_key_enabled(&have_sha256_x86);
+}
+EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
+
+static int __init sha256_x86_mod_init(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
+		static_call_update(sha256_blocks_x86, sha256_ni_transform);
+	} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE |
+				     XFEATURE_MASK_YMM, NULL) &&
+		   boot_cpu_has(X86_FEATURE_AVX)) {
+		if (boot_cpu_has(X86_FEATURE_AVX2) &&
+		    boot_cpu_has(X86_FEATURE_BMI2))
+			static_call_update(sha256_blocks_x86,
+					   sha256_transform_rorx);
+		else
+			static_call_update(sha256_blocks_x86,
+					   sha256_transform_avx);
+	} else if (!boot_cpu_has(X86_FEATURE_SSSE3)) {
+		return 0;
+	}
+	static_branch_enable(&have_sha256_x86);
+	return 0;
+}
+subsys_initcall(sha256_x86_mod_init);
+
+static void __exit sha256_x86_mod_exit(void)
+{
+}
+module_exit(sha256_x86_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-256 optimized for x86_64");
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index a26c43abd47d..39374949daa2 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -367,6 +367,54 @@ SYM_FUNC_END(call_depth_return_thunk)
 
 #endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */
 
+#ifdef CONFIG_MITIGATION_ITS
+
+.macro ITS_THUNK reg
+
+/*
+ * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b)
+ * that complete the fineibt_paranoid caller sequence.
+ */
+1:	.byte 0xea
+SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL)
+	UNWIND_HINT_UNDEFINED
+	ANNOTATE_NOENDBR
+	jne 1b
+SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL)
+	UNWIND_HINT_UNDEFINED
+	ANNOTATE_NOENDBR
+	ANNOTATE_RETPOLINE_SAFE
+	jmp *%\reg
+	int3
+	.align 32, 0xcc		/* fill to the end of the line */
+	.skip  32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */
+.endm
+
+/* ITS mitigation requires thunks be aligned to upper half of cacheline */
+.align 64, 0xcc
+.skip 29, 0xcc
+
+#define GEN(reg) ITS_THUNK reg
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+	.align 64, 0xcc
+SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax)
+SYM_CODE_END(__x86_indirect_its_thunk_array)
+
+.align 64, 0xcc
+.skip 32, 0xcc
+SYM_CODE_START(its_return_thunk)
+	UNWIND_HINT_FUNC
+	ANNOTATE_NOENDBR
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+SYM_CODE_END(its_return_thunk)
+EXPORT_SYMBOL(its_return_thunk)
+
+#endif /* CONFIG_MITIGATION_ITS */
+
 /*
  * This function name is magical and is used by -mfunction-return=thunk-extern
  * for the compiler to generate JMPs to it.
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index caedb3ef6688..f5dd84eb55dc 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -996,8 +996,8 @@ AVXcode: 4
 83: Grp1 Ev,Ib (1A),(es)
 # CTESTSCC instructions are: CTESTB, CTESTBE, CTESTF, CTESTL, CTESTLE, CTESTNB, CTESTNBE, CTESTNL,
 #			     CTESTNLE, CTESTNO, CTESTNS, CTESTNZ, CTESTO, CTESTS, CTESTT, CTESTZ
-84: CTESTSCC (ev)
-85: CTESTSCC (es) | CTESTSCC (66),(es)
+84: CTESTSCC Eb,Gb (ev)
+85: CTESTSCC Ev,Gv (es) | CTESTSCC Ev,Gv (66),(es)
 88: POPCNT Gv,Ev (es) | POPCNT Gv,Ev (66),(es)
 8f: POP2 Bq,Rq (000),(11B),(ev)
 a5: SHLD Ev,Gv,CL (es) | SHLD Ev,Gv,CL (66),(es)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ad662cc4605c..bb8d99e717b9 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -30,6 +30,7 @@
 #include <linux/initrd.h>
 #include <linux/cpumask.h>
 #include <linux/gfp.h>
+#include <linux/execmem.h>
 
 #include <asm/asm.h>
 #include <asm/bios_ebda.h>
@@ -565,7 +566,7 @@ static void __init lowmem_pfn_init(void)
 	"only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
 
 #define MSG_HIGHMEM_TRIMMED \
-	"Warning: only 4GB will be used. Support for for CONFIG_HIGHMEM64G was removed!\n"
+	"Warning: only 4GB will be used. Support for CONFIG_HIGHMEM64G was removed!\n"
 /*
  * We have more RAM than fits into lowmem - we try to put it into
  * highmem, also taking the highmem=x boot parameter into account:
@@ -755,6 +756,8 @@ void mark_rodata_ro(void)
 	pr_info("Write protecting kernel text and read-only data: %luk\n",
 		size >> 10);
 
+	execmem_cache_make_ro();
+
 	kernel_set_to_readonly = 1;
 
 #ifdef CONFIG_CPA_DEBUG
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 7c4f6f591f2b..949a447f75ec 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -34,6 +34,7 @@
 #include <linux/gfp.h>
 #include <linux/kcore.h>
 #include <linux/bootmem_info.h>
+#include <linux/execmem.h>
 
 #include <asm/processor.h>
 #include <asm/bios_ebda.h>
@@ -1391,6 +1392,8 @@ void mark_rodata_ro(void)
 	       (end - start) >> 10);
 	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
 
+	execmem_cache_make_ro();
+
 	kernel_set_to_readonly = 1;
 
 	/*
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index a05fcddfc811..f7ae44d3dd9e 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -360,7 +360,7 @@ static inline pgd_t *_pgd_alloc(struct mm_struct *mm)
 	 * We allocate one page for pgd.
 	 */
 	if (!SHARED_KERNEL_PMD)
-		return __pgd_alloc(mm, PGD_ALLOCATION_ORDER);
+		return __pgd_alloc(mm, pgd_allocation_order());
 
 	/*
 	 * Now PAE kernel is not running as a Xen domain. We can allocate
@@ -380,7 +380,7 @@ static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 static inline pgd_t *_pgd_alloc(struct mm_struct *mm)
 {
-	return __pgd_alloc(mm, PGD_ALLOCATION_ORDER);
+	return __pgd_alloc(mm, pgd_allocation_order());
 }
 
 static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index eb83348f9305..b6d6750e4bd1 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -899,8 +899,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
 		cond_mitigation(tsk);
 
 		/*
-		 * Let nmi_uaccess_okay() and finish_asid_transition()
-		 * know that CR3 is changing.
+		 * Indicate that CR3 is about to change. nmi_uaccess_okay()
+		 * and others are sensitive to the window where mm_cpumask(),
+		 * CR3 and cpu_tlbstate.loaded_mm are not all in sync.
 		 */
 		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
 		barrier();
@@ -1204,8 +1205,16 @@ done:
 
 static bool should_flush_tlb(int cpu, void *data)
 {
+	struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu);
 	struct flush_tlb_info *info = data;
 
+	/*
+	 * Order the 'loaded_mm' and 'is_lazy' against their
+	 * write ordering in switch_mm_irqs_off(). Ensure
+	 * 'is_lazy' is at least as new as 'loaded_mm'.
+	 */
+	smp_rmb();
+
 	/* Lazy TLB will get flushed at the next context switch. */
 	if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
 		return false;
@@ -1214,8 +1223,15 @@ static bool should_flush_tlb(int cpu, void *data)
 	if (!info->mm)
 		return true;
 
+	/*
+	 * While switching, the remote CPU could have state from
+	 * either the prev or next mm. Assume the worst and flush.
+	 */
+	if (loaded_mm == LOADED_MM_SWITCHING)
+		return true;
+
 	/* The target mm is loaded, and the CPU is not lazy. */
-	if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm)
+	if (loaded_mm == info->mm)
 		return true;
 
 	/* In cpumask, but not the loaded mm? Periodically remove by flushing. */
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 9e5fe2ba858f..ea4dd5b393aa 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -41,6 +41,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 #define EMIT2(b1, b2)		EMIT((b1) + ((b2) << 8), 2)
 #define EMIT3(b1, b2, b3)	EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
 #define EMIT4(b1, b2, b3, b4)   EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+#define EMIT5(b1, b2, b3, b4, b5) \
+	do { EMIT1(b1); EMIT4(b2, b3, b4, b5); } while (0)
 
 #define EMIT1_off32(b1, off) \
 	do { EMIT1(b1); EMIT(off, 4); } while (0)
@@ -661,7 +663,10 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
 {
 	u8 *prog = *pprog;
 
-	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
+	if (cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) {
+		OPTIMIZER_HIDE_VAR(reg);
+		emit_jump(&prog, its_static_thunk(reg), ip);
+	} else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
 		EMIT_LFENCE();
 		EMIT2(0xFF, 0xE0 + reg);
 	} else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) {
@@ -683,7 +688,7 @@ static void emit_return(u8 **pprog, u8 *ip)
 {
 	u8 *prog = *pprog;
 
-	if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
+	if (cpu_wants_rethunk()) {
 		emit_jump(&prog, x86_return_thunk, ip);
 	} else {
 		EMIT1(0xC3);		/* ret */
@@ -1502,6 +1507,48 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
 #define PRIV_STACK_GUARD_SZ    8
 #define PRIV_STACK_GUARD_VAL   0xEB9F12345678eb9fULL
 
+static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip,
+				    struct bpf_prog *bpf_prog)
+{
+	u8 *prog = *pprog;
+	u8 *func;
+
+	if (cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP)) {
+		/* The clearing sequence clobbers eax and ecx. */
+		EMIT1(0x50); /* push rax */
+		EMIT1(0x51); /* push rcx */
+		ip += 2;
+
+		func = (u8 *)clear_bhb_loop;
+		ip += x86_call_depth_emit_accounting(&prog, func, ip);
+
+		if (emit_call(&prog, func, ip))
+			return -EINVAL;
+		EMIT1(0x59); /* pop rcx */
+		EMIT1(0x58); /* pop rax */
+	}
+	/* Insert IBHF instruction */
+	if ((cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP) &&
+	     cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) ||
+	    cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_HW)) {
+		/*
+		 * Add an Indirect Branch History Fence (IBHF). IBHF acts as a
+		 * fence preventing branch history from before the fence from
+		 * affecting indirect branches after the fence. This is
+		 * specifically used in cBPF jitted code to prevent Intra-mode
+		 * BHI attacks. The IBHF instruction is designed to be a NOP on
+		 * hardware that doesn't need or support it.  The REP and REX.W
+		 * prefixes are required by the microcode, and they also ensure
+		 * that the NOP is unlikely to be used in existing code.
+		 *
+		 * IBHF is not a valid instruction in 32-bit mode.
+		 */
+		EMIT5(0xF3, 0x48, 0x0F, 0x1E, 0xF8); /* ibhf */
+	}
+	*pprog = prog;
+	return 0;
+}
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
 		  int oldproglen, struct jit_context *ctx, bool jmp_padding)
 {
@@ -2544,6 +2591,13 @@ emit_jmp:
 			seen_exit = true;
 			/* Update cleanup_addr */
 			ctx->cleanup_addr = proglen;
+			if (bpf_prog_was_classic(bpf_prog) &&
+			    !capable(CAP_SYS_ADMIN)) {
+				u8 *ip = image + addrs[i - 1];
+
+				if (emit_spectre_bhb_barrier(&prog, ip, bpf_prog))
+					return -EINVAL;
+			}
 			if (bpf_prog->aux->exception_boundary) {
 				pop_callee_regs(&prog, all_callee_regs_used);
 				pop_r12(&prog);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index ac57259a432b..a4b4ebd41b8f 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -73,7 +73,7 @@ int __init efi_alloc_page_tables(void)
 	gfp_t gfp_mask;
 
 	gfp_mask = GFP_KERNEL | __GFP_ZERO;
-	efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
+	efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, pgd_allocation_order());
 	if (!efi_pgd)
 		goto fail;
 
@@ -96,7 +96,7 @@ free_p4d:
 	if (pgtable_l5_enabled())
 		free_page((unsigned long)pgd_page_vaddr(*pgd));
 free_pgd:
-	free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER);
+	free_pages((unsigned long)efi_pgd, pgd_allocation_order());
 fail:
 	return -ENOMEM;
 }
diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h
index ab5c8e47049c..9193a7790a71 100644
--- a/arch/x86/um/shared/sysdep/faultinfo_32.h
+++ b/arch/x86/um/shared/sysdep/faultinfo_32.h
@@ -31,8 +31,8 @@ struct faultinfo {
 
 #define ___backtrack_faulted(_faulted)					\
 	asm volatile (							\
-		"mov $0, %0\n"						\
 		"movl $__get_kernel_nofault_faulted_%=,%1\n"		\
+		"mov $0, %0\n"						\
 		"jmp _end_%=\n"						\
 		"__get_kernel_nofault_faulted_%=:\n"			\
 		"mov $1, %0;"						\
diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h
index 26fb4835d3e9..61e4ca1e0ab5 100644
--- a/arch/x86/um/shared/sysdep/faultinfo_64.h
+++ b/arch/x86/um/shared/sysdep/faultinfo_64.h
@@ -31,8 +31,8 @@ struct faultinfo {
 
 #define ___backtrack_faulted(_faulted)					\
 	asm volatile (							\
-		"mov $0, %0\n"						\
 		"movq $__get_kernel_nofault_faulted_%=,%1\n"		\
+		"mov $0, %0\n"						\
 		"jmp _end_%=\n"						\
 		"__get_kernel_nofault_faulted_%=:\n"			\
 		"mov $1, %0;"						\
diff --git a/arch/xtensa/configs/cadence_csp_defconfig b/arch/xtensa/configs/cadence_csp_defconfig
index 91c4c4cae8a7..49f50d1bd724 100644
--- a/arch/xtensa/configs/cadence_csp_defconfig
+++ b/arch/xtensa/configs/cadence_csp_defconfig
@@ -1,6 +1,5 @@
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_USELIB=y
 CONFIG_NO_HZ_IDLE=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_IRQ_TIME_ACCOUNTING=y