diff options
Diffstat (limited to 'arch')
260 files changed, 5574 insertions, 1792 deletions
diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index e94f621903fe..251b73c5481e 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -140,6 +140,12 @@ #define SO_PASSPIDFD 76 #define SO_PEERPIDFD 77 +#define SO_DEVMEM_LINEAR 78 +#define SCM_DEVMEM_LINEAR SO_DEVMEM_LINEAR +#define SO_DEVMEM_DMABUF 79 +#define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF +#define SO_DEVMEM_DONTNEED 80 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index fd0b0a0d4686..163608fd49d1 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -13,6 +13,7 @@ config ARC select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE + select ARCH_NEED_CMPXCHG_1_EMU select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC select ARCH_32BIT_OFF_T select BUILDTIME_TABLE_SORT diff --git a/arch/arc/include/asm/cmpxchg.h b/arch/arc/include/asm/cmpxchg.h index e138fde067de..58045c898340 100644 --- a/arch/arc/include/asm/cmpxchg.h +++ b/arch/arc/include/asm/cmpxchg.h @@ -8,6 +8,7 @@ #include <linux/build_bug.h> #include <linux/types.h> +#include <linux/cmpxchg-emu.h> #include <asm/barrier.h> #include <asm/smp.h> @@ -46,6 +47,9 @@ __typeof__(*(ptr)) _prev_; \ \ switch(sizeof((_p_))) { \ + case 1: \ + _prev_ = (__typeof__(*(ptr)))cmpxchg_emu_u8((volatile u8 *)_p_, (uintptr_t)_o_, (uintptr_t)_n_); \ + break; \ case 4: \ _prev_ = __cmpxchg(_p_, _o_, _n_); \ break; \ @@ -65,8 +69,6 @@ __typeof__(*(ptr)) _prev_; \ unsigned long __flags; \ \ - BUILD_BUG_ON(sizeof(_p_) != 4); \ - \ /* \ * spin lock/unlock provide the needed smp_mb() before/after \ */ \ diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 54b2bb817a7f..0ec034933cae 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -64,6 +64,7 @@ config ARM select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_IRQ_IPI if SMP select GENERIC_CPU_AUTOPROBE + select GENERIC_CPU_DEVICES select GENERIC_EARLY_IOREMAP select GENERIC_IDLE_POLL_SETUP select GENERIC_IRQ_MULTI_HANDLER @@ -117,7 +118,7 @@ config ARM select HAVE_KERNEL_XZ select HAVE_KPROBES if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !CPU_V7M select HAVE_KRETPROBES if HAVE_KPROBES - select HAVE_LD_DEAD_CODE_DATA_ELIMINATION + select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if (LD_VERSION >= 23600 || LD_IS_LLD) select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_OPTPROBES if !THUMB2_KERNEL diff --git a/arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi b/arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi index 52a0f6ee426f..bcf4d9c870ec 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi @@ -274,24 +274,24 @@ led@0 { chan-name = "R"; - led-cur = /bits/ 8 <0x20>; - max-cur = /bits/ 8 <0x60>; + led-cur = /bits/ 8 <0x6e>; + max-cur = /bits/ 8 <0xc8>; reg = <0>; color = <LED_COLOR_ID_RED>; }; led@1 { chan-name = "G"; - led-cur = /bits/ 8 <0x20>; - max-cur = /bits/ 8 <0x60>; + led-cur = /bits/ 8 <0xbe>; + max-cur = /bits/ 8 <0xc8>; reg = <1>; color = <LED_COLOR_ID_GREEN>; }; led@2 { chan-name = "B"; - led-cur = /bits/ 8 <0x20>; - max-cur = /bits/ 8 <0x60>; + led-cur = /bits/ 8 <0xbe>; + max-cur = /bits/ 8 <0xc8>; reg = <2>; color = <LED_COLOR_ID_BLUE>; }; diff --git a/arch/arm/boot/dts/ti/omap/omap3-n900.dts b/arch/arm/boot/dts/ti/omap/omap3-n900.dts index 07c5b963af78..4bde3342bb95 100644 --- a/arch/arm/boot/dts/ti/omap/omap3-n900.dts +++ b/arch/arm/boot/dts/ti/omap/omap3-n900.dts @@ -781,7 +781,7 @@ mount-matrix = "-1", "0", "0", "0", "1", "0", - "0", "0", "1"; + "0", "0", "-1"; }; cam1: camera@3e { diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index 847b7a003356..5ff49a5e9afc 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -166,10 +166,9 @@ config CRYPTO_AES_ARM config CRYPTO_AES_ARM_BS tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (bit-sliced NEON)" depends on KERNEL_MODE_NEON + select CRYPTO_AES_ARM select CRYPTO_SKCIPHER select CRYPTO_LIB_AES - select CRYPTO_AES - select CRYPTO_CBC select CRYPTO_SIMD help Length-preserving ciphers: AES cipher algorithms (FIPS-197) @@ -183,8 +182,15 @@ config CRYPTO_AES_ARM_BS Bit sliced AES gives around 45% speedup on Cortex-A15 for CTR mode and for XTS mode encryption, CBC and XTS mode decryption speedup is around 25%. (CBC encryption speed is not affected by this driver.) - This implementation does not rely on any lookup tables so it is - believed to be invulnerable to cache timing attacks. + + The bit sliced AES code does not use lookup tables, so it is believed + to be invulnerable to cache timing attacks. However, since the bit + sliced AES code cannot process single blocks efficiently, in certain + cases table-based code with some countermeasures against cache timing + attacks will still be used as a fallback method; specifically CBC + encryption (not CBC decryption), the encryption of XTS tweaks, XTS + ciphertext stealing when the message isn't a multiple of 16 bytes, and + CTR when invoked in a context in which NEON instructions are unusable. config CRYPTO_AES_ARM_CE tristate "Ciphers: AES, modes: ECB/CBC/CTS/CTR/XTS (ARMv8 Crypto Extensions)" diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c index b668c97663ec..f5b66f4cf45d 100644 --- a/arch/arm/crypto/aes-ce-glue.c +++ b/arch/arm/crypto/aes-ce-glue.c @@ -711,7 +711,7 @@ static int __init aes_init(void) algname = aes_algs[i].base.cra_name + 2; drvname = aes_algs[i].base.cra_driver_name + 2; basename = aes_algs[i].base.cra_driver_name; - simd = simd_skcipher_create_compat(algname, drvname, basename); + simd = simd_skcipher_create_compat(aes_algs + i, algname, drvname, basename); err = PTR_ERR(simd); if (IS_ERR(simd)) goto unregister_simds; diff --git a/arch/arm/crypto/aes-cipher-glue.c b/arch/arm/crypto/aes-cipher-glue.c index 6dfaef2d8f91..29efb7833960 100644 --- a/arch/arm/crypto/aes-cipher-glue.c +++ b/arch/arm/crypto/aes-cipher-glue.c @@ -9,9 +9,10 @@ #include <crypto/aes.h> #include <crypto/algapi.h> #include <linux/module.h> +#include "aes-cipher.h" -asmlinkage void __aes_arm_encrypt(u32 *rk, int rounds, const u8 *in, u8 *out); -asmlinkage void __aes_arm_decrypt(u32 *rk, int rounds, const u8 *in, u8 *out); +EXPORT_SYMBOL_GPL(__aes_arm_encrypt); +EXPORT_SYMBOL_GPL(__aes_arm_decrypt); static void aes_arm_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { diff --git a/arch/arm/crypto/aes-cipher.h b/arch/arm/crypto/aes-cipher.h new file mode 100644 index 000000000000..d5db2b87eb69 --- /dev/null +++ b/arch/arm/crypto/aes-cipher.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef ARM_CRYPTO_AES_CIPHER_H +#define ARM_CRYPTO_AES_CIPHER_H + +#include <linux/linkage.h> +#include <linux/types.h> + +asmlinkage void __aes_arm_encrypt(const u32 rk[], int rounds, + const u8 *in, u8 *out); +asmlinkage void __aes_arm_decrypt(const u32 rk[], int rounds, + const u8 *in, u8 *out); + +#endif /* ARM_CRYPTO_AES_CIPHER_H */ diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c index 201eb35dde37..f6be80b5938b 100644 --- a/arch/arm/crypto/aes-neonbs-glue.c +++ b/arch/arm/crypto/aes-neonbs-glue.c @@ -9,24 +9,22 @@ #include <asm/simd.h> #include <crypto/aes.h> #include <crypto/ctr.h> -#include <crypto/internal/cipher.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <crypto/xts.h> #include <linux/module.h> +#include "aes-cipher.h" MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_DESCRIPTION("Bit sliced AES using NEON instructions"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("ecb(aes)"); -MODULE_ALIAS_CRYPTO("cbc(aes)-all"); +MODULE_ALIAS_CRYPTO("cbc(aes)"); MODULE_ALIAS_CRYPTO("ctr(aes)"); MODULE_ALIAS_CRYPTO("xts(aes)"); -MODULE_IMPORT_NS(CRYPTO_INTERNAL); - asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds); asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], @@ -52,13 +50,13 @@ struct aesbs_ctx { struct aesbs_cbc_ctx { struct aesbs_ctx key; - struct crypto_skcipher *enc_tfm; + struct crypto_aes_ctx fallback; }; struct aesbs_xts_ctx { struct aesbs_ctx key; - struct crypto_cipher *cts_tfm; - struct crypto_cipher *tweak_tfm; + struct crypto_aes_ctx fallback; + struct crypto_aes_ctx tweak_key; }; struct aesbs_ctr_ctx { @@ -129,37 +127,49 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); - struct crypto_aes_ctx rk; int err; - err = aes_expandkey(&rk, in_key, key_len); + err = aes_expandkey(&ctx->fallback, in_key, key_len); if (err) return err; ctx->key.rounds = 6 + key_len / 4; kernel_neon_begin(); - aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds); + aesbs_convert_key(ctx->key.rk, ctx->fallback.key_enc, ctx->key.rounds); kernel_neon_end(); - memzero_explicit(&rk, sizeof(rk)); - return crypto_skcipher_setkey(ctx->enc_tfm, in_key, key_len); + return 0; } static int cbc_encrypt(struct skcipher_request *req) { - struct skcipher_request *subreq = skcipher_request_ctx(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); + const struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; - skcipher_request_set_tfm(subreq, ctx->enc_tfm); - skcipher_request_set_callback(subreq, - skcipher_request_flags(req), - NULL, NULL); - skcipher_request_set_crypt(subreq, req->src, req->dst, - req->cryptlen, req->iv); + err = skcipher_walk_virt(&walk, req, false); - return crypto_skcipher_encrypt(subreq); + while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + u8 *prev = walk.iv; + + do { + crypto_xor_cpy(dst, src, prev, AES_BLOCK_SIZE); + __aes_arm_encrypt(ctx->fallback.key_enc, + ctx->key.rounds, dst, dst); + prev = dst; + src += AES_BLOCK_SIZE; + dst += AES_BLOCK_SIZE; + nbytes -= AES_BLOCK_SIZE; + } while (nbytes >= AES_BLOCK_SIZE); + memcpy(walk.iv, prev, AES_BLOCK_SIZE); + err = skcipher_walk_done(&walk, nbytes); + } + return err; } static int cbc_decrypt(struct skcipher_request *req) @@ -190,30 +200,6 @@ static int cbc_decrypt(struct skcipher_request *req) return err; } -static int cbc_init(struct crypto_skcipher *tfm) -{ - struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); - unsigned int reqsize; - - ctx->enc_tfm = crypto_alloc_skcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC | - CRYPTO_ALG_NEED_FALLBACK); - if (IS_ERR(ctx->enc_tfm)) - return PTR_ERR(ctx->enc_tfm); - - reqsize = sizeof(struct skcipher_request); - reqsize += crypto_skcipher_reqsize(ctx->enc_tfm); - crypto_skcipher_set_reqsize(tfm, reqsize); - - return 0; -} - -static void cbc_exit(struct crypto_skcipher *tfm) -{ - struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); - - crypto_free_skcipher(ctx->enc_tfm); -} - static int aesbs_ctr_setkey_sync(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { @@ -271,16 +257,8 @@ static int ctr_encrypt(struct skcipher_request *req) static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst) { struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); - unsigned long flags; - - /* - * Temporarily disable interrupts to avoid races where - * cachelines are evicted when the CPU is interrupted - * to do something else. - */ - local_irq_save(flags); - aes_encrypt(&ctx->fallback, dst, src); - local_irq_restore(flags); + + __aes_arm_encrypt(ctx->fallback.key_enc, ctx->key.rounds, src, dst); } static int ctr_encrypt_sync(struct skcipher_request *req) @@ -302,45 +280,23 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key, return err; key_len /= 2; - err = crypto_cipher_setkey(ctx->cts_tfm, in_key, key_len); + err = aes_expandkey(&ctx->fallback, in_key, key_len); if (err) return err; - err = crypto_cipher_setkey(ctx->tweak_tfm, in_key + key_len, key_len); + err = aes_expandkey(&ctx->tweak_key, in_key + key_len, key_len); if (err) return err; return aesbs_setkey(tfm, in_key, key_len); } -static int xts_init(struct crypto_skcipher *tfm) -{ - struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - ctx->cts_tfm = crypto_alloc_cipher("aes", 0, 0); - if (IS_ERR(ctx->cts_tfm)) - return PTR_ERR(ctx->cts_tfm); - - ctx->tweak_tfm = crypto_alloc_cipher("aes", 0, 0); - if (IS_ERR(ctx->tweak_tfm)) - crypto_free_cipher(ctx->cts_tfm); - - return PTR_ERR_OR_ZERO(ctx->tweak_tfm); -} - -static void xts_exit(struct crypto_skcipher *tfm) -{ - struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - - crypto_free_cipher(ctx->tweak_tfm); - crypto_free_cipher(ctx->cts_tfm); -} - static int __xts_crypt(struct skcipher_request *req, bool encrypt, void (*fn)(u8 out[], u8 const in[], u8 const rk[], int rounds, int blocks, u8 iv[], int)) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + const int rounds = ctx->key.rounds; int tail = req->cryptlen % AES_BLOCK_SIZE; struct skcipher_request subreq; u8 buf[2 * AES_BLOCK_SIZE]; @@ -364,7 +320,7 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt, if (err) return err; - crypto_cipher_encrypt_one(ctx->tweak_tfm, walk.iv, walk.iv); + __aes_arm_encrypt(ctx->tweak_key.key_enc, rounds, walk.iv, walk.iv); while (walk.nbytes >= AES_BLOCK_SIZE) { unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; @@ -378,7 +334,7 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt, kernel_neon_begin(); fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->key.rk, - ctx->key.rounds, blocks, walk.iv, reorder_last_tweak); + rounds, blocks, walk.iv, reorder_last_tweak); kernel_neon_end(); err = skcipher_walk_done(&walk, walk.nbytes - blocks * AES_BLOCK_SIZE); @@ -396,9 +352,9 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt, crypto_xor(buf, req->iv, AES_BLOCK_SIZE); if (encrypt) - crypto_cipher_encrypt_one(ctx->cts_tfm, buf, buf); + __aes_arm_encrypt(ctx->fallback.key_enc, rounds, buf, buf); else - crypto_cipher_decrypt_one(ctx->cts_tfm, buf, buf); + __aes_arm_decrypt(ctx->fallback.key_dec, rounds, buf, buf); crypto_xor(buf, req->iv, AES_BLOCK_SIZE); @@ -439,8 +395,7 @@ static struct skcipher_alg aes_algs[] = { { .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctx), .base.cra_module = THIS_MODULE, - .base.cra_flags = CRYPTO_ALG_INTERNAL | - CRYPTO_ALG_NEED_FALLBACK, + .base.cra_flags = CRYPTO_ALG_INTERNAL, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, @@ -449,8 +404,6 @@ static struct skcipher_alg aes_algs[] = { { .setkey = aesbs_cbc_setkey, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, - .init = cbc_init, - .exit = cbc_exit, }, { .base.cra_name = "__ctr(aes)", .base.cra_driver_name = "__ctr-aes-neonbs", @@ -500,8 +453,6 @@ static struct skcipher_alg aes_algs[] = { { .setkey = aesbs_xts_setkey, .encrypt = xts_encrypt, .decrypt = xts_decrypt, - .init = xts_init, - .exit = xts_exit, } }; static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)]; @@ -540,7 +491,7 @@ static int __init aes_init(void) algname = aes_algs[i].base.cra_name + 2; drvname = aes_algs[i].base.cra_driver_name + 2; basename = aes_algs[i].base.cra_driver_name; - simd = simd_skcipher_create_compat(algname, drvname, basename); + simd = simd_skcipher_create_compat(aes_algs + i, algname, drvname, basename); err = PTR_ERR(simd); if (IS_ERR(simd)) goto unregister_simds; diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h index a41b503b7dcd..f63ba8986b24 100644 --- a/arch/arm/include/asm/arm_pmuv3.h +++ b/arch/arm/include/asm/arm_pmuv3.h @@ -127,6 +127,12 @@ static inline u32 read_pmuver(void) return (dfr0 >> 24) & 0xf; } +static inline bool pmuv3_has_icntr(void) +{ + /* FEAT_PMUv3_ICNTR not accessible for 32-bit */ + return false; +} + static inline void write_pmcr(u32 val) { write_sysreg(val, PMCR); @@ -152,6 +158,13 @@ static inline u64 read_pmccntr(void) return read_sysreg(PMCCNTR); } +static inline void write_pmicntr(u64 val) {} + +static inline u64 read_pmicntr(void) +{ + return 0; +} + static inline void write_pmcntenset(u32 val) { write_sysreg(val, PMCNTENSET); @@ -177,6 +190,13 @@ static inline void write_pmccfiltr(u32 val) write_sysreg(val, PMCCFILTR); } +static inline void write_pmicfiltr(u64 val) {} + +static inline u64 read_pmicfiltr(void) +{ + return 0; +} + static inline void write_pmovsclr(u32 val) { write_sysreg(val, PMOVSR); diff --git a/arch/arm/include/asm/cpu.h b/arch/arm/include/asm/cpu.h index bd6fdb4b922d..9d8863537aa5 100644 --- a/arch/arm/include/asm/cpu.h +++ b/arch/arm/include/asm/cpu.h @@ -11,7 +11,6 @@ #include <linux/cpu.h> struct cpuinfo_arm { - struct cpu cpu; u32 cpuid; #ifdef CONFIG_SMP unsigned int loops_per_jiffy; diff --git a/arch/arm/include/asm/dma-iommu.h b/arch/arm/include/asm/dma-iommu.h index 82ec1ccf1fee..2ce4c5683e6d 100644 --- a/arch/arm/include/asm/dma-iommu.h +++ b/arch/arm/include/asm/dma-iommu.h @@ -24,7 +24,7 @@ struct dma_iommu_mapping { }; struct dma_iommu_mapping * -arm_iommu_create_mapping(const struct bus_type *bus, dma_addr_t base, u64 size); +arm_iommu_create_mapping(struct device *dev, dma_addr_t base, u64 size); void arm_iommu_release_mapping(struct dma_iommu_mapping *mapping); diff --git a/arch/arm/include/asm/hypervisor.h b/arch/arm/include/asm/hypervisor.h index bd61502b9715..8a648e506540 100644 --- a/arch/arm/include/asm/hypervisor.h +++ b/arch/arm/include/asm/hypervisor.h @@ -7,4 +7,6 @@ void kvm_init_hyp_services(void); bool kvm_arm_hyp_service_available(u32 func_id); +static inline void kvm_arch_init_hyp_services(void) { }; + #endif diff --git a/arch/arm/include/asm/pgtable-3level-hwdef.h b/arch/arm/include/asm/pgtable-3level-hwdef.h index dfab3e982cbf..944fc9955528 100644 --- a/arch/arm/include/asm/pgtable-3level-hwdef.h +++ b/arch/arm/include/asm/pgtable-3level-hwdef.h @@ -106,6 +106,11 @@ /* * TTBCR register bits. + * + * The ORGN0 and IRGN0 bits enables different forms of caching when + * walking the translation table. Clearing these bits (which is claimed + * to be the reset default) means "normal memory, [outer|inner] + * non-cacheable" */ #define TTBCR_EAE (1 << 31) #define TTBCR_IMP (1 << 30) diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index f01d23a220e6..1dfae1af8e31 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -29,6 +29,12 @@ #include "entry-header.S" #include <asm/probes.h> +#ifdef CONFIG_HAVE_LD_DEAD_CODE_DATA_ELIMINATION +#define RELOC_TEXT_NONE .reloc .text, R_ARM_NONE, . +#else +#define RELOC_TEXT_NONE +#endif + /* * Interrupt handling. */ @@ -1065,7 +1071,7 @@ vector_addrexcptn: .globl vector_fiq .section .vectors, "ax", %progbits - .reloc .text, R_ARM_NONE, . + RELOC_TEXT_NONE W(b) vector_rst W(b) vector_und ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_swi ) @@ -1079,7 +1085,7 @@ THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_swi ) #ifdef CONFIG_HARDEN_BRANCH_HISTORY .section .vectors.bhb.loop8, "ax", %progbits - .reloc .text, R_ARM_NONE, . + RELOC_TEXT_NONE W(b) vector_rst W(b) vector_bhb_loop8_und ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_bhb_loop8_swi ) @@ -1092,7 +1098,7 @@ THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_bhb_loop8_swi ) W(b) vector_bhb_loop8_fiq .section .vectors.bhb.bpiall, "ax", %progbits - .reloc .text, R_ARM_NONE, . + RELOC_TEXT_NONE W(b) vector_rst W(b) vector_bhb_bpiall_und ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_bhb_bpiall_swi ) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 7b33b157fca0..e6a857bf0ce6 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -1201,20 +1201,10 @@ void __init setup_arch(char **cmdline_p) mdesc->init_early(); } - -static int __init topology_init(void) +bool arch_cpu_is_hotpluggable(int num) { - int cpu; - - for_each_possible_cpu(cpu) { - struct cpuinfo_arm *cpuinfo = &per_cpu(cpu_data, cpu); - cpuinfo->cpu.hotpluggable = platform_can_hotplug_cpu(cpu); - register_cpu(&cpuinfo->cpu, cpu); - } - - return 0; + return platform_can_hotplug_cpu(num); } -subsys_initcall(topology_init); #ifdef CONFIG_HAVE_PROC_CPU static int __init proc_cpu_init(void) diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 5adf1769eee4..88c2d68a69c9 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -1532,7 +1532,7 @@ static const struct dma_map_ops iommu_ops = { /** * arm_iommu_create_mapping - * @bus: pointer to the bus holding the client device (for IOMMU calls) + * @dev: pointer to the client device (for IOMMU calls) * @base: start address of the valid IO address space * @size: maximum size of the valid IO address space * @@ -1544,7 +1544,7 @@ static const struct dma_map_ops iommu_ops = { * arm_iommu_attach_device function. */ struct dma_iommu_mapping * -arm_iommu_create_mapping(const struct bus_type *bus, dma_addr_t base, u64 size) +arm_iommu_create_mapping(struct device *dev, dma_addr_t base, u64 size) { unsigned int bits = size >> PAGE_SHIFT; unsigned int bitmap_size = BITS_TO_LONGS(bits) * sizeof(long); @@ -1585,9 +1585,11 @@ arm_iommu_create_mapping(const struct bus_type *bus, dma_addr_t base, u64 size) spin_lock_init(&mapping->lock); - mapping->domain = iommu_domain_alloc(bus); - if (!mapping->domain) + mapping->domain = iommu_paging_domain_alloc(dev); + if (IS_ERR(mapping->domain)) { + err = PTR_ERR(mapping->domain); goto err4; + } kref_init(&mapping->kref); return mapping; @@ -1718,7 +1720,7 @@ static void arm_setup_iommu_dma_ops(struct device *dev) dma_base = dma_range_map_min(dev->dma_range_map); size = dma_range_map_max(dev->dma_range_map) - dma_base; } - mapping = arm_iommu_create_mapping(dev->bus, dma_base, size); + mapping = arm_iommu_create_mapping(dev, dma_base, size); if (IS_ERR(mapping)) { pr_warn("Failed to create %llu-byte IOMMU mapping for device %s\n", size, dev_name(dev)); diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 3f774856ca67..f85c177cdf8d 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -1638,7 +1638,7 @@ static void __init early_paging_init(const struct machine_desc *mdesc) { pgtables_remap *lpae_pgtables_remap; unsigned long pa_pgd; - unsigned int cr, ttbcr; + u32 cr, ttbcr, tmp; long long offset; if (!mdesc->pv_fixup) @@ -1688,7 +1688,9 @@ static void __init early_paging_init(const struct machine_desc *mdesc) cr = get_cr(); set_cr(cr & ~(CR_I | CR_C)); ttbcr = cpu_get_ttbcr(); - cpu_set_ttbcr(ttbcr & ~(3 << 8 | 3 << 10)); + /* Disable all kind of caching of the translation table */ + tmp = ttbcr & ~(TTBCR_ORGN0_MASK | TTBCR_IRGN0_MASK); + cpu_set_ttbcr(tmp); flush_cache_all(); /* diff --git a/arch/arm/vfp/vfpinstr.h b/arch/arm/vfp/vfpinstr.h index 3c7938fd40aa..32090b0fb250 100644 --- a/arch/arm/vfp/vfpinstr.h +++ b/arch/arm/vfp/vfpinstr.h @@ -64,33 +64,37 @@ #ifdef CONFIG_AS_VFP_VMRS_FPINST -#define fmrx(_vfp_) ({ \ - u32 __v; \ - asm(".fpu vfpv2\n" \ - "vmrs %0, " #_vfp_ \ - : "=r" (__v) : : "cc"); \ - __v; \ - }) - -#define fmxr(_vfp_,_var_) \ - asm(".fpu vfpv2\n" \ - "vmsr " #_vfp_ ", %0" \ - : : "r" (_var_) : "cc") +#define fmrx(_vfp_) ({ \ + u32 __v; \ + asm volatile (".fpu vfpv2\n" \ + "vmrs %0, " #_vfp_ \ + : "=r" (__v) : : "cc"); \ + __v; \ +}) + +#define fmxr(_vfp_, _var_) ({ \ + asm volatile (".fpu vfpv2\n" \ + "vmsr " #_vfp_ ", %0" \ + : : "r" (_var_) : "cc"); \ +}) #else #define vfpreg(_vfp_) #_vfp_ -#define fmrx(_vfp_) ({ \ - u32 __v; \ - asm("mrc p10, 7, %0, " vfpreg(_vfp_) ", cr0, 0 @ fmrx %0, " #_vfp_ \ - : "=r" (__v) : : "cc"); \ - __v; \ - }) - -#define fmxr(_vfp_,_var_) \ - asm("mcr p10, 7, %0, " vfpreg(_vfp_) ", cr0, 0 @ fmxr " #_vfp_ ", %0" \ - : : "r" (_var_) : "cc") +#define fmrx(_vfp_) ({ \ + u32 __v; \ + asm volatile ("mrc p10, 7, %0, " vfpreg(_vfp_) "," \ + "cr0, 0 @ fmrx %0, " #_vfp_ \ + : "=r" (__v) : : "cc"); \ + __v; \ +}) + +#define fmxr(_vfp_, _var_) ({ \ + asm volatile ("mcr p10, 7, %0, " vfpreg(_vfp_) "," \ + "cr0, 0 @ fmxr " #_vfp_ ", %0" \ + : : "r" (_var_) : "cc"); \ +}) #endif diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a2f8ff354ca6..ed15b876fa74 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -34,6 +34,7 @@ config ARM64 select ARCH_HAS_KERNEL_FPU_SUPPORT if KERNEL_MODE_NEON select ARCH_HAS_KEEPINITRD select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_DEVMAP @@ -423,7 +424,7 @@ config AMPERE_ERRATUM_AC03_CPU_38 default y help This option adds an alternative code sequence to work around Ampere - erratum AC03_CPU_38 on AmpereOne. + errata AC03_CPU_38 and AC04_CPU_10 on AmpereOne. The affected design reports FEAT_HAFDBS as not implemented in ID_AA64MMFR1_EL1.HAFDBS, but (V)TCR_ELx.{HA,HD} are not RES0 @@ -2137,6 +2138,29 @@ config ARM64_EPAN if the cpu does not implement the feature. endmenu # "ARMv8.7 architectural features" +menu "ARMv8.9 architectural features" + +config ARM64_POE + prompt "Permission Overlay Extension" + def_bool y + select ARCH_USES_HIGH_VMA_FLAGS + select ARCH_HAS_PKEYS + help + The Permission Overlay Extension is used to implement Memory + Protection Keys. Memory Protection Keys provides a mechanism for + enforcing page-based protections, but without requiring modification + of the page tables when an application changes protection domains. + + For details, see Documentation/core-api/protection-keys.rst + + If unsure, say y. + +config ARCH_PKEY_BITS + int + default 3 + +endmenu # "ARMv8.9 architectural features" + config ARM64_SVE bool "ARM Scalable Vector Extension support" default y diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi index 6b6e3ee950e5..acf293310f7a 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi @@ -175,7 +175,7 @@ }; }; - core-cluster-thermal { + cluster-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 1>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi index 17f4e3171120..ab4c919e3e16 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi @@ -214,7 +214,7 @@ }; }; - core-cluster-thermal { + cluster-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 3>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi index 200e52622f99..55019866d6a2 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi @@ -182,7 +182,7 @@ }; }; - core-cluster-thermal { + cluster-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 3>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi index 8ce4b6aae79d..e3a7db21fe29 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi @@ -131,7 +131,7 @@ }; thermal-zones { - core-cluster-thermal { + cluster-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 0>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi index bde89de2576e..1b306d6802ce 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi @@ -122,7 +122,7 @@ }; }; - core-cluster1-thermal { + cluster1-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 4>; @@ -151,7 +151,7 @@ }; }; - core-cluster2-thermal { + cluster2-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 5>; @@ -180,7 +180,7 @@ }; }; - core-cluster3-thermal { + cluster3-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 6>; @@ -209,7 +209,7 @@ }; }; - core-cluster4-thermal { + cluster4-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 7>; diff --git a/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi b/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi index 26c7ca31e22e..bd75a658767d 100644 --- a/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi @@ -492,7 +492,7 @@ }; }; - ddr-cluster5-thermal { + ddr-ctrl5-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 1>; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso b/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso index bf3e04651ba0..353ace3601dc 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso +++ b/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso @@ -21,7 +21,7 @@ &gpio3 { pinctrl-names = "default"; - pinctrcl-0 = <&pinctrl_gpio3_hog>; + pinctrl-0 = <&pinctrl_gpio3_hog>; uart4_rs485_en { gpio-hog; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso b/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso index f4448cde0407..8a75d6783ad2 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso +++ b/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso @@ -22,7 +22,7 @@ &gpio3 { pinctrl-names = "default"; - pinctrcl-0 = <&pinctrl_gpio3_hog>; + pinctrl-0 = <&pinctrl_gpio3_hog>; uart4_rs485_en { gpio-hog; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts b/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts index 17e2c19d8455..cc9b81d46188 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts +++ b/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts @@ -211,13 +211,12 @@ simple-audio-card,cpu { sound-dai = <&sai3>; + frame-master; + bitclock-master; }; simple-audio-card,codec { sound-dai = <&wm8962>; - clocks = <&clk IMX8MP_CLK_IPP_DO_CLKO1>; - frame-master; - bitclock-master; }; }; }; @@ -507,10 +506,9 @@ &sai3 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_sai3>; - assigned-clocks = <&clk IMX8MP_CLK_SAI3>, - <&clk IMX8MP_AUDIO_PLL2> ; - assigned-clock-parents = <&clk IMX8MP_AUDIO_PLL2_OUT>; - assigned-clock-rates = <12288000>, <361267200>; + assigned-clocks = <&clk IMX8MP_CLK_SAI3>; + assigned-clock-parents = <&clk IMX8MP_AUDIO_PLL1_OUT>; + assigned-clock-rates = <12288000>; fsl,sai-mclk-direction-output; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts b/arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts index da8f19a646a9..e2ee9f5a042c 100644 --- a/arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts +++ b/arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts @@ -499,7 +499,7 @@ pinctrl-0 = <&pinctrl_usdhc2_hs>, <&pinctrl_usdhc2_gpio>; pinctrl-1 = <&pinctrl_usdhc2_uhs>, <&pinctrl_usdhc2_gpio>; pinctrl-2 = <&pinctrl_usdhc2_uhs>, <&pinctrl_usdhc2_gpio>; - cd-gpios = <&gpio3 00 GPIO_ACTIVE_LOW>; + cd-gpios = <&gpio3 0 GPIO_ACTIVE_LOW>; vmmc-supply = <®_usdhc2_vmmc>; bus-width = <4>; no-sdio; diff --git a/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi b/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi index edbd8cad35bc..72a9a5d4e27a 100644 --- a/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi +++ b/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi @@ -19,7 +19,7 @@ linux,cma { compatible = "shared-dma-pool"; reusable; - alloc-ranges = <0 0x60000000 0 0x40000000>; + alloc-ranges = <0 0x80000000 0 0x40000000>; size = <0 0x10000000>; linux,cma-default; }; @@ -156,6 +156,7 @@ &wdog3 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_wdog>; + fsl,ext-reset-output; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx93.dtsi b/arch/arm64/boot/dts/freescale/imx93.dtsi index 4a3f42355cb8..a0993022c102 100644 --- a/arch/arm64/boot/dts/freescale/imx93.dtsi +++ b/arch/arm64/boot/dts/freescale/imx93.dtsi @@ -1105,7 +1105,7 @@ <&clk IMX93_CLK_SYS_PLL_PFD0_DIV2>; assigned-clock-rates = <100000000>, <250000000>; intf_mode = <&wakeupmix_gpr 0x28>; - snps,clk-csr = <0>; + snps,clk-csr = <6>; nvmem-cells = <ð_mac2>; nvmem-cell-names = "mac-address"; status = "disabled"; diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi index 1bbf9a0468f6..425272aa5a81 100644 --- a/arch/arm64/boot/dts/freescale/imx95.dtsi +++ b/arch/arm64/boot/dts/freescale/imx95.dtsi @@ -27,7 +27,7 @@ reg = <0x0>; enable-method = "psci"; #cooling-cells = <2>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; i-cache-size = <32768>; i-cache-line-size = <64>; @@ -44,7 +44,7 @@ reg = <0x100>; enable-method = "psci"; #cooling-cells = <2>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; i-cache-size = <32768>; i-cache-line-size = <64>; @@ -61,7 +61,7 @@ reg = <0x200>; enable-method = "psci"; #cooling-cells = <2>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; i-cache-size = <32768>; i-cache-line-size = <64>; @@ -78,7 +78,7 @@ reg = <0x300>; enable-method = "psci"; #cooling-cells = <2>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; i-cache-size = <32768>; i-cache-line-size = <64>; @@ -93,7 +93,7 @@ device_type = "cpu"; compatible = "arm,cortex-a55"; reg = <0x400>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; enable-method = "psci"; #cooling-cells = <2>; @@ -110,7 +110,7 @@ device_type = "cpu"; compatible = "arm,cortex-a55"; reg = <0x500>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; enable-method = "psci"; #cooling-cells = <2>; @@ -187,7 +187,7 @@ compatible = "cache"; cache-size = <524288>; cache-line-size = <64>; - cache-sets = <1024>; + cache-sets = <512>; cache-level = <3>; cache-unified; }; diff --git a/arch/arm64/boot/dts/qcom/ipq5332.dtsi b/arch/arm64/boot/dts/qcom/ipq5332.dtsi index 573656587c0d..0a74ed4f72cc 100644 --- a/arch/arm64/boot/dts/qcom/ipq5332.dtsi +++ b/arch/arm64/boot/dts/qcom/ipq5332.dtsi @@ -320,8 +320,8 @@ reg = <0x08af8800 0x400>; interrupts = <GIC_SPI 62 IRQ_TYPE_LEVEL_HIGH>, - <GIC_SPI 53 IRQ_TYPE_EDGE_BOTH>, - <GIC_SPI 52 IRQ_TYPE_EDGE_BOTH>; + <GIC_SPI 53 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 52 IRQ_TYPE_LEVEL_HIGH>; interrupt-names = "pwr_event", "dp_hs_phy_irq", "dm_hs_phy_irq"; diff --git a/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts b/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts index 7fb980fcb307..9caa14dda585 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts @@ -278,6 +278,13 @@ vdd-l3-supply = <&vreg_s1f_0p7>; vdd-s1-supply = <&vph_pwr>; vdd-s2-supply = <&vph_pwr>; + + vreg_l3i_0p8: ldo3 { + regulator-name = "vreg_l3i_0p8"; + regulator-min-microvolt = <880000>; + regulator-max-microvolt = <920000>; + regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>; + }; }; regulators-7 { @@ -423,11 +430,17 @@ }; &pcie4 { + perst-gpios = <&tlmm 146 GPIO_ACTIVE_LOW>; + wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>; + + pinctrl-0 = <&pcie4_default>; + pinctrl-names = "default"; + status = "okay"; }; &pcie4_phy { - vdda-phy-supply = <&vreg_l3j_0p8>; + vdda-phy-supply = <&vreg_l3i_0p8>; vdda-pll-supply = <&vreg_l3e_1p2>; status = "okay"; @@ -517,7 +530,30 @@ bias-disable; }; - pcie6a_default: pcie2a-default-state { + pcie4_default: pcie4-default-state { + clkreq-n-pins { + pins = "gpio147"; + function = "pcie4_clk"; + drive-strength = <2>; + bias-pull-up; + }; + + perst-n-pins { + pins = "gpio146"; + function = "gpio"; + drive-strength = <2>; + bias-disable; + }; + + wake-n-pins { + pins = "gpio148"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; + }; + + pcie6a_default: pcie6a-default-state { clkreq-n-pins { pins = "gpio153"; function = "pcie6a_clk"; @@ -529,7 +565,7 @@ pins = "gpio152"; function = "gpio"; drive-strength = <2>; - bias-pull-down; + bias-disable; }; wake-n-pins { diff --git a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts index 6152bcd0bc1f..e17ab8251e2a 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts @@ -268,7 +268,6 @@ pinctrl-0 = <&edp_reg_en>; pinctrl-names = "default"; - regulator-always-on; regulator-boot-on; }; @@ -637,6 +636,14 @@ }; }; +&gpu { + status = "okay"; + + zap-shader { + firmware-name = "qcom/x1e80100/gen70500_zap.mbn"; + }; +}; + &i2c0 { clock-frequency = <400000>; @@ -724,9 +731,13 @@ aux-bus { panel { - compatible = "edp-panel"; + compatible = "samsung,atna45af01", "samsung,atna33xc20"; + enable-gpios = <&pmc8380_3_gpios 4 GPIO_ACTIVE_HIGH>; power-supply = <&vreg_edp_3p3>; + pinctrl-0 = <&edp_bl_en>; + pinctrl-names = "default"; + port { edp_panel_in: endpoint { remote-endpoint = <&mdss_dp3_out>; @@ -756,11 +767,17 @@ }; &pcie4 { + perst-gpios = <&tlmm 146 GPIO_ACTIVE_LOW>; + wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>; + + pinctrl-0 = <&pcie4_default>; + pinctrl-names = "default"; + status = "okay"; }; &pcie4_phy { - vdda-phy-supply = <&vreg_l3j_0p8>; + vdda-phy-supply = <&vreg_l3i_0p8>; vdda-pll-supply = <&vreg_l3e_1p2>; status = "okay"; @@ -785,6 +802,16 @@ status = "okay"; }; +&pmc8380_3_gpios { + edp_bl_en: edp-bl-en-state { + pins = "gpio4"; + function = "normal"; + power-source = <1>; /* 1.8V */ + input-disable; + output-enable; + }; +}; + &qupv3_0 { status = "okay"; }; @@ -931,7 +958,30 @@ bias-disable; }; - pcie6a_default: pcie2a-default-state { + pcie4_default: pcie4-default-state { + clkreq-n-pins { + pins = "gpio147"; + function = "pcie4_clk"; + drive-strength = <2>; + bias-pull-up; + }; + + perst-n-pins { + pins = "gpio146"; + function = "gpio"; + drive-strength = <2>; + bias-disable; + }; + + wake-n-pins { + pins = "gpio148"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; + }; + + pcie6a_default: pcie6a-default-state { clkreq-n-pins { pins = "gpio153"; function = "pcie6a_clk"; @@ -943,15 +993,15 @@ pins = "gpio152"; function = "gpio"; drive-strength = <2>; - bias-pull-down; + bias-disable; }; wake-n-pins { - pins = "gpio154"; - function = "gpio"; - drive-strength = <2>; - bias-pull-up; - }; + pins = "gpio154"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; }; tpad_default: tpad-default-state { diff --git a/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts b/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts index fbff558f5b07..1943bdbfb8c0 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts @@ -625,16 +625,31 @@ }; &pcie4 { + perst-gpios = <&tlmm 146 GPIO_ACTIVE_LOW>; + wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>; + + pinctrl-0 = <&pcie4_default>; + pinctrl-names = "default"; + status = "okay"; }; &pcie4_phy { - vdda-phy-supply = <&vreg_l3j_0p8>; + vdda-phy-supply = <&vreg_l3i_0p8>; vdda-pll-supply = <&vreg_l3e_1p2>; status = "okay"; }; +&pcie4_port0 { + wifi@0 { + compatible = "pci17cb,1107"; + reg = <0x10000 0x0 0x0 0x0 0x0>; + + qcom,ath12k-calibration-variant = "LES790"; + }; +}; + &pcie6a { perst-gpios = <&tlmm 152 GPIO_ACTIVE_LOW>; wake-gpios = <&tlmm 154 GPIO_ACTIVE_LOW>; @@ -782,7 +797,30 @@ bias-disable; }; - pcie6a_default: pcie2a-default-state { + pcie4_default: pcie4-default-state { + clkreq-n-pins { + pins = "gpio147"; + function = "pcie4_clk"; + drive-strength = <2>; + bias-pull-up; + }; + + perst-n-pins { + pins = "gpio146"; + function = "gpio"; + drive-strength = <2>; + bias-disable; + }; + + wake-n-pins { + pins = "gpio148"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; + }; + + pcie6a_default: pcie6a-default-state { clkreq-n-pins { pins = "gpio153"; function = "pcie6a_clk"; @@ -794,15 +832,15 @@ pins = "gpio152"; function = "gpio"; drive-strength = <2>; - bias-pull-down; + bias-disable; }; wake-n-pins { - pins = "gpio154"; - function = "gpio"; - drive-strength = <2>; - bias-pull-up; - }; + pins = "gpio154"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; }; tpad_default: tpad-default-state { diff --git a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts index 72a4f4138616..8098e6730ae5 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts @@ -606,6 +606,14 @@ }; }; +&gpu { + status = "okay"; + + zap-shader { + firmware-name = "qcom/x1e80100/gen70500_zap.mbn"; + }; +}; + &lpass_tlmm { spkr_01_sd_n_active: spkr-01-sd-n-active-state { pins = "gpio12"; @@ -660,11 +668,17 @@ }; &pcie4 { + perst-gpios = <&tlmm 146 GPIO_ACTIVE_LOW>; + wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>; + + pinctrl-0 = <&pcie4_default>; + pinctrl-names = "default"; + status = "okay"; }; &pcie4_phy { - vdda-phy-supply = <&vreg_l3j_0p8>; + vdda-phy-supply = <&vreg_l3i_0p8>; vdda-pll-supply = <&vreg_l3e_1p2>; status = "okay"; @@ -804,7 +818,30 @@ bias-disable; }; - pcie6a_default: pcie2a-default-state { + pcie4_default: pcie4-default-state { + clkreq-n-pins { + pins = "gpio147"; + function = "pcie4_clk"; + drive-strength = <2>; + bias-pull-up; + }; + + perst-n-pins { + pins = "gpio146"; + function = "gpio"; + drive-strength = <2>; + bias-disable; + }; + + wake-n-pins { + pins = "gpio148"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; + }; + + pcie6a_default: pcie6a-default-state { clkreq-n-pins { pins = "gpio153"; function = "pcie6a_clk"; @@ -816,15 +853,15 @@ pins = "gpio152"; function = "gpio"; drive-strength = <2>; - bias-pull-down; + bias-disable; }; wake-n-pins { - pins = "gpio154"; - function = "gpio"; - drive-strength = <2>; - bias-pull-up; - }; + pins = "gpio154"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; }; wcd_default: wcd-reset-n-active-state { diff --git a/arch/arm64/boot/dts/qcom/x1e80100.dtsi b/arch/arm64/boot/dts/qcom/x1e80100.dtsi index 7bca5fcd7d52..cd732ef88cd8 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100.dtsi +++ b/arch/arm64/boot/dts/qcom/x1e80100.dtsi @@ -2901,7 +2901,7 @@ dma-coherent; - linux,pci-domain = <7>; + linux,pci-domain = <6>; num-lanes = <2>; interrupts = <GIC_SPI 773 IRQ_TYPE_LEVEL_HIGH>, @@ -2959,6 +2959,7 @@ "link_down"; power-domains = <&gcc GCC_PCIE_6A_GDSC>; + required-opps = <&rpmhpd_opp_nom>; phys = <&pcie6a_phy>; phy-names = "pciephy"; @@ -3022,7 +3023,7 @@ dma-coherent; - linux,pci-domain = <5>; + linux,pci-domain = <4>; num-lanes = <2>; interrupts = <GIC_SPI 141 IRQ_TYPE_LEVEL_HIGH>, @@ -3080,11 +3081,22 @@ "link_down"; power-domains = <&gcc GCC_PCIE_4_GDSC>; + required-opps = <&rpmhpd_opp_nom>; phys = <&pcie4_phy>; phy-names = "pciephy"; status = "disabled"; + + pcie4_port0: pcie@0 { + device_type = "pci"; + reg = <0x0 0x0 0x0 0x0 0x0>; + bus-range = <0x01 0xff>; + + #address-cells = <3>; + #size-cells = <2>; + ranges; + }; }; pcie4_phy: phy@1c0e000 { @@ -3155,9 +3167,10 @@ interconnects = <&gem_noc MASTER_GFX3D 0 &mc_virt SLAVE_EBI1 0>; interconnect-names = "gfx-mem"; + status = "disabled"; + zap-shader { memory-region = <&gpu_microcode_mem>; - firmware-name = "qcom/gen70500_zap.mbn"; }; gpu_opp_table: opp-table { @@ -3288,7 +3301,7 @@ reg = <0x0 0x03da0000 0x0 0x40000>; #iommu-cells = <2>; #global-interrupts = <1>; - interrupts = <GIC_SPI 673 IRQ_TYPE_LEVEL_HIGH>, + interrupts = <GIC_SPI 674 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 678 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 679 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 680 IRQ_TYPE_LEVEL_HIGH>, diff --git a/arch/arm64/boot/dts/rockchip/rk3328-rock-pi-e.dts b/arch/arm64/boot/dts/rockchip/rk3328-rock-pi-e.dts index a608a219543e..3e08e2fd0a78 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-rock-pi-e.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-rock-pi-e.dts @@ -387,7 +387,7 @@ pmic { pmic_int_l: pmic-int-l { - rockchip,pins = <2 RK_PA6 RK_FUNC_GPIO &pcfg_pull_up>; + rockchip,pins = <0 RK_PA2 RK_FUNC_GPIO &pcfg_pull_up>; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi index ccbe3a7a1d2c..d24444cdf54a 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi @@ -154,6 +154,22 @@ }; }; +&gpio3 { + /* + * The Qseven BIOS_DISABLE signal on the RK3399-Q7 keeps the on-module + * eMMC and SPI flash powered-down initially (in fact it keeps the + * reset signal asserted). BIOS_DISABLE_OVERRIDE pin allows to override + * that signal so that eMMC and SPI can be used regardless of the state + * of the signal. + */ + bios-disable-override-hog { + gpios = <RK_PD5 GPIO_ACTIVE_LOW>; + gpio-hog; + line-name = "bios_disable_override"; + output-high; + }; +}; + &gmac { assigned-clocks = <&cru SCLK_RMII_SRC>; assigned-clock-parents = <&clkin_gmac>; @@ -409,6 +425,7 @@ &i2s0 { pinctrl-0 = <&i2s0_2ch_bus>; + pinctrl-1 = <&i2s0_2ch_bus_bclk_off>; rockchip,playback-channels = <2>; rockchip,capture-channels = <2>; status = "okay"; @@ -417,8 +434,8 @@ /* * As Q7 does not specify neither a global nor a RX clock for I2S these * signals are not used. Furthermore I2S0_LRCK_RX is used as GPIO. - * Therefore we have to redefine the i2s0_2ch_bus definition to prevent - * conflicts. + * Therefore we have to redefine the i2s0_2ch_bus and i2s0_2ch_bus_bclk_off + * definitions to prevent conflicts. */ &i2s0_2ch_bus { rockchip,pins = @@ -428,6 +445,14 @@ <3 RK_PD7 1 &pcfg_pull_none>; }; +&i2s0_2ch_bus_bclk_off { + rockchip,pins = + <3 RK_PD0 RK_FUNC_GPIO &pcfg_pull_none>, + <3 RK_PD2 1 &pcfg_pull_none>, + <3 RK_PD3 1 &pcfg_pull_none>, + <3 RK_PD7 1 &pcfg_pull_none>; +}; + &io_domains { status = "okay"; bt656-supply = <&vcc_1v8>; @@ -449,9 +474,14 @@ &pinctrl { pinctrl-names = "default"; - pinctrl-0 = <&q7_thermal_pin>; + pinctrl-0 = <&q7_thermal_pin &bios_disable_override_hog_pin>; gpios { + bios_disable_override_hog_pin: bios-disable-override-hog-pin { + rockchip,pins = + <3 RK_PD5 RK_FUNC_GPIO &pcfg_pull_down>; + }; + q7_thermal_pin: q7-thermal-pin { rockchip,pins = <0 RK_PA3 RK_FUNC_GPIO &pcfg_pull_up>; diff --git a/arch/arm64/boot/dts/rockchip/rk356x.dtsi b/arch/arm64/boot/dts/rockchip/rk356x.dtsi index 4690be841a1c..c72b3a608edd 100644 --- a/arch/arm64/boot/dts/rockchip/rk356x.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk356x.dtsi @@ -1592,10 +1592,9 @@ <&cru SRST_TSADCPHY>; rockchip,grf = <&grf>; rockchip,hw-tshut-temp = <95000>; - pinctrl-names = "init", "default", "sleep"; - pinctrl-0 = <&tsadc_pin>; - pinctrl-1 = <&tsadc_shutorg>; - pinctrl-2 = <&tsadc_pin>; + pinctrl-names = "default", "sleep"; + pinctrl-0 = <&tsadc_shutorg>; + pinctrl-1 = <&tsadc_pin>; #thermal-sensor-cells = <1>; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi index b6e4df180f0b..ee99166ebd46 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi @@ -582,14 +582,14 @@ }; vo0_grf: syscon@fd5a6000 { - compatible = "rockchip,rk3588-vo-grf", "syscon"; + compatible = "rockchip,rk3588-vo0-grf", "syscon"; reg = <0x0 0xfd5a6000 0x0 0x2000>; clocks = <&cru PCLK_VO0GRF>; }; vo1_grf: syscon@fd5a8000 { - compatible = "rockchip,rk3588-vo-grf", "syscon"; - reg = <0x0 0xfd5a8000 0x0 0x100>; + compatible = "rockchip,rk3588-vo1-grf", "syscon"; + reg = <0x0 0xfd5a8000 0x0 0x4000>; clocks = <&cru PCLK_VO1GRF>; }; diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 7d32fca64996..362df9390263 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -887,6 +887,7 @@ CONFIG_DRM_PANEL_KHADAS_TS050=m CONFIG_DRM_PANEL_MANTIX_MLAF057WE51=m CONFIG_DRM_PANEL_NOVATEK_NT36672E=m CONFIG_DRM_PANEL_RAYDIUM_RM67191=m +CONFIG_DRM_PANEL_SAMSUNG_ATNA33XC20=m CONFIG_DRM_PANEL_SITRONIX_ST7703=m CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m CONFIG_DRM_PANEL_VISIONOX_VTDR6130=m diff --git a/arch/arm64/crypto/poly1305-armv8.pl b/arch/arm64/crypto/poly1305-armv8.pl index cbc980fb02e3..22c9069c0650 100644 --- a/arch/arm64/crypto/poly1305-armv8.pl +++ b/arch/arm64/crypto/poly1305-armv8.pl @@ -473,7 +473,8 @@ poly1305_blocks_neon: subs $len,$len,#64 ldp x9,x13,[$inp,#48] add $in2,$inp,#96 - adr $zeros,.Lzeros + adrp $zeros,.Lzeros + add $zeros,$zeros,#:lo12:.Lzeros lsl $padbit,$padbit,#24 add x15,$ctx,#48 @@ -885,10 +886,13 @@ poly1305_blocks_neon: ret .size poly1305_blocks_neon,.-poly1305_blocks_neon +.pushsection .rodata .align 5 .Lzeros: .long 0,0,0,0,0,0,0,0 .asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm" +.popsection + .align 2 #if !defined(__KERNEL__) && !defined(_WIN64) .comm OPENSSL_armcap_P,4,4 diff --git a/arch/arm64/include/asm/arm_pmuv3.h b/arch/arm64/include/asm/arm_pmuv3.h index a4697a0b6835..468a049bc63b 100644 --- a/arch/arm64/include/asm/arm_pmuv3.h +++ b/arch/arm64/include/asm/arm_pmuv3.h @@ -33,6 +33,14 @@ static inline void write_pmevtypern(int n, unsigned long val) PMEVN_SWITCH(n, WRITE_PMEVTYPERN); } +#define RETURN_READ_PMEVTYPERN(n) \ + return read_sysreg(pmevtyper##n##_el0) +static inline unsigned long read_pmevtypern(int n) +{ + PMEVN_SWITCH(n, RETURN_READ_PMEVTYPERN); + return 0; +} + static inline unsigned long read_pmmir(void) { return read_cpuid(PMMIR_EL1); @@ -46,6 +54,14 @@ static inline u32 read_pmuver(void) ID_AA64DFR0_EL1_PMUVer_SHIFT); } +static inline bool pmuv3_has_icntr(void) +{ + u64 dfr1 = read_sysreg(id_aa64dfr1_el1); + + return !!cpuid_feature_extract_unsigned_field(dfr1, + ID_AA64DFR1_EL1_PMICNTR_SHIFT); +} + static inline void write_pmcr(u64 val) { write_sysreg(val, pmcr_el0); @@ -71,22 +87,32 @@ static inline u64 read_pmccntr(void) return read_sysreg(pmccntr_el0); } -static inline void write_pmcntenset(u32 val) +static inline void write_pmicntr(u64 val) +{ + write_sysreg_s(val, SYS_PMICNTR_EL0); +} + +static inline u64 read_pmicntr(void) +{ + return read_sysreg_s(SYS_PMICNTR_EL0); +} + +static inline void write_pmcntenset(u64 val) { write_sysreg(val, pmcntenset_el0); } -static inline void write_pmcntenclr(u32 val) +static inline void write_pmcntenclr(u64 val) { write_sysreg(val, pmcntenclr_el0); } -static inline void write_pmintenset(u32 val) +static inline void write_pmintenset(u64 val) { write_sysreg(val, pmintenset_el1); } -static inline void write_pmintenclr(u32 val) +static inline void write_pmintenclr(u64 val) { write_sysreg(val, pmintenclr_el1); } @@ -96,12 +122,27 @@ static inline void write_pmccfiltr(u64 val) write_sysreg(val, pmccfiltr_el0); } -static inline void write_pmovsclr(u32 val) +static inline u64 read_pmccfiltr(void) +{ + return read_sysreg(pmccfiltr_el0); +} + +static inline void write_pmicfiltr(u64 val) +{ + write_sysreg_s(val, SYS_PMICFILTR_EL0); +} + +static inline u64 read_pmicfiltr(void) +{ + return read_sysreg_s(SYS_PMICFILTR_EL0); +} + +static inline void write_pmovsclr(u64 val) { write_sysreg(val, pmovsclr_el0); } -static inline u32 read_pmovsclr(void) +static inline u64 read_pmovsclr(void) { return read_sysreg(pmovsclr_el0); } diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 558434267271..3d261cc123c1 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -832,6 +832,12 @@ static inline bool system_supports_lpa2(void) return cpus_have_final_cap(ARM64_HAS_LPA2); } +static inline bool system_supports_poe(void) +{ + return IS_ENABLED(CONFIG_ARM64_POE) && + alternative_has_cap_unlikely(ARM64_HAS_S1POE); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 5fd7caea4419..5a7dfeb8e8eb 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -143,6 +143,7 @@ #define APPLE_CPU_PART_M2_AVALANCHE_MAX 0x039 #define AMPERE_CPU_PART_AMPERE1 0xAC3 +#define AMPERE_CPU_PART_AMPERE1A 0xAC4 #define MICROSOFT_CPU_PART_AZURE_COBALT_100 0xD49 /* Based on r0p0 of ARM Neoverse N2 */ @@ -212,6 +213,7 @@ #define MIDR_APPLE_M2_BLIZZARD_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_MAX) #define MIDR_APPLE_M2_AVALANCHE_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_MAX) #define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1) +#define MIDR_AMPERE1A MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1A) #define MIDR_MICROSOFT_AZURE_COBALT_100 MIDR_CPU_MODEL(ARM_CPU_IMP_MICROSOFT, MICROSOFT_CPU_PART_AZURE_COBALT_100) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index fd87c4b8f984..e0ffdf13a18b 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -165,42 +165,53 @@ mrs x1, id_aa64dfr0_el1 ubfx x1, x1, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4 cmp x1, #3 - b.lt .Lset_debug_fgt_\@ + b.lt .Lskip_spe_fgt_\@ /* Disable PMSNEVFR_EL1 read and write traps */ orr x0, x0, #(1 << 62) -.Lset_debug_fgt_\@: +.Lskip_spe_fgt_\@: msr_s SYS_HDFGRTR_EL2, x0 msr_s SYS_HDFGWTR_EL2, x0 mov x0, xzr mrs x1, id_aa64pfr1_el1 ubfx x1, x1, #ID_AA64PFR1_EL1_SME_SHIFT, #4 - cbz x1, .Lset_pie_fgt_\@ + cbz x1, .Lskip_debug_fgt_\@ /* Disable nVHE traps of TPIDR2 and SMPRI */ orr x0, x0, #HFGxTR_EL2_nSMPRI_EL1_MASK orr x0, x0, #HFGxTR_EL2_nTPIDR2_EL0_MASK -.Lset_pie_fgt_\@: +.Lskip_debug_fgt_\@: mrs_s x1, SYS_ID_AA64MMFR3_EL1 ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4 - cbz x1, .Lset_fgt_\@ + cbz x1, .Lskip_pie_fgt_\@ /* Disable trapping of PIR_EL1 / PIRE0_EL1 */ orr x0, x0, #HFGxTR_EL2_nPIR_EL1 orr x0, x0, #HFGxTR_EL2_nPIRE0_EL1 -.Lset_fgt_\@: +.Lskip_pie_fgt_\@: + mrs_s x1, SYS_ID_AA64MMFR3_EL1 + ubfx x1, x1, #ID_AA64MMFR3_EL1_S1POE_SHIFT, #4 + cbz x1, .Lskip_poe_fgt_\@ + + /* Disable trapping of POR_EL0 */ + orr x0, x0, #HFGxTR_EL2_nPOR_EL0 + +.Lskip_poe_fgt_\@: msr_s SYS_HFGRTR_EL2, x0 msr_s SYS_HFGWTR_EL2, x0 msr_s SYS_HFGITR_EL2, xzr mrs x1, id_aa64pfr0_el1 // AMU traps UNDEF without AMU ubfx x1, x1, #ID_AA64PFR0_EL1_AMU_SHIFT, #4 - cbz x1, .Lskip_fgt_\@ + cbz x1, .Lskip_amu_fgt_\@ msr_s SYS_HAFGRTR_EL2, xzr + +.Lskip_amu_fgt_\@: + .Lskip_fgt_\@: .endm diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 56c148890daf..da6d2c1c0b03 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -10,63 +10,63 @@ #include <asm/memory.h> #include <asm/sysreg.h> -#define ESR_ELx_EC_UNKNOWN (0x00) -#define ESR_ELx_EC_WFx (0x01) +#define ESR_ELx_EC_UNKNOWN UL(0x00) +#define ESR_ELx_EC_WFx UL(0x01) /* Unallocated EC: 0x02 */ -#define ESR_ELx_EC_CP15_32 (0x03) -#define ESR_ELx_EC_CP15_64 (0x04) -#define ESR_ELx_EC_CP14_MR (0x05) -#define ESR_ELx_EC_CP14_LS (0x06) -#define ESR_ELx_EC_FP_ASIMD (0x07) -#define ESR_ELx_EC_CP10_ID (0x08) /* EL2 only */ -#define ESR_ELx_EC_PAC (0x09) /* EL2 and above */ +#define ESR_ELx_EC_CP15_32 UL(0x03) +#define ESR_ELx_EC_CP15_64 UL(0x04) +#define ESR_ELx_EC_CP14_MR UL(0x05) +#define ESR_ELx_EC_CP14_LS UL(0x06) +#define ESR_ELx_EC_FP_ASIMD UL(0x07) +#define ESR_ELx_EC_CP10_ID UL(0x08) /* EL2 only */ +#define ESR_ELx_EC_PAC UL(0x09) /* EL2 and above */ /* Unallocated EC: 0x0A - 0x0B */ -#define ESR_ELx_EC_CP14_64 (0x0C) -#define ESR_ELx_EC_BTI (0x0D) -#define ESR_ELx_EC_ILL (0x0E) +#define ESR_ELx_EC_CP14_64 UL(0x0C) +#define ESR_ELx_EC_BTI UL(0x0D) +#define ESR_ELx_EC_ILL UL(0x0E) /* Unallocated EC: 0x0F - 0x10 */ -#define ESR_ELx_EC_SVC32 (0x11) -#define ESR_ELx_EC_HVC32 (0x12) /* EL2 only */ -#define ESR_ELx_EC_SMC32 (0x13) /* EL2 and above */ +#define ESR_ELx_EC_SVC32 UL(0x11) +#define ESR_ELx_EC_HVC32 UL(0x12) /* EL2 only */ +#define ESR_ELx_EC_SMC32 UL(0x13) /* EL2 and above */ /* Unallocated EC: 0x14 */ -#define ESR_ELx_EC_SVC64 (0x15) -#define ESR_ELx_EC_HVC64 (0x16) /* EL2 and above */ -#define ESR_ELx_EC_SMC64 (0x17) /* EL2 and above */ -#define ESR_ELx_EC_SYS64 (0x18) -#define ESR_ELx_EC_SVE (0x19) -#define ESR_ELx_EC_ERET (0x1a) /* EL2 only */ +#define ESR_ELx_EC_SVC64 UL(0x15) +#define ESR_ELx_EC_HVC64 UL(0x16) /* EL2 and above */ +#define ESR_ELx_EC_SMC64 UL(0x17) /* EL2 and above */ +#define ESR_ELx_EC_SYS64 UL(0x18) +#define ESR_ELx_EC_SVE UL(0x19) +#define ESR_ELx_EC_ERET UL(0x1a) /* EL2 only */ /* Unallocated EC: 0x1B */ -#define ESR_ELx_EC_FPAC (0x1C) /* EL1 and above */ -#define ESR_ELx_EC_SME (0x1D) +#define ESR_ELx_EC_FPAC UL(0x1C) /* EL1 and above */ +#define ESR_ELx_EC_SME UL(0x1D) /* Unallocated EC: 0x1E */ -#define ESR_ELx_EC_IMP_DEF (0x1f) /* EL3 only */ -#define ESR_ELx_EC_IABT_LOW (0x20) -#define ESR_ELx_EC_IABT_CUR (0x21) -#define ESR_ELx_EC_PC_ALIGN (0x22) +#define ESR_ELx_EC_IMP_DEF UL(0x1f) /* EL3 only */ +#define ESR_ELx_EC_IABT_LOW UL(0x20) +#define ESR_ELx_EC_IABT_CUR UL(0x21) +#define ESR_ELx_EC_PC_ALIGN UL(0x22) /* Unallocated EC: 0x23 */ -#define ESR_ELx_EC_DABT_LOW (0x24) -#define ESR_ELx_EC_DABT_CUR (0x25) -#define ESR_ELx_EC_SP_ALIGN (0x26) -#define ESR_ELx_EC_MOPS (0x27) -#define ESR_ELx_EC_FP_EXC32 (0x28) +#define ESR_ELx_EC_DABT_LOW UL(0x24) +#define ESR_ELx_EC_DABT_CUR UL(0x25) +#define ESR_ELx_EC_SP_ALIGN UL(0x26) +#define ESR_ELx_EC_MOPS UL(0x27) +#define ESR_ELx_EC_FP_EXC32 UL(0x28) /* Unallocated EC: 0x29 - 0x2B */ -#define ESR_ELx_EC_FP_EXC64 (0x2C) +#define ESR_ELx_EC_FP_EXC64 UL(0x2C) /* Unallocated EC: 0x2D - 0x2E */ -#define ESR_ELx_EC_SERROR (0x2F) -#define ESR_ELx_EC_BREAKPT_LOW (0x30) -#define ESR_ELx_EC_BREAKPT_CUR (0x31) -#define ESR_ELx_EC_SOFTSTP_LOW (0x32) -#define ESR_ELx_EC_SOFTSTP_CUR (0x33) -#define ESR_ELx_EC_WATCHPT_LOW (0x34) -#define ESR_ELx_EC_WATCHPT_CUR (0x35) +#define ESR_ELx_EC_SERROR UL(0x2F) +#define ESR_ELx_EC_BREAKPT_LOW UL(0x30) +#define ESR_ELx_EC_BREAKPT_CUR UL(0x31) +#define ESR_ELx_EC_SOFTSTP_LOW UL(0x32) +#define ESR_ELx_EC_SOFTSTP_CUR UL(0x33) +#define ESR_ELx_EC_WATCHPT_LOW UL(0x34) +#define ESR_ELx_EC_WATCHPT_CUR UL(0x35) /* Unallocated EC: 0x36 - 0x37 */ -#define ESR_ELx_EC_BKPT32 (0x38) +#define ESR_ELx_EC_BKPT32 UL(0x38) /* Unallocated EC: 0x39 */ -#define ESR_ELx_EC_VECTOR32 (0x3A) /* EL2 only */ +#define ESR_ELx_EC_VECTOR32 UL(0x3A) /* EL2 only */ /* Unallocated EC: 0x3B */ -#define ESR_ELx_EC_BRK64 (0x3C) +#define ESR_ELx_EC_BRK64 UL(0x3C) /* Unallocated EC: 0x3D - 0x3F */ -#define ESR_ELx_EC_MAX (0x3F) +#define ESR_ELx_EC_MAX UL(0x3F) #define ESR_ELx_EC_SHIFT (26) #define ESR_ELx_EC_WIDTH (6) @@ -122,8 +122,8 @@ #define ESR_ELx_FSC_SECC_TTW(n) (0x1c + (n)) /* Status codes for individual page table levels */ -#define ESR_ELx_FSC_ACCESS_L(n) (ESR_ELx_FSC_ACCESS + n) -#define ESR_ELx_FSC_PERM_L(n) (ESR_ELx_FSC_PERM + n) +#define ESR_ELx_FSC_ACCESS_L(n) (ESR_ELx_FSC_ACCESS + (n)) +#define ESR_ELx_FSC_PERM_L(n) (ESR_ELx_FSC_PERM + (n)) #define ESR_ELx_FSC_FAULT_nL (0x2C) #define ESR_ELx_FSC_FAULT_L(n) (((n) < 0 ? ESR_ELx_FSC_FAULT_nL : \ @@ -161,6 +161,7 @@ /* ISS field definitions for exceptions taken in to Hyp */ #define ESR_ELx_FSC_ADDRSZ (0x00) +#define ESR_ELx_FSC_ADDRSZ_L(n) (ESR_ELx_FSC_ADDRSZ + (n)) #define ESR_ELx_CV (UL(1) << 24) #define ESR_ELx_COND_SHIFT (20) #define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT) diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index bc69ac368d73..f2a84efc3618 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -155,8 +155,6 @@ extern void cpu_enable_sme2(const struct arm64_cpu_capabilities *__unused); extern void cpu_enable_fa64(const struct arm64_cpu_capabilities *__unused); extern void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__unused); -extern u64 read_smcr_features(void); - /* * Helpers to translate bit indices in sve_vq_map to VQ values (and * vice versa). This allows find_next_bit() to be used to find the diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 4edd3b61df11..a775adddecf2 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -157,6 +157,7 @@ #define KERNEL_HWCAP_SME_SF8FMA __khwcap2_feature(SME_SF8FMA) #define KERNEL_HWCAP_SME_SF8DP4 __khwcap2_feature(SME_SF8DP4) #define KERNEL_HWCAP_SME_SF8DP2 __khwcap2_feature(SME_SF8DP2) +#define KERNEL_HWCAP_POE __khwcap2_feature(POE) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h index 0ae427f352c8..409e239834d1 100644 --- a/arch/arm64/include/asm/hypervisor.h +++ b/arch/arm64/include/asm/hypervisor.h @@ -7,4 +7,15 @@ void kvm_init_hyp_services(void); bool kvm_arm_hyp_service_available(u32 func_id); +#ifdef CONFIG_ARM_PKVM_GUEST +void pkvm_init_hyp_services(void); +#else +static inline void pkvm_init_hyp_services(void) { }; +#endif + +static inline void kvm_arch_init_hyp_services(void) +{ + pkvm_init_hyp_services(); +}; + #endif diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 41fd90895dfc..1ada23a6ec19 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -271,6 +271,10 @@ __iowrite64_copy(void __iomem *to, const void *from, size_t count) * I/O memory mapping functions. */ +typedef int (*ioremap_prot_hook_t)(phys_addr_t phys_addr, size_t size, + pgprot_t *prot); +int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook); + #define ioremap_prot ioremap_prot #define _PAGE_IOREMAP PROT_DEVICE_nGnRE diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index d81cc746e0eb..109a85ee6910 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -107,6 +107,7 @@ /* TCR_EL2 Registers bits */ #define TCR_EL2_DS (1UL << 32) #define TCR_EL2_RES1 ((1U << 31) | (1 << 23)) +#define TCR_EL2_HPD (1 << 24) #define TCR_EL2_TBI (1 << 20) #define TCR_EL2_PS_SHIFT 16 #define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 2181a11b9d92..b36a3b6cc011 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -10,6 +10,7 @@ #include <asm/hyp_image.h> #include <asm/insn.h> #include <asm/virt.h> +#include <asm/sysreg.h> #define ARM_EXIT_WITH_SERROR_BIT 31 #define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT)) @@ -235,6 +236,9 @@ extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu); extern int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding); extern void __kvm_timer_set_cntvoff(u64 cntvoff); +extern void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); @@ -259,7 +263,7 @@ extern u64 __kvm_get_mdcr_el2(void); asm volatile( \ " mrs %1, spsr_el2\n" \ " mrs %2, elr_el2\n" \ - "1: at "at_op", %3\n" \ + "1: " __msr_s(at_op, "%3") "\n" \ " isb\n" \ " b 9f\n" \ "2: msr spsr_el2, %1\n" \ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index a33f5996ca9f..329619c6fa96 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -446,6 +446,12 @@ enum vcpu_sysreg { GCR_EL1, /* Tag Control Register */ TFSRE0_EL1, /* Tag Fault Status Register (EL0) */ + POR_EL0, /* Permission Overlay Register 0 (EL0) */ + + /* FP/SIMD/SVE */ + SVCR, + FPMR, + /* 32bit specific registers. */ DACR32_EL2, /* Domain Access Control Register */ IFSR32_EL2, /* Instruction Fault Status Register */ @@ -517,6 +523,8 @@ enum vcpu_sysreg { VNCR(PIR_EL1), /* Permission Indirection Register 1 (EL1) */ VNCR(PIRE0_EL1), /* Permission Indirection Register 0 (EL1) */ + VNCR(POR_EL1), /* Permission Overlay Register 1 (EL1) */ + VNCR(HFGRTR_EL2), VNCR(HFGWTR_EL2), VNCR(HFGITR_EL2), @@ -530,6 +538,8 @@ enum vcpu_sysreg { VNCR(CNTP_CVAL_EL0), VNCR(CNTP_CTL_EL0), + VNCR(ICH_HCR_EL2), + NR_SYS_REGS /* Nothing after this line! */ }; @@ -595,6 +605,16 @@ struct kvm_host_data { struct cpu_sve_state *sve_state; }; + union { + /* HYP VA pointer to the host storage for FPMR */ + u64 *fpmr_ptr; + /* + * Used by pKVM only, as it needs to provide storage + * for the host + */ + u64 fpmr; + }; + /* Ownership of the FP regs */ enum { FP_STATE_FREE, @@ -664,8 +684,6 @@ struct kvm_vcpu_arch { void *sve_state; enum fp_type fp_type; unsigned int sve_max_vl; - u64 svcr; - u64 fpmr; /* Stage 2 paging state used by the hardware on next switch */ struct kvm_s2_mmu *hw_mmu; @@ -1330,12 +1348,12 @@ void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM -void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr); -void kvm_clr_pmu_events(u32 clr); +void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr); +void kvm_clr_pmu_events(u64 clr); bool kvm_set_pmuserenr(u64 val); #else -static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {} -static inline void kvm_clr_pmu_events(u32 clr) {} +static inline void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) {} +static inline void kvm_clr_pmu_events(u64 clr) {} static inline bool kvm_set_pmuserenr(u64 val) { return false; @@ -1473,4 +1491,8 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); (pa + pi + pa3) == 1; \ }) +#define kvm_has_fpmr(k) \ + (system_supports_fpmr() && \ + kvm_has_feat((k), ID_AA64PFR2_EL1, FPMR, IMP)) + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 216ca424bb16..cd4087fbda9a 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -352,5 +352,11 @@ static inline bool kvm_is_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) return &kvm->arch.mmu != mmu; } +#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm); +#else +static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {} +#endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */ + #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */ diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 5b06c31035a2..e8bc6d67aba2 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -85,7 +85,7 @@ struct kvm_s2_trans { bool readable; int level; u32 esr; - u64 upper_attr; + u64 desc; }; static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans) @@ -115,7 +115,7 @@ static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans) static inline bool kvm_s2_trans_executable(struct kvm_s2_trans *trans) { - return !(trans->upper_attr & BIT(54)); + return !(trans->desc & BIT(54)); } extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, @@ -205,4 +205,40 @@ static inline u64 kvm_encode_nested_level(struct kvm_s2_trans *trans) return FIELD_PREP(KVM_NV_GUEST_MAP_SZ, trans->level); } +/* Adjust alignment for the contiguous bit as per StageOA() */ +#define contiguous_bit_shift(d, wi, l) \ + ({ \ + u8 shift = 0; \ + \ + if ((d) & PTE_CONT) { \ + switch (BIT((wi)->pgshift)) { \ + case SZ_4K: \ + shift = 4; \ + break; \ + case SZ_16K: \ + shift = (l) == 2 ? 5 : 7; \ + break; \ + case SZ_64K: \ + shift = 5; \ + break; \ + } \ + } \ + \ + shift; \ + }) + +static inline unsigned int ps_to_output_size(unsigned int ps) +{ + switch (ps) { + case 0: return 32; + case 1: return 36; + case 2: return 40; + case 3: return 42; + case 4: return 44; + case 5: + default: + return 48; + } +} + #endif /* __ARM64_KVM_NESTED_H */ diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 19278dfe7978..03f4c3d7839c 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -59,6 +59,48 @@ typedef u64 kvm_pte_t; #define KVM_PHYS_INVALID (-1ULL) +#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) + +#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ + ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ + ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) +#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) +#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 +#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) + +#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) +#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) +#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 +#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) + +#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) + +#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) + +#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) + +#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) + +#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) + +#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ + KVM_PTE_LEAF_ATTR_HI_S2_XN) + +#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) +#define KVM_MAX_OWNER_ID 1 + +/* + * Used to indicate a pte for which a 'break-before-make' sequence is in + * progress. + */ +#define KVM_INVALID_PTE_LOCKED BIT(10) + static inline bool kvm_pte_valid(kvm_pte_t pte) { return pte & KVM_PTE_VALID; diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h new file mode 100644 index 000000000000..b0c9a86b13a4 --- /dev/null +++ b/arch/arm64/include/asm/mem_encrypt.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_MEM_ENCRYPT_H +#define __ASM_MEM_ENCRYPT_H + +struct arm64_mem_crypt_ops { + int (*encrypt)(unsigned long addr, int numpages); + int (*decrypt)(unsigned long addr, int numpages); +}; + +int arm64_mem_crypt_ops_register(const struct arm64_mem_crypt_ops *ops); + +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); + +#endif /* __ASM_MEM_ENCRYPT_H */ diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 54fb014eba05..0480c61dbb4f 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -110,6 +110,8 @@ #define PAGE_END (_PAGE_END(VA_BITS_MIN)) #endif /* CONFIG_KASAN */ +#define PHYSMEM_END __pa(PAGE_END - 1) + #define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT) /* diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index 5966ee4a6154..52791715f6e6 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -7,7 +7,7 @@ #include <uapi/asm/mman.h> static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, - unsigned long pkey __always_unused) + unsigned long pkey) { unsigned long ret = 0; @@ -17,6 +17,14 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, if (system_supports_mte() && (prot & PROT_MTE)) ret |= VM_MTE; +#ifdef CONFIG_ARCH_HAS_PKEYS + if (system_supports_poe()) { + ret |= pkey & BIT(0) ? VM_PKEY_BIT0 : 0; + ret |= pkey & BIT(1) ? VM_PKEY_BIT1 : 0; + ret |= pkey & BIT(2) ? VM_PKEY_BIT2 : 0; + } +#endif + return ret; } #define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 65977c7783c5..2ec96d91acc6 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -25,6 +25,7 @@ typedef struct { refcount_t pinned; void *vdso; unsigned long flags; + u8 pkey_allocation_map; } mm_context_t; /* @@ -63,7 +64,6 @@ static inline bool arm64_kernel_unmapped_at_el0(void) extern void arm64_memblock_init(void); extern void paging_init(void); extern void bootmem_init(void); -extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); extern void create_mapping_noalloc(phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot); extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index bd19f4c758b7..7c09d47e09cb 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -15,12 +15,12 @@ #include <linux/sched/hotplug.h> #include <linux/mm_types.h> #include <linux/pgtable.h> +#include <linux/pkeys.h> #include <asm/cacheflush.h> #include <asm/cpufeature.h> #include <asm/daifflags.h> #include <asm/proc-fns.h> -#include <asm-generic/mm_hooks.h> #include <asm/cputype.h> #include <asm/sysreg.h> #include <asm/tlbflush.h> @@ -175,9 +175,36 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm) { atomic64_set(&mm->context.id, 0); refcount_set(&mm->context.pinned, 0); + + /* pkey 0 is the default, so always reserve it. */ + mm->context.pkey_allocation_map = BIT(0); + + return 0; +} + +static inline void arch_dup_pkeys(struct mm_struct *oldmm, + struct mm_struct *mm) +{ + /* Duplicate the oldmm pkey state in mm: */ + mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map; +} + +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + arch_dup_pkeys(oldmm, mm); + return 0; } +static inline void arch_exit_mmap(struct mm_struct *mm) +{ +} + +static inline void arch_unmap(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + #ifdef CONFIG_ARM64_SW_TTBR0_PAN static inline void update_saved_ttbr0(struct task_struct *tsk, struct mm_struct *mm) @@ -267,6 +294,23 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm) return -1UL >> 8; } +/* + * Only enforce protection keys on the current process, because there is no + * user context to access POR_EL0 for another address space. + */ +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, + bool write, bool execute, bool foreign) +{ + if (!system_supports_poe()) + return true; + + /* allow access if the VMA is not one from this process */ + if (foreign || vma_is_foreign(vma)) + return true; + + return por_el0_allows_pkey(vma_pkey(vma), write, execute); +} + #include <asm-generic/mmu_context.h> #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 1f60aa1bc750..fd330c1db289 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -135,7 +135,6 @@ /* * Section */ -#define PMD_SECT_VALID (_AT(pmdval_t, 1) << 0) #define PMD_SECT_USER (_AT(pmdval_t, 1) << 6) /* AP[1] */ #define PMD_SECT_RDONLY (_AT(pmdval_t, 1) << 7) /* AP[2] */ #define PMD_SECT_S (_AT(pmdval_t, 3) << 8) @@ -200,11 +199,26 @@ #define PTE_PI_IDX_3 54 /* UXN */ /* + * POIndex[2:0] encoding (Permission Overlay Extension) + */ +#define PTE_PO_IDX_0 (_AT(pteval_t, 1) << 60) +#define PTE_PO_IDX_1 (_AT(pteval_t, 1) << 61) +#define PTE_PO_IDX_2 (_AT(pteval_t, 1) << 62) + +#define PTE_PO_IDX_MASK GENMASK_ULL(62, 60) + + +/* * Memory Attribute override for Stage-2 (MemAttr[3:0]) */ #define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2) /* + * Hierarchical permission for Stage-1 tables + */ +#define S1_TABLE_AP (_AT(pmdval_t, 3) << 61) + +/* * Highest possible physical address supported. */ #define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS) @@ -298,6 +312,10 @@ #define TCR_TBI1 (UL(1) << 38) #define TCR_HA (UL(1) << 39) #define TCR_HD (UL(1) << 40) +#define TCR_HPD0_SHIFT 41 +#define TCR_HPD0 (UL(1) << TCR_HPD0_SHIFT) +#define TCR_HPD1_SHIFT 42 +#define TCR_HPD1 (UL(1) << TCR_HPD1_SHIFT) #define TCR_TBID0 (UL(1) << 51) #define TCR_TBID1 (UL(1) << 52) #define TCR_NFD0 (UL(1) << 53) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index b11cfb9fdd37..2a11d0c10760 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -154,10 +154,10 @@ static inline bool __pure lpa2_is_enabled(void) #define PIE_E0 ( \ PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY), PIE_R) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW)) + PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW_O)) #define PIE_E1 ( \ PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 7a4f5604be3f..96c2b0b07c4c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -34,6 +34,7 @@ #include <asm/cmpxchg.h> #include <asm/fixmap.h> +#include <asm/por.h> #include <linux/mmdebug.h> #include <linux/mm_types.h> #include <linux/sched.h> @@ -149,6 +150,24 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) #define pte_accessible(mm, pte) \ (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte)) +static inline bool por_el0_allows_pkey(u8 pkey, bool write, bool execute) +{ + u64 por; + + if (!system_supports_poe()) + return true; + + por = read_sysreg_s(SYS_POR_EL0); + + if (write) + return por_elx_allows_write(por, pkey); + + if (execute) + return por_elx_allows_exec(por, pkey); + + return por_elx_allows_read(por, pkey); +} + /* * p??_access_permitted() is true for valid user mappings (PTE_USER * bit set, subject to the write permission check). For execute-only @@ -156,8 +175,11 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) * not set) must return false. PROT_NONE mappings do not have the * PTE_VALID bit set. */ -#define pte_access_permitted(pte, write) \ +#define pte_access_permitted_no_overlay(pte, write) \ (((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte))) +#define pte_access_permitted(pte, write) \ + (pte_access_permitted_no_overlay(pte, write) && \ + por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false)) #define pmd_access_permitted(pmd, write) \ (pte_access_permitted(pmd_pte(pmd), (write))) #define pud_access_permitted(pud, write) \ @@ -373,10 +395,11 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages) /* * If the PTE would provide user space access to the tags associated * with it then ensure that the MTE tags are synchronised. Although - * pte_access_permitted() returns false for exec only mappings, they - * don't expose tags (instruction fetches don't check tags). + * pte_access_permitted_no_overlay() returns false for exec only + * mappings, they don't expose tags (instruction fetches don't check + * tags). */ - if (system_supports_mte() && pte_access_permitted(pte, false) && + if (system_supports_mte() && pte_access_permitted_no_overlay(pte, false) && !pte_special(pte) && pte_tagged(pte)) mte_sync_tags(pte, nr_pages); } @@ -1103,7 +1126,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) */ const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY | PTE_PRESENT_INVALID | PTE_VALID | PTE_WRITE | - PTE_GP | PTE_ATTRINDX_MASK; + PTE_GP | PTE_ATTRINDX_MASK | PTE_PO_IDX_MASK; + /* preserve the hardware dirty information */ if (pte_hw_dirty(pte)) pte = set_pte_bit(pte, __pgprot(PTE_DIRTY)); diff --git a/arch/arm64/include/asm/pkeys.h b/arch/arm64/include/asm/pkeys.h new file mode 100644 index 000000000000..0ca5f83ce148 --- /dev/null +++ b/arch/arm64/include/asm/pkeys.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Arm Ltd. + * + * Based on arch/x86/include/asm/pkeys.h + */ + +#ifndef _ASM_ARM64_PKEYS_H +#define _ASM_ARM64_PKEYS_H + +#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2) + +#define arch_max_pkey() 8 + +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val); + +static inline bool arch_pkeys_enabled(void) +{ + return system_supports_poe(); +} + +static inline int vma_pkey(struct vm_area_struct *vma) +{ + return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT; +} + +static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, + int prot, int pkey) +{ + if (pkey != -1) + return pkey; + + return vma_pkey(vma); +} + +static inline int execute_only_pkey(struct mm_struct *mm) +{ + // Execute-only mappings are handled by EPAN/FEAT_PAN3. + return -1; +} + +#define mm_pkey_allocation_map(mm) (mm)->context.pkey_allocation_map +#define mm_set_pkey_allocated(mm, pkey) do { \ + mm_pkey_allocation_map(mm) |= (1U << pkey); \ +} while (0) +#define mm_set_pkey_free(mm, pkey) do { \ + mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ +} while (0) + +static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) +{ + /* + * "Allocated" pkeys are those that have been returned + * from pkey_alloc() or pkey 0 which is allocated + * implicitly when the mm is created. + */ + if (pkey < 0 || pkey >= arch_max_pkey()) + return false; + + return mm_pkey_allocation_map(mm) & (1U << pkey); +} + +/* + * Returns a positive, 3-bit key on success, or -1 on failure. + */ +static inline int mm_pkey_alloc(struct mm_struct *mm) +{ + /* + * Note: this is the one and only place we make sure + * that the pkey is valid as far as the hardware is + * concerned. The rest of the kernel trusts that + * only good, valid pkeys come out of here. + */ + u8 all_pkeys_mask = GENMASK(arch_max_pkey() - 1, 0); + int ret; + + if (!arch_pkeys_enabled()) + return -1; + + /* + * Are we out of pkeys? We must handle this specially + * because ffz() behavior is undefined if there are no + * zeros. + */ + if (mm_pkey_allocation_map(mm) == all_pkeys_mask) + return -1; + + ret = ffz(mm_pkey_allocation_map(mm)); + + mm_set_pkey_allocated(mm, ret); + + return ret; +} + +static inline int mm_pkey_free(struct mm_struct *mm, int pkey) +{ + if (!mm_pkey_is_allocated(mm, pkey)) + return -EINVAL; + + mm_set_pkey_free(mm, pkey); + + return 0; +} + +#endif /* _ASM_ARM64_PKEYS_H */ diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h new file mode 100644 index 000000000000..e06e9f473675 --- /dev/null +++ b/arch/arm64/include/asm/por.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Arm Ltd. + */ + +#ifndef _ASM_ARM64_POR_H +#define _ASM_ARM64_POR_H + +#define POR_BITS_PER_PKEY 4 +#define POR_ELx_IDX(por_elx, idx) (((por_elx) >> ((idx) * POR_BITS_PER_PKEY)) & 0xf) + +static inline bool por_elx_allows_read(u64 por, u8 pkey) +{ + u8 perm = POR_ELx_IDX(por, pkey); + + return perm & POE_R; +} + +static inline bool por_elx_allows_write(u64 por, u8 pkey) +{ + u8 perm = POR_ELx_IDX(por, pkey); + + return perm & POE_W; +} + +static inline bool por_elx_allows_exec(u64 por, u8 pkey) +{ + u8 perm = POR_ELx_IDX(por, pkey); + + return perm & POE_X; +} + +#endif /* _ASM_ARM64_POR_H */ diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index f77371232d8c..1438424f0064 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -184,6 +184,7 @@ struct thread_struct { u64 sctlr_user; u64 svcr; u64 tpidr2_el0; + u64 por_el0; }; static inline unsigned int thread_get_vl(struct thread_struct *thread, @@ -402,5 +403,10 @@ long get_tagged_addr_ctrl(struct task_struct *task); #define GET_TAGGED_ADDR_CTRL() get_tagged_addr_ctrl(current) #endif +int get_tsc_mode(unsigned long adr); +int set_tsc_mode(unsigned int val); +#define GET_TSC_CTL(adr) get_tsc_mode((adr)) +#define SET_TSC_CTL(val) set_tsc_mode((val)) + #endif /* __ASSEMBLY__ */ #endif /* __ASM_PROCESSOR_H */ diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h index 5b1701c76d1c..6cf4aae05219 100644 --- a/arch/arm64/include/asm/ptdump.h +++ b/arch/arm64/include/asm/ptdump.h @@ -5,6 +5,8 @@ #ifndef __ASM_PTDUMP_H #define __ASM_PTDUMP_H +#include <linux/ptdump.h> + #ifdef CONFIG_PTDUMP_CORE #include <linux/mm_types.h> @@ -21,14 +23,53 @@ struct ptdump_info { unsigned long base_addr; }; +struct ptdump_prot_bits { + u64 mask; + u64 val; + const char *set; + const char *clear; +}; + +struct ptdump_pg_level { + const struct ptdump_prot_bits *bits; + char name[4]; + int num; + u64 mask; +}; + +/* + * The page dumper groups page table entries of the same type into a single + * description. It uses pg_state to track the range information while + * iterating over the pte entries. When the continuity is broken it then + * dumps out a description of the range. + */ +struct ptdump_pg_state { + struct ptdump_state ptdump; + struct ptdump_pg_level *pg_level; + struct seq_file *seq; + const struct addr_marker *marker; + const struct mm_struct *mm; + unsigned long start_address; + int level; + u64 current_prot; + bool check_wx; + unsigned long wx_pages; + unsigned long uxn_pages; +}; + void ptdump_walk(struct seq_file *s, struct ptdump_info *info); +void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, + u64 val); #ifdef CONFIG_PTDUMP_DEBUGFS #define EFI_RUNTIME_MAP_END DEFAULT_MAP_WINDOW_64 void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name); #else static inline void ptdump_debugfs_register(struct ptdump_info *info, const char *name) { } -#endif +#endif /* CONFIG_PTDUMP_DEBUGFS */ +#else +static inline void note_page(struct ptdump_state *pt_st, unsigned long addr, + int level, u64 val) { } #endif /* CONFIG_PTDUMP_CORE */ #endif /* __ASM_PTDUMP_H */ diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h index 0f740b781187..917761feeffd 100644 --- a/arch/arm64/include/asm/set_memory.h +++ b/arch/arm64/include/asm/set_memory.h @@ -3,6 +3,7 @@ #ifndef _ASM_ARM64_SET_MEMORY_H #define _ASM_ARM64_SET_MEMORY_H +#include <asm/mem_encrypt.h> #include <asm-generic/set_memory.h> bool can_set_direct_map(void); diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 4a9ea103817e..9ea97dddefc4 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -109,6 +109,9 @@ #define set_pstate_ssbs(x) asm volatile(SET_PSTATE_SSBS(x)) #define set_pstate_dit(x) asm volatile(SET_PSTATE_DIT(x)) +/* Register-based PAN access, for save/restore purposes */ +#define SYS_PSTATE_PAN sys_reg(3, 0, 4, 2, 3) + #define __SYS_BARRIER_INSN(CRm, op2, Rt) \ __emit_inst(0xd5000000 | sys_insn(0, 3, 3, (CRm), (op2)) | ((Rt) & 0x1f)) @@ -325,7 +328,25 @@ #define SYS_PAR_EL1 sys_reg(3, 0, 7, 4, 0) #define SYS_PAR_EL1_F BIT(0) +/* When PAR_EL1.F == 1 */ #define SYS_PAR_EL1_FST GENMASK(6, 1) +#define SYS_PAR_EL1_PTW BIT(8) +#define SYS_PAR_EL1_S BIT(9) +#define SYS_PAR_EL1_AssuredOnly BIT(12) +#define SYS_PAR_EL1_TopLevel BIT(13) +#define SYS_PAR_EL1_Overlay BIT(14) +#define SYS_PAR_EL1_DirtyBit BIT(15) +#define SYS_PAR_EL1_F1_IMPDEF GENMASK_ULL(63, 48) +#define SYS_PAR_EL1_F1_RES0 (BIT(7) | BIT(10) | GENMASK_ULL(47, 16)) +#define SYS_PAR_EL1_RES1 BIT(11) +/* When PAR_EL1.F == 0 */ +#define SYS_PAR_EL1_SH GENMASK_ULL(8, 7) +#define SYS_PAR_EL1_NS BIT(9) +#define SYS_PAR_EL1_F0_IMPDEF BIT(10) +#define SYS_PAR_EL1_NSE BIT(11) +#define SYS_PAR_EL1_PA GENMASK_ULL(51, 12) +#define SYS_PAR_EL1_ATTR GENMASK_ULL(63, 56) +#define SYS_PAR_EL1_F0_RES0 (GENMASK_ULL(6, 1) | GENMASK_ULL(55, 52)) /*** Statistical Profiling Extension ***/ #define PMSEVFR_EL1_RES0_IMP \ @@ -403,7 +424,6 @@ #define SYS_PMCNTENCLR_EL0 sys_reg(3, 3, 9, 12, 2) #define SYS_PMOVSCLR_EL0 sys_reg(3, 3, 9, 12, 3) #define SYS_PMSWINC_EL0 sys_reg(3, 3, 9, 12, 4) -#define SYS_PMSELR_EL0 sys_reg(3, 3, 9, 12, 5) #define SYS_PMCEID0_EL0 sys_reg(3, 3, 9, 12, 6) #define SYS_PMCEID1_EL0 sys_reg(3, 3, 9, 12, 7) #define SYS_PMCCNTR_EL0 sys_reg(3, 3, 9, 13, 0) @@ -652,6 +672,7 @@ #define OP_AT_S12E1W sys_insn(AT_Op0, 4, AT_CRn, 8, 5) #define OP_AT_S12E0R sys_insn(AT_Op0, 4, AT_CRn, 8, 6) #define OP_AT_S12E0W sys_insn(AT_Op0, 4, AT_CRn, 8, 7) +#define OP_AT_S1E2A sys_insn(AT_Op0, 4, AT_CRn, 9, 2) /* TLBI instructions */ #define TLBI_Op0 1 @@ -1077,6 +1098,9 @@ #define POE_RXW UL(0x7) #define POE_MASK UL(0xf) +/* Initial value for Permission Overlay Extension for EL0 */ +#define POR_EL0_INIT POE_RXW + #define ARM64_FEATURE_FIELD_BITS 4 /* Defined for compatibility only, do not add new users. */ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index e72a3bf9e563..1114c1c3300a 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -81,6 +81,7 @@ void arch_setup_new_exec(void); #define TIF_SME 27 /* SME in use */ #define TIF_SME_VL_INHERIT 28 /* Inherit SME vl_onexec across exec */ #define TIF_KERNEL_FPSTATE 29 /* Task is in a kernel mode FPSIMD section */ +#define TIF_TSC_SIGSEGV 30 /* SIGSEGV on counter-timer access */ #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) @@ -97,6 +98,7 @@ void arch_setup_new_exec(void); #define _TIF_SVE (1 << TIF_SVE) #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_TSC_SIGSEGV (1 << TIF_TSC_SIGSEGV) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h index eefe766d6161..d780d1bd2eac 100644 --- a/arch/arm64/include/asm/traps.h +++ b/arch/arm64/include/asm/traps.h @@ -25,6 +25,7 @@ try_emulate_armv8_deprecated(struct pt_regs *regs, u32 insn) void force_signal_inject(int signal, int code, unsigned long address, unsigned long err); void arm64_notify_segfault(unsigned long addr); void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str); +void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey); void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str); void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str); diff --git a/arch/arm64/include/asm/vncr_mapping.h b/arch/arm64/include/asm/vncr_mapping.h index df2c47c55972..06f8ec0906a6 100644 --- a/arch/arm64/include/asm/vncr_mapping.h +++ b/arch/arm64/include/asm/vncr_mapping.h @@ -52,6 +52,7 @@ #define VNCR_PIRE0_EL1 0x290 #define VNCR_PIRE0_EL2 0x298 #define VNCR_PIR_EL1 0x2A0 +#define VNCR_POR_EL1 0x2A8 #define VNCR_ICH_LR0_EL2 0x400 #define VNCR_ICH_LR1_EL2 0x408 #define VNCR_ICH_LR2_EL2 0x410 diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 285610e626f5..055381b2c615 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -122,5 +122,6 @@ #define HWCAP2_SME_SF8FMA (1UL << 60) #define HWCAP2_SME_SF8DP4 (1UL << 61) #define HWCAP2_SME_SF8DP2 (1UL << 62) +#define HWCAP2_POE (1UL << 63) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/include/uapi/asm/mman.h b/arch/arm64/include/uapi/asm/mman.h index 1e6482a838e1..e7e0c8216243 100644 --- a/arch/arm64/include/uapi/asm/mman.h +++ b/arch/arm64/include/uapi/asm/mman.h @@ -7,4 +7,13 @@ #define PROT_BTI 0x10 /* BTI guarded page */ #define PROT_MTE 0x20 /* Normal Tagged mapping */ +/* Override any generic PKEY permission defines */ +#define PKEY_DISABLE_EXECUTE 0x4 +#define PKEY_DISABLE_READ 0x8 +#undef PKEY_ACCESS_MASK +#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ + PKEY_DISABLE_WRITE |\ + PKEY_DISABLE_READ |\ + PKEY_DISABLE_EXECUTE) + #endif /* ! _UAPI__ASM_MMAN_H */ diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h index 8a45b7a411e0..bb7af77a30a7 100644 --- a/arch/arm64/include/uapi/asm/sigcontext.h +++ b/arch/arm64/include/uapi/asm/sigcontext.h @@ -98,6 +98,13 @@ struct esr_context { __u64 esr; }; +#define POE_MAGIC 0x504f4530 + +struct poe_context { + struct _aarch64_ctx head; + __u64 por_el0; +}; + /* * extra_context: describes extra space in the signal frame for * additional structures that don't fit in sigcontext.__reserved[]. @@ -320,10 +327,10 @@ struct zt_context { ((sizeof(struct za_context) + (__SVE_VQ_BYTES - 1)) \ / __SVE_VQ_BYTES * __SVE_VQ_BYTES) -#define ZA_SIG_REGS_SIZE(vq) ((vq * __SVE_VQ_BYTES) * (vq * __SVE_VQ_BYTES)) +#define ZA_SIG_REGS_SIZE(vq) (((vq) * __SVE_VQ_BYTES) * ((vq) * __SVE_VQ_BYTES)) #define ZA_SIG_ZAV_OFFSET(vq, n) (ZA_SIG_REGS_OFFSET + \ - (SVE_SIG_ZREG_SIZE(vq) * n)) + (SVE_SIG_ZREG_SIZE(vq) * (n))) #define ZA_SIG_CONTEXT_SIZE(vq) \ (ZA_SIG_REGS_OFFSET + ZA_SIG_REGS_SIZE(vq)) @@ -334,7 +341,7 @@ struct zt_context { #define ZT_SIG_REGS_OFFSET sizeof(struct zt_context) -#define ZT_SIG_REGS_SIZE(n) (ZT_SIG_REG_BYTES * n) +#define ZT_SIG_REGS_SIZE(n) (ZT_SIG_REG_BYTES * (n)) #define ZT_SIG_CONTEXT_SIZE(n) \ (sizeof(struct zt_context) + ZT_SIG_REGS_SIZE(n)) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index f6b6b4507357..dfefbdf4073a 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -456,6 +456,14 @@ static const struct midr_range erratum_spec_ssbs_list[] = { }; #endif +#ifdef CONFIG_AMPERE_ERRATUM_AC03_CPU_38 +static const struct midr_range erratum_ac03_cpu_38_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), + {}, +}; +#endif + const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { @@ -772,7 +780,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = { { .desc = "AmpereOne erratum AC03_CPU_38", .capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38, - ERRATA_MIDR_ALL_VERSIONS(MIDR_AMPERE1), + ERRATA_MIDR_RANGE_LIST(erratum_ac03_cpu_38_list), }, #endif { diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 646ecd3069fd..718728a85430 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -466,6 +466,8 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { }; static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_POE), + FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1POE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1PIE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_TCRX_SHIFT, 4, 0), ARM64_FTR_END, @@ -2348,6 +2350,14 @@ static void cpu_enable_mops(const struct arm64_cpu_capabilities *__unused) sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_MSCEn); } +#ifdef CONFIG_ARM64_POE +static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set(REG_TCR2_EL1, 0, TCR2_EL1x_E0POE); + sysreg_clear_set(CPACR_EL1, 0, CPACR_ELx_E0POE); +} +#endif + /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) @@ -2870,6 +2880,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_nv1, ARM64_CPUID_FIELDS_NEG(ID_AA64MMFR4_EL1, E2H0, NI_NV1) }, +#ifdef CONFIG_ARM64_POE + { + .desc = "Stage-1 Permission Overlay Extension (S1POE)", + .capability = ARM64_HAS_S1POE, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_poe, + ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) + }, +#endif {}, }; @@ -3034,6 +3054,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP2), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E4M3, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E4M3), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E5M2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E5M2), +#ifdef CONFIG_ARM64_POE + HWCAP_CAP(ID_AA64MMFR3_EL1, S1POE, IMP, CAP_HWCAP, KERNEL_HWCAP_POE), +#endif {}, }; diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 09eeaa24d456..44718d0482b3 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -143,6 +143,7 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SME_SF8FMA] = "smesf8fma", [KERNEL_HWCAP_SME_SF8DP4] = "smesf8dp4", [KERNEL_HWCAP_SME_SF8DP2] = "smesf8dp2", + [KERNEL_HWCAP_POE] = "poe", }; #ifdef CONFIG_COMPAT @@ -280,7 +281,7 @@ const struct seq_operations cpuinfo_op = { }; -static struct kobj_type cpuregs_kobj_type = { +static const struct kobj_type cpuregs_kobj_type = { .sysfs_ops = &kobj_sysfs_ops, }; diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 02870beb271e..7b11d84f533c 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -407,7 +407,7 @@ int swsusp_arch_resume(void) void *, phys_addr_t, phys_addr_t); struct trans_pgd_info trans_info = { .trans_alloc_page = hibernate_page_alloc, - .trans_alloc_arg = (void *)GFP_ATOMIC, + .trans_alloc_arg = (__force void *)GFP_ATOMIC, }; /* diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index f872c57e9909..fd9a7bed83ce 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -6,28 +6,7 @@ * Copyright (C) 2014 ARM Ltd. */ -#include <linux/acpi.h> -#include <linux/init.h> -#include <linux/io.h> -#include <linux/kernel.h> -#include <linux/mm.h> #include <linux/pci.h> -#include <linux/pci-acpi.h> -#include <linux/pci-ecam.h> -#include <linux/slab.h> - -#ifdef CONFIG_ACPI -/* - * Try to assign the IRQ number when probing a new device - */ -int pcibios_alloc_irq(struct pci_dev *dev) -{ - if (!acpi_disabled) - acpi_pci_irq_enable(dev); - - return 0; -} -#endif /* * raw_pci_read/write - Platform-specific PCI config space access. @@ -61,173 +40,3 @@ int pcibus_to_node(struct pci_bus *bus) EXPORT_SYMBOL(pcibus_to_node); #endif - -#ifdef CONFIG_ACPI - -struct acpi_pci_generic_root_info { - struct acpi_pci_root_info common; - struct pci_config_window *cfg; /* config space mapping */ -}; - -int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) -{ - struct pci_config_window *cfg = bus->sysdata; - struct acpi_device *adev = to_acpi_device(cfg->parent); - struct acpi_pci_root *root = acpi_driver_data(adev); - - return root->segment; -} - -int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) -{ - struct pci_config_window *cfg; - struct acpi_device *adev; - struct device *bus_dev; - - if (acpi_disabled) - return 0; - - cfg = bridge->bus->sysdata; - - /* - * On Hyper-V there is no corresponding ACPI device for a root bridge, - * therefore ->parent is set as NULL by the driver. And set 'adev' as - * NULL in this case because there is no proper ACPI device. - */ - if (!cfg->parent) - adev = NULL; - else - adev = to_acpi_device(cfg->parent); - - bus_dev = &bridge->bus->dev; - - ACPI_COMPANION_SET(&bridge->dev, adev); - set_dev_node(bus_dev, acpi_get_node(acpi_device_handle(adev))); - - return 0; -} - -static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci) -{ - struct resource_entry *entry, *tmp; - int status; - - status = acpi_pci_probe_root_resources(ci); - resource_list_for_each_entry_safe(entry, tmp, &ci->resources) { - if (!(entry->res->flags & IORESOURCE_WINDOW)) - resource_list_destroy_entry(entry); - } - return status; -} - -/* - * Lookup the bus range for the domain in MCFG, and set up config space - * mapping. - */ -static struct pci_config_window * -pci_acpi_setup_ecam_mapping(struct acpi_pci_root *root) -{ - struct device *dev = &root->device->dev; - struct resource *bus_res = &root->secondary; - u16 seg = root->segment; - const struct pci_ecam_ops *ecam_ops; - struct resource cfgres; - struct acpi_device *adev; - struct pci_config_window *cfg; - int ret; - - ret = pci_mcfg_lookup(root, &cfgres, &ecam_ops); - if (ret) { - dev_err(dev, "%04x:%pR ECAM region not found\n", seg, bus_res); - return NULL; - } - - adev = acpi_resource_consumer(&cfgres); - if (adev) - dev_info(dev, "ECAM area %pR reserved by %s\n", &cfgres, - dev_name(&adev->dev)); - else - dev_warn(dev, FW_BUG "ECAM area %pR not reserved in ACPI namespace\n", - &cfgres); - - cfg = pci_ecam_create(dev, &cfgres, bus_res, ecam_ops); - if (IS_ERR(cfg)) { - dev_err(dev, "%04x:%pR error %ld mapping ECAM\n", seg, bus_res, - PTR_ERR(cfg)); - return NULL; - } - - return cfg; -} - -/* release_info: free resources allocated by init_info */ -static void pci_acpi_generic_release_info(struct acpi_pci_root_info *ci) -{ - struct acpi_pci_generic_root_info *ri; - - ri = container_of(ci, struct acpi_pci_generic_root_info, common); - pci_ecam_free(ri->cfg); - kfree(ci->ops); - kfree(ri); -} - -/* Interface called from ACPI code to setup PCI host controller */ -struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) -{ - struct acpi_pci_generic_root_info *ri; - struct pci_bus *bus, *child; - struct acpi_pci_root_ops *root_ops; - struct pci_host_bridge *host; - - ri = kzalloc(sizeof(*ri), GFP_KERNEL); - if (!ri) - return NULL; - - root_ops = kzalloc(sizeof(*root_ops), GFP_KERNEL); - if (!root_ops) { - kfree(ri); - return NULL; - } - - ri->cfg = pci_acpi_setup_ecam_mapping(root); - if (!ri->cfg) { - kfree(ri); - kfree(root_ops); - return NULL; - } - - root_ops->release_info = pci_acpi_generic_release_info; - root_ops->prepare_resources = pci_acpi_root_prepare_resources; - root_ops->pci_ops = (struct pci_ops *)&ri->cfg->ops->pci_ops; - bus = acpi_pci_root_create(root, root_ops, &ri->common, ri->cfg); - if (!bus) - return NULL; - - /* If we must preserve the resource configuration, claim now */ - host = pci_find_host_bridge(bus); - if (host->preserve_config) - pci_bus_claim_resources(bus); - - /* - * Assign whatever was left unassigned. If we didn't claim above, - * this will reassign everything. - */ - pci_assign_unassigned_root_bus_resources(bus); - - list_for_each_entry(child, &bus->children, node) - pcie_bus_configure_settings(child); - - return bus; -} - -void pcibios_add_bus(struct pci_bus *bus) -{ - acpi_pci_add_bus(bus); -} - -void pcibios_remove_bus(struct pci_bus *bus) -{ - acpi_pci_remove_bus(bus); -} - -#endif diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 4ae31b7af6c3..0540653fbf38 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -43,6 +43,7 @@ #include <linux/stacktrace.h> #include <asm/alternative.h> +#include <asm/arch_timer.h> #include <asm/compat.h> #include <asm/cpufeature.h> #include <asm/cacheflush.h> @@ -271,12 +272,21 @@ static void flush_tagged_addr_state(void) clear_thread_flag(TIF_TAGGED_ADDR); } +static void flush_poe(void) +{ + if (!system_supports_poe()) + return; + + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); +} + void flush_thread(void) { fpsimd_flush_thread(); tls_thread_flush(); flush_ptrace_hw_breakpoint(current); flush_tagged_addr_state(); + flush_poe(); } void arch_release_task_struct(struct task_struct *tsk) @@ -371,6 +381,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (system_supports_tpidr2()) p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); + if (system_supports_poe()) + p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); + if (stack_start) { if (is_compat_thread(task_thread_info(p))) childregs->compat_sp = stack_start; @@ -472,27 +485,63 @@ static void entry_task_switch(struct task_struct *next) } /* - * ARM erratum 1418040 handling, affecting the 32bit view of CNTVCT. - * Ensure access is disabled when switching to a 32bit task, ensure - * access is enabled when switching to a 64bit task. + * Handle sysreg updates for ARM erratum 1418040 which affects the 32bit view of + * CNTVCT, various other errata which require trapping all CNTVCT{,_EL0} + * accesses and prctl(PR_SET_TSC). Ensure access is disabled iff a workaround is + * required or PR_TSC_SIGSEGV is set. */ -static void erratum_1418040_thread_switch(struct task_struct *next) +static void update_cntkctl_el1(struct task_struct *next) { - if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040) || - !this_cpu_has_cap(ARM64_WORKAROUND_1418040)) - return; + struct thread_info *ti = task_thread_info(next); - if (is_compat_thread(task_thread_info(next))) + if (test_ti_thread_flag(ti, TIF_TSC_SIGSEGV) || + has_erratum_handler(read_cntvct_el0) || + (IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040) && + this_cpu_has_cap(ARM64_WORKAROUND_1418040) && + is_compat_thread(ti))) sysreg_clear_set(cntkctl_el1, ARCH_TIMER_USR_VCT_ACCESS_EN, 0); else sysreg_clear_set(cntkctl_el1, 0, ARCH_TIMER_USR_VCT_ACCESS_EN); } -static void erratum_1418040_new_exec(void) +static void cntkctl_thread_switch(struct task_struct *prev, + struct task_struct *next) +{ + if ((read_ti_thread_flags(task_thread_info(prev)) & + (_TIF_32BIT | _TIF_TSC_SIGSEGV)) != + (read_ti_thread_flags(task_thread_info(next)) & + (_TIF_32BIT | _TIF_TSC_SIGSEGV))) + update_cntkctl_el1(next); +} + +static int do_set_tsc_mode(unsigned int val) { + bool tsc_sigsegv; + + if (val == PR_TSC_SIGSEGV) + tsc_sigsegv = true; + else if (val == PR_TSC_ENABLE) + tsc_sigsegv = false; + else + return -EINVAL; + preempt_disable(); - erratum_1418040_thread_switch(current); + update_thread_flag(TIF_TSC_SIGSEGV, tsc_sigsegv); + update_cntkctl_el1(current); preempt_enable(); + + return 0; +} + +static void permission_overlay_switch(struct task_struct *next) +{ + if (!system_supports_poe()) + return; + + current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); + if (current->thread.por_el0 != next->thread.por_el0) { + write_sysreg_s(next->thread.por_el0, SYS_POR_EL0); + } } /* @@ -528,8 +577,9 @@ struct task_struct *__switch_to(struct task_struct *prev, contextidr_thread_switch(next); entry_task_switch(next); ssbs_thread_switch(next); - erratum_1418040_thread_switch(next); + cntkctl_thread_switch(prev, next); ptrauth_thread_switch_user(next); + permission_overlay_switch(next); /* * Complete any pending TLB or cache maintenance on this CPU in case @@ -645,7 +695,7 @@ void arch_setup_new_exec(void) current->mm->context.flags = mmflags; ptrauth_thread_init_user(); mte_thread_init_user(); - erratum_1418040_new_exec(); + do_set_tsc_mode(PR_TSC_ENABLE); if (task_spec_ssb_noexec(current)) { arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS, @@ -754,3 +804,26 @@ int arch_elf_adjust_prot(int prot, const struct arch_elf_state *state, return prot; } #endif + +int get_tsc_mode(unsigned long adr) +{ + unsigned int val; + + if (is_compat_task()) + return -EINVAL; + + if (test_thread_flag(TIF_TSC_SIGSEGV)) + val = PR_TSC_SIGSEGV; + else + val = PR_TSC_ENABLE; + + return put_user(val, (unsigned int __user *)adr); +} + +int set_tsc_mode(unsigned int val) +{ + if (is_compat_task()) + return -EINVAL; + + return do_set_tsc_mode(val); +} diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 0d022599eb61..b756578aeaee 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1440,6 +1440,39 @@ static int tagged_addr_ctrl_set(struct task_struct *target, const struct } #endif +#ifdef CONFIG_ARM64_POE +static int poe_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_poe()) + return -EINVAL; + + return membuf_write(&to, &target->thread.por_el0, + sizeof(target->thread.por_el0)); +} + +static int poe_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + long ctrl; + + if (!system_supports_poe()) + return -EINVAL; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); + if (ret) + return ret; + + target->thread.por_el0 = ctrl; + + return 0; +} +#endif + enum aarch64_regset { REGSET_GPR, REGSET_FPR, @@ -1469,6 +1502,9 @@ enum aarch64_regset { #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI REGSET_TAGGED_ADDR_CTRL, #endif +#ifdef CONFIG_ARM64_POE + REGSET_POE +#endif }; static const struct user_regset aarch64_regsets[] = { @@ -1628,6 +1664,16 @@ static const struct user_regset aarch64_regsets[] = { .set = tagged_addr_ctrl_set, }, #endif +#ifdef CONFIG_ARM64_POE + [REGSET_POE] = { + .core_note_type = NT_ARM_POE, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = poe_get, + .set = poe_set, + }, +#endif }; static const struct user_regset_view user_aarch64_view = { diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 4a77f4976e11..561986947530 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -61,6 +61,7 @@ struct rt_sigframe_user_layout { unsigned long za_offset; unsigned long zt_offset; unsigned long fpmr_offset; + unsigned long poe_offset; unsigned long extra_offset; unsigned long end_offset; }; @@ -185,6 +186,8 @@ struct user_ctxs { u32 zt_size; struct fpmr_context __user *fpmr; u32 fpmr_size; + struct poe_context __user *poe; + u32 poe_size; }; static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) @@ -258,6 +261,32 @@ static int restore_fpmr_context(struct user_ctxs *user) return err; } +static int preserve_poe_context(struct poe_context __user *ctx) +{ + int err = 0; + + __put_user_error(POE_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(read_sysreg_s(SYS_POR_EL0), &ctx->por_el0, err); + + return err; +} + +static int restore_poe_context(struct user_ctxs *user) +{ + u64 por_el0; + int err = 0; + + if (user->poe_size != sizeof(*user->poe)) + return -EINVAL; + + __get_user_error(por_el0, &(user->poe->por_el0), err); + if (!err) + write_sysreg_s(por_el0, SYS_POR_EL0); + + return err; +} + #ifdef CONFIG_ARM64_SVE static int preserve_sve_context(struct sve_context __user *ctx) @@ -621,6 +650,7 @@ static int parse_user_sigframe(struct user_ctxs *user, user->za = NULL; user->zt = NULL; user->fpmr = NULL; + user->poe = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; @@ -671,6 +701,17 @@ static int parse_user_sigframe(struct user_ctxs *user, /* ignore */ break; + case POE_MAGIC: + if (!system_supports_poe()) + goto invalid; + + if (user->poe) + goto invalid; + + user->poe = (struct poe_context __user *)head; + user->poe_size = size; + break; + case SVE_MAGIC: if (!system_supports_sve() && !system_supports_sme()) goto invalid; @@ -857,6 +898,9 @@ static int restore_sigframe(struct pt_regs *regs, if (err == 0 && system_supports_sme2() && user.zt) err = restore_zt_context(&user); + if (err == 0 && system_supports_poe() && user.poe) + err = restore_poe_context(&user); + return err; } @@ -980,6 +1024,13 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, return err; } + if (system_supports_poe()) { + err = sigframe_alloc(user, &user->poe_offset, + sizeof(struct poe_context)); + if (err) + return err; + } + return sigframe_alloc_end(user); } @@ -1042,6 +1093,14 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, err |= preserve_fpmr_context(fpmr_ctx); } + if (system_supports_poe() && err == 0 && user->poe_offset) { + struct poe_context __user *poe_ctx = + apply_user_offset(user, user->poe_offset); + + err |= preserve_poe_context(poe_ctx); + } + + /* ZA state if present */ if (system_supports_sme() && err == 0 && user->za_offset) { struct za_context __user *za_ctx = @@ -1178,6 +1237,9 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, sme_smstop(); } + if (system_supports_poe()) + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); + if (ka->sa.sa_flags & SA_RESTORER) sigtramp = ka->sa.sa_restorer; else diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index f01f0fd7b7fe..3b3f6b56e733 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -68,7 +68,7 @@ enum ipi_msg_type { IPI_RESCHEDULE, IPI_CALL_FUNC, IPI_CPU_STOP, - IPI_CPU_CRASH_STOP, + IPI_CPU_STOP_NMI, IPI_TIMER, IPI_IRQ_WORK, NR_IPI, @@ -85,6 +85,8 @@ static int ipi_irq_base __ro_after_init; static int nr_ipi __ro_after_init = NR_IPI; static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init; +static bool crash_stop; + static void ipi_setup(int cpu); #ifdef CONFIG_HOTPLUG_CPU @@ -823,7 +825,7 @@ static const char *ipi_types[MAX_IPI] __tracepoint_string = { [IPI_RESCHEDULE] = "Rescheduling interrupts", [IPI_CALL_FUNC] = "Function call interrupts", [IPI_CPU_STOP] = "CPU stop interrupts", - [IPI_CPU_CRASH_STOP] = "CPU stop (for crash dump) interrupts", + [IPI_CPU_STOP_NMI] = "CPU stop NMIs", [IPI_TIMER] = "Timer broadcast interrupts", [IPI_IRQ_WORK] = "IRQ work interrupts", [IPI_CPU_BACKTRACE] = "CPU backtrace interrupts", @@ -867,9 +869,9 @@ void arch_irq_work_raise(void) } #endif -static void __noreturn local_cpu_stop(void) +static void __noreturn local_cpu_stop(unsigned int cpu) { - set_cpu_online(smp_processor_id(), false); + set_cpu_online(cpu, false); local_daif_mask(); sdei_mask_local_cpu(); @@ -883,21 +885,26 @@ static void __noreturn local_cpu_stop(void) */ void __noreturn panic_smp_self_stop(void) { - local_cpu_stop(); + local_cpu_stop(smp_processor_id()); } -#ifdef CONFIG_KEXEC_CORE -static atomic_t waiting_for_crash_ipi = ATOMIC_INIT(0); -#endif - static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) { #ifdef CONFIG_KEXEC_CORE + /* + * Use local_daif_mask() instead of local_irq_disable() to make sure + * that pseudo-NMIs are disabled. The "crash stop" code starts with + * an IRQ and falls back to NMI (which might be pseudo). If the IRQ + * finally goes through right as we're timing out then the NMI could + * interrupt us. It's better to prevent the NMI and let the IRQ + * finish since the pt_regs will be better. + */ + local_daif_mask(); + crash_save_cpu(regs, cpu); - atomic_dec(&waiting_for_crash_ipi); + set_cpu_online(cpu, false); - local_irq_disable(); sdei_mask_local_cpu(); if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) @@ -962,14 +969,12 @@ static void do_handle_IPI(int ipinr) break; case IPI_CPU_STOP: - local_cpu_stop(); - break; - - case IPI_CPU_CRASH_STOP: - if (IS_ENABLED(CONFIG_KEXEC_CORE)) { + case IPI_CPU_STOP_NMI: + if (IS_ENABLED(CONFIG_KEXEC_CORE) && crash_stop) { ipi_cpu_crash_stop(cpu, get_irq_regs()); - unreachable(); + } else { + local_cpu_stop(cpu); } break; @@ -1024,8 +1029,7 @@ static bool ipi_should_be_nmi(enum ipi_msg_type ipi) return false; switch (ipi) { - case IPI_CPU_STOP: - case IPI_CPU_CRASH_STOP: + case IPI_CPU_STOP_NMI: case IPI_CPU_BACKTRACE: case IPI_KGDB_ROUNDUP: return true; @@ -1138,79 +1142,109 @@ static inline unsigned int num_other_online_cpus(void) void smp_send_stop(void) { + static unsigned long stop_in_progress; + cpumask_t mask; unsigned long timeout; - if (num_other_online_cpus()) { - cpumask_t mask; + /* + * If this cpu is the only one alive at this point in time, online or + * not, there are no stop messages to be sent around, so just back out. + */ + if (num_other_online_cpus() == 0) + goto skip_ipi; - cpumask_copy(&mask, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &mask); + /* Only proceed if this is the first CPU to reach this code */ + if (test_and_set_bit(0, &stop_in_progress)) + return; - if (system_state <= SYSTEM_RUNNING) - pr_crit("SMP: stopping secondary CPUs\n"); - smp_cross_call(&mask, IPI_CPU_STOP); - } + /* + * Send an IPI to all currently online CPUs except the CPU running + * this code. + * + * NOTE: we don't do anything here to prevent other CPUs from coming + * online after we snapshot `cpu_online_mask`. Ideally, the calling code + * should do something to prevent other CPUs from coming up. This code + * can be called in the panic path and thus it doesn't seem wise to + * grab the CPU hotplug mutex ourselves. Worst case: + * - If a CPU comes online as we're running, we'll likely notice it + * during the 1 second wait below and then we'll catch it when we try + * with an NMI (assuming NMIs are enabled) since we re-snapshot the + * mask before sending an NMI. + * - If we leave the function and see that CPUs are still online we'll + * at least print a warning. Especially without NMIs this function + * isn't foolproof anyway so calling code will just have to accept + * the fact that there could be cases where a CPU can't be stopped. + */ + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); - /* Wait up to one second for other CPUs to stop */ + if (system_state <= SYSTEM_RUNNING) + pr_crit("SMP: stopping secondary CPUs\n"); + + /* + * Start with a normal IPI and wait up to one second for other CPUs to + * stop. We do this first because it gives other processors a chance + * to exit critical sections / drop locks and makes the rest of the + * stop process (especially console flush) more robust. + */ + smp_cross_call(&mask, IPI_CPU_STOP); timeout = USEC_PER_SEC; while (num_other_online_cpus() && timeout--) udelay(1); - if (num_other_online_cpus()) + /* + * If CPUs are still online, try an NMI. There's no excuse for this to + * be slow, so we only give them an extra 10 ms to respond. + */ + if (num_other_online_cpus() && ipi_should_be_nmi(IPI_CPU_STOP_NMI)) { + smp_rmb(); + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + + pr_info("SMP: retry stop with NMI for CPUs %*pbl\n", + cpumask_pr_args(&mask)); + + smp_cross_call(&mask, IPI_CPU_STOP_NMI); + timeout = USEC_PER_MSEC * 10; + while (num_other_online_cpus() && timeout--) + udelay(1); + } + + if (num_other_online_cpus()) { + smp_rmb(); + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", - cpumask_pr_args(cpu_online_mask)); + cpumask_pr_args(&mask)); + } +skip_ipi: sdei_mask_local_cpu(); } #ifdef CONFIG_KEXEC_CORE void crash_smp_send_stop(void) { - static int cpus_stopped; - cpumask_t mask; - unsigned long timeout; - /* * This function can be called twice in panic path, but obviously * we execute this only once. + * + * We use this same boolean to tell whether the IPI we send was a + * stop or a "crash stop". */ - if (cpus_stopped) + if (crash_stop) return; + crash_stop = 1; - cpus_stopped = 1; + smp_send_stop(); - /* - * If this cpu is the only one alive at this point in time, online or - * not, there are no stop messages to be sent around, so just back out. - */ - if (num_other_online_cpus() == 0) - goto skip_ipi; - - cpumask_copy(&mask, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &mask); - - atomic_set(&waiting_for_crash_ipi, num_other_online_cpus()); - - pr_crit("SMP: stopping secondary CPUs\n"); - smp_cross_call(&mask, IPI_CPU_CRASH_STOP); - - /* Wait up to one second for other CPUs to stop */ - timeout = USEC_PER_SEC; - while ((atomic_read(&waiting_for_crash_ipi) > 0) && timeout--) - udelay(1); - - if (atomic_read(&waiting_for_crash_ipi) > 0) - pr_warn("SMP: failed to stop secondary CPUs %*pbl\n", - cpumask_pr_args(&mask)); - -skip_ipi: - sdei_mask_local_cpu(); sdei_handler_abort(); } bool smp_crash_stop_failed(void) { - return (atomic_read(&waiting_for_crash_ipi) > 0); + return num_other_online_cpus() != 0; } #endif diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 6b3258860377..2729faaee4b4 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -25,6 +25,7 @@ * * @common: Common unwind state. * @task: The task being unwound. + * @graph_idx: Used by ftrace_graph_ret_addr() for optimized stack unwinding. * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance * associated with the most recently encountered replacement lr * value. @@ -32,6 +33,7 @@ struct kunwind_state { struct unwind_state common; struct task_struct *task; + int graph_idx; #ifdef CONFIG_KRETPROBES struct llist_node *kr_cur; #endif @@ -106,7 +108,7 @@ kunwind_recover_return_address(struct kunwind_state *state) if (state->task->ret_stack && (state->common.pc == (unsigned long)return_to_handler)) { unsigned long orig_pc; - orig_pc = ftrace_graph_ret_addr(state->task, NULL, + orig_pc = ftrace_graph_ret_addr(state->task, &state->graph_idx, state->common.pc, (void *)state->common.fp); if (WARN_ON_ONCE(state->common.pc == orig_pc)) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 9e22683aa921..563cbce11126 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -273,6 +273,12 @@ void arm64_force_sig_fault(int signo, int code, unsigned long far, force_sig_fault(signo, code, (void __user *)far); } +void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey) +{ + arm64_show_signal(SIGSEGV, str); + force_sig_pkuerr((void __user *)far, pkey); +} + void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str) { @@ -601,18 +607,26 @@ static void ctr_read_handler(unsigned long esr, struct pt_regs *regs) static void cntvct_read_handler(unsigned long esr, struct pt_regs *regs) { - int rt = ESR_ELx_SYS64_ISS_RT(esr); + if (test_thread_flag(TIF_TSC_SIGSEGV)) { + force_sig(SIGSEGV); + } else { + int rt = ESR_ELx_SYS64_ISS_RT(esr); - pt_regs_write_reg(regs, rt, arch_timer_read_counter()); - arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + pt_regs_write_reg(regs, rt, arch_timer_read_counter()); + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + } } static void cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) { - int rt = ESR_ELx_SYS64_ISS_RT(esr); + if (test_thread_flag(TIF_TSC_SIGSEGV)) { + force_sig(SIGSEGV); + } else { + int rt = ESR_ELx_SYS64_ISS_RT(esr); - pt_regs_write_reg(regs, rt, arch_timer_get_rate()); - arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + pt_regs_write_reg(regs, rt, arch_timer_get_rate()); + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + } } static void mrs_handler(unsigned long esr, struct pt_regs *regs) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 8304eb342be9..ead632ad01b4 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -66,4 +66,21 @@ config PROTECTED_NVHE_STACKTRACE If unsure, or not using protected nVHE (pKVM), say N. +config PTDUMP_STAGE2_DEBUGFS + bool "Present the stage-2 pagetables to debugfs" + depends on KVM + depends on DEBUG_KERNEL + depends on DEBUG_FS + depends on GENERIC_PTDUMP + select PTDUMP_CORE + default n + help + Say Y here if you want to show the stage-2 kernel pagetables + layout in a debugfs file. This information is only useful for kernel developers + who are working in architecture specific areas of the kernel. + It is probably not a good idea to enable this feature in a production + kernel. + + If in doubt, say N. + endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 86a629aaf0a1..3cf7adb2b503 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -17,7 +17,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ inject_fault.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o stacktrace.o \ vgic-sys-reg-v3.o fpsimd.o pkvm.o \ - arch_timer.o trng.o vmid.o emulate-nested.o nested.o \ + arch_timer.o trng.o vmid.o emulate-nested.o nested.o at.o \ vgic/vgic.o vgic/vgic-init.o \ vgic/vgic-irqfd.o vgic/vgic-v2.o \ vgic/vgic-v3.o vgic/vgic-v4.o \ @@ -27,6 +27,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o +kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o always-y := hyp_constants.h hyp-constants.s diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 9bef7638342e..fe0764173cd0 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -46,6 +46,8 @@ #include <kvm/arm_pmu.h> #include <kvm/arm_psci.h> +#include "sys_regs.h" + static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; enum kvm_wfx_trap_policy { @@ -228,6 +230,7 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) void kvm_arch_create_vm_debugfs(struct kvm *kvm) { kvm_sys_regs_create_debugfs(kvm); + kvm_s2_ptdump_create_debugfs(kvm); } static void kvm_destroy_mpidr_data(struct kvm *kvm) @@ -821,15 +824,13 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) return ret; } - if (vcpu_has_nv(vcpu)) { - ret = kvm_init_nv_sysregs(vcpu->kvm); - if (ret) - return ret; - } + ret = kvm_finalize_sys_regs(vcpu); + if (ret) + return ret; /* - * This needs to happen after NV has imposed its own restrictions on - * the feature set + * This needs to happen after any restriction has been applied + * to the feature set. */ kvm_calculate_traps(vcpu); diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c new file mode 100644 index 000000000000..39f0e87a340e --- /dev/null +++ b/arch/arm64/kvm/at.c @@ -0,0 +1,1101 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 - Linaro Ltd + * Author: Jintack Lim <jintack.lim@linaro.org> + */ + +#include <linux/kvm_host.h> + +#include <asm/esr.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +enum trans_regime { + TR_EL10, + TR_EL20, + TR_EL2, +}; + +struct s1_walk_info { + u64 baddr; + enum trans_regime regime; + unsigned int max_oa_bits; + unsigned int pgshift; + unsigned int txsz; + int sl; + bool hpd; + bool be; + bool s2; +}; + +struct s1_walk_result { + union { + struct { + u64 desc; + u64 pa; + s8 level; + u8 APTable; + bool UXNTable; + bool PXNTable; + }; + struct { + u8 fst; + bool ptw; + bool s2; + }; + }; + bool failed; +}; + +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2) +{ + wr->fst = fst; + wr->ptw = ptw; + wr->s2 = s2; + wr->failed = true; +} + +#define S1_MMU_DISABLED (-127) + +static int get_ia_size(struct s1_walk_info *wi) +{ + return 64 - wi->txsz; +} + +/* Return true if the IPA is out of the OA range */ +static bool check_output_size(u64 ipa, struct s1_walk_info *wi) +{ + return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); +} + +/* Return the translation regime that applies to an AT instruction */ +static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) +{ + /* + * We only get here from guest EL2, so the translation + * regime AT applies to is solely defined by {E2H,TGE}. + */ + switch (op) { + case OP_AT_S1E2R: + case OP_AT_S1E2W: + case OP_AT_S1E2A: + return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; + break; + default: + return (vcpu_el2_e2h_is_set(vcpu) && + vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10; + } +} + +static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; + unsigned int stride, x; + bool va55, tbi, lva, as_el0; + + hcr = __vcpu_sys_reg(vcpu, HCR_EL2); + + wi->regime = compute_translation_regime(vcpu, op); + as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); + + va55 = va & BIT(55); + + if (wi->regime == TR_EL2 && va55) + goto addrsz; + + wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); + + switch (wi->regime) { + case TR_EL10: + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); + ttbr = (va55 ? + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); + break; + case TR_EL2: + case TR_EL20: + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + ttbr = (va55 ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + break; + default: + BUG(); + } + + tbi = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_TBI, tcr) : + (va55 ? + FIELD_GET(TCR_TBI1, tcr) : + FIELD_GET(TCR_TBI0, tcr))); + + if (!tbi && (u64)sign_extend64(va, 55) != va) + goto addrsz; + + va = (u64)sign_extend64(va, 55); + + /* Let's put the MMU disabled case aside immediately */ + switch (wi->regime) { + case TR_EL10: + /* + * If dealing with the EL1&0 translation regime, 3 things + * can disable the S1 translation: + * + * - HCR_EL2.DC = 1 + * - HCR_EL2.{E2H,TGE} = {0,1} + * - SCTLR_EL1.M = 0 + * + * The TGE part is interesting. If we have decided that this + * is EL1&0, then it means that either {E2H,TGE} == {1,0} or + * {0,x}, and we only need to test for TGE == 1. + */ + if (hcr & (HCR_DC | HCR_TGE)) { + wr->level = S1_MMU_DISABLED; + break; + } + fallthrough; + case TR_EL2: + case TR_EL20: + if (!(sctlr & SCTLR_ELx_M)) + wr->level = S1_MMU_DISABLED; + break; + } + + if (wr->level == S1_MMU_DISABLED) { + if (va >= BIT(kvm_get_pa_bits(vcpu->kvm))) + goto addrsz; + + wr->pa = va; + return 0; + } + + wi->be = sctlr & SCTLR_ELx_EE; + + wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP); + wi->hpd &= (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_HPD, tcr) : + (va55 ? + FIELD_GET(TCR_HPD1, tcr) : + FIELD_GET(TCR_HPD0, tcr))); + + /* Someone was silly enough to encode TG0/TG1 differently */ + if (va55) { + wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG1_MASK, tcr); + + switch (tg << TCR_TG1_SHIFT) { + case TCR_TG1_4K: + wi->pgshift = 12; break; + case TCR_TG1_16K: + wi->pgshift = 14; break; + case TCR_TG1_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } else { + wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG0_MASK, tcr); + + switch (tg << TCR_TG0_SHIFT) { + case TCR_TG0_4K: + wi->pgshift = 12; break; + case TCR_TG0_16K: + wi->pgshift = 14; break; + case TCR_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } + + /* R_PLCGL, R_YXNYW */ + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { + if (wi->txsz > 39) + goto transfault_l0; + } else { + if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) + goto transfault_l0; + } + + /* R_GTJBY, R_SXWGM */ + switch (BIT(wi->pgshift)) { + case SZ_4K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT); + lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); + break; + case SZ_16K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT); + lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); + break; + case SZ_64K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); + break; + } + + if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) + goto transfault_l0; + + ia_bits = get_ia_size(wi); + + /* R_YYVYV, I_THCZK */ + if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || + (va55 && va < GENMASK(63, ia_bits))) + goto transfault_l0; + + /* I_ZFSYQ */ + if (wi->regime != TR_EL2 && + (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) + goto transfault_l0; + + /* R_BNDVG and following statements */ + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && + as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) + goto transfault_l0; + + /* AArch64.S1StartLevel() */ + stride = wi->pgshift - 3; + wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); + + ps = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); + + wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps)); + + /* Compute minimal alignment */ + x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); + + wi->baddr = ttbr & TTBRx_EL1_BADDR; + + /* R_VPBBF */ + if (check_output_size(wi->baddr, wi)) + goto addrsz; + + wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); + + return 0; + +addrsz: /* Address Size Fault level 0 */ + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false); + return -EFAULT; + +transfault_l0: /* Translation Fault level 0 */ + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false); + return -EFAULT; +} + +static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + u64 va_top, va_bottom, baddr, desc; + int level, stride, ret; + + level = wi->sl; + stride = wi->pgshift - 3; + baddr = wi->baddr; + + va_top = get_ia_size(wi) - 1; + + while (1) { + u64 index, ipa; + + va_bottom = (3 - level) * stride + wi->pgshift; + index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); + + ipa = baddr | index; + + if (wi->s2) { + struct kvm_s2_trans s2_trans = {}; + + ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans); + if (ret) { + fail_s1_walk(wr, + (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, + true, true); + return ret; + } + + if (!kvm_s2_trans_readable(&s2_trans)) { + fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), + true, true); + + return -EPERM; + } + + ipa = kvm_s2_trans_output(&s2_trans); + } + + ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); + if (ret) { + fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), + true, false); + return ret; + } + + if (wi->be) + desc = be64_to_cpu((__force __be64)desc); + else + desc = le64_to_cpu((__force __le64)desc); + + /* Invalid descriptor */ + if (!(desc & BIT(0))) + goto transfault; + + /* Block mapping, check validity down the line */ + if (!(desc & BIT(1))) + break; + + /* Page mapping */ + if (level == 3) + break; + + /* Table handling */ + if (!wi->hpd) { + wr->APTable |= FIELD_GET(S1_TABLE_AP, desc); + wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc); + wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); + } + + baddr = desc & GENMASK_ULL(47, wi->pgshift); + + /* Check for out-of-range OA */ + if (check_output_size(baddr, wi)) + goto addrsz; + + /* Prepare for next round */ + va_top = va_bottom - 1; + level++; + } + + /* Block mapping, check the validity of the level */ + if (!(desc & BIT(1))) { + bool valid_block = false; + + switch (BIT(wi->pgshift)) { + case SZ_4K: + valid_block = level == 1 || level == 2; + break; + case SZ_16K: + case SZ_64K: + valid_block = level == 2; + break; + } + + if (!valid_block) + goto transfault; + } + + if (check_output_size(desc & GENMASK(47, va_bottom), wi)) + goto addrsz; + + va_bottom += contiguous_bit_shift(desc, wi, level); + + wr->failed = false; + wr->level = level; + wr->desc = desc; + wr->pa = desc & GENMASK(47, va_bottom); + wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); + + return 0; + +addrsz: + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false); + return -EINVAL; +transfault: + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false); + return -ENOENT; +} + +struct mmu_config { + u64 ttbr0; + u64 ttbr1; + u64 tcr; + u64 mair; + u64 sctlr; + u64 vttbr; + u64 vtcr; + u64 hcr; +}; + +static void __mmu_config_save(struct mmu_config *config) +{ + config->ttbr0 = read_sysreg_el1(SYS_TTBR0); + config->ttbr1 = read_sysreg_el1(SYS_TTBR1); + config->tcr = read_sysreg_el1(SYS_TCR); + config->mair = read_sysreg_el1(SYS_MAIR); + config->sctlr = read_sysreg_el1(SYS_SCTLR); + config->vttbr = read_sysreg(vttbr_el2); + config->vtcr = read_sysreg(vtcr_el2); + config->hcr = read_sysreg(hcr_el2); +} + +static void __mmu_config_restore(struct mmu_config *config) +{ + write_sysreg(config->hcr, hcr_el2); + + /* + * ARM errata 1165522 and 1530923 require TGE to be 1 before + * we update the guest state. + */ + asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); + + write_sysreg_el1(config->ttbr0, SYS_TTBR0); + write_sysreg_el1(config->ttbr1, SYS_TTBR1); + write_sysreg_el1(config->tcr, SYS_TCR); + write_sysreg_el1(config->mair, SYS_MAIR); + write_sysreg_el1(config->sctlr, SYS_SCTLR); + write_sysreg(config->vttbr, vttbr_el2); + write_sysreg(config->vtcr, vtcr_el2); +} + +static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 host_pan; + bool fail; + + host_pan = read_sysreg_s(SYS_PSTATE_PAN); + write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN); + + switch (op) { + case OP_AT_S1E1RP: + fail = __kvm_at(OP_AT_S1E1RP, vaddr); + break; + case OP_AT_S1E1WP: + fail = __kvm_at(OP_AT_S1E1WP, vaddr); + break; + } + + write_sysreg_s(host_pan, SYS_PSTATE_PAN); + + return fail; +} + +#define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic) +#define MEMATTR_NC 0b0100 +#define MEMATTR_Wt 0b1000 +#define MEMATTR_Wb 0b1100 +#define MEMATTR_WbRaWa 0b1111 + +#define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0) + +static u8 s2_memattr_to_attr(u8 memattr) +{ + memattr &= 0b1111; + + switch (memattr) { + case 0b0000: + case 0b0001: + case 0b0010: + case 0b0011: + return memattr << 2; + case 0b0100: + return MEMATTR(Wb, Wb); + case 0b0101: + return MEMATTR(NC, NC); + case 0b0110: + return MEMATTR(Wt, NC); + case 0b0111: + return MEMATTR(Wb, NC); + case 0b1000: + /* Reserved, assume NC */ + return MEMATTR(NC, NC); + case 0b1001: + return MEMATTR(NC, Wt); + case 0b1010: + return MEMATTR(Wt, Wt); + case 0b1011: + return MEMATTR(Wb, Wt); + case 0b1100: + /* Reserved, assume NC */ + return MEMATTR(NC, NC); + case 0b1101: + return MEMATTR(NC, Wb); + case 0b1110: + return MEMATTR(Wt, Wb); + case 0b1111: + return MEMATTR(Wb, Wb); + default: + unreachable(); + } +} + +static u8 combine_s1_s2_attr(u8 s1, u8 s2) +{ + bool transient; + u8 final = 0; + + /* Upgrade transient s1 to non-transient to simplify things */ + switch (s1) { + case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */ + transient = true; + s1 = MEMATTR_Wt | (s1 & GENMASK(1,0)); + break; + case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */ + transient = true; + s1 = MEMATTR_Wb | (s1 & GENMASK(1,0)); + break; + default: + transient = false; + } + + /* S2CombineS1AttrHints() */ + if ((s1 & GENMASK(3, 2)) == MEMATTR_NC || + (s2 & GENMASK(3, 2)) == MEMATTR_NC) + final = MEMATTR_NC; + else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt || + (s2 & GENMASK(3, 2)) == MEMATTR_Wt) + final = MEMATTR_Wt; + else + final = MEMATTR_Wb; + + if (final != MEMATTR_NC) { + /* Inherit RaWa hints form S1 */ + if (transient) { + switch (s1 & GENMASK(3, 2)) { + case MEMATTR_Wt: + final = 0; + break; + case MEMATTR_Wb: + final = MEMATTR_NC; + break; + } + } + + final |= s1 & GENMASK(1, 0); + } + + return final; +} + +#define ATTR_NSH 0b00 +#define ATTR_RSV 0b01 +#define ATTR_OSH 0b10 +#define ATTR_ISH 0b11 + +static u8 compute_sh(u8 attr, u64 desc) +{ + u8 sh; + + /* Any form of device, as well as NC has SH[1:0]=0b10 */ + if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) + return ATTR_OSH; + + sh = FIELD_GET(PTE_SHARED, desc); + if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ + sh = ATTR_NSH; + + return sh; +} + +static u8 combine_sh(u8 s1_sh, u8 s2_sh) +{ + if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH) + return ATTR_OSH; + if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH) + return ATTR_ISH; + + return ATTR_NSH; +} + +static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, + struct kvm_s2_trans *tr) +{ + u8 s1_parattr, s2_memattr, final_attr; + u64 par; + + /* If S2 has failed to translate, report the damage */ + if (tr->esr) { + par = SYS_PAR_EL1_RES1; + par |= SYS_PAR_EL1_F; + par |= SYS_PAR_EL1_S; + par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr); + return par; + } + + s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par); + s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc); + + if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) { + if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP)) + s2_memattr &= ~BIT(3); + + /* Combination of R_VRJSW and R_RHWZM */ + switch (s2_memattr) { + case 0b0101: + if (MEMATTR_IS_DEVICE(s1_parattr)) + final_attr = s1_parattr; + else + final_attr = MEMATTR(NC, NC); + break; + case 0b0110: + case 0b1110: + final_attr = MEMATTR(WbRaWa, WbRaWa); + break; + case 0b0111: + case 0b1111: + /* Preserve S1 attribute */ + final_attr = s1_parattr; + break; + case 0b0100: + case 0b1100: + case 0b1101: + /* Reserved, do something non-silly */ + final_attr = s1_parattr; + break; + default: + /* MemAttr[2]=0, Device from S2 */ + final_attr = s2_memattr & GENMASK(1,0) << 2; + } + } else { + /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ + u8 s2_parattr = s2_memattr_to_attr(s2_memattr); + + if (MEMATTR_IS_DEVICE(s1_parattr) || + MEMATTR_IS_DEVICE(s2_parattr)) { + final_attr = min(s1_parattr, s2_parattr); + } else { + /* At this stage, this is memory vs memory */ + final_attr = combine_s1_s2_attr(s1_parattr & 0xf, + s2_parattr & 0xf); + final_attr |= combine_s1_s2_attr(s1_parattr >> 4, + s2_parattr >> 4) << 4; + } + } + + if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) && + !MEMATTR_IS_DEVICE(final_attr)) + final_attr = MEMATTR(NC, NC); + + par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); + par |= tr->output & GENMASK(47, 12); + par |= FIELD_PREP(SYS_PAR_EL1_SH, + combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), + compute_sh(final_attr, tr->desc))); + + return par; +} + +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, + enum trans_regime regime) +{ + u64 par; + + if (wr->failed) { + par = SYS_PAR_EL1_RES1; + par |= SYS_PAR_EL1_F; + par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst); + par |= wr->ptw ? SYS_PAR_EL1_PTW : 0; + par |= wr->s2 ? SYS_PAR_EL1_S : 0; + } else if (wr->level == S1_MMU_DISABLED) { + /* MMU off or HCR_EL2.DC == 1 */ + par = SYS_PAR_EL1_NSE; + par |= wr->pa & GENMASK_ULL(47, 12); + + if (regime == TR_EL10 && + (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, + MEMATTR(WbRaWa, WbRaWa)); + par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH); + } else { + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */ + par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH); + } + } else { + u64 mair, sctlr; + u8 sh; + + par = SYS_PAR_EL1_NSE; + + mair = (regime == TR_EL10 ? + vcpu_read_sys_reg(vcpu, MAIR_EL1) : + vcpu_read_sys_reg(vcpu, MAIR_EL2)); + + mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; + mair &= 0xff; + + sctlr = (regime == TR_EL10 ? + vcpu_read_sys_reg(vcpu, SCTLR_EL1) : + vcpu_read_sys_reg(vcpu, SCTLR_EL2)); + + /* Force NC for memory if SCTLR_ELx.C is clear */ + if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair)) + mair = MEMATTR(NC, NC); + + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); + par |= wr->pa & GENMASK_ULL(47, 12); + + sh = compute_sh(mair, wr->desc); + par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); + } + + return par; +} + +static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + u64 sctlr; + + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) + return false; + + if (regime == TR_EL10) + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + else + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + + return sctlr & SCTLR_EL1_EPAN; +} + +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + bool perm_fail, ur, uw, ux, pr, pw, px; + struct s1_walk_result wr = {}; + struct s1_walk_info wi = {}; + int ret, idx; + + ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr); + if (ret) + goto compute_par; + + if (wr.level == S1_MMU_DISABLED) + goto compute_par; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = walk_s1(vcpu, &wi, &wr, vaddr); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (ret) + goto compute_par; + + /* FIXME: revisit when adding indirect permission support */ + /* AArch64.S1DirectBasePermissions() */ + if (wi.regime != TR_EL2) { + switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr.desc)) { + case 0b00: + pr = pw = true; + ur = uw = false; + break; + case 0b01: + pr = pw = ur = uw = true; + break; + case 0b10: + pr = true; + pw = ur = uw = false; + break; + case 0b11: + pr = ur = true; + pw = uw = false; + break; + } + + switch (wr.APTable) { + case 0b00: + break; + case 0b01: + ur = uw = false; + break; + case 0b10: + pw = uw = false; + break; + case 0b11: + pw = ur = uw = false; + break; + } + + /* We don't use px for anything yet, but hey... */ + px = !((wr.desc & PTE_PXN) || wr.PXNTable || uw); + ux = !((wr.desc & PTE_UXN) || wr.UXNTable); + + if (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) { + bool pan; + + pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT; + pan &= ur || uw || (pan3_enabled(vcpu, wi.regime) && ux); + pw &= !pan; + pr &= !pan; + } + } else { + ur = uw = ux = false; + + if (!(wr.desc & PTE_RDONLY)) { + pr = pw = true; + } else { + pr = true; + pw = false; + } + + if (wr.APTable & BIT(1)) + pw = false; + + /* XN maps to UXN */ + px = !((wr.desc & PTE_UXN) || wr.UXNTable); + } + + perm_fail = false; + + switch (op) { + case OP_AT_S1E1RP: + case OP_AT_S1E1R: + case OP_AT_S1E2R: + perm_fail = !pr; + break; + case OP_AT_S1E1WP: + case OP_AT_S1E1W: + case OP_AT_S1E2W: + perm_fail = !pw; + break; + case OP_AT_S1E0R: + perm_fail = !ur; + break; + case OP_AT_S1E0W: + perm_fail = !uw; + break; + case OP_AT_S1E1A: + case OP_AT_S1E2A: + break; + default: + BUG(); + } + + if (perm_fail) + fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false); + +compute_par: + return compute_par_s1(vcpu, &wr, wi.regime); +} + +/* + * Return the PAR_EL1 value as the result of a valid translation. + * + * If the translation is unsuccessful, the value may only contain + * PAR_EL1.F, and cannot be taken at face value. It isn't an + * indication of the translation having failed, only that the fast + * path did not succeed, *unless* it indicates a S1 permission fault. + */ +static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct mmu_config config; + struct kvm_s2_mmu *mmu; + bool fail; + u64 par; + + par = SYS_PAR_EL1_F; + + /* + * We've trapped, so everything is live on the CPU. As we will + * be switching contexts behind everybody's back, disable + * interrupts while holding the mmu lock. + */ + guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock); + + /* + * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already + * the right one (as we trapped from vEL2). If not, save the + * full MMU context. + */ + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) + goto skip_mmu_switch; + + /* + * Obtaining the S2 MMU for a L2 is horribly racy, and we may not + * find it (recycled by another vcpu, for example). When this + * happens, admit defeat immediately and use the SW (slow) path. + */ + mmu = lookup_s2_mmu(vcpu); + if (!mmu) + return par; + + __mmu_config_save(&config); + + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); + __load_stage2(mmu, mmu->arch); + +skip_mmu_switch: + /* Clear TGE, enable S2 translation, we're rolling */ + write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2); + isb(); + + switch (op) { + case OP_AT_S1E1RP: + case OP_AT_S1E1WP: + fail = at_s1e1p_fast(vcpu, op, vaddr); + break; + case OP_AT_S1E1R: + fail = __kvm_at(OP_AT_S1E1R, vaddr); + break; + case OP_AT_S1E1W: + fail = __kvm_at(OP_AT_S1E1W, vaddr); + break; + case OP_AT_S1E0R: + fail = __kvm_at(OP_AT_S1E0R, vaddr); + break; + case OP_AT_S1E0W: + fail = __kvm_at(OP_AT_S1E0W, vaddr); + break; + case OP_AT_S1E1A: + fail = __kvm_at(OP_AT_S1E1A, vaddr); + break; + default: + WARN_ON_ONCE(1); + fail = true; + break; + } + + if (!fail) + par = read_sysreg_par(); + + if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) + __mmu_config_restore(&config); + + return par; +} + +static bool par_check_s1_perm_fault(u64 par) +{ + u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); + + return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM && + !(par & SYS_PAR_EL1_S)); +} + +void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); + + /* + * If PAR_EL1 reports that AT failed on a S1 permission fault, we + * know for sure that the PTW was able to walk the S1 tables and + * there's nothing else to do. + * + * If AT failed for any other reason, then we must walk the guest S1 + * to emulate the instruction. + */ + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) + par = handle_at_slow(vcpu, op, vaddr); + + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} + +void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 par; + + /* + * We've trapped, so everything is live on the CPU. As we will be + * switching context behind everybody's back, disable interrupts... + */ + scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) { + struct kvm_s2_mmu *mmu; + u64 val, hcr; + bool fail; + + mmu = &vcpu->kvm->arch.mmu; + + val = hcr = read_sysreg(hcr_el2); + val &= ~HCR_TGE; + val |= HCR_VM; + + if (!vcpu_el2_e2h_is_set(vcpu)) + val |= HCR_NV | HCR_NV1; + + write_sysreg(val, hcr_el2); + isb(); + + par = SYS_PAR_EL1_F; + + switch (op) { + case OP_AT_S1E2R: + fail = __kvm_at(OP_AT_S1E1R, vaddr); + break; + case OP_AT_S1E2W: + fail = __kvm_at(OP_AT_S1E1W, vaddr); + break; + case OP_AT_S1E2A: + fail = __kvm_at(OP_AT_S1E1A, vaddr); + break; + default: + WARN_ON_ONCE(1); + fail = true; + } + + isb(); + + if (!fail) + par = read_sysreg_par(); + + write_sysreg(hcr, hcr_el2); + isb(); + } + + /* We failed the translation, let's replay it in slow motion */ + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) + par = handle_at_slow(vcpu, op, vaddr); + + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} + +void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct kvm_s2_trans out = {}; + u64 ipa, par; + bool write; + int ret; + + /* Do the stage-1 translation */ + switch (op) { + case OP_AT_S12E1R: + op = OP_AT_S1E1R; + write = false; + break; + case OP_AT_S12E1W: + op = OP_AT_S1E1W; + write = true; + break; + case OP_AT_S12E0R: + op = OP_AT_S1E0R; + write = false; + break; + case OP_AT_S12E0W: + op = OP_AT_S1E0W; + write = true; + break; + default: + WARN_ON_ONCE(1); + return; + } + + __kvm_at_s1e01(vcpu, op, vaddr); + par = vcpu_read_sys_reg(vcpu, PAR_EL1); + if (par & SYS_PAR_EL1_F) + return; + + /* + * If we only have a single stage of translation (E2H=0 or + * TGE=1), exit early. Same thing if {VM,DC}=={0,0}. + */ + if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) || + !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) + return; + + /* Do the stage-2 translation */ + ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); + out.esr = 0; + ret = kvm_walk_nested_s2(vcpu, ipa, &out); + if (ret < 0) + return; + + /* Check the access permission */ + if (!out.esr && + ((!write && !out.readable) || (write && !out.writable))) + out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3); + + par = compute_par_s12(vcpu, par, &out); + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 05166eccea0a..05b6435d02a9 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -83,14 +83,20 @@ enum cgt_group_id { CGT_CPTR_TAM, CGT_CPTR_TCPAC, + CGT_HCRX_EnFPM, CGT_HCRX_TCR2En, + CGT_ICH_HCR_TC, + CGT_ICH_HCR_TALL0, + CGT_ICH_HCR_TALL1, + CGT_ICH_HCR_TDIR, + /* * Anything after this point is a combination of coarse trap * controls, which must all be evaluated to decide what to do. */ __MULTIPLE_CONTROL_BITS__, - CGT_HCR_IMO_FMO = __MULTIPLE_CONTROL_BITS__, + CGT_HCR_IMO_FMO_ICH_HCR_TC = __MULTIPLE_CONTROL_BITS__, CGT_HCR_TID2_TID4, CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB_TTLBOS, @@ -105,6 +111,8 @@ enum cgt_group_id { CGT_MDCR_TDE_TDRA, CGT_MDCR_TDCC_TDE_TDA, + CGT_ICH_HCR_TC_TDIR, + /* * Anything after this point requires a callback evaluating a * complex trap condition. Ugly stuff. @@ -372,12 +380,42 @@ static const struct trap_bits coarse_trap_bits[] = { .mask = CPTR_EL2_TCPAC, .behaviour = BEHAVE_FORWARD_ANY, }, + [CGT_HCRX_EnFPM] = { + .index = HCRX_EL2, + .value = 0, + .mask = HCRX_EL2_EnFPM, + .behaviour = BEHAVE_FORWARD_ANY, + }, [CGT_HCRX_TCR2En] = { .index = HCRX_EL2, .value = 0, .mask = HCRX_EL2_TCR2En, .behaviour = BEHAVE_FORWARD_ANY, }, + [CGT_ICH_HCR_TC] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_TC, + .mask = ICH_HCR_TC, + .behaviour = BEHAVE_FORWARD_ANY, + }, + [CGT_ICH_HCR_TALL0] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_TALL0, + .mask = ICH_HCR_TALL0, + .behaviour = BEHAVE_FORWARD_ANY, + }, + [CGT_ICH_HCR_TALL1] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_TALL1, + .mask = ICH_HCR_TALL1, + .behaviour = BEHAVE_FORWARD_ANY, + }, + [CGT_ICH_HCR_TDIR] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_TDIR, + .mask = ICH_HCR_TDIR, + .behaviour = BEHAVE_FORWARD_ANY, + }, }; #define MCB(id, ...) \ @@ -387,7 +425,6 @@ static const struct trap_bits coarse_trap_bits[] = { } static const enum cgt_group_id *coarse_control_combo[] = { - MCB(CGT_HCR_IMO_FMO, CGT_HCR_IMO, CGT_HCR_FMO), MCB(CGT_HCR_TID2_TID4, CGT_HCR_TID2, CGT_HCR_TID4), MCB(CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB, CGT_HCR_TTLBIS), MCB(CGT_HCR_TTLB_TTLBOS, CGT_HCR_TTLB, CGT_HCR_TTLBOS), @@ -402,6 +439,9 @@ static const enum cgt_group_id *coarse_control_combo[] = { MCB(CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE, CGT_MDCR_TDOSA), MCB(CGT_MDCR_TDE_TDRA, CGT_MDCR_TDE, CGT_MDCR_TDRA), MCB(CGT_MDCR_TDCC_TDE_TDA, CGT_MDCR_TDCC, CGT_MDCR_TDE, CGT_MDCR_TDA), + + MCB(CGT_HCR_IMO_FMO_ICH_HCR_TC, CGT_HCR_IMO, CGT_HCR_FMO, CGT_ICH_HCR_TC), + MCB(CGT_ICH_HCR_TC_TDIR, CGT_ICH_HCR_TC, CGT_ICH_HCR_TDIR), }; typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *); @@ -536,9 +576,9 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_CSSELR_EL1, CGT_HCR_TID2_TID4), SR_RANGE_TRAP(SYS_ID_PFR0_EL1, sys_reg(3, 0, 0, 7, 7), CGT_HCR_TID3), - SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO), - SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO), - SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO), + SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), + SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), + SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), SR_RANGE_TRAP(sys_reg(3, 0, 11, 0, 0), sys_reg(3, 0, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 1, 11, 0, 0), @@ -786,6 +826,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(OP_AT_S12E1W, CGT_HCR_NV), SR_TRAP(OP_AT_S12E0R, CGT_HCR_NV), SR_TRAP(OP_AT_S12E0W, CGT_HCR_NV), + SR_TRAP(OP_AT_S1E2A, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1, CGT_HCR_NV), @@ -867,6 +908,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(OP_AT_S1E0W, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1RP, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1WP, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E1A, CGT_HCR_AT), SR_TRAP(SYS_ERXPFGF_EL1, CGT_HCR_nFIEN), SR_TRAP(SYS_ERXPFGCTL_EL1, CGT_HCR_nFIEN), SR_TRAP(SYS_ERXPFGCDN_EL1, CGT_HCR_nFIEN), @@ -1108,6 +1150,35 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_CNTP_CTL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTPCT_EL0, CGT_CNTHCTL_EL1PCTEN), SR_TRAP(SYS_CNTPCTSS_EL0, CGT_CNTHCTL_EL1PCTEN), + SR_TRAP(SYS_FPMR, CGT_HCRX_EnFPM), + /* + * IMPDEF choice: + * We treat ICC_SRE_EL2.{SRE,Enable) and ICV_SRE_EL1.SRE as + * RAO/WI. We therefore never consider ICC_SRE_EL2.Enable for + * ICC_SRE_EL1 access, and always handle it locally. + */ + SR_TRAP(SYS_ICC_AP0R0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R1_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R2_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R3_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP1R0_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R2_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R3_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_BPR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_BPR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_CTLR_EL1, CGT_ICH_HCR_TC), + SR_TRAP(SYS_ICC_DIR_EL1, CGT_ICH_HCR_TC_TDIR), + SR_TRAP(SYS_ICC_EOIR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_EOIR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_HPPIR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_HPPIR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_IAR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_IAR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_IGRPEN0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_IGRPEN1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_PMR_EL1, CGT_ICH_HCR_TC), + SR_TRAP(SYS_ICC_RPR_EL1, CGT_ICH_HCR_TC), }; static DEFINE_XARRAY(sr_forward_xa); diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index c53e5b14038d..ea5484ce1f3b 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -63,6 +63,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) */ *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; *host_data_ptr(fpsimd_state) = kern_hyp_va(¤t->thread.uw.fpsimd_state); + *host_data_ptr(fpmr_ptr) = kern_hyp_va(¤t->thread.uw.fpmr); vcpu_clear_flag(vcpu, HOST_SVE_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) @@ -134,8 +135,8 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) fp_state.sve_state = vcpu->arch.sve_state; fp_state.sve_vl = vcpu->arch.sve_max_vl; fp_state.sme_state = NULL; - fp_state.svcr = &vcpu->arch.svcr; - fp_state.fpmr = &vcpu->arch.fpmr; + fp_state.svcr = &__vcpu_sys_reg(vcpu, SVCR); + fp_state.fpmr = &__vcpu_sys_reg(vcpu, FPMR); fp_state.fp_type = &vcpu->arch.fp_type; if (vcpu_has_sve(vcpu)) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 11098eb7eb44..962f985977c2 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1045,6 +1045,11 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, mutex_lock(&kvm->slots_lock); + if (write && atomic_read(&kvm->nr_memslots_dirty_logging)) { + ret = -EBUSY; + goto out; + } + while (length > 0) { kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL); void *maddr; @@ -1059,6 +1064,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, page = pfn_to_online_page(pfn); if (!page) { /* Reject ZONE_DEVICE memory */ + kvm_release_pfn_clean(pfn); ret = -EFAULT; goto out; } diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h index 9e13c1bc2ad5..17df94570f03 100644 --- a/arch/arm64/kvm/hyp/include/hyp/fault.h +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -14,6 +14,7 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) { + int ret; u64 par, tmp; /* @@ -27,7 +28,9 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) * saved the guest context yet, and we may return early... */ par = read_sysreg_par(); - if (!__kvm_at("s1e1r", far)) + ret = system_supports_poe() ? __kvm_at(OP_AT_S1E1A, far) : + __kvm_at(OP_AT_S1E1R, far); + if (!ret) tmp = read_sysreg_par(); else tmp = SYS_PAR_EL1_F; /* back to the guest */ diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 37ff87d782b6..46d52e8a3df3 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -403,6 +403,9 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) else __fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs); + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) + write_sysreg_s(__vcpu_sys_reg(vcpu, FPMR), SYS_FPMR); + /* Skip restoring fpexc32 for AArch64 guests */ if (!(read_sysreg(hcr_el2) & HCR_RW)) write_sysreg(__vcpu_sys_reg(vcpu, FPEXC32_EL2), fpexc32_el2); diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 4c0fdabaf8ae..1579a3c08a36 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -16,9 +16,15 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt); + static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) { ctxt_sys_reg(ctxt, MDSCR_EL1) = read_sysreg(mdscr_el1); + + // POR_EL0 can affect uaccess, so must be saved/restored early. + if (ctxt_has_s1poe(ctxt)) + ctxt_sys_reg(ctxt, POR_EL0) = read_sysreg_s(SYS_POR_EL0); } static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt) @@ -66,6 +72,17 @@ static inline bool ctxt_has_tcrx(struct kvm_cpu_context *ctxt) return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, TCRX, IMP); } +static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!system_supports_poe()) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, S1POE, IMP); +} + static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) { ctxt_sys_reg(ctxt, SCTLR_EL1) = read_sysreg_el1(SYS_SCTLR); @@ -80,6 +97,9 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR); ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0); } + + if (ctxt_has_s1poe(ctxt)) + ctxt_sys_reg(ctxt, POR_EL1) = read_sysreg_el1(SYS_POR); } ctxt_sys_reg(ctxt, ESR_EL1) = read_sysreg_el1(SYS_ESR); ctxt_sys_reg(ctxt, AFSR0_EL1) = read_sysreg_el1(SYS_AFSR0); @@ -120,6 +140,10 @@ static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt) { write_sysreg(ctxt_sys_reg(ctxt, MDSCR_EL1), mdscr_el1); + + // POR_EL0 can affect uaccess, so must be saved/restored early. + if (ctxt_has_s1poe(ctxt)) + write_sysreg_s(ctxt_sys_reg(ctxt, POR_EL0), SYS_POR_EL0); } static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) @@ -158,6 +182,9 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1), SYS_PIR); write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0); } + + if (ctxt_has_s1poe(ctxt)) + write_sysreg_el1(ctxt_sys_reg(ctxt, POR_EL1), SYS_POR); } write_sysreg_el1(ctxt_sys_reg(ctxt, ESR_EL1), SYS_ESR); write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR0_EL1), SYS_AFSR0); diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index e715c157c2c4..e433dfab882a 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -426,9 +426,9 @@ out: return; } -static __always_inline void do_ffa_mem_xfer(const u64 func_id, - struct arm_smccc_res *res, - struct kvm_cpu_context *ctxt) +static void __do_ffa_mem_xfer(const u64 func_id, + struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, len, ctxt, 1); DECLARE_REG(u32, fraglen, ctxt, 2); @@ -440,9 +440,6 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id, u32 offset, nr_ranges; int ret = 0; - BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE && - func_id != FFA_FN64_MEM_LEND); - if (addr_mbz || npages_mbz || fraglen > len || fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) { ret = FFA_RET_INVALID_PARAMETERS; @@ -461,6 +458,11 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id, goto out_unlock; } + if (len > ffa_desc_buf.len) { + ret = FFA_RET_NO_MEMORY; + goto out_unlock; + } + buf = hyp_buffers.tx; memcpy(buf, host_buffers.tx, fraglen); @@ -512,6 +514,13 @@ err_unshare: goto out_unlock; } +#define do_ffa_mem_xfer(fid, res, ctxt) \ + do { \ + BUILD_BUG_ON((fid) != FFA_FN64_MEM_SHARE && \ + (fid) != FFA_FN64_MEM_LEND); \ + __do_ffa_mem_xfer((fid), (res), (ctxt)); \ + } while (0); + static void do_ffa_mem_reclaim(struct arm_smccc_res *res, struct kvm_cpu_context *ctxt) { diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 07120b37da35..401af1835be6 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -130,7 +130,7 @@ alternative_else_nop_endif /* Invalidate the stale TLBs from Bootloader */ tlbi alle2 - tlbi vmalls12e1 + tlbi alle1 dsb sy mov_q x0, INIT_SCTLR_EL2_MMU_ON diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index f43d845f3c4e..87692b566d90 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -62,6 +62,8 @@ static void fpsimd_sve_flush(void) static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) { + bool has_fpmr; + if (!guest_owns_fp_regs()) return; @@ -73,11 +75,18 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) else __fpsimd_save_state(&vcpu->arch.ctxt.fp_regs); + has_fpmr = kvm_has_fpmr(kern_hyp_va(vcpu->kvm)); + if (has_fpmr) + __vcpu_sys_reg(vcpu, FPMR) = read_sysreg_s(SYS_FPMR); + if (system_supports_sve()) __hyp_sve_restore_host(); else __fpsimd_restore_state(*host_data_ptr(fpsimd_state)); + if (has_fpmr) + write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR); + *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; } diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 8f5c56d5b1cd..cc69106734ca 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -197,6 +197,15 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) } else { __fpsimd_save_state(*host_data_ptr(fpsimd_state)); } + + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) { + u64 val = read_sysreg_s(SYS_FPMR); + + if (unlikely(is_protected_kvm_enabled())) + *host_data_ptr(fpmr) = val; + else + **host_data_ptr(fpmr_ptr) = val; + } } static const exit_handler_fn hyp_exit_handlers[] = { diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index ca3c09df8d7c..48da9ca9763f 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -132,10 +132,10 @@ static void exit_vmid_context(struct tlb_inv_context *cxt) else __load_host_stage2(); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { - /* Ensure write of the old VMID */ - isb(); + /* Ensure write of the old VMID */ + isb(); + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { if (!(cxt->sctlr & SCTLR_ELx_M)) { write_sysreg_el1(cxt->sctlr, SYS_SCTLR); isb(); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 9e2bbee77491..b11bcebac908 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -17,48 +17,6 @@ #define KVM_PTE_TYPE_PAGE 1 #define KVM_PTE_TYPE_TABLE 1 -#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) - -#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ - ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ - ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) -#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) -#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 -#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) - -#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) -#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) -#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) -#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) -#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 -#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) - -#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) - -#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) - -#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) - -#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) - -#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) - -#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ - KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ - KVM_PTE_LEAF_ATTR_HI_S2_XN) - -#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) -#define KVM_MAX_OWNER_ID 1 - -/* - * Used to indicate a pte for which a 'break-before-make' sequence is in - * progress. - */ -#define KVM_INVALID_PTE_LOCKED BIT(10) - struct kvm_pgtable_walk_data { struct kvm_pgtable_walker *walker; @@ -1547,7 +1505,6 @@ static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, */ new = kvm_init_table_pte(childp, mm_ops); stage2_make_pte(ctx, new); - dsb(ishst); return 0; } @@ -1559,8 +1516,11 @@ int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, .flags = KVM_PGTABLE_WALK_LEAF, .arg = mc, }; + int ret; - return kvm_pgtable_walk(pgt, addr, size, &walker); + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + dsb(ishst); + return ret; } int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 7b397fad26f2..18d4677002b1 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -268,8 +268,16 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) * starting to mess with the rest of the GIC, and VMCR_EL2 in * particular. This logic must be called before * __vgic_v3_restore_state(). + * + * However, if the vgic is disabled (ICH_HCR_EL2.EN==0), no GIC is + * provisioned at all. In order to prevent illegal accesses to the + * system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1 + * so that the trap bits can take effect. Yes, we *loves* the GIC. */ - if (!cpu_if->vgic_sre) { + if (!(cpu_if->vgic_hcr & ICH_HCR_EN)) { + write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1); + isb(); + } else if (!cpu_if->vgic_sre) { write_gicreg(0, ICC_SRE_EL1); isb(); write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); @@ -288,8 +296,9 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) } /* - * Prevent the guest from touching the GIC system registers if - * SRE isn't enabled for GICv3 emulation. + * Prevent the guest from touching the ICC_SRE_EL1 system + * register. Note that this may not have any effect, as + * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation. */ write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); @@ -297,10 +306,11 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) /* * If we need to trap system registers, we must write * ICH_HCR_EL2 anyway, even if no interrupts are being - * injected, + * injected. Note that this also applies if we don't expect + * any system register access (no vgic at all). */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || - cpu_if->its_vpe.its_vm) + cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); } @@ -326,7 +336,7 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) * no interrupts were being injected, and we disable it again here. */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || - cpu_if->its_vpe.its_vm) + cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) write_gicreg(0, ICH_HCR_EL2); } @@ -1032,6 +1042,75 @@ static void __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) write_gicreg(vmcr, ICH_VMCR_EL2); } +static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, + u32 sysreg, bool is_read) +{ + u64 ich_hcr; + + if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) + return false; + + ich_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + + switch (sysreg) { + case SYS_ICC_IGRPEN0_EL1: + if (is_read && + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + if (!is_read && + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + fallthrough; + + case SYS_ICC_AP0Rn_EL1(0): + case SYS_ICC_AP0Rn_EL1(1): + case SYS_ICC_AP0Rn_EL1(2): + case SYS_ICC_AP0Rn_EL1(3): + case SYS_ICC_BPR0_EL1: + case SYS_ICC_EOIR0_EL1: + case SYS_ICC_HPPIR0_EL1: + case SYS_ICC_IAR0_EL1: + return ich_hcr & ICH_HCR_TALL0; + + case SYS_ICC_IGRPEN1_EL1: + if (is_read && + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + if (!is_read && + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + fallthrough; + + case SYS_ICC_AP1Rn_EL1(0): + case SYS_ICC_AP1Rn_EL1(1): + case SYS_ICC_AP1Rn_EL1(2): + case SYS_ICC_AP1Rn_EL1(3): + case SYS_ICC_BPR1_EL1: + case SYS_ICC_EOIR1_EL1: + case SYS_ICC_HPPIR1_EL1: + case SYS_ICC_IAR1_EL1: + return ich_hcr & ICH_HCR_TALL1; + + case SYS_ICC_DIR_EL1: + if (ich_hcr & ICH_HCR_TDIR) + return true; + + fallthrough; + + case SYS_ICC_RPR_EL1: + case SYS_ICC_CTLR_EL1: + case SYS_ICC_PMR_EL1: + return ich_hcr & ICH_HCR_TC; + + default: + return false; + } +} + int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) { int rt; @@ -1041,6 +1120,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) bool is_read; u32 sysreg; + if (kern_hyp_va(vcpu->kvm)->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) + return 0; + esr = kvm_vcpu_get_esr(vcpu); if (vcpu_mode_is_32bit(vcpu)) { if (!kvm_condition_valid(vcpu)) { @@ -1055,6 +1137,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; + if (__vgic_v3_check_trap_forwarding(vcpu, sysreg, is_read)) + return 0; + switch (sysreg) { case SYS_ICC_IAR0_EL1: case SYS_ICC_IAR1_EL1: diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 77010b76c150..80581b1c3995 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -312,6 +312,9 @@ static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) { __fpsimd_save_state(*host_data_ptr(fpsimd_state)); + + if (kvm_has_fpmr(vcpu->kvm)) + **host_data_ptr(fpmr_ptr) = read_sysreg_s(SYS_FPMR); } static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 6981b1bc0946..a509b63bd4dd 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1540,8 +1540,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vma_pagesize = min(vma_pagesize, (long)max_map_size); } - if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) + /* + * Both the canonical IPA and fault IPA must be hugepage-aligned to + * ensure we find the right PFN and lay down the mapping in the right + * place. + */ + if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { fault_ipa &= ~(vma_pagesize - 1); + ipa &= ~(vma_pagesize - 1); + } gfn = ipa >> PAGE_SHIFT; mte_allowed = kvm_vma_mte_allowed(vma); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index bab27f9d8cc6..638b6205526f 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -103,20 +103,6 @@ struct s2_walk_info { bool be; }; -static unsigned int ps_to_output_size(unsigned int ps) -{ - switch (ps) { - case 0: return 32; - case 1: return 36; - case 2: return 40; - case 3: return 42; - case 4: return 44; - case 5: - default: - return 48; - } -} - static u32 compute_fsc(int level, u32 fsc) { return fsc | (level & 0x3); @@ -256,7 +242,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, /* Check for valid descriptor at this point */ if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) { out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); - out->upper_attr = desc; + out->desc = desc; return 1; } @@ -266,7 +252,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, if (check_output_size(wi, desc)) { out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); - out->upper_attr = desc; + out->desc = desc; return 1; } @@ -278,27 +264,24 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, if (level < first_block_level) { out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); - out->upper_attr = desc; + out->desc = desc; return 1; } - /* - * We don't use the contiguous bit in the stage-2 ptes, so skip check - * for misprogramming of the contiguous bit. - */ - if (check_output_size(wi, desc)) { out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); - out->upper_attr = desc; + out->desc = desc; return 1; } if (!(desc & BIT(10))) { out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS); - out->upper_attr = desc; + out->desc = desc; return 1; } + addr_bottom += contiguous_bit_shift(desc, wi, level); + /* Calculate and return the result */ paddr = (desc & GENMASK_ULL(47, addr_bottom)) | (ipa & GENMASK_ULL(addr_bottom - 1, 0)); @@ -307,7 +290,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, out->readable = desc & (0b01 << 6); out->writable = desc & (0b10 << 6); out->level = level; - out->upper_attr = desc & GENMASK_ULL(63, 52); + out->desc = desc; return 0; } @@ -954,19 +937,16 @@ static void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1) int kvm_init_nv_sysregs(struct kvm *kvm) { u64 res0, res1; - int ret = 0; - mutex_lock(&kvm->arch.config_lock); + lockdep_assert_held(&kvm->arch.config_lock); if (kvm->arch.sysreg_masks) - goto out; + return 0; kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)), GFP_KERNEL_ACCOUNT); - if (!kvm->arch.sysreg_masks) { - ret = -ENOMEM; - goto out; - } + if (!kvm->arch.sysreg_masks) + return -ENOMEM; limit_nv_id_regs(kvm); @@ -1195,8 +1175,13 @@ int kvm_init_nv_sysregs(struct kvm *kvm) if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, V1P1)) res0 |= ~(res0 | res1); set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1); -out: - mutex_unlock(&kvm->arch.config_lock); - return ret; + /* SCTLR_EL1 */ + res0 = SCTLR_EL1_RES0; + res1 = SCTLR_EL1_RES1; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) + res0 |= SCTLR_EL1_EPAN; + set_sysreg_masks(kvm, SCTLR_EL1, res0, res1); + + return 0; } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 82a2a003259c..ac36c438b8c1 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -233,7 +233,7 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) int i; struct kvm_pmu *pmu = &vcpu->arch.pmu; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) pmu->pmc[i].idx = i; } @@ -260,7 +260,7 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) { int i; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, i)); irq_work_sync(&vcpu->arch.pmu.overflow_work); } @@ -291,7 +291,7 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E) || !val) return; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) { struct kvm_pmc *pmc; if (!(val & BIT(i))) @@ -323,7 +323,7 @@ void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) if (!kvm_vcpu_has_pmu(vcpu) || !val) return; - for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { + for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) { struct kvm_pmc *pmc; if (!(val & BIT(i))) @@ -910,10 +910,10 @@ u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) struct arm_pmu *arm_pmu = kvm->arch.arm_pmu; /* - * The arm_pmu->num_events considers the cycle counter as well. - * Ignore that and return only the general-purpose counters. + * The arm_pmu->cntr_mask considers the fixed counter(s) as well. + * Ignore those and return only the general-purpose counters. */ - return arm_pmu->num_events - 1; + return bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS); } static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu) diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 329819806096..0b3adf3e17b4 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -5,6 +5,8 @@ */ #include <linux/kvm_host.h> #include <linux/perf_event.h> +#include <linux/perf/arm_pmu.h> +#include <linux/perf/arm_pmuv3.h> static DEFINE_PER_CPU(struct kvm_pmu_events, kvm_pmu_events); @@ -35,7 +37,7 @@ struct kvm_pmu_events *kvm_get_pmu_events(void) * Add events to track that we may want to switch at guest entry/exit * time. */ -void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) +void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); @@ -51,7 +53,7 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) /* * Stop tracking events */ -void kvm_clr_pmu_events(u32 clr) +void kvm_clr_pmu_events(u64 clr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); @@ -62,79 +64,32 @@ void kvm_clr_pmu_events(u32 clr) pmu->events_guest &= ~clr; } -#define PMEVTYPER_READ_CASE(idx) \ - case idx: \ - return read_sysreg(pmevtyper##idx##_el0) - -#define PMEVTYPER_WRITE_CASE(idx) \ - case idx: \ - write_sysreg(val, pmevtyper##idx##_el0); \ - break - -#define PMEVTYPER_CASES(readwrite) \ - PMEVTYPER_##readwrite##_CASE(0); \ - PMEVTYPER_##readwrite##_CASE(1); \ - PMEVTYPER_##readwrite##_CASE(2); \ - PMEVTYPER_##readwrite##_CASE(3); \ - PMEVTYPER_##readwrite##_CASE(4); \ - PMEVTYPER_##readwrite##_CASE(5); \ - PMEVTYPER_##readwrite##_CASE(6); \ - PMEVTYPER_##readwrite##_CASE(7); \ - PMEVTYPER_##readwrite##_CASE(8); \ - PMEVTYPER_##readwrite##_CASE(9); \ - PMEVTYPER_##readwrite##_CASE(10); \ - PMEVTYPER_##readwrite##_CASE(11); \ - PMEVTYPER_##readwrite##_CASE(12); \ - PMEVTYPER_##readwrite##_CASE(13); \ - PMEVTYPER_##readwrite##_CASE(14); \ - PMEVTYPER_##readwrite##_CASE(15); \ - PMEVTYPER_##readwrite##_CASE(16); \ - PMEVTYPER_##readwrite##_CASE(17); \ - PMEVTYPER_##readwrite##_CASE(18); \ - PMEVTYPER_##readwrite##_CASE(19); \ - PMEVTYPER_##readwrite##_CASE(20); \ - PMEVTYPER_##readwrite##_CASE(21); \ - PMEVTYPER_##readwrite##_CASE(22); \ - PMEVTYPER_##readwrite##_CASE(23); \ - PMEVTYPER_##readwrite##_CASE(24); \ - PMEVTYPER_##readwrite##_CASE(25); \ - PMEVTYPER_##readwrite##_CASE(26); \ - PMEVTYPER_##readwrite##_CASE(27); \ - PMEVTYPER_##readwrite##_CASE(28); \ - PMEVTYPER_##readwrite##_CASE(29); \ - PMEVTYPER_##readwrite##_CASE(30) - /* * Read a value direct from PMEVTYPER<idx> where idx is 0-30 - * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31). + * or PMxCFILTR_EL0 where idx is 31-32. */ static u64 kvm_vcpu_pmu_read_evtype_direct(int idx) { - switch (idx) { - PMEVTYPER_CASES(READ); - case ARMV8_PMU_CYCLE_IDX: - return read_sysreg(pmccfiltr_el0); - default: - WARN_ON(1); - } + if (idx == ARMV8_PMU_CYCLE_IDX) + return read_pmccfiltr(); + else if (idx == ARMV8_PMU_INSTR_IDX) + return read_pmicfiltr(); - return 0; + return read_pmevtypern(idx); } /* * Write a value direct to PMEVTYPER<idx> where idx is 0-30 - * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31). + * or PMxCFILTR_EL0 where idx is 31-32. */ static void kvm_vcpu_pmu_write_evtype_direct(int idx, u32 val) { - switch (idx) { - PMEVTYPER_CASES(WRITE); - case ARMV8_PMU_CYCLE_IDX: - write_sysreg(val, pmccfiltr_el0); - break; - default: - WARN_ON(1); - } + if (idx == ARMV8_PMU_CYCLE_IDX) + write_pmccfiltr(val); + else if (idx == ARMV8_PMU_INSTR_IDX) + write_pmicfiltr(val); + else + write_pmevtypern(idx, val); } /* @@ -145,7 +100,7 @@ static void kvm_vcpu_pmu_enable_el0(unsigned long events) u64 typer; u32 counter; - for_each_set_bit(counter, &events, 32) { + for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) { typer = kvm_vcpu_pmu_read_evtype_direct(counter); typer &= ~ARMV8_PMU_EXCLUDE_EL0; kvm_vcpu_pmu_write_evtype_direct(counter, typer); @@ -160,7 +115,7 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events) u64 typer; u32 counter; - for_each_set_bit(counter, &events, 32) { + for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) { typer = kvm_vcpu_pmu_read_evtype_direct(counter); typer |= ARMV8_PMU_EXCLUDE_EL0; kvm_vcpu_pmu_write_evtype_direct(counter, typer); @@ -176,7 +131,7 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events) void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) { struct kvm_pmu_events *pmu; - u32 events_guest, events_host; + u64 events_guest, events_host; if (!kvm_arm_support_pmu_v3() || !has_vhe()) return; @@ -197,7 +152,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) { struct kvm_pmu_events *pmu; - u32 events_guest, events_host; + u64 events_guest, events_host; if (!kvm_arm_support_pmu_v3() || !has_vhe()) return; diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c new file mode 100644 index 000000000000..e4a342e903e2 --- /dev/null +++ b/arch/arm64/kvm/ptdump.c @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Debug helper used to dump the stage-2 pagetables of the system and their + * associated permissions. + * + * Copyright (C) Google, 2024 + * Author: Sebastian Ene <sebastianene@google.com> + */ +#include <linux/debugfs.h> +#include <linux/kvm_host.h> +#include <linux/seq_file.h> + +#include <asm/kvm_mmu.h> +#include <asm/kvm_pgtable.h> +#include <asm/ptdump.h> + +#define MARKERS_LEN 2 +#define KVM_PGTABLE_MAX_LEVELS (KVM_PGTABLE_LAST_LEVEL + 1) + +struct kvm_ptdump_guest_state { + struct kvm *kvm; + struct ptdump_pg_state parser_state; + struct addr_marker ipa_marker[MARKERS_LEN]; + struct ptdump_pg_level level[KVM_PGTABLE_MAX_LEVELS]; + struct ptdump_range range[MARKERS_LEN]; +}; + +static const struct ptdump_prot_bits stage2_pte_bits[] = { + { + .mask = PTE_VALID, + .val = PTE_VALID, + .set = " ", + .clear = "F", + }, { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID, + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID, + .set = "R", + .clear = " ", + }, { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID, + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID, + .set = "W", + .clear = " ", + }, { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID, + .val = PTE_VALID, + .set = " ", + .clear = "X", + }, { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID, + .val = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID, + .set = "AF", + .clear = " ", + }, { + .mask = PTE_TABLE_BIT | PTE_VALID, + .val = PTE_VALID, + .set = "BLK", + .clear = " ", + }, +}; + +static int kvm_ptdump_visitor(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct ptdump_pg_state *st = ctx->arg; + struct ptdump_state *pt_st = &st->ptdump; + + note_page(pt_st, ctx->addr, ctx->level, ctx->old); + + return 0; +} + +static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl) +{ + u32 i; + u64 mask; + + if (WARN_ON_ONCE(start_lvl >= KVM_PGTABLE_LAST_LEVEL)) + return -EINVAL; + + mask = 0; + for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++) + mask |= stage2_pte_bits[i].mask; + + for (i = start_lvl; i < KVM_PGTABLE_MAX_LEVELS; i++) { + snprintf(level[i].name, sizeof(level[i].name), "%u", i); + + level[i].num = ARRAY_SIZE(stage2_pte_bits); + level[i].bits = stage2_pte_bits; + level[i].mask = mask; + } + + return 0; +} + +static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm) +{ + struct kvm_ptdump_guest_state *st; + struct kvm_s2_mmu *mmu = &kvm->arch.mmu; + struct kvm_pgtable *pgtable = mmu->pgt; + int ret; + + st = kzalloc(sizeof(struct kvm_ptdump_guest_state), GFP_KERNEL_ACCOUNT); + if (!st) + return ERR_PTR(-ENOMEM); + + ret = kvm_ptdump_build_levels(&st->level[0], pgtable->start_level); + if (ret) { + kfree(st); + return ERR_PTR(ret); + } + + st->ipa_marker[0].name = "Guest IPA"; + st->ipa_marker[1].start_address = BIT(pgtable->ia_bits); + st->range[0].end = BIT(pgtable->ia_bits); + + st->kvm = kvm; + st->parser_state = (struct ptdump_pg_state) { + .marker = &st->ipa_marker[0], + .level = -1, + .pg_level = &st->level[0], + .ptdump.range = &st->range[0], + .start_address = 0, + }; + + return st; +} + +static int kvm_ptdump_guest_show(struct seq_file *m, void *unused) +{ + int ret; + struct kvm_ptdump_guest_state *st = m->private; + struct kvm *kvm = st->kvm; + struct kvm_s2_mmu *mmu = &kvm->arch.mmu; + struct ptdump_pg_state *parser_state = &st->parser_state; + struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) { + .cb = kvm_ptdump_visitor, + .arg = parser_state, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + parser_state->seq = m; + + write_lock(&kvm->mmu_lock); + ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker); + write_unlock(&kvm->mmu_lock); + + return ret; +} + +static int kvm_ptdump_guest_open(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + struct kvm_ptdump_guest_state *st; + int ret; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + st = kvm_ptdump_parser_create(kvm); + if (IS_ERR(st)) { + ret = PTR_ERR(st); + goto err_with_kvm_ref; + } + + ret = single_open(file, kvm_ptdump_guest_show, st); + if (!ret) + return 0; + + kfree(st); +err_with_kvm_ref: + kvm_put_kvm(kvm); + return ret; +} + +static int kvm_ptdump_guest_close(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + void *st = ((struct seq_file *)file->private_data)->private; + + kfree(st); + kvm_put_kvm(kvm); + + return single_release(m, file); +} + +static const struct file_operations kvm_ptdump_guest_fops = { + .open = kvm_ptdump_guest_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_ptdump_guest_close, +}; + +static int kvm_pgtable_range_show(struct seq_file *m, void *unused) +{ + struct kvm_pgtable *pgtable = m->private; + + seq_printf(m, "%2u\n", pgtable->ia_bits); + return 0; +} + +static int kvm_pgtable_levels_show(struct seq_file *m, void *unused) +{ + struct kvm_pgtable *pgtable = m->private; + + seq_printf(m, "%1d\n", KVM_PGTABLE_MAX_LEVELS - pgtable->start_level); + return 0; +} + +static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file, + int (*show)(struct seq_file *, void *)) +{ + struct kvm *kvm = m->i_private; + struct kvm_pgtable *pgtable; + int ret; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + pgtable = kvm->arch.mmu.pgt; + + ret = single_open(file, show, pgtable); + if (ret < 0) + kvm_put_kvm(kvm); + return ret; +} + +static int kvm_pgtable_range_open(struct inode *m, struct file *file) +{ + return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_range_show); +} + +static int kvm_pgtable_levels_open(struct inode *m, struct file *file) +{ + return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_levels_show); +} + +static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + + kvm_put_kvm(kvm); + return single_release(m, file); +} + +static const struct file_operations kvm_pgtable_range_fops = { + .open = kvm_pgtable_range_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_pgtable_debugfs_close, +}; + +static const struct file_operations kvm_pgtable_levels_fops = { + .open = kvm_pgtable_levels_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_pgtable_debugfs_close, +}; + +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) +{ + debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry, + kvm, &kvm_ptdump_guest_fops); + debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm, + &kvm_pgtable_range_fops); + debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry, + kvm, &kvm_pgtable_levels_fops); +} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c90324060436..dad88e31f953 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -18,6 +18,7 @@ #include <linux/printk.h> #include <linux/uaccess.h> +#include <asm/arm_pmuv3.h> #include <asm/cacheflush.h> #include <asm/cputype.h> #include <asm/debug-monitors.h> @@ -33,6 +34,7 @@ #include <trace/events/kvm.h> #include "sys_regs.h" +#include "vgic/vgic.h" #include "trace.h" @@ -46,6 +48,13 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg); static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val); +static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + kvm_inject_undefined(vcpu); + return false; +} + static bool bad_trap(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r, @@ -53,8 +62,7 @@ static bool bad_trap(struct kvm_vcpu *vcpu, { WARN_ONCE(1, "Unexpected %s\n", msg); print_sys_reg_instr(params); - kvm_inject_undefined(vcpu); - return false; + return undef_access(vcpu, params, r); } static bool read_from_write_only(struct kvm_vcpu *vcpu, @@ -345,10 +353,8 @@ static bool access_dcgsw(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - if (!kvm_has_mte(vcpu->kvm)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_has_mte(vcpu->kvm)) + return undef_access(vcpu, p, r); /* Treat MTE S/W ops as we treat the classic ones: with contempt */ return access_dcsw(vcpu, p, r); @@ -385,10 +391,8 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu, u64 val, mask, shift; if (reg_to_encoding(r) == SYS_TCR2_EL1 && - !kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) { - kvm_inject_undefined(vcpu); - return false; - } + !kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) + return undef_access(vcpu, p, r); BUG_ON(!p->is_write); @@ -435,6 +439,9 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu, { bool g1; + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + if (!p->is_write) return read_from_write_only(vcpu, p, r); @@ -478,6 +485,9 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + if (p->is_write) return ignore_write(vcpu, p); @@ -495,14 +505,6 @@ static bool trap_raz_wi(struct kvm_vcpu *vcpu, return read_zero(vcpu, p); } -static bool trap_undef(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - kvm_inject_undefined(vcpu); - return false; -} - /* * ARMv8.1 mandates at least a trivial LORegion implementation, where all the * RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0 @@ -515,10 +517,8 @@ static bool trap_loregion(struct kvm_vcpu *vcpu, { u32 sr = reg_to_encoding(r); - if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) + return undef_access(vcpu, p, r); if (p->is_write && sr == SYS_LORID_EL1) return write_to_read_only(vcpu, p, r); @@ -887,7 +887,7 @@ static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); - __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK; + __vcpu_sys_reg(vcpu, r->reg) &= PMSELR_EL0_SEL_MASK; return __vcpu_sys_reg(vcpu, r->reg); } @@ -979,7 +979,7 @@ static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, else /* return PMSELR.SEL field */ p->regval = __vcpu_sys_reg(vcpu, PMSELR_EL0) - & ARMV8_PMU_COUNTER_MASK; + & PMSELR_EL0_SEL_MASK; return true; } @@ -1047,8 +1047,8 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, if (pmu_access_event_counter_el0_disabled(vcpu)) return false; - idx = __vcpu_sys_reg(vcpu, PMSELR_EL0) - & ARMV8_PMU_COUNTER_MASK; + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, + __vcpu_sys_reg(vcpu, PMSELR_EL0)); } else if (r->Op2 == 0) { /* PMCCNTR_EL0 */ if (pmu_access_cycle_counter_el0_disabled(vcpu)) @@ -1098,7 +1098,7 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 1) { /* PMXEVTYPER_EL0 */ - idx = __vcpu_sys_reg(vcpu, PMSELR_EL0) & ARMV8_PMU_COUNTER_MASK; + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, __vcpu_sys_reg(vcpu, PMSELR_EL0)); reg = PMEVTYPER0_EL0 + idx; } else if (r->CRn == 14 && (r->CRm & 12) == 12) { idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); @@ -1251,10 +1251,8 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) { - if (!vcpu_mode_priv(vcpu)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!vcpu_mode_priv(vcpu)) + return undef_access(vcpu, p, r); __vcpu_sys_reg(vcpu, PMUSERENR_EL0) = p->regval & ARMV8_PMU_USERENR_MASK; @@ -1338,14 +1336,6 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, .reset = reset_pmevtyper, \ .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), } -static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - kvm_inject_undefined(vcpu); - - return false; -} - /* Macro to expand the AMU counter and type registers*/ #define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), undef_access } #define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), undef_access } @@ -1404,8 +1394,7 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, break; default: print_sys_reg_msg(p, "%s", "Unhandled trapped timer register"); - kvm_inject_undefined(vcpu); - return false; + return undef_access(vcpu, p, r); } if (p->is_write) @@ -1539,6 +1528,10 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); break; + case SYS_ID_AA64PFR2_EL1: + /* We only expose FPMR */ + val &= ID_AA64PFR2_EL1_FPMR; + break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | @@ -1556,6 +1549,9 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, case SYS_ID_AA64MMFR2_EL1: val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; break; + case SYS_ID_AA64MMFR3_EL1: + val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE; + break; case SYS_ID_MMFR4_EL1: val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX); break; @@ -1669,6 +1665,24 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } +static unsigned int sme_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SME, IMP)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int fp8_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_fpmr(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { @@ -2085,26 +2099,6 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, #define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v) /* - * EL{0,1}2 registers are the EL2 view on an EL0 or EL1 register when - * HCR_EL2.E2H==1, and only in the sysreg table for convenience of - * handling traps. Given that, they are always hidden from userspace. - */ -static unsigned int hidden_user_visibility(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - return REG_HIDDEN_USER; -} - -#define EL12_REG(name, acc, rst, v) { \ - SYS_DESC(SYS_##name##_EL12), \ - .access = acc, \ - .reset = rst, \ - .reg = name##_EL1, \ - .val = v, \ - .visibility = hidden_user_visibility, \ -} - -/* * Since reset() callback and field val are not used for idregs, they will be * used for specific purposes for idregs. * The reset() would return KVM sanitised register value. The value would be the @@ -2211,6 +2205,18 @@ static bool access_spsr(struct kvm_vcpu *vcpu, return true; } +static bool access_cntkctl_el12(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + __vcpu_sys_reg(vcpu, CNTKCTL_EL1) = p->regval; + else + p->regval = __vcpu_sys_reg(vcpu, CNTKCTL_EL1); + + return true; +} + static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 val = r->val; @@ -2255,6 +2261,15 @@ static bool access_zcr_el2(struct kvm_vcpu *vcpu, return true; } +static unsigned int s1poe_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + return 0; + + return REG_HIDDEN; +} + /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 @@ -2301,7 +2316,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { // DBGDTR[TR]X_EL0 share the same encoding { SYS_DESC(SYS_DBGDTRTX_EL0), trap_raz_wi }, - { SYS_DESC(SYS_DBGVCR32_EL2), trap_undef, reset_val, DBGVCR32_EL2, 0 }, + { SYS_DESC(SYS_DBGVCR32_EL2), undef_access, reset_val, DBGVCR32_EL2, 0 }, { SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 }, @@ -2359,16 +2374,15 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64PFR0_EL1_MPAM | ID_AA64PFR0_EL1_SVE | ID_AA64PFR0_EL1_RAS | - ID_AA64PFR0_EL1_GIC | ID_AA64PFR0_EL1_AdvSIMD | ID_AA64PFR0_EL1_FP), }, ID_SANITISED(ID_AA64PFR1_EL1), - ID_UNALLOCATED(4,2), + ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR), ID_UNALLOCATED(4,3), ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), ID_HIDDEN(ID_AA64SMFR0_EL1), ID_UNALLOCATED(4,6), - ID_UNALLOCATED(4,7), + ID_WRITABLE(ID_AA64FPFR0_EL1, ~ID_AA64FPFR0_EL1_RES0), /* CRm=5 */ { SYS_DESC(SYS_ID_AA64DFR0_EL1), @@ -2418,7 +2432,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64MMFR2_EL1_IDS | ID_AA64MMFR2_EL1_NV | ID_AA64MMFR2_EL1_CCIDX)), - ID_SANITISED(ID_AA64MMFR3_EL1), + ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX | + ID_AA64MMFR3_EL1_S1POE)), ID_SANITISED(ID_AA64MMFR4_EL1), ID_UNALLOCATED(7,5), ID_UNALLOCATED(7,6), @@ -2449,6 +2464,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_SPSR_EL1), access_spsr}, { SYS_DESC(SYS_ELR_EL1), access_elr}, + { SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, + { SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 }, { SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 }, { SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 }, @@ -2492,6 +2509,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1 }, { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1 }, + { SYS_DESC(SYS_POR_EL1), NULL, reset_unknown, POR_EL1, + .visibility = s1poe_visibility }, { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, { SYS_DESC(SYS_LORSA_EL1), trap_loregion }, @@ -2503,18 +2522,31 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, - { SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_EOIR0_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_HPPIR0_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_DIR_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_RPR_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, + { SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, + { SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_SGI0R_EL1), access_gic_sgi }, - { SYS_DESC(SYS_ICC_IAR1_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_EOIR1_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_HPPIR1_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + { SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, { SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 }, { SYS_DESC(SYS_TPIDR_EL1), NULL, reset_unknown, TPIDR_EL1 }, @@ -2535,7 +2567,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { CTR_EL0_IDC_MASK | CTR_EL0_DminLine_MASK | CTR_EL0_IminLine_MASK), - { SYS_DESC(SYS_SVCR), undef_access }, + { SYS_DESC(SYS_SVCR), undef_access, reset_val, SVCR, 0, .visibility = sme_visibility }, + { SYS_DESC(SYS_FPMR), undef_access, reset_val, FPMR, 0, .visibility = fp8_visibility }, { PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr, .reg = PMCR_EL0, .get_user = get_pmcr, .set_user = set_pmcr }, @@ -2578,6 +2611,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { .access = access_pmovs, .reg = PMOVSSET_EL0, .get_user = get_pmreg, .set_user = set_pmreg }, + { SYS_DESC(SYS_POR_EL0), NULL, reset_unknown, POR_EL0, + .visibility = s1poe_visibility }, { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 }, { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 }, { SYS_DESC(SYS_TPIDR2_EL0), undef_access }, @@ -2758,7 +2793,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG_VNCR(VTTBR_EL2, reset_val, 0), EL2_REG_VNCR(VTCR_EL2, reset_val, 0), - { SYS_DESC(SYS_DACR32_EL2), trap_undef, reset_unknown, DACR32_EL2 }, + { SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 }, EL2_REG_VNCR(HDFGRTR_EL2, reset_val, 0), EL2_REG_VNCR(HDFGWTR_EL2, reset_val, 0), EL2_REG_VNCR(HAFGRTR_EL2, reset_val, 0), @@ -2767,20 +2802,16 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_SP_EL1), access_sp_el1}, /* AArch32 SPSR_* are RES0 if trapped from a NV guest */ - { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi, - .visibility = hidden_user_visibility }, - { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi, - .visibility = hidden_user_visibility }, - { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi, - .visibility = hidden_user_visibility }, - { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi, - .visibility = hidden_user_visibility }, - - { SYS_DESC(SYS_IFSR32_EL2), trap_undef, reset_unknown, IFSR32_EL2 }, + { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi }, + + { SYS_DESC(SYS_IFSR32_EL2), undef_access, reset_unknown, IFSR32_EL2 }, EL2_REG(AFSR0_EL2, access_rw, reset_val, 0), EL2_REG(AFSR1_EL2, access_rw, reset_val, 0), EL2_REG_REDIR(ESR_EL2, reset_val, 0), - { SYS_DESC(SYS_FPEXC32_EL2), trap_undef, reset_val, FPEXC32_EL2, 0x700 }, + { SYS_DESC(SYS_FPEXC32_EL2), undef_access, reset_val, FPEXC32_EL2, 0x700 }, EL2_REG_REDIR(FAR_EL2, reset_val, 0), EL2_REG(HPFAR_EL2, access_rw, reset_val, 0), @@ -2790,7 +2821,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(VBAR_EL2, access_rw, reset_val, 0), EL2_REG(RVBAR_EL2, access_rw, reset_val, 0), - { SYS_DESC(SYS_RMR_EL2), trap_undef }, + { SYS_DESC(SYS_RMR_EL2), undef_access }, + + EL2_REG_VNCR(ICH_HCR_EL2, reset_val, 0), EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0), EL2_REG(TPIDR_EL2, access_rw, reset_val, 0), @@ -2798,11 +2831,48 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0), EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), - EL12_REG(CNTKCTL, access_rw, reset_val, 0), + { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 }, EL2_REG(SP_EL2, NULL, reset_unknown, 0), }; +static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + __kvm_at_s1e01(vcpu, op, p->regval); + + return true; +} + +static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + /* There is no FGT associated with AT S1E2A :-( */ + if (op == OP_AT_S1E2A && + !kvm_has_feat(vcpu->kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) { + kvm_inject_undefined(vcpu); + return false; + } + + __kvm_at_s1e2(vcpu, op, p->regval); + + return true; +} + +static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + __kvm_at_s12(vcpu, op, p->regval); + + return true; +} + static bool kvm_supported_tlbi_s12_op(struct kvm_vcpu *vpcu, u32 instr) { struct kvm *kvm = vpcu->kvm; @@ -2824,10 +2894,8 @@ static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); - if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); write_lock(&vcpu->kvm->mmu_lock); @@ -2896,10 +2964,8 @@ static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); u64 limit, vttbr; - if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); limit = BIT_ULL(kvm_get_pa_bits(vcpu->kvm)); @@ -2924,10 +2990,8 @@ static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, u64 base, range, tg, num, scale; int shift; - if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); /* * Because the shadow S2 structure doesn't necessarily reflect that @@ -2995,10 +3059,8 @@ static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); - if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), &(union tlbi_info) { @@ -3038,10 +3100,8 @@ static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, WARN_ON(!vcpu_is_el2(vcpu)); - if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), &(union tlbi_info) { @@ -3065,6 +3125,14 @@ static struct sys_reg_desc sys_insn_descs[] = { { SYS_DESC(SYS_DC_ISW), access_dcsw }, { SYS_DESC(SYS_DC_IGSW), access_dcgsw }, { SYS_DESC(SYS_DC_IGDSW), access_dcgsw }, + + SYS_INSN(AT_S1E1R, handle_at_s1e01), + SYS_INSN(AT_S1E1W, handle_at_s1e01), + SYS_INSN(AT_S1E0R, handle_at_s1e01), + SYS_INSN(AT_S1E0W, handle_at_s1e01), + SYS_INSN(AT_S1E1RP, handle_at_s1e01), + SYS_INSN(AT_S1E1WP, handle_at_s1e01), + { SYS_DESC(SYS_DC_CSW), access_dcsw }, { SYS_DESC(SYS_DC_CGSW), access_dcgsw }, { SYS_DESC(SYS_DC_CGDSW), access_dcgsw }, @@ -3144,19 +3212,27 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_VALE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_VAALE1NXS, handle_tlbi_el1), + SYS_INSN(AT_S1E2R, handle_at_s1e2), + SYS_INSN(AT_S1E2W, handle_at_s1e2), + SYS_INSN(AT_S12E1R, handle_at_s12), + SYS_INSN(AT_S12E1W, handle_at_s12), + SYS_INSN(AT_S12E0R, handle_at_s12), + SYS_INSN(AT_S12E0W, handle_at_s12), + SYS_INSN(AT_S1E2A, handle_at_s1e2), + SYS_INSN(TLBI_IPAS2E1IS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2E1IS, handle_ripas2e1is), SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is), - SYS_INSN(TLBI_ALLE2OS, trap_undef), - SYS_INSN(TLBI_VAE2OS, trap_undef), + SYS_INSN(TLBI_ALLE2OS, undef_access), + SYS_INSN(TLBI_VAE2OS, undef_access), SYS_INSN(TLBI_ALLE1OS, handle_alle1is), - SYS_INSN(TLBI_VALE2OS, trap_undef), + SYS_INSN(TLBI_VALE2OS, undef_access), SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is), - SYS_INSN(TLBI_RVAE2IS, trap_undef), - SYS_INSN(TLBI_RVALE2IS, trap_undef), + SYS_INSN(TLBI_RVAE2IS, undef_access), + SYS_INSN(TLBI_RVALE2IS, undef_access), SYS_INSN(TLBI_ALLE1IS, handle_alle1is), SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is), @@ -3168,10 +3244,10 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is), - SYS_INSN(TLBI_RVAE2OS, trap_undef), - SYS_INSN(TLBI_RVALE2OS, trap_undef), - SYS_INSN(TLBI_RVAE2, trap_undef), - SYS_INSN(TLBI_RVALE2, trap_undef), + SYS_INSN(TLBI_RVAE2OS, undef_access), + SYS_INSN(TLBI_RVALE2OS, undef_access), + SYS_INSN(TLBI_RVAE2, undef_access), + SYS_INSN(TLBI_RVALE2, undef_access), SYS_INSN(TLBI_ALLE1, handle_alle1is), SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is), @@ -3180,19 +3256,19 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is), - SYS_INSN(TLBI_ALLE2OSNXS, trap_undef), - SYS_INSN(TLBI_VAE2OSNXS, trap_undef), + SYS_INSN(TLBI_ALLE2OSNXS, undef_access), + SYS_INSN(TLBI_VAE2OSNXS, undef_access), SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is), - SYS_INSN(TLBI_VALE2OSNXS, trap_undef), + SYS_INSN(TLBI_VALE2OSNXS, undef_access), SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is), - SYS_INSN(TLBI_RVAE2ISNXS, trap_undef), - SYS_INSN(TLBI_RVALE2ISNXS, trap_undef), - SYS_INSN(TLBI_ALLE2ISNXS, trap_undef), - SYS_INSN(TLBI_VAE2ISNXS, trap_undef), + SYS_INSN(TLBI_RVAE2ISNXS, undef_access), + SYS_INSN(TLBI_RVALE2ISNXS, undef_access), + SYS_INSN(TLBI_ALLE2ISNXS, undef_access), + SYS_INSN(TLBI_VAE2ISNXS, undef_access), SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is), - SYS_INSN(TLBI_VALE2ISNXS, trap_undef), + SYS_INSN(TLBI_VALE2ISNXS, undef_access), SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is), SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is), SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is), @@ -3202,14 +3278,14 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is), - SYS_INSN(TLBI_RVAE2OSNXS, trap_undef), - SYS_INSN(TLBI_RVALE2OSNXS, trap_undef), - SYS_INSN(TLBI_RVAE2NXS, trap_undef), - SYS_INSN(TLBI_RVALE2NXS, trap_undef), - SYS_INSN(TLBI_ALLE2NXS, trap_undef), - SYS_INSN(TLBI_VAE2NXS, trap_undef), + SYS_INSN(TLBI_RVAE2OSNXS, undef_access), + SYS_INSN(TLBI_RVALE2OSNXS, undef_access), + SYS_INSN(TLBI_RVAE2NXS, undef_access), + SYS_INSN(TLBI_RVALE2NXS, undef_access), + SYS_INSN(TLBI_ALLE2NXS, undef_access), + SYS_INSN(TLBI_VAE2NXS, undef_access), SYS_INSN(TLBI_ALLE1NXS, handle_alle1is), - SYS_INSN(TLBI_VALE2NXS, trap_undef), + SYS_INSN(TLBI_VALE2NXS, undef_access), SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is), }; @@ -3387,6 +3463,7 @@ static const struct sys_reg_desc cp15_regs[] = { /* TTBCR2 */ { AA32(HI), Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, TCR_EL1 }, { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, DACR32_EL2 }, + { CP15_SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, /* DFSR */ { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, ESR_EL1 }, { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, IFSR32_EL2 }, @@ -3436,8 +3513,28 @@ static const struct sys_reg_desc cp15_regs[] = { /* AMAIR1 */ { AA32(HI), Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, AMAIR_EL1 }, - /* ICC_SRE */ - { Op1( 0), CRn(12), CRm(12), Op2( 5), access_gic_sre }, + { CP15_SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + { CP15_SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, CONTEXTIDR_EL1 }, @@ -4274,7 +4371,7 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, int ret; r = id_to_sys_reg_desc(vcpu, reg->id, table, num); - if (!r || sysreg_hidden_user(vcpu, r)) + if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; if (r->get_user) { @@ -4318,7 +4415,7 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, return -EFAULT; r = id_to_sys_reg_desc(vcpu, reg->id, table, num); - if (!r || sysreg_hidden_user(vcpu, r)) + if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; if (sysreg_user_write_ignore(vcpu, r)) @@ -4404,7 +4501,7 @@ static int walk_one_sys_reg(const struct kvm_vcpu *vcpu, if (!(rd->reg || rd->get_user)) return 0; - if (sysreg_hidden_user(vcpu, rd)) + if (sysreg_hidden(vcpu, rd)) return 0; if (!copy_reg_to_user(rd, uind)) @@ -4545,6 +4642,7 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) mutex_lock(&kvm->arch.config_lock); vcpu_set_hcr(vcpu); + vcpu_set_ich_hcr(vcpu); if (cpus_have_final_cap(ARM64_HAS_HCX)) { /* @@ -4560,6 +4658,9 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) if (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) vcpu->arch.hcrx_el2 |= HCRX_EL2_TCR2En; + + if (kvm_has_fpmr(kvm)) + vcpu->arch.hcrx_el2 |= HCRX_EL2_EnFPM; } if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags)) @@ -4568,8 +4669,6 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) kvm->arch.fgu[HFGxTR_GROUP] = (HFGxTR_EL2_nAMAIR2_EL1 | HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nS2POR_EL1 | - HFGxTR_EL2_nPOR_EL1 | - HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nACCDATA_EL1 | HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK); @@ -4600,10 +4699,21 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) HFGITR_EL2_TLBIRVAAE1OS | HFGITR_EL2_TLBIRVAE1OS); + if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) + kvm->arch.fgu[HFGITR_GROUP] |= HFGITR_EL2_ATS1E1A; + + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN2)) + kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_ATS1E1RP | + HFGITR_EL2_ATS1E1WP); + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPOR_EL1 | + HFGxTR_EL2_nPOR_EL0); + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) kvm->arch.fgu[HAFGRTR_GROUP] |= ~(HAFGRTR_EL2_RES0 | HAFGRTR_EL2_RES1); @@ -4613,6 +4723,36 @@ out: mutex_unlock(&kvm->arch.config_lock); } +/* + * Perform last adjustments to the ID registers that are implied by the + * configuration outside of the ID regs themselves, as well as any + * initialisation that directly depend on these ID registers (such as + * RES0/RES1 behaviours). This is not the place to configure traps though. + * + * Because this can be called once per CPU, changes must be idempotent. + */ +int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + guard(mutex)(&kvm->arch.config_lock); + + if (!(static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) && + irqchip_in_kernel(kvm) && + kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)) { + kvm->arch.id_regs[IDREG_IDX(SYS_ID_AA64PFR0_EL1)] &= ~ID_AA64PFR0_EL1_GIC_MASK; + kvm->arch.id_regs[IDREG_IDX(SYS_ID_PFR1_EL1)] &= ~ID_PFR1_EL1_GIC_MASK; + } + + if (vcpu_has_nv(vcpu)) { + int ret = kvm_init_nv_sysregs(kvm); + if (ret) + return ret; + } + + return 0; +} + int __init kvm_sys_reg_table_init(void) { bool valid = true; diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 997eea21ba2a..1d94ed6efad2 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -95,9 +95,8 @@ struct sys_reg_desc { }; #define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */ -#define REG_HIDDEN_USER (1 << 1) /* hidden from userspace only */ -#define REG_RAZ (1 << 2) /* RAZ from userspace and guest */ -#define REG_USER_WI (1 << 3) /* WI from userspace only */ +#define REG_RAZ (1 << 1) /* RAZ from userspace and guest */ +#define REG_USER_WI (1 << 2) /* WI from userspace only */ static __printf(2, 3) inline void print_sys_reg_msg(const struct sys_reg_params *p, @@ -165,15 +164,6 @@ static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu, return sysreg_visibility(vcpu, r) & REG_HIDDEN; } -static inline bool sysreg_hidden_user(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *r) -{ - if (likely(!r->visibility)) - return false; - - return r->visibility(vcpu, r) & (REG_HIDDEN | REG_HIDDEN_USER); -} - static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { @@ -235,6 +225,8 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index); +int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu); + #define AA32(_x) .aarch32_map = AA32_##_x #define Op0(_x) .Op0 = _x #define Op1(_x) .Op1 = _x @@ -248,4 +240,11 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index); CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ Op2(sys_reg_Op2(reg)) +#define CP15_SYS_DESC(reg) \ + .name = #reg, \ + .aarch32_map = AA32_DIRECT, \ + Op0(0), Op1(sys_reg_Op1(reg)), \ + CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ + Op2(sys_reg_Op2(reg)) + #endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */ diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index bc74d06398ef..e1397ab2072a 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -85,7 +85,7 @@ static void iter_unmark_lpis(struct kvm *kvm) struct vgic_irq *irq; unsigned long intid; - xa_for_each(&dist->lpi_xa, intid, irq) { + xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) { xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); vgic_put_irq(kvm, irq); } diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 41feb858ff9a..e7c53e8af3d1 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -417,10 +417,8 @@ static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) kfree(vgic_cpu->private_irqs); vgic_cpu->private_irqs = NULL; - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - vgic_unregister_redist_iodev(vcpu); + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; - } } void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -448,6 +446,11 @@ void kvm_vgic_destroy(struct kvm *kvm) kvm_vgic_dist_destroy(kvm); mutex_unlock(&kvm->arch.config_lock); + + if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + kvm_for_each_vcpu(i, vcpu, kvm) + vgic_unregister_redist_iodev(vcpu); + mutex_unlock(&kvm->slots_lock); } diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 3eecdd2f4b8f..b217b256853c 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -292,6 +292,18 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) /* Get the show on the road... */ vgic_v3->vgic_hcr = ICH_HCR_EN; +} + +void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; + + /* Hide GICv3 sysreg if necessary */ + if (!kvm_has_gicv3(vcpu->kvm)) { + vgic_v3->vgic_hcr |= ICH_HCR_TALL0 | ICH_HCR_TALL1 | ICH_HCR_TC; + return; + } + if (group0_trap) vgic_v3->vgic_hcr |= ICH_HCR_TALL0; if (group1_trap) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 974849ea7101..f50274fd5581 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -36,6 +36,11 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { * we have to disable IRQs before taking this lock and everything lower * than it. * + * The config_lock has additional ordering requirements: + * kvm->slots_lock + * kvm->srcu + * kvm->arch.config_lock + * * If you need to take multiple locks, always take the upper lock first, * then the lower ones, e.g. first take the its_lock, then the irq_lock. * If you are already holding a lock and need to take a higher one, you @@ -917,10 +922,13 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) void kvm_vgic_load(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; + } - if (kvm_vgic_global_state.type == VGIC_V2) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_load(vcpu); else vgic_v3_load(vcpu); @@ -928,10 +936,13 @@ void kvm_vgic_load(struct kvm_vcpu *vcpu) void kvm_vgic_put(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; + } - if (kvm_vgic_global_state.type == VGIC_V2) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_put(vcpu); else vgic_v3_put(vcpu); diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index ba8f790431bd..f2486b4d9f95 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -346,4 +346,11 @@ void vgic_v4_configure_vsgis(struct kvm *kvm); void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val); int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq); +void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu); + +static inline bool kvm_has_gicv3(struct kvm *kvm) +{ + return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP); +} + #endif diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 60454256945b..2fc8c6dd0407 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := dma-mapping.o extable.o fault.o init.o \ cache.o copypage.o flush.o \ - ioremap.o mmap.o pgd.o mmu.o \ + ioremap.o mmap.o pgd.o mem_encrypt.o mmu.o \ context.o proc.o pageattr.o fixmap.o obj-$(CONFIG_ARM64_CONTPTE) += contpte.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index a3edced29ac1..55107d27d3f8 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -421,6 +421,12 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma, ptep = contpte_align_down(ptep); start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + /* + * We are not advancing entry because __ptep_set_access_flags() + * only consumes access flags from entry. And since we have checked + * for the whole contpte block and returned early, pte_same() + * within __ptep_set_access_flags() is likely false. + */ for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) __ptep_set_access_flags(vma, addr, ptep, entry, 0); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 451ba7cbd5ad..8b281cf308b3 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -23,6 +23,7 @@ #include <linux/sched/debug.h> #include <linux/highmem.h> #include <linux/perf_event.h> +#include <linux/pkeys.h> #include <linux/preempt.h> #include <linux/hugetlb.h> @@ -486,6 +487,23 @@ static void do_bad_area(unsigned long far, unsigned long esr, } } +static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma, + unsigned int mm_flags) +{ + unsigned long iss2 = ESR_ELx_ISS2(esr); + + if (!system_supports_poe()) + return false; + + if (esr_fsc_is_permission_fault(esr) && (iss2 & ESR_ELx_Overlay)) + return true; + + return !arch_vma_access_permitted(vma, + mm_flags & FAULT_FLAG_WRITE, + mm_flags & FAULT_FLAG_INSTRUCTION, + false); +} + static bool is_el0_instruction_abort(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; @@ -511,6 +529,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, unsigned long addr = untagged_addr(far); struct vm_area_struct *vma; int si_code; + int pkey = -1; if (kprobe_page_fault(regs, esr)) return 0; @@ -575,6 +594,16 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, count_vm_vma_lock_event(VMA_LOCK_SUCCESS); goto bad_area; } + + if (fault_from_pkey(esr, vma, mm_flags)) { + pkey = vma_pkey(vma); + vma_end_read(vma); + fault = 0; + si_code = SEGV_PKUERR; + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto bad_area; + } + fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs); if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) vma_end_read(vma); @@ -610,7 +639,16 @@ retry: goto bad_area; } + if (fault_from_pkey(esr, vma, mm_flags)) { + pkey = vma_pkey(vma); + mmap_read_unlock(mm); + fault = 0; + si_code = SEGV_PKUERR; + goto bad_area; + } + fault = handle_mm_fault(vma, addr, mm_flags, regs); + /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) @@ -669,8 +707,23 @@ bad_area: arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name); } else { + /* + * The pkey value that we return to userspace can be different + * from the pkey that caused the fault. + * + * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); + * 2. T1 : set POR_EL0 to deny access to pkey=4, touches, page + * 3. T1 : faults... + * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); + * 5. T1 : enters fault handler, takes mmap_lock, etc... + * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really + * faulted on a pte with its pkey=4. + */ /* Something tried to access memory that out of memory map */ - arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name); + if (si_code == SEGV_PKUERR) + arm64_force_sig_fault_pkey(far, inf->name, pkey); + else + arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name); } return 0; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 9b5ab6818f7f..a0400b9aa814 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -414,8 +414,16 @@ void __init mem_init(void) void free_initmem(void) { - free_reserved_area(lm_alias(__init_begin), - lm_alias(__init_end), + void *lm_init_begin = lm_alias(__init_begin); + void *lm_init_end = lm_alias(__init_end); + + WARN_ON(!IS_ALIGNED((unsigned long)lm_init_begin, PAGE_SIZE)); + WARN_ON(!IS_ALIGNED((unsigned long)lm_init_end, PAGE_SIZE)); + + /* Delete __init region from memblock.reserved. */ + memblock_free(lm_init_begin, lm_init_end - lm_init_begin); + + free_reserved_area(lm_init_begin, lm_init_end, POISON_FREE_INITMEM, "unused kernel"); /* * Unmap the __init region but leave the VM area in place. This diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index 269f2f63ab7d..6cc0b7e7eb03 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -3,10 +3,22 @@ #include <linux/mm.h> #include <linux/io.h> +static ioremap_prot_hook_t ioremap_prot_hook; + +int arm64_ioremap_prot_hook_register(ioremap_prot_hook_t hook) +{ + if (WARN_ON(ioremap_prot_hook)) + return -EBUSY; + + ioremap_prot_hook = hook; + return 0; +} + void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, unsigned long prot) { unsigned long last_addr = phys_addr + size - 1; + pgprot_t pgprot = __pgprot(prot); /* Don't allow outside PHYS_MASK */ if (last_addr & ~PHYS_MASK) @@ -16,7 +28,16 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, if (WARN_ON(pfn_is_map_memory(__phys_to_pfn(phys_addr)))) return NULL; - return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); + /* + * If a hook is registered (e.g. for confidential computing + * purposes), call that now and barf if it fails. + */ + if (unlikely(ioremap_prot_hook) && + WARN_ON(ioremap_prot_hook(phys_addr, size, &pgprot))) { + return NULL; + } + + return generic_ioremap_prot(phys_addr, size, pgprot); } EXPORT_SYMBOL(ioremap_prot); diff --git a/arch/arm64/mm/mem_encrypt.c b/arch/arm64/mm/mem_encrypt.c new file mode 100644 index 000000000000..ee3c0ab04384 --- /dev/null +++ b/arch/arm64/mm/mem_encrypt.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Implementation of the memory encryption/decryption API. + * + * Since the low-level details of the operation depend on the + * Confidential Computing environment (e.g. pKVM, CCA, ...), this just + * acts as a top-level dispatcher to whatever hooks may have been + * registered. + * + * Author: Will Deacon <will@kernel.org> + * Copyright (C) 2024 Google LLC + * + * "Hello, boils and ghouls!" + */ + +#include <linux/bug.h> +#include <linux/compiler.h> +#include <linux/err.h> +#include <linux/mm.h> + +#include <asm/mem_encrypt.h> + +static const struct arm64_mem_crypt_ops *crypt_ops; + +int arm64_mem_crypt_ops_register(const struct arm64_mem_crypt_ops *ops) +{ + if (WARN_ON(crypt_ops)) + return -EBUSY; + + crypt_ops = ops; + return 0; +} + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + if (likely(!crypt_ops) || WARN_ON(!PAGE_ALIGNED(addr))) + return 0; + + return crypt_ops->encrypt(addr, numpages); +} +EXPORT_SYMBOL_GPL(set_memory_encrypted); + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + if (likely(!crypt_ops) || WARN_ON(!PAGE_ALIGNED(addr))) + return 0; + + return crypt_ops->decrypt(addr, numpages); +} +EXPORT_SYMBOL_GPL(set_memory_decrypted); diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 642bdf908b22..7e3ad97e27d8 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -102,6 +102,17 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags) if (vm_flags & VM_MTE) prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED); +#ifdef CONFIG_ARCH_HAS_PKEYS + if (system_supports_poe()) { + if (vm_flags & VM_PKEY_BIT0) + prot |= PTE_PO_IDX_0; + if (vm_flags & VM_PKEY_BIT1) + prot |= PTE_PO_IDX_1; + if (vm_flags & VM_PKEY_BIT2) + prot |= PTE_PO_IDX_2; + } +#endif + return __pgprot(prot); } EXPORT_SYMBOL(vm_get_page_prot); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 353ea5dc32b8..e55b02fbddc8 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -25,6 +25,7 @@ #include <linux/vmalloc.h> #include <linux/set_memory.h> #include <linux/kfence.h> +#include <linux/pkeys.h> #include <asm/barrier.h> #include <asm/cputype.h> @@ -1549,3 +1550,47 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) cpu_uninstall_idmap(); } + +#ifdef CONFIG_ARCH_HAS_PKEYS +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) +{ + u64 new_por = POE_RXW; + u64 old_por; + u64 pkey_shift; + + if (!system_supports_poe()) + return -ENOSPC; + + /* + * This code should only be called with valid 'pkey' + * values originating from in-kernel users. Complain + * if a bad value is observed. + */ + if (WARN_ON_ONCE(pkey >= arch_max_pkey())) + return -EINVAL; + + /* Set the bits we need in POR: */ + new_por = POE_RXW; + if (init_val & PKEY_DISABLE_WRITE) + new_por &= ~POE_W; + if (init_val & PKEY_DISABLE_ACCESS) + new_por &= ~POE_RW; + if (init_val & PKEY_DISABLE_READ) + new_por &= ~POE_R; + if (init_val & PKEY_DISABLE_EXECUTE) + new_por &= ~POE_X; + + /* Shift the bits in to the correct place in POR for pkey: */ + pkey_shift = pkey * POR_BITS_PER_PKEY; + new_por <<= pkey_shift; + + /* Get old POR and mask off any old bits in place: */ + old_por = read_sysreg_s(SYS_POR_EL0); + old_por &= ~(POE_MASK << pkey_shift); + + /* Write old part along with new part: */ + write_sysreg_s(old_por | new_por, SYS_POR_EL0); + + return 0; +} +#endif diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index f4bc6c5bac06..8abdc7fed321 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -36,8 +36,6 @@ #define TCR_KASLR_FLAGS 0 #endif -#define TCR_SMP_FLAGS TCR_SHARED - /* PTWs cacheable, inner/outer WBWA */ #define TCR_CACHE_FLAGS TCR_IRGN_WBWA | TCR_ORGN_WBWA @@ -469,7 +467,7 @@ SYM_FUNC_START(__cpu_setup) tcr .req x16 mov_q mair, MAIR_EL1_SET mov_q tcr, TCR_T0SZ(IDMAP_VA_BITS) | TCR_T1SZ(VA_BITS_MIN) | TCR_CACHE_FLAGS | \ - TCR_SMP_FLAGS | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ + TCR_SHARED | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS tcr_clear_errata_bits tcr, x9, x5 diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 6986827e0d64..264c5f9b97d8 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -38,33 +38,7 @@ seq_printf(m, fmt); \ }) -/* - * The page dumper groups page table entries of the same type into a single - * description. It uses pg_state to track the range information while - * iterating over the pte entries. When the continuity is broken it then - * dumps out a description of the range. - */ -struct pg_state { - struct ptdump_state ptdump; - struct seq_file *seq; - const struct addr_marker *marker; - const struct mm_struct *mm; - unsigned long start_address; - int level; - u64 current_prot; - bool check_wx; - unsigned long wx_pages; - unsigned long uxn_pages; -}; - -struct prot_bits { - u64 mask; - u64 val; - const char *set; - const char *clear; -}; - -static const struct prot_bits pte_bits[] = { +static const struct ptdump_prot_bits pte_bits[] = { { .mask = PTE_VALID, .val = PTE_VALID, @@ -143,14 +117,7 @@ static const struct prot_bits pte_bits[] = { } }; -struct pg_level { - const struct prot_bits *bits; - char name[4]; - int num; - u64 mask; -}; - -static struct pg_level pg_level[] __ro_after_init = { +static struct ptdump_pg_level kernel_pg_levels[] __ro_after_init = { { /* pgd */ .name = "PGD", .bits = pte_bits, @@ -174,7 +141,7 @@ static struct pg_level pg_level[] __ro_after_init = { }, }; -static void dump_prot(struct pg_state *st, const struct prot_bits *bits, +static void dump_prot(struct ptdump_pg_state *st, const struct ptdump_prot_bits *bits, size_t num) { unsigned i; @@ -192,7 +159,7 @@ static void dump_prot(struct pg_state *st, const struct prot_bits *bits, } } -static void note_prot_uxn(struct pg_state *st, unsigned long addr) +static void note_prot_uxn(struct ptdump_pg_state *st, unsigned long addr) { if (!st->check_wx) return; @@ -206,7 +173,7 @@ static void note_prot_uxn(struct pg_state *st, unsigned long addr) st->uxn_pages += (addr - st->start_address) / PAGE_SIZE; } -static void note_prot_wx(struct pg_state *st, unsigned long addr) +static void note_prot_wx(struct ptdump_pg_state *st, unsigned long addr) { if (!st->check_wx) return; @@ -221,16 +188,17 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) st->wx_pages += (addr - st->start_address) / PAGE_SIZE; } -static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - u64 val) +void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, + u64 val) { - struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + struct ptdump_pg_state *st = container_of(pt_st, struct ptdump_pg_state, ptdump); + struct ptdump_pg_level *pg_level = st->pg_level; static const char units[] = "KMGTPE"; u64 prot = 0; /* check if the current level has been folded dynamically */ - if ((level == 1 && mm_p4d_folded(st->mm)) || - (level == 2 && mm_pud_folded(st->mm))) + if (st->mm && ((level == 1 && mm_p4d_folded(st->mm)) || + (level == 2 && mm_pud_folded(st->mm)))) level = 0; if (level >= 0) @@ -286,15 +254,16 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, void ptdump_walk(struct seq_file *s, struct ptdump_info *info) { unsigned long end = ~0UL; - struct pg_state st; + struct ptdump_pg_state st; if (info->base_addr < TASK_SIZE_64) end = TASK_SIZE_64; - st = (struct pg_state){ + st = (struct ptdump_pg_state){ .seq = s, .marker = info->markers, .mm = info->mm, + .pg_level = &kernel_pg_levels[0], .level = -1, .ptdump = { .note_page = note_page, @@ -312,10 +281,10 @@ static void __init ptdump_initialize(void) { unsigned i, j; - for (i = 0; i < ARRAY_SIZE(pg_level); i++) - if (pg_level[i].bits) - for (j = 0; j < pg_level[i].num; j++) - pg_level[i].mask |= pg_level[i].bits[j].mask; + for (i = 0; i < ARRAY_SIZE(kernel_pg_levels); i++) + if (kernel_pg_levels[i].bits) + for (j = 0; j < kernel_pg_levels[i].num; j++) + kernel_pg_levels[i].mask |= kernel_pg_levels[i].bits[j].mask; } static struct ptdump_info kernel_ptdump_info __ro_after_init = { @@ -324,12 +293,13 @@ static struct ptdump_info kernel_ptdump_info __ro_after_init = { bool ptdump_check_wx(void) { - struct pg_state st = { + struct ptdump_pg_state st = { .seq = NULL, .marker = (struct addr_marker[]) { { 0, NULL}, { -1, NULL}, }, + .pg_level = &kernel_pg_levels[0], .level = -1, .check_wx = true, .ptdump = { diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 5139a28130c0..0f7b484cb2ff 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -42,14 +42,16 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) * the temporary mappings we use during restore. */ __set_pte(dst_ptep, pte_mkwrite_novma(pte)); - } else if ((debug_pagealloc_enabled() || - is_kfence_address((void *)addr)) && !pte_none(pte)) { + } else if (!pte_none(pte)) { /* * debug_pagealloc will removed the PTE_VALID bit if * the page isn't in use by the resume kernel. It may have * been in use by the original kernel, in which case we need * to put it back in our copy to do the restore. * + * Other cases include kfence / vmalloc / memfd_secret which + * may call `set_direct_map_invalid_noflush()`. + * * Before marking this entry valid, check the pfn should * be mapped. */ diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index ac3429d892b9..eedb5acc21ed 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -45,6 +45,7 @@ HAS_MOPS HAS_NESTED_VIRT HAS_PAN HAS_S1PIE +HAS_S1POE HAS_RAS_EXTN HAS_RNG HAS_SB diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 7ceaa1e0b4bc..8d637ac4b7c6 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2029,6 +2029,31 @@ Sysreg FAR_EL1 3 0 6 0 0 Field 63:0 ADDR EndSysreg +Sysreg PMICNTR_EL0 3 3 9 4 0 +Field 63:0 ICNT +EndSysreg + +Sysreg PMICFILTR_EL0 3 3 9 6 0 +Res0 63:59 +Field 58 SYNC +Field 57:56 VS +Res0 55:32 +Field 31 P +Field 30 U +Field 29 NSK +Field 28 NSU +Field 27 NSH +Field 26 M +Res0 25 +Field 24 SH +Field 23 T +Field 22 RLK +Field 21 RLU +Field 20 RLH +Res0 19:16 +Field 15:0 evtCount +EndSysreg + Sysreg PMSCR_EL1 3 0 9 9 0 Res0 63:8 Field 7:6 PCT @@ -2153,6 +2178,11 @@ Field 4 P Field 3:0 ALIGN EndSysreg +Sysreg PMSELR_EL0 3 3 9 12 5 +Res0 63:5 +Field 4:0 SEL +EndSysreg + SysregFields CONTEXTIDR_ELx Res0 63:32 Field 31:0 PROCID diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild index 2bb3676429c0..4635b755b2b4 100644 --- a/arch/loongarch/include/asm/Kbuild +++ b/arch/loongarch/include/asm/Kbuild @@ -6,7 +6,6 @@ generic-y += mcs_spinlock.h generic-y += parport.h generic-y += early_ioremap.h generic-y += qrwlock.h -generic-y += qspinlock.h generic-y += user.h generic-y += ioctl.h generic-y += statfs.h diff --git a/arch/loongarch/include/asm/dma-direct.h b/arch/loongarch/include/asm/dma-direct.h deleted file mode 100644 index 75ccd808a2af..000000000000 --- a/arch/loongarch/include/asm/dma-direct.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2020-2022 Loongson Technology Corporation Limited - */ -#ifndef _LOONGARCH_DMA_DIRECT_H -#define _LOONGARCH_DMA_DIRECT_H - -dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr); -phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr); - -#endif /* _LOONGARCH_DMA_DIRECT_H */ diff --git a/arch/loongarch/include/asm/hw_irq.h b/arch/loongarch/include/asm/hw_irq.h index af4f4e8fbd85..8156ffb67415 100644 --- a/arch/loongarch/include/asm/hw_irq.h +++ b/arch/loongarch/include/asm/hw_irq.h @@ -9,6 +9,8 @@ extern atomic_t irq_err_count; +#define ARCH_IRQ_INIT_FLAGS IRQ_NOPROBE + /* * interrupt-retrigger: NOP for now. This may not be appropriate for all * machines, we'll see ... diff --git a/arch/loongarch/include/asm/kvm_csr.h b/arch/loongarch/include/asm/kvm_csr.h index 724ca8b7b401..4a76ce796f1f 100644 --- a/arch/loongarch/include/asm/kvm_csr.h +++ b/arch/loongarch/include/asm/kvm_csr.h @@ -30,6 +30,7 @@ : [val] "+r" (__v) \ : [reg] "i" (csr) \ : "memory"); \ + __v; \ }) #define gcsr_xchg(v, m, csr) \ @@ -181,6 +182,8 @@ __BUILD_GCSR_OP(tlbidx) #define kvm_save_hw_gcsr(csr, gid) (csr->csrs[gid] = gcsr_read(gid)) #define kvm_restore_hw_gcsr(csr, gid) (gcsr_write(csr->csrs[gid], gid)) +#define kvm_read_clear_hw_gcsr(csr, gid) (csr->csrs[gid] = gcsr_write(0, gid)) + int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu); static __always_inline unsigned long kvm_read_sw_gcsr(struct loongarch_csrs *csr, int gid) @@ -208,4 +211,7 @@ static __always_inline void kvm_change_sw_gcsr(struct loongarch_csrs *csr, csr->csrs[gid] |= val & _mask; } +#define KVM_PMU_EVENT_ENABLED (CSR_PERFCTRL_PLV0 | CSR_PERFCTRL_PLV1 | \ + CSR_PERFCTRL_PLV2 | CSR_PERFCTRL_PLV3) + #endif /* __ASM_LOONGARCH_KVM_CSR_H__ */ diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index 5f0677e03817..d6bb72424027 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -30,6 +30,7 @@ #define KVM_HALT_POLL_NS_DEFAULT 500000 #define KVM_REQ_TLB_FLUSH_GPA KVM_ARCH_REQ(0) #define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(1) +#define KVM_REQ_PMU KVM_ARCH_REQ(2) #define KVM_GUESTDBG_SW_BP_MASK \ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP) @@ -60,9 +61,13 @@ struct kvm_arch_memory_slot { unsigned long flags; }; +#define HOST_MAX_PMNUM 16 struct kvm_context { unsigned long vpid_cache; struct kvm_vcpu *last_vcpu; + /* Host PMU CSR */ + u64 perf_ctrl[HOST_MAX_PMNUM]; + u64 perf_cntr[HOST_MAX_PMNUM]; }; struct kvm_world_switch { @@ -107,6 +112,8 @@ struct kvm_arch { unsigned int root_level; spinlock_t phyid_map_lock; struct kvm_phyid_map *phyid_map; + /* Enabled PV features */ + unsigned long pv_features; s64 time_offset; struct kvm_context __percpu *vmcs; @@ -133,8 +140,15 @@ enum emulation_result { #define KVM_LARCH_FPU (0x1 << 0) #define KVM_LARCH_LSX (0x1 << 1) #define KVM_LARCH_LASX (0x1 << 2) -#define KVM_LARCH_SWCSR_LATEST (0x1 << 3) -#define KVM_LARCH_HWCSR_USABLE (0x1 << 4) +#define KVM_LARCH_LBT (0x1 << 3) +#define KVM_LARCH_PMU (0x1 << 4) +#define KVM_LARCH_SWCSR_LATEST (0x1 << 5) +#define KVM_LARCH_HWCSR_USABLE (0x1 << 6) + +#define LOONGARCH_PV_FEAT_UPDATED BIT_ULL(63) +#define LOONGARCH_PV_FEAT_MASK (BIT(KVM_FEATURE_IPI) | \ + BIT(KVM_FEATURE_STEAL_TIME) | \ + BIT(KVM_FEATURE_VIRT_EXTIOI)) struct kvm_vcpu_arch { /* @@ -168,10 +182,14 @@ struct kvm_vcpu_arch { /* FPU state */ struct loongarch_fpu fpu FPU_ALIGN; + struct loongarch_lbt lbt; /* CSR state */ struct loongarch_csrs *csr; + /* Guest max PMU CSR id */ + int max_pmu_csrid; + /* GPR used as IO source/target */ u32 io_gpr; @@ -239,6 +257,21 @@ static inline bool kvm_guest_has_lasx(struct kvm_vcpu_arch *arch) return arch->cpucfg[2] & CPUCFG2_LASX; } +static inline bool kvm_guest_has_lbt(struct kvm_vcpu_arch *arch) +{ + return arch->cpucfg[2] & (CPUCFG2_X86BT | CPUCFG2_ARMBT | CPUCFG2_MIPSBT); +} + +static inline bool kvm_guest_has_pmu(struct kvm_vcpu_arch *arch) +{ + return arch->cpucfg[6] & CPUCFG6_PMP; +} + +static inline int kvm_get_pmu_num(struct kvm_vcpu_arch *arch) +{ + return (arch->cpucfg[6] & CPUCFG6_PMNUM) >> CPUCFG6_PMNUM_SHIFT; +} + /* Debug: dump vcpu state */ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu); diff --git a/arch/loongarch/include/asm/kvm_para.h b/arch/loongarch/include/asm/kvm_para.h index 43ec61589e6c..c4e84227280d 100644 --- a/arch/loongarch/include/asm/kvm_para.h +++ b/arch/loongarch/include/asm/kvm_para.h @@ -2,6 +2,8 @@ #ifndef _ASM_LOONGARCH_KVM_PARA_H #define _ASM_LOONGARCH_KVM_PARA_H +#include <uapi/asm/kvm_para.h> + /* * Hypercall code field */ @@ -154,10 +156,20 @@ static __always_inline long kvm_hypercall5(u64 fid, return ret; } +#ifdef CONFIG_PARAVIRT +bool kvm_para_available(void); +unsigned int kvm_arch_para_features(void); +#else +static inline bool kvm_para_available(void) +{ + return false; +} + static inline unsigned int kvm_arch_para_features(void) { return 0; } +#endif static inline unsigned int kvm_arch_para_hints(void) { diff --git a/arch/loongarch/include/asm/kvm_vcpu.h b/arch/loongarch/include/asm/kvm_vcpu.h index c416cb7125c0..d7e8f7d50ee0 100644 --- a/arch/loongarch/include/asm/kvm_vcpu.h +++ b/arch/loongarch/include/asm/kvm_vcpu.h @@ -75,8 +75,13 @@ static inline void kvm_save_lasx(struct loongarch_fpu *fpu) { } static inline void kvm_restore_lasx(struct loongarch_fpu *fpu) { } #endif +#ifdef CONFIG_CPU_HAS_LBT +int kvm_own_lbt(struct kvm_vcpu *vcpu); +#else +static inline int kvm_own_lbt(struct kvm_vcpu *vcpu) { return -EINVAL; } +#endif + void kvm_init_timer(struct kvm_vcpu *vcpu, unsigned long hz); -void kvm_reset_timer(struct kvm_vcpu *vcpu); void kvm_save_timer(struct kvm_vcpu *vcpu); void kvm_restore_timer(struct kvm_vcpu *vcpu); @@ -125,4 +130,9 @@ static inline bool kvm_pvtime_supported(void) return !!sched_info_on(); } +static inline bool kvm_guest_has_pv_feature(struct kvm_vcpu *vcpu, unsigned int feature) +{ + return vcpu->kvm->arch.pv_features & BIT(feature); +} + #endif /* __ASM_LOONGARCH_KVM_VCPU_H__ */ diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 04a78010fc72..24a3f4925cfb 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -119,6 +119,7 @@ #define CPUCFG6_PMP BIT(0) #define CPUCFG6_PAMVER GENMASK(3, 1) #define CPUCFG6_PMNUM GENMASK(7, 4) +#define CPUCFG6_PMNUM_SHIFT 4 #define CPUCFG6_PMBITS GENMASK(13, 8) #define CPUCFG6_UPM BIT(14) @@ -160,16 +161,8 @@ /* * CPUCFG index area: 0x40000000 -- 0x400000ff - * SW emulation for KVM hypervirsor + * SW emulation for KVM hypervirsor, see arch/loongarch/include/uapi/asm/kvm_para.h */ -#define CPUCFG_KVM_BASE 0x40000000 -#define CPUCFG_KVM_SIZE 0x100 - -#define CPUCFG_KVM_SIG (CPUCFG_KVM_BASE + 0) -#define KVM_SIGNATURE "KVM\0" -#define CPUCFG_KVM_FEATURE (CPUCFG_KVM_BASE + 4) -#define KVM_FEATURE_IPI BIT(1) -#define KVM_FEATURE_STEAL_TIME BIT(2) #ifndef __ASSEMBLY__ diff --git a/arch/loongarch/include/asm/paravirt.h b/arch/loongarch/include/asm/paravirt.h index dddec49671ae..3f4323603e6a 100644 --- a/arch/loongarch/include/asm/paravirt.h +++ b/arch/loongarch/include/asm/paravirt.h @@ -19,6 +19,7 @@ static inline u64 paravirt_steal_clock(int cpu) int __init pv_ipi_init(void); int __init pv_time_init(void); +int __init pv_spinlock_init(void); #else @@ -31,5 +32,11 @@ static inline int pv_time_init(void) { return 0; } + +static inline int pv_spinlock_init(void) +{ + return 0; +} + #endif // CONFIG_PARAVIRT #endif diff --git a/arch/loongarch/include/asm/qspinlock.h b/arch/loongarch/include/asm/qspinlock.h new file mode 100644 index 000000000000..e76d3aa1e1eb --- /dev/null +++ b/arch/loongarch/include/asm/qspinlock.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_LOONGARCH_QSPINLOCK_H +#define _ASM_LOONGARCH_QSPINLOCK_H + +#include <linux/jump_label.h> + +#ifdef CONFIG_PARAVIRT + +DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key); + +#define virt_spin_lock virt_spin_lock + +static inline bool virt_spin_lock(struct qspinlock *lock) +{ + int val; + + if (!static_branch_unlikely(&virt_spin_lock_key)) + return false; + + /* + * On hypervisors without PARAVIRT_SPINLOCKS support we fall + * back to a Test-and-Set spinlock, because fair locks have + * horrible lock 'holder' preemption issues. + */ + +__retry: + val = atomic_read(&lock->val); + + if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) { + cpu_relax(); + goto __retry; + } + + return true; +} + +#endif /* CONFIG_PARAVIRT */ + +#include <asm-generic/qspinlock.h> + +#endif // _ASM_LOONGARCH_QSPINLOCK_H diff --git a/arch/loongarch/include/uapi/asm/Kbuild b/arch/loongarch/include/uapi/asm/Kbuild index c6d141d7b7d7..517761419999 100644 --- a/arch/loongarch/include/uapi/asm/Kbuild +++ b/arch/loongarch/include/uapi/asm/Kbuild @@ -1,4 +1,2 @@ # SPDX-License-Identifier: GPL-2.0 syscall-y += unistd_64.h - -generic-y += kvm_para.h diff --git a/arch/loongarch/include/uapi/asm/kvm.h b/arch/loongarch/include/uapi/asm/kvm.h index ddc5cab0ffd0..70d89070bfeb 100644 --- a/arch/loongarch/include/uapi/asm/kvm.h +++ b/arch/loongarch/include/uapi/asm/kvm.h @@ -64,6 +64,7 @@ struct kvm_fpu { #define KVM_REG_LOONGARCH_KVM (KVM_REG_LOONGARCH | 0x20000ULL) #define KVM_REG_LOONGARCH_FPSIMD (KVM_REG_LOONGARCH | 0x30000ULL) #define KVM_REG_LOONGARCH_CPUCFG (KVM_REG_LOONGARCH | 0x40000ULL) +#define KVM_REG_LOONGARCH_LBT (KVM_REG_LOONGARCH | 0x50000ULL) #define KVM_REG_LOONGARCH_MASK (KVM_REG_LOONGARCH | 0x70000ULL) #define KVM_CSR_IDX_MASK 0x7fff #define KVM_CPUCFG_IDX_MASK 0x7fff @@ -77,11 +78,30 @@ struct kvm_fpu { /* Debugging: Special instruction for software breakpoint */ #define KVM_REG_LOONGARCH_DEBUG_INST (KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 3) +/* LBT registers */ +#define KVM_REG_LOONGARCH_LBT_SCR0 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 1) +#define KVM_REG_LOONGARCH_LBT_SCR1 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 2) +#define KVM_REG_LOONGARCH_LBT_SCR2 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 3) +#define KVM_REG_LOONGARCH_LBT_SCR3 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 4) +#define KVM_REG_LOONGARCH_LBT_EFLAGS (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 5) +#define KVM_REG_LOONGARCH_LBT_FTOP (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 6) + #define LOONGARCH_REG_SHIFT 3 #define LOONGARCH_REG_64(TYPE, REG) (TYPE | KVM_REG_SIZE_U64 | (REG << LOONGARCH_REG_SHIFT)) #define KVM_IOC_CSRID(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CSR, REG) #define KVM_IOC_CPUCFG(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CPUCFG, REG) +/* Device Control API on vm fd */ +#define KVM_LOONGARCH_VM_FEAT_CTRL 0 +#define KVM_LOONGARCH_VM_FEAT_LSX 0 +#define KVM_LOONGARCH_VM_FEAT_LASX 1 +#define KVM_LOONGARCH_VM_FEAT_X86BT 2 +#define KVM_LOONGARCH_VM_FEAT_ARMBT 3 +#define KVM_LOONGARCH_VM_FEAT_MIPSBT 4 +#define KVM_LOONGARCH_VM_FEAT_PMU 5 +#define KVM_LOONGARCH_VM_FEAT_PV_IPI 6 +#define KVM_LOONGARCH_VM_FEAT_PV_STEALTIME 7 + /* Device Control API on vcpu fd */ #define KVM_LOONGARCH_VCPU_CPUCFG 0 #define KVM_LOONGARCH_VCPU_PVTIME_CTRL 1 diff --git a/arch/loongarch/include/uapi/asm/kvm_para.h b/arch/loongarch/include/uapi/asm/kvm_para.h new file mode 100644 index 000000000000..b0604aa9b4bb --- /dev/null +++ b/arch/loongarch/include/uapi/asm/kvm_para.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_ASM_KVM_PARA_H +#define _UAPI_ASM_KVM_PARA_H + +#include <linux/types.h> + +/* + * CPUCFG index area: 0x40000000 -- 0x400000ff + * SW emulation for KVM hypervirsor + */ +#define CPUCFG_KVM_BASE 0x40000000 +#define CPUCFG_KVM_SIZE 0x100 +#define CPUCFG_KVM_SIG (CPUCFG_KVM_BASE + 0) +#define KVM_SIGNATURE "KVM\0" +#define CPUCFG_KVM_FEATURE (CPUCFG_KVM_BASE + 4) +#define KVM_FEATURE_IPI 1 +#define KVM_FEATURE_STEAL_TIME 2 +/* BIT 24 - 31 are features configurable by user space vmm */ +#define KVM_FEATURE_VIRT_EXTIOI 24 + +#endif /* _UAPI_ASM_KVM_PARA_H */ diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S index 69a85f2479fb..6ab640101457 100644 --- a/arch/loongarch/kernel/fpu.S +++ b/arch/loongarch/kernel/fpu.S @@ -530,6 +530,10 @@ SYM_FUNC_END(_restore_lasx_context) #ifdef CONFIG_CPU_HAS_LBT STACK_FRAME_NON_STANDARD _restore_fp +#ifdef CONFIG_CPU_HAS_LSX STACK_FRAME_NON_STANDARD _restore_lsx +#endif +#ifdef CONFIG_CPU_HAS_LASX STACK_FRAME_NON_STANDARD _restore_lasx #endif +#endif diff --git a/arch/loongarch/kernel/irq.c b/arch/loongarch/kernel/irq.c index f4991c03514f..adac8fcbb2ac 100644 --- a/arch/loongarch/kernel/irq.c +++ b/arch/loongarch/kernel/irq.c @@ -102,9 +102,6 @@ void __init init_IRQ(void) mp_ops.init_ipi(); #endif - for (i = 0; i < NR_IRQS; i++) - irq_set_noprobe(i); - for_each_possible_cpu(i) { page = alloc_pages_node(cpu_to_node(i), GFP_KERNEL, order); diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c index 9c9b75b76f62..708eda025ed8 100644 --- a/arch/loongarch/kernel/paravirt.c +++ b/arch/loongarch/kernel/paravirt.c @@ -13,6 +13,7 @@ static int has_steal_clock; struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); +DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key); static u64 native_steal_clock(int cpu) { @@ -151,11 +152,14 @@ static void pv_init_ipi(void) } #endif -static bool kvm_para_available(void) +bool kvm_para_available(void) { int config; static int hypervisor_type; + if (!cpu_has_hypervisor) + return false; + if (!hypervisor_type) { config = read_cpucfg(CPUCFG_KVM_SIG); if (!memcmp(&config, KVM_SIGNATURE, 4)) @@ -165,17 +169,22 @@ static bool kvm_para_available(void) return hypervisor_type == HYPERVISOR_KVM; } -int __init pv_ipi_init(void) +unsigned int kvm_arch_para_features(void) { - int feature; + static unsigned int feature; - if (!cpu_has_hypervisor) - return 0; if (!kvm_para_available()) return 0; - feature = read_cpucfg(CPUCFG_KVM_FEATURE); - if (!(feature & KVM_FEATURE_IPI)) + if (!feature) + feature = read_cpucfg(CPUCFG_KVM_FEATURE); + + return feature; +} + +int __init pv_ipi_init(void) +{ + if (!kvm_para_has_feature(KVM_FEATURE_IPI)) return 0; #ifdef CONFIG_SMP @@ -206,7 +215,7 @@ static int pv_enable_steal_time(void) } addr |= KVM_STEAL_PHYS_VALID; - kvm_hypercall2(KVM_HCALL_FUNC_NOTIFY, KVM_FEATURE_STEAL_TIME, addr); + kvm_hypercall2(KVM_HCALL_FUNC_NOTIFY, BIT(KVM_FEATURE_STEAL_TIME), addr); return 0; } @@ -214,7 +223,7 @@ static int pv_enable_steal_time(void) static void pv_disable_steal_time(void) { if (has_steal_clock) - kvm_hypercall2(KVM_HCALL_FUNC_NOTIFY, KVM_FEATURE_STEAL_TIME, 0); + kvm_hypercall2(KVM_HCALL_FUNC_NOTIFY, BIT(KVM_FEATURE_STEAL_TIME), 0); } #ifdef CONFIG_SMP @@ -258,15 +267,9 @@ static struct notifier_block pv_reboot_nb = { int __init pv_time_init(void) { - int r, feature; + int r; - if (!cpu_has_hypervisor) - return 0; - if (!kvm_para_available()) - return 0; - - feature = read_cpucfg(CPUCFG_KVM_FEATURE); - if (!(feature & KVM_FEATURE_STEAL_TIME)) + if (!kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) return 0; has_steal_clock = 1; @@ -300,3 +303,13 @@ int __init pv_time_init(void) return 0; } + +int __init pv_spinlock_init(void) +{ + if (!cpu_has_hypervisor) + return 0; + + static_branch_enable(&virt_spin_lock_key); + + return 0; +} diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 0f0740f0be27..00e307203ddb 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -603,6 +603,8 @@ void __init setup_arch(char **cmdline_p) arch_mem_init(cmdline_p); resource_init(); + jump_label_init(); /* Initialise the static keys for paravirtualization */ + #ifdef CONFIG_SMP plat_smp_setup(); prefill_possible_map(); diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index ca405ab86aae..482b3c7e3042 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -476,7 +476,7 @@ core_initcall(ipi_pm_init); #endif /* Preload SMP state for boot cpu */ -void smp_prepare_boot_cpu(void) +void __init smp_prepare_boot_cpu(void) { unsigned int cpu, node, rr_node; @@ -509,6 +509,8 @@ void smp_prepare_boot_cpu(void) rr_node = next_node_in(rr_node, node_online_map); } } + + pv_spinlock_init(); } /* called from main before smp_init() */ diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index ea73f9dc2cc6..90894f70ff4a 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -50,9 +50,7 @@ static int kvm_emu_cpucfg(struct kvm_vcpu *vcpu, larch_inst inst) vcpu->arch.gprs[rd] = *(unsigned int *)KVM_SIGNATURE; break; case CPUCFG_KVM_FEATURE: - ret = KVM_FEATURE_IPI; - if (kvm_pvtime_supported()) - ret |= KVM_FEATURE_STEAL_TIME; + ret = vcpu->kvm->arch.pv_features & LOONGARCH_PV_FEAT_MASK; vcpu->arch.gprs[rd] = ret; break; default: @@ -127,6 +125,14 @@ static int kvm_handle_csr(struct kvm_vcpu *vcpu, larch_inst inst) rj = inst.reg2csr_format.rj; csrid = inst.reg2csr_format.csr; + if (csrid >= LOONGARCH_CSR_PERFCTRL0 && csrid <= vcpu->arch.max_pmu_csrid) { + if (kvm_guest_has_pmu(&vcpu->arch)) { + vcpu->arch.pc -= 4; + kvm_make_request(KVM_REQ_PMU, vcpu); + return EMULATE_DONE; + } + } + /* Process CSR ops */ switch (rj) { case 0: /* process csrrd */ @@ -697,25 +703,22 @@ static long kvm_save_notify(struct kvm_vcpu *vcpu) id = kvm_read_reg(vcpu, LOONGARCH_GPR_A1); data = kvm_read_reg(vcpu, LOONGARCH_GPR_A2); switch (id) { - case KVM_FEATURE_STEAL_TIME: - if (!kvm_pvtime_supported()) - return KVM_HCALL_INVALID_CODE; - + case BIT(KVM_FEATURE_STEAL_TIME): if (data & ~(KVM_STEAL_PHYS_MASK | KVM_STEAL_PHYS_VALID)) return KVM_HCALL_INVALID_PARAMETER; vcpu->arch.st.guest_addr = data; if (!(data & KVM_STEAL_PHYS_VALID)) - break; + return 0; vcpu->arch.st.last_steal = current->sched_info.run_delay; kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); - break; + return 0; default: - break; + return KVM_HCALL_INVALID_CODE; }; - return 0; + return KVM_HCALL_INVALID_CODE; }; /* @@ -748,6 +751,14 @@ static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu) return RESUME_GUEST; } +static int kvm_handle_lbt_disabled(struct kvm_vcpu *vcpu) +{ + if (kvm_own_lbt(vcpu)) + kvm_queue_exception(vcpu, EXCCODE_INE, 0); + + return RESUME_GUEST; +} + static int kvm_send_pv_ipi(struct kvm_vcpu *vcpu) { unsigned int min, cpu, i; @@ -781,19 +792,21 @@ static int kvm_send_pv_ipi(struct kvm_vcpu *vcpu) */ static void kvm_handle_service(struct kvm_vcpu *vcpu) { + long ret = KVM_HCALL_INVALID_CODE; unsigned long func = kvm_read_reg(vcpu, LOONGARCH_GPR_A0); - long ret; switch (func) { case KVM_HCALL_FUNC_IPI: - kvm_send_pv_ipi(vcpu); - ret = KVM_HCALL_SUCCESS; + if (kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_IPI)) { + kvm_send_pv_ipi(vcpu); + ret = KVM_HCALL_SUCCESS; + } break; case KVM_HCALL_FUNC_NOTIFY: - ret = kvm_save_notify(vcpu); + if (kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_STEAL_TIME)) + ret = kvm_save_notify(vcpu); break; default: - ret = KVM_HCALL_INVALID_CODE; break; } @@ -865,6 +878,7 @@ static exit_handle_fn kvm_fault_tables[EXCCODE_INT_START] = { [EXCCODE_FPDIS] = kvm_handle_fpu_disabled, [EXCCODE_LSXDIS] = kvm_handle_lsx_disabled, [EXCCODE_LASXDIS] = kvm_handle_lasx_disabled, + [EXCCODE_BTDIS] = kvm_handle_lbt_disabled, [EXCCODE_GSPR] = kvm_handle_gspr, [EXCCODE_HVC] = kvm_handle_hypercall, }; diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S index 80e988985a6a..0c292f818492 100644 --- a/arch/loongarch/kvm/switch.S +++ b/arch/loongarch/kvm/switch.S @@ -277,6 +277,10 @@ SYM_DATA(kvm_enter_guest_size, .quad kvm_enter_guest_end - kvm_enter_guest) #ifdef CONFIG_CPU_HAS_LBT STACK_FRAME_NON_STANDARD kvm_restore_fpu +#ifdef CONFIG_CPU_HAS_LSX STACK_FRAME_NON_STANDARD kvm_restore_lsx +#endif +#ifdef CONFIG_CPU_HAS_LASX STACK_FRAME_NON_STANDARD kvm_restore_lasx #endif +#endif diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c index bcc6b6d063d9..74a4b5c272d6 100644 --- a/arch/loongarch/kvm/timer.c +++ b/arch/loongarch/kvm/timer.c @@ -188,10 +188,3 @@ void kvm_save_timer(struct kvm_vcpu *vcpu) kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ESTAT); preempt_enable(); } - -void kvm_reset_timer(struct kvm_vcpu *vcpu) -{ - write_gcsr_timercfg(0); - kvm_write_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_TCFG, 0); - hrtimer_cancel(&vcpu->arch.swtimer); -} diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 16756ffb55e8..0697b1064251 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -6,6 +6,7 @@ #include <linux/kvm_host.h> #include <linux/entry-kvm.h> #include <asm/fpu.h> +#include <asm/lbt.h> #include <asm/loongarch.h> #include <asm/setup.h> #include <asm/time.h> @@ -31,6 +32,126 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { sizeof(kvm_vcpu_stats_desc), }; +static inline void kvm_save_host_pmu(struct kvm_vcpu *vcpu) +{ + struct kvm_context *context; + + context = this_cpu_ptr(vcpu->kvm->arch.vmcs); + context->perf_cntr[0] = read_csr_perfcntr0(); + context->perf_cntr[1] = read_csr_perfcntr1(); + context->perf_cntr[2] = read_csr_perfcntr2(); + context->perf_cntr[3] = read_csr_perfcntr3(); + context->perf_ctrl[0] = write_csr_perfctrl0(0); + context->perf_ctrl[1] = write_csr_perfctrl1(0); + context->perf_ctrl[2] = write_csr_perfctrl2(0); + context->perf_ctrl[3] = write_csr_perfctrl3(0); +} + +static inline void kvm_restore_host_pmu(struct kvm_vcpu *vcpu) +{ + struct kvm_context *context; + + context = this_cpu_ptr(vcpu->kvm->arch.vmcs); + write_csr_perfcntr0(context->perf_cntr[0]); + write_csr_perfcntr1(context->perf_cntr[1]); + write_csr_perfcntr2(context->perf_cntr[2]); + write_csr_perfcntr3(context->perf_cntr[3]); + write_csr_perfctrl0(context->perf_ctrl[0]); + write_csr_perfctrl1(context->perf_ctrl[1]); + write_csr_perfctrl2(context->perf_ctrl[2]); + write_csr_perfctrl3(context->perf_ctrl[3]); +} + + +static inline void kvm_save_guest_pmu(struct kvm_vcpu *vcpu) +{ + struct loongarch_csrs *csr = vcpu->arch.csr; + + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR0); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR1); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR2); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR3); + kvm_read_clear_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL0); + kvm_read_clear_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL1); + kvm_read_clear_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL2); + kvm_read_clear_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL3); +} + +static inline void kvm_restore_guest_pmu(struct kvm_vcpu *vcpu) +{ + struct loongarch_csrs *csr = vcpu->arch.csr; + + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR0); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR1); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR2); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR3); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL0); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL1); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL2); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL3); +} + +static int kvm_own_pmu(struct kvm_vcpu *vcpu) +{ + unsigned long val; + + if (!kvm_guest_has_pmu(&vcpu->arch)) + return -EINVAL; + + kvm_save_host_pmu(vcpu); + + /* Set PM0-PM(num) to guest */ + val = read_csr_gcfg() & ~CSR_GCFG_GPERF; + val |= (kvm_get_pmu_num(&vcpu->arch) + 1) << CSR_GCFG_GPERF_SHIFT; + write_csr_gcfg(val); + + kvm_restore_guest_pmu(vcpu); + + return 0; +} + +static void kvm_lose_pmu(struct kvm_vcpu *vcpu) +{ + unsigned long val; + struct loongarch_csrs *csr = vcpu->arch.csr; + + if (!(vcpu->arch.aux_inuse & KVM_LARCH_PMU)) + return; + + kvm_save_guest_pmu(vcpu); + + /* Disable pmu access from guest */ + write_csr_gcfg(read_csr_gcfg() & ~CSR_GCFG_GPERF); + + /* + * Clear KVM_LARCH_PMU if the guest is not using PMU CSRs when + * exiting the guest, so that the next time trap into the guest. + * We don't need to deal with PMU CSRs contexts. + */ + val = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL0); + val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL1); + val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL2); + val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL3); + if (!(val & KVM_PMU_EVENT_ENABLED)) + vcpu->arch.aux_inuse &= ~KVM_LARCH_PMU; + + kvm_restore_host_pmu(vcpu); +} + +static void kvm_restore_pmu(struct kvm_vcpu *vcpu) +{ + if ((vcpu->arch.aux_inuse & KVM_LARCH_PMU)) + kvm_make_request(KVM_REQ_PMU, vcpu); +} + +static void kvm_check_pmu(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_PMU, vcpu)) { + kvm_own_pmu(vcpu); + vcpu->arch.aux_inuse |= KVM_LARCH_PMU; + } +} + static void kvm_update_stolen_time(struct kvm_vcpu *vcpu) { u32 version; @@ -158,6 +279,7 @@ static int kvm_pre_enter_guest(struct kvm_vcpu *vcpu) /* Make sure the vcpu mode has been written */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); kvm_check_vpid(vcpu); + kvm_check_pmu(vcpu); /* * Called after function kvm_check_vpid() @@ -195,6 +317,8 @@ static int kvm_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) /* Set a default exit reason */ run->exit_reason = KVM_EXIT_UNKNOWN; + kvm_lose_pmu(vcpu); + guest_timing_exit_irqoff(); guest_state_exit_irqoff(); local_irq_enable(); @@ -468,6 +592,22 @@ static int _kvm_setcsr(struct kvm_vcpu *vcpu, unsigned int id, u64 val) kvm_write_sw_gcsr(csr, id, val); + /* + * After modifying the PMU CSR register value of the vcpu. + * If the PMU CSRs are used, we need to set KVM_REQ_PMU. + */ + if (id >= LOONGARCH_CSR_PERFCTRL0 && id <= LOONGARCH_CSR_PERFCNTR3) { + unsigned long val; + + val = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL0) | + kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL1) | + kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL2) | + kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL3); + + if (val & KVM_PMU_EVENT_ENABLED) + kvm_make_request(KVM_REQ_PMU, vcpu); + } + return ret; } @@ -497,6 +637,12 @@ static int _kvm_get_cpucfg_mask(int id, u64 *v) *v |= CPUCFG2_LSX; if (cpu_has_lasx) *v |= CPUCFG2_LASX; + if (cpu_has_lbt_x86) + *v |= CPUCFG2_X86BT; + if (cpu_has_lbt_arm) + *v |= CPUCFG2_ARMBT; + if (cpu_has_lbt_mips) + *v |= CPUCFG2_MIPSBT; return 0; case LOONGARCH_CPUCFG3: @@ -506,6 +652,12 @@ static int _kvm_get_cpucfg_mask(int id, u64 *v) case LOONGARCH_CPUCFG5: *v = GENMASK(31, 0); return 0; + case LOONGARCH_CPUCFG6: + if (cpu_has_pmp) + *v = GENMASK(14, 0); + else + *v = 0; + return 0; case LOONGARCH_CPUCFG16: *v = GENMASK(16, 0); return 0; @@ -550,6 +702,17 @@ static int kvm_check_cpucfg(int id, u64 val) /* LASX architecturally implies LSX and FP but val does not satisfy that */ return -EINVAL; return 0; + case LOONGARCH_CPUCFG6: + if (val & CPUCFG6_PMP) { + u32 host = read_cpucfg(LOONGARCH_CPUCFG6); + if ((val & CPUCFG6_PMBITS) != (host & CPUCFG6_PMBITS)) + return -EINVAL; + if ((val & CPUCFG6_PMNUM) > (host & CPUCFG6_PMNUM)) + return -EINVAL; + if ((val & CPUCFG6_UPM) && !(host & CPUCFG6_UPM)) + return -EINVAL; + } + return 0; default: /* * Values for the other CPUCFG IDs are not being further validated @@ -577,6 +740,34 @@ static int kvm_get_one_reg(struct kvm_vcpu *vcpu, else ret = -EINVAL; break; + case KVM_REG_LOONGARCH_LBT: + if (!kvm_guest_has_lbt(&vcpu->arch)) + return -ENXIO; + + switch (reg->id) { + case KVM_REG_LOONGARCH_LBT_SCR0: + *v = vcpu->arch.lbt.scr0; + break; + case KVM_REG_LOONGARCH_LBT_SCR1: + *v = vcpu->arch.lbt.scr1; + break; + case KVM_REG_LOONGARCH_LBT_SCR2: + *v = vcpu->arch.lbt.scr2; + break; + case KVM_REG_LOONGARCH_LBT_SCR3: + *v = vcpu->arch.lbt.scr3; + break; + case KVM_REG_LOONGARCH_LBT_EFLAGS: + *v = vcpu->arch.lbt.eflags; + break; + case KVM_REG_LOONGARCH_LBT_FTOP: + *v = vcpu->arch.fpu.ftop; + break; + default: + ret = -EINVAL; + break; + } + break; case KVM_REG_LOONGARCH_KVM: switch (reg->id) { case KVM_REG_LOONGARCH_COUNTER: @@ -635,6 +826,37 @@ static int kvm_set_one_reg(struct kvm_vcpu *vcpu, if (ret) break; vcpu->arch.cpucfg[id] = (u32)v; + if (id == LOONGARCH_CPUCFG6) + vcpu->arch.max_pmu_csrid = + LOONGARCH_CSR_PERFCTRL0 + 2 * kvm_get_pmu_num(&vcpu->arch) + 1; + break; + case KVM_REG_LOONGARCH_LBT: + if (!kvm_guest_has_lbt(&vcpu->arch)) + return -ENXIO; + + switch (reg->id) { + case KVM_REG_LOONGARCH_LBT_SCR0: + vcpu->arch.lbt.scr0 = v; + break; + case KVM_REG_LOONGARCH_LBT_SCR1: + vcpu->arch.lbt.scr1 = v; + break; + case KVM_REG_LOONGARCH_LBT_SCR2: + vcpu->arch.lbt.scr2 = v; + break; + case KVM_REG_LOONGARCH_LBT_SCR3: + vcpu->arch.lbt.scr3 = v; + break; + case KVM_REG_LOONGARCH_LBT_EFLAGS: + vcpu->arch.lbt.eflags = v; + break; + case KVM_REG_LOONGARCH_LBT_FTOP: + vcpu->arch.fpu.ftop = v; + break; + default: + ret = -EINVAL; + break; + } break; case KVM_REG_LOONGARCH_KVM: switch (reg->id) { @@ -647,7 +869,7 @@ static int kvm_set_one_reg(struct kvm_vcpu *vcpu, vcpu->kvm->arch.time_offset = (signed long)(v - drdtime()); break; case KVM_REG_LOONGARCH_VCPU_RESET: - kvm_reset_timer(vcpu); + vcpu->arch.st.guest_addr = 0; memset(&vcpu->arch.irq_pending, 0, sizeof(vcpu->arch.irq_pending)); memset(&vcpu->arch.irq_clear, 0, sizeof(vcpu->arch.irq_clear)); break; @@ -728,7 +950,10 @@ static int kvm_loongarch_cpucfg_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { switch (attr->attr) { - case 2: + case LOONGARCH_CPUCFG2: + case LOONGARCH_CPUCFG6: + return 0; + case CPUCFG_KVM_FEATURE: return 0; default: return -ENXIO; @@ -740,8 +965,8 @@ static int kvm_loongarch_cpucfg_has_attr(struct kvm_vcpu *vcpu, static int kvm_loongarch_pvtime_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { - if (!kvm_pvtime_supported() || - attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA) + if (!kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_STEAL_TIME) + || attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA) return -ENXIO; return 0; @@ -773,9 +998,18 @@ static int kvm_loongarch_cpucfg_get_attr(struct kvm_vcpu *vcpu, uint64_t val; uint64_t __user *uaddr = (uint64_t __user *)attr->addr; - ret = _kvm_get_cpucfg_mask(attr->attr, &val); - if (ret) - return ret; + switch (attr->attr) { + case 0 ... (KVM_MAX_CPUCFG_REGS - 1): + ret = _kvm_get_cpucfg_mask(attr->attr, &val); + if (ret) + return ret; + break; + case CPUCFG_KVM_FEATURE: + val = vcpu->kvm->arch.pv_features & LOONGARCH_PV_FEAT_MASK; + break; + default: + return -ENXIO; + } put_user(val, uaddr); @@ -788,8 +1022,8 @@ static int kvm_loongarch_pvtime_get_attr(struct kvm_vcpu *vcpu, u64 gpa; u64 __user *user = (u64 __user *)attr->addr; - if (!kvm_pvtime_supported() || - attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA) + if (!kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_STEAL_TIME) + || attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA) return -ENXIO; gpa = vcpu->arch.st.guest_addr; @@ -821,7 +1055,28 @@ static int kvm_loongarch_vcpu_get_attr(struct kvm_vcpu *vcpu, static int kvm_loongarch_cpucfg_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { - return -ENXIO; + u64 val, valid; + u64 __user *user = (u64 __user *)attr->addr; + struct kvm *kvm = vcpu->kvm; + + switch (attr->attr) { + case CPUCFG_KVM_FEATURE: + if (get_user(val, user)) + return -EFAULT; + + valid = LOONGARCH_PV_FEAT_MASK; + if (val & ~valid) + return -EINVAL; + + /* All vCPUs need set the same PV features */ + if ((kvm->arch.pv_features & LOONGARCH_PV_FEAT_UPDATED) + && ((kvm->arch.pv_features & valid) != val)) + return -EINVAL; + kvm->arch.pv_features = val | LOONGARCH_PV_FEAT_UPDATED; + return 0; + default: + return -ENXIO; + } } static int kvm_loongarch_pvtime_set_attr(struct kvm_vcpu *vcpu, @@ -831,8 +1086,8 @@ static int kvm_loongarch_pvtime_set_attr(struct kvm_vcpu *vcpu, u64 gpa, __user *user = (u64 __user *)attr->addr; struct kvm *kvm = vcpu->kvm; - if (!kvm_pvtime_supported() || - attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA) + if (!kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_STEAL_TIME) + || attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA) return -ENXIO; if (get_user(gpa, user)) @@ -977,12 +1232,66 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return 0; } +#ifdef CONFIG_CPU_HAS_LBT +int kvm_own_lbt(struct kvm_vcpu *vcpu) +{ + if (!kvm_guest_has_lbt(&vcpu->arch)) + return -EINVAL; + + preempt_disable(); + set_csr_euen(CSR_EUEN_LBTEN); + _restore_lbt(&vcpu->arch.lbt); + vcpu->arch.aux_inuse |= KVM_LARCH_LBT; + preempt_enable(); + + return 0; +} + +static void kvm_lose_lbt(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + if (vcpu->arch.aux_inuse & KVM_LARCH_LBT) { + _save_lbt(&vcpu->arch.lbt); + clear_csr_euen(CSR_EUEN_LBTEN); + vcpu->arch.aux_inuse &= ~KVM_LARCH_LBT; + } + preempt_enable(); +} + +static void kvm_check_fcsr(struct kvm_vcpu *vcpu, unsigned long fcsr) +{ + /* + * If TM is enabled, top register save/restore will + * cause lbt exception, here enable lbt in advance + */ + if (fcsr & FPU_CSR_TM) + kvm_own_lbt(vcpu); +} + +static void kvm_check_fcsr_alive(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.aux_inuse & KVM_LARCH_FPU) { + if (vcpu->arch.aux_inuse & KVM_LARCH_LBT) + return; + kvm_check_fcsr(vcpu, read_fcsr(LOONGARCH_FCSR0)); + } +} +#else +static inline void kvm_lose_lbt(struct kvm_vcpu *vcpu) { } +static inline void kvm_check_fcsr(struct kvm_vcpu *vcpu, unsigned long fcsr) { } +static inline void kvm_check_fcsr_alive(struct kvm_vcpu *vcpu) { } +#endif + /* Enable FPU and restore context */ void kvm_own_fpu(struct kvm_vcpu *vcpu) { preempt_disable(); - /* Enable FPU */ + /* + * Enable FPU for guest + * Set FR and FRE according to guest context + */ + kvm_check_fcsr(vcpu, vcpu->arch.fpu.fcsr); set_csr_euen(CSR_EUEN_FPEN); kvm_restore_fpu(&vcpu->arch.fpu); @@ -1002,6 +1311,7 @@ int kvm_own_lsx(struct kvm_vcpu *vcpu) preempt_disable(); /* Enable LSX for guest */ + kvm_check_fcsr(vcpu, vcpu->arch.fpu.fcsr); set_csr_euen(CSR_EUEN_LSXEN | CSR_EUEN_FPEN); switch (vcpu->arch.aux_inuse & KVM_LARCH_FPU) { case KVM_LARCH_FPU: @@ -1036,6 +1346,7 @@ int kvm_own_lasx(struct kvm_vcpu *vcpu) preempt_disable(); + kvm_check_fcsr(vcpu, vcpu->arch.fpu.fcsr); set_csr_euen(CSR_EUEN_FPEN | CSR_EUEN_LSXEN | CSR_EUEN_LASXEN); switch (vcpu->arch.aux_inuse & (KVM_LARCH_FPU | KVM_LARCH_LSX)) { case KVM_LARCH_LSX: @@ -1067,6 +1378,7 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu) { preempt_disable(); + kvm_check_fcsr_alive(vcpu); if (vcpu->arch.aux_inuse & KVM_LARCH_LASX) { kvm_save_lasx(&vcpu->arch.fpu); vcpu->arch.aux_inuse &= ~(KVM_LARCH_LSX | KVM_LARCH_FPU | KVM_LARCH_LASX); @@ -1089,6 +1401,7 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu) /* Disable FPU */ clear_csr_euen(CSR_EUEN_FPEN); } + kvm_lose_lbt(vcpu); preempt_enable(); } @@ -1235,6 +1548,9 @@ static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) change_csr_gcfg(CSR_GCFG_MATC_MASK, CSR_GCFG_MATC_ROOT); kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); + /* Restore hardware PMU CSRs */ + kvm_restore_pmu(vcpu); + /* Don't bother restoring registers multiple times unless necessary */ if (vcpu->arch.aux_inuse & KVM_LARCH_HWCSR_USABLE) return 0; diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c index 6b2e4f66ad26..4ba734aaef87 100644 --- a/arch/loongarch/kvm/vm.c +++ b/arch/loongarch/kvm/vm.c @@ -5,6 +5,7 @@ #include <linux/kvm_host.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_vcpu.h> const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), @@ -39,6 +40,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) spin_lock_init(&kvm->arch.phyid_map_lock); kvm_init_vmcs(kvm); + + /* Enable all PV features by default */ + kvm->arch.pv_features = BIT(KVM_FEATURE_IPI); + if (kvm_pvtime_supported()) + kvm->arch.pv_features |= BIT(KVM_FEATURE_STEAL_TIME); + kvm->arch.gpa_size = BIT(cpu_vabits - 1); kvm->arch.root_level = CONFIG_PGTABLE_LEVELS - 1; kvm->arch.invalid_ptes[0] = 0; @@ -99,7 +106,67 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) return r; } +static int kvm_vm_feature_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_LOONGARCH_VM_FEAT_LSX: + if (cpu_has_lsx) + return 0; + return -ENXIO; + case KVM_LOONGARCH_VM_FEAT_LASX: + if (cpu_has_lasx) + return 0; + return -ENXIO; + case KVM_LOONGARCH_VM_FEAT_X86BT: + if (cpu_has_lbt_x86) + return 0; + return -ENXIO; + case KVM_LOONGARCH_VM_FEAT_ARMBT: + if (cpu_has_lbt_arm) + return 0; + return -ENXIO; + case KVM_LOONGARCH_VM_FEAT_MIPSBT: + if (cpu_has_lbt_mips) + return 0; + return -ENXIO; + case KVM_LOONGARCH_VM_FEAT_PMU: + if (cpu_has_pmp) + return 0; + return -ENXIO; + case KVM_LOONGARCH_VM_FEAT_PV_IPI: + return 0; + case KVM_LOONGARCH_VM_FEAT_PV_STEALTIME: + if (kvm_pvtime_supported()) + return 0; + return -ENXIO; + default: + return -ENXIO; + } +} + +static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_LOONGARCH_VM_FEAT_CTRL: + return kvm_vm_feature_has_attr(kvm, attr); + default: + return -ENXIO; + } +} + int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { - return -ENOIOCTLCMD; + void __user *argp = (void __user *)arg; + struct kvm *kvm = filp->private_data; + struct kvm_device_attr attr; + + switch (ioctl) { + case KVM_HAS_DEVICE_ATTR: + if (copy_from_user(&attr, argp, sizeof(attr))) + return -EFAULT; + + return kvm_vm_has_attr(kvm, &attr); + default: + return -ENOIOCTLCMD; + } } diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 3827dc76edd8..4520c5741579 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -193,11 +193,6 @@ asmlinkage void __init mmu_init(void) { unsigned int kstart, ksize; - if (!memblock.reserved.cnt) { - pr_emerg("Error memory count\n"); - machine_restart(NULL); - } - if ((u32) memblock.memory.regions[0].size < 0x400000) { pr_emerg("Memory must be greater than 4MB\n"); machine_restart(NULL); diff --git a/arch/mips/alchemy/common/dma.c b/arch/mips/alchemy/common/dma.c index 973049b5bd61..44d8433b1f45 100644 --- a/arch/mips/alchemy/common/dma.c +++ b/arch/mips/alchemy/common/dma.c @@ -131,29 +131,6 @@ static const struct dma_dev dma_dev_table_bank2[DMA_NUM_DEV_BANK2] = { { AU1100_SD1_PHYS_ADDR + 0x04, DMA_DS | DMA_DW8 | DMA_DR } /* coherent */ }; -void dump_au1000_dma_channel(unsigned int dmanr) -{ - struct dma_chan *chan; - - if (dmanr >= NUM_AU1000_DMA_CHANNELS) - return; - chan = &au1000_dma_table[dmanr]; - - printk(KERN_INFO "Au1000 DMA%d Register Dump:\n", dmanr); - printk(KERN_INFO " mode = 0x%08x\n", - __raw_readl(chan->io + DMA_MODE_SET)); - printk(KERN_INFO " addr = 0x%08x\n", - __raw_readl(chan->io + DMA_PERIPHERAL_ADDR)); - printk(KERN_INFO " start0 = 0x%08x\n", - __raw_readl(chan->io + DMA_BUFFER0_START)); - printk(KERN_INFO " start1 = 0x%08x\n", - __raw_readl(chan->io + DMA_BUFFER1_START)); - printk(KERN_INFO " count0 = 0x%08x\n", - __raw_readl(chan->io + DMA_BUFFER0_COUNT)); - printk(KERN_INFO " count1 = 0x%08x\n", - __raw_readl(chan->io + DMA_BUFFER1_COUNT)); -} - /* * Finds a free channel, and binds the requested device to it. * Returns the allocated channel number, or negative on error. diff --git a/arch/mips/crypto/crc32-mips.c b/arch/mips/crypto/crc32-mips.c index ec6d58008f8e..2a59b85f88aa 100644 --- a/arch/mips/crypto/crc32-mips.c +++ b/arch/mips/crypto/crc32-mips.c @@ -77,24 +77,26 @@ static u32 crc32_mips_le_hw(u32 crc_, const u8 *p, unsigned int len) { u32 crc = crc_; -#ifdef CONFIG_64BIT - while (len >= sizeof(u64)) { - u64 value = get_unaligned_le64(p); - - CRC32(crc, value, d); - p += sizeof(u64); - len -= sizeof(u64); - } - - if (len & sizeof(u32)) { -#else /* !CONFIG_64BIT */ - while (len >= sizeof(u32)) { -#endif - u32 value = get_unaligned_le32(p); - - CRC32(crc, value, w); - p += sizeof(u32); - len -= sizeof(u32); + if (IS_ENABLED(CONFIG_64BIT)) { + for (; len >= sizeof(u64); p += sizeof(u64), len -= sizeof(u64)) { + u64 value = get_unaligned_le64(p); + + CRC32(crc, value, d); + } + + if (len & sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32(crc, value, w); + p += sizeof(u32); + } + } else { + for (; len >= sizeof(u32); len -= sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32(crc, value, w); + p += sizeof(u32); + } } if (len & sizeof(u16)) { @@ -117,24 +119,26 @@ static u32 crc32c_mips_le_hw(u32 crc_, const u8 *p, unsigned int len) { u32 crc = crc_; -#ifdef CONFIG_64BIT - while (len >= sizeof(u64)) { - u64 value = get_unaligned_le64(p); + if (IS_ENABLED(CONFIG_64BIT)) { + for (; len >= sizeof(u64); p += sizeof(u64), len -= sizeof(u64)) { + u64 value = get_unaligned_le64(p); - CRC32C(crc, value, d); - p += sizeof(u64); - len -= sizeof(u64); - } + CRC32(crc, value, d); + } - if (len & sizeof(u32)) { -#else /* !CONFIG_64BIT */ - while (len >= sizeof(u32)) { -#endif - u32 value = get_unaligned_le32(p); + if (len & sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32(crc, value, w); + p += sizeof(u32); + } + } else { + for (; len >= sizeof(u32); len -= sizeof(u32)) { + u32 value = get_unaligned_le32(p); - CRC32C(crc, value, w); - p += sizeof(u32); - len -= sizeof(u32); + CRC32(crc, value, w); + p += sizeof(u32); + } } if (len & sizeof(u16)) { diff --git a/arch/mips/include/asm/cmp.h b/arch/mips/include/asm/cmp.h index e9e87504bb0c..71e20e6cd38d 100644 --- a/arch/mips/include/asm/cmp.h +++ b/arch/mips/include/asm/cmp.h @@ -7,12 +7,4 @@ */ struct task_struct; -extern void cmp_smp_setup(void); -extern void cmp_smp_finish(void); -extern void cmp_boot_secondary(int cpu, struct task_struct *t); -extern void cmp_init_secondary(void); -extern void cmp_prepare_cpus(unsigned int max_cpus); - -/* This is platform specific */ -extern void cmp_send_ipi(int cpu, unsigned int action); #endif /* _ASM_CMP_H */ diff --git a/arch/mips/include/asm/dec/prom.h b/arch/mips/include/asm/dec/prom.h index 908e96e3a311..8fcad6984389 100644 --- a/arch/mips/include/asm/dec/prom.h +++ b/arch/mips/include/asm/dec/prom.h @@ -160,6 +160,5 @@ extern void prom_identify_arch(u32); extern void prom_init_cmdline(s32, s32 *, u32); extern void register_prom_console(void); -extern void unregister_prom_console(void); #endif /* _ASM_DEC_PROM_H */ diff --git a/arch/mips/include/asm/mach-au1x00/au1000_dma.h b/arch/mips/include/asm/mach-au1x00/au1000_dma.h index b82e513c8523..18c24051a1f2 100644 --- a/arch/mips/include/asm/mach-au1x00/au1000_dma.h +++ b/arch/mips/include/asm/mach-au1x00/au1000_dma.h @@ -124,7 +124,6 @@ extern int request_au1000_dma(int dev_id, extern void free_au1000_dma(unsigned int dmanr); extern int au1000_dma_read_proc(char *buf, char **start, off_t fpos, int length, int *eof, void *data); -extern void dump_au1000_dma_channel(unsigned int dmanr); extern spinlock_t au1000_dma_spin_lock; static inline struct dma_chan *get_dma_chan(unsigned int dmanr) diff --git a/arch/mips/include/asm/mips-boards/generic.h b/arch/mips/include/asm/mips-boards/generic.h index c904c24550f6..5befba569c9f 100644 --- a/arch/mips/include/asm/mips-boards/generic.h +++ b/arch/mips/include/asm/mips-boards/generic.h @@ -73,7 +73,4 @@ extern void mips_pcibios_init(void); #define mips_pcibios_init() do { } while (0) #endif -extern void mips_scroll_message(void); -extern void mips_display_message(const char *str); - #endif /* __ASM_MIPS_BOARDS_GENERIC_H */ diff --git a/arch/mips/include/asm/mips_mt.h b/arch/mips/include/asm/mips_mt.h index 28917f1582b3..6ea02af29876 100644 --- a/arch/mips/include/asm/mips_mt.h +++ b/arch/mips/include/asm/mips_mt.h @@ -17,8 +17,6 @@ extern int vpelimit; extern cpumask_t mt_fpu_cpumask; extern unsigned long mt_fpemul_threshold; -extern void mips_mt_regdump(unsigned long previous_mvpcontrol_value); - #ifdef CONFIG_MIPS_MT extern void mips_mt_set_cpuoptions(void); #else diff --git a/arch/mips/include/uapi/asm/sigcontext.h b/arch/mips/include/uapi/asm/sigcontext.h index d0a540e88bb4..d10afd13ee5b 100644 --- a/arch/mips/include/uapi/asm/sigcontext.h +++ b/arch/mips/include/uapi/asm/sigcontext.h @@ -56,7 +56,6 @@ struct sigcontext { #if _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32 -#include <linux/posix_types.h> /* * Keep this struct definition in sync with the sigcontext fragment * in arch/mips/kernel/asm-offsets.c diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 60ebaed28a4c..8ab7582291ab 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -151,6 +151,12 @@ #define SO_PASSPIDFD 76 #define SO_PEERPIDFD 77 +#define SO_DEVMEM_LINEAR 78 +#define SCM_DEVMEM_LINEAR SO_DEVMEM_LINEAR +#define SO_DEVMEM_DMABUF 79 +#define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF +#define SO_DEVMEM_DONTNEED 80 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/jazz/setup.c b/arch/mips/jazz/setup.c index e318ea11c858..d21e5d441f53 100644 --- a/arch/mips/jazz/setup.c +++ b/arch/mips/jazz/setup.c @@ -23,8 +23,6 @@ #include <asm/reboot.h> #include <asm/tlbmisc.h> -extern asmlinkage void jazz_handle_int(void); - extern void jazz_machine_restart(char *command); static struct resource jazz_io_resources[] = { diff --git a/arch/mips/kernel/cevt-r4k.c b/arch/mips/kernel/cevt-r4k.c index 368e8475870f..5f6e9e2ebbdb 100644 --- a/arch/mips/kernel/cevt-r4k.c +++ b/arch/mips/kernel/cevt-r4k.c @@ -303,13 +303,6 @@ int r4k_clockevent_init(void) if (!c0_compare_int_usable()) return -ENXIO; - /* - * With vectored interrupts things are getting platform specific. - * get_c0_compare_int is a hook to allow a platform to return the - * interrupt number of its liking. - */ - irq = get_c0_compare_int(); - cd = &per_cpu(mips_clockevent_device, cpu); cd->name = "MIPS"; @@ -320,7 +313,6 @@ int r4k_clockevent_init(void) min_delta = calculate_min_delta(); cd->rating = 300; - cd->irq = irq; cd->cpumask = cpumask_of(cpu); cd->set_next_event = mips_next_event; cd->event_handler = mips_event_handler; @@ -332,6 +324,13 @@ int r4k_clockevent_init(void) cp0_timer_irq_installed = 1; + /* + * With vectored interrupts things are getting platform specific. + * get_c0_compare_int is a hook to allow a platform to return the + * interrupt number of its liking. + */ + irq = get_c0_compare_int(); + if (request_irq(irq, c0_compare_interrupt, flags, "timer", c0_compare_interrupt)) pr_err("Failed to request irq %d (timer)\n", irq); diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index bda7f193baab..af7412549e6e 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -1724,12 +1724,16 @@ static inline void cpu_probe_loongson(struct cpuinfo_mips *c, unsigned int cpu) c->ases |= (MIPS_ASE_LOONGSON_MMI | MIPS_ASE_LOONGSON_CAM | MIPS_ASE_LOONGSON_EXT | MIPS_ASE_LOONGSON_EXT2); c->ases &= ~MIPS_ASE_VZ; /* VZ of Loongson-3A2000/3000 is incomplete */ + change_c0_config6(LOONGSON_CONF6_EXTIMER | LOONGSON_CONF6_INTIMER, + LOONGSON_CONF6_INTIMER); break; case PRID_IMP_LOONGSON_64G: __cpu_name[cpu] = "ICT Loongson-3"; set_elf_platform(cpu, "loongson3a"); set_isa(c, MIPS_CPU_ISA_M64R2); decode_cpucfg(c); + change_c0_config6(LOONGSON_CONF6_EXTIMER | LOONGSON_CONF6_INTIMER, + LOONGSON_CONF6_INTIMER); break; default: panic("Unknown Loongson Processor ID!"); diff --git a/arch/mips/kernel/csrc-r4k.c b/arch/mips/kernel/csrc-r4k.c index bdb1fa8931f4..59eca397f297 100644 --- a/arch/mips/kernel/csrc-r4k.c +++ b/arch/mips/kernel/csrc-r4k.c @@ -21,9 +21,7 @@ static struct clocksource clocksource_mips = { .name = "MIPS", .read = c0_hpt_read, .mask = CLOCKSOURCE_MASK(32), - .flags = CLOCK_SOURCE_IS_CONTINUOUS | - CLOCK_SOURCE_MUST_VERIFY | - CLOCK_SOURCE_VERIFY_PERCPU, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static u64 __maybe_unused notrace r4k_read_sched_clock(void) diff --git a/arch/mips/kernel/mips-mt.c b/arch/mips/kernel/mips-mt.c index c938ba208fc0..37676a44fefb 100644 --- a/arch/mips/kernel/mips-mt.c +++ b/arch/mips/kernel/mips-mt.c @@ -43,83 +43,6 @@ static int __init maxtcs(char *str) __setup("maxtcs=", maxtcs); -/* - * Dump new MIPS MT state for the core. Does not leave TCs halted. - * Takes an argument which taken to be a pre-call MVPControl value. - */ - -void mips_mt_regdump(unsigned long mvpctl) -{ - unsigned long flags; - unsigned long vpflags; - unsigned long mvpconf0; - int nvpe; - int ntc; - int i; - int tc; - unsigned long haltval; - unsigned long tcstatval; - - local_irq_save(flags); - vpflags = dvpe(); - printk("=== MIPS MT State Dump ===\n"); - printk("-- Global State --\n"); - printk(" MVPControl Passed: %08lx\n", mvpctl); - printk(" MVPControl Read: %08lx\n", vpflags); - printk(" MVPConf0 : %08lx\n", (mvpconf0 = read_c0_mvpconf0())); - nvpe = ((mvpconf0 & MVPCONF0_PVPE) >> MVPCONF0_PVPE_SHIFT) + 1; - ntc = ((mvpconf0 & MVPCONF0_PTC) >> MVPCONF0_PTC_SHIFT) + 1; - printk("-- per-VPE State --\n"); - for (i = 0; i < nvpe; i++) { - for (tc = 0; tc < ntc; tc++) { - settc(tc); - if ((read_tc_c0_tcbind() & TCBIND_CURVPE) == i) { - printk(" VPE %d\n", i); - printk(" VPEControl : %08lx\n", - read_vpe_c0_vpecontrol()); - printk(" VPEConf0 : %08lx\n", - read_vpe_c0_vpeconf0()); - printk(" VPE%d.Status : %08lx\n", - i, read_vpe_c0_status()); - printk(" VPE%d.EPC : %08lx %pS\n", - i, read_vpe_c0_epc(), - (void *) read_vpe_c0_epc()); - printk(" VPE%d.Cause : %08lx\n", - i, read_vpe_c0_cause()); - printk(" VPE%d.Config7 : %08lx\n", - i, read_vpe_c0_config7()); - break; /* Next VPE */ - } - } - } - printk("-- per-TC State --\n"); - for (tc = 0; tc < ntc; tc++) { - settc(tc); - if (read_tc_c0_tcbind() == read_c0_tcbind()) { - /* Are we dumping ourself? */ - haltval = 0; /* Then we're not halted, and mustn't be */ - tcstatval = flags; /* And pre-dump TCStatus is flags */ - printk(" TC %d (current TC with VPE EPC above)\n", tc); - } else { - haltval = read_tc_c0_tchalt(); - write_tc_c0_tchalt(1); - tcstatval = read_tc_c0_tcstatus(); - printk(" TC %d\n", tc); - } - printk(" TCStatus : %08lx\n", tcstatval); - printk(" TCBind : %08lx\n", read_tc_c0_tcbind()); - printk(" TCRestart : %08lx %pS\n", - read_tc_c0_tcrestart(), (void *) read_tc_c0_tcrestart()); - printk(" TCHalt : %08lx\n", haltval); - printk(" TCContext : %08lx\n", read_tc_c0_tccontext()); - if (!haltval) - write_tc_c0_tchalt(0); - } - printk("===========================\n"); - evpe(vpflags); - local_irq_restore(flags); -} - static int mt_opt_rpsctl = -1; static int mt_opt_nblsu = -1; static int mt_opt_forceconfig7; diff --git a/arch/mips/ralink/irq-gic.c b/arch/mips/ralink/irq-gic.c index 3bab51a5fb4c..8bc566ea00e5 100644 --- a/arch/mips/ralink/irq-gic.c +++ b/arch/mips/ralink/irq-gic.c @@ -10,6 +10,7 @@ #include <linux/of.h> #include <linux/irqchip.h> #include <asm/mips-cps.h> +#include <asm/time.h> int get_c0_perfcount_int(void) { diff --git a/arch/mips/ralink/timer-gic.c b/arch/mips/ralink/timer-gic.c index dcf2a44ac51e..926082655a78 100644 --- a/arch/mips/ralink/timer-gic.c +++ b/arch/mips/ralink/timer-gic.c @@ -11,6 +11,8 @@ #include <linux/of_clk.h> #include <linux/clocksource.h> +#include <asm/time.h> + #include "common.h" void __init plat_time_init(void) diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index be264c2b1a11..38fc0b188e08 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -132,6 +132,12 @@ #define SO_PASSPIDFD 0x404A #define SO_PEERPIDFD 0x404B +#define SO_DEVMEM_LINEAR 78 +#define SCM_DEVMEM_LINEAR SO_DEVMEM_LINEAR +#define SO_DEVMEM_DMABUF 79 +#define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF +#define SO_DEVMEM_DONTNEED 80 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 34d91cb8b259..96970fa75e4a 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -459,7 +459,6 @@ void free_initmem(void) unsigned long kernel_end = (unsigned long)&_end; /* Remap kernel text and data, but do not touch init section yet. */ - kernel_set_to_readonly = true; map_pages(init_end, __pa(init_end), kernel_end - init_end, PAGE_KERNEL, 0); @@ -493,11 +492,18 @@ void free_initmem(void) #ifdef CONFIG_STRICT_KERNEL_RWX void mark_rodata_ro(void) { - /* rodata memory was already mapped with KERNEL_RO access rights by - pagetable_init() and map_pages(). No need to do additional stuff here */ - unsigned long roai_size = __end_ro_after_init - __start_ro_after_init; + unsigned long start = (unsigned long) &__start_rodata; + unsigned long end = (unsigned long) &__end_rodata; + + pr_info("Write protecting the kernel read-only data: %luk\n", + (end - start) >> 10); + + kernel_set_to_readonly = true; + map_pages(start, __pa(start), end - start, PAGE_KERNEL, 0); - pr_info("Write protected read-only-after-init data: %luk\n", roai_size >> 10); + /* force the kernel to see the new page table entries */ + flush_cache_all(); + flush_tlb_all(); } #endif diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d7b09b064a8a..8a4ee57cd4ef 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -1026,6 +1026,10 @@ config PPC_MEM_KEYS If unsure, say y. +config ARCH_PKEY_BITS + int + default 5 + config PPC_SECURE_BOOT prompt "Enable secure boot support" bool diff --git a/arch/powerpc/crypto/curve25519-ppc64le-core.c b/arch/powerpc/crypto/curve25519-ppc64le-core.c index 4e3e44ea4484..f7810be0b292 100644 --- a/arch/powerpc/crypto/curve25519-ppc64le-core.c +++ b/arch/powerpc/crypto/curve25519-ppc64le-core.c @@ -295,5 +295,6 @@ module_exit(curve25519_mod_exit); MODULE_ALIAS_CRYPTO("curve25519"); MODULE_ALIAS_CRYPTO("curve25519-ppc64le"); +MODULE_DESCRIPTION("PPC64le Curve25519 scalar multiplication with 51 bits limbs"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Danny Tsen <dtsen@us.ibm.com>"); diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 9508399dd036..b481738c4bb5 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -52,7 +52,7 @@ #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) #define pgd_ERROR(e) \ - pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) + pr_err("%s:%d: bad pgd %08llx.\n", __FILE__, __LINE__, (unsigned long long)pgd_val(e)) /* * This is the bottom of the PKMAP area with HIGHMEM or an arbitrary @@ -170,7 +170,7 @@ static inline void pmd_clear(pmd_t *pmdp) #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) #else #define pmd_page_vaddr(pmd) \ - ((const void *)(pmd_val(pmd) & ~(PTE_TABLE_SIZE - 1))) + ((const void *)((unsigned long)pmd_val(pmd) & ~(PTE_TABLE_SIZE - 1))) #define pmd_pfn(pmd) (__pa(pmd_val(pmd)) >> PAGE_SHIFT) #endif diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h index 7b3d4c592a10..f3086e39e7d2 100644 --- a/arch/powerpc/include/asm/pgtable-types.h +++ b/arch/powerpc/include/asm/pgtable-types.h @@ -49,16 +49,22 @@ static inline unsigned long pud_val(pud_t x) #endif /* CONFIG_PPC64 */ /* PGD level */ -#if defined(CONFIG_PPC_E500) && defined(CONFIG_PTE_64BIT) +#if defined(CONFIG_PPC_85xx) && defined(CONFIG_PTE_64BIT) typedef struct { unsigned long long pgd; } pgd_t; + +static inline unsigned long long pgd_val(pgd_t x) +{ + return x.pgd; +} #else typedef struct { unsigned long pgd; } pgd_t; -#endif -#define __pgd(x) ((pgd_t) { (x) }) + static inline unsigned long pgd_val(pgd_t x) { return x.pgd; } +#endif +#define __pgd(x) ((pgd_t) { (x) }) /* Page protection bits */ typedef struct { unsigned long pgprot; } pgprot_t; diff --git a/arch/powerpc/kernel/vdso/vdso32.lds.S b/arch/powerpc/kernel/vdso/vdso32.lds.S index 426e1ccc6971..8f57107000a2 100644 --- a/arch/powerpc/kernel/vdso/vdso32.lds.S +++ b/arch/powerpc/kernel/vdso/vdso32.lds.S @@ -74,6 +74,8 @@ SECTIONS .got : { *(.got) } :text .plt : { *(.plt) } + .rela.dyn : { *(.rela .rela*) } + _end = .; __end = .; PROVIDE(end = .); @@ -87,7 +89,7 @@ SECTIONS *(.branch_lt) *(.data .data.* .gnu.linkonce.d.* .sdata*) *(.bss .sbss .dynbss .dynsbss) - *(.got1 .glink .iplt .rela*) + *(.got1 .glink .iplt) } } diff --git a/arch/powerpc/kernel/vdso/vdso64.lds.S b/arch/powerpc/kernel/vdso/vdso64.lds.S index bda6c8cdd459..400819258c06 100644 --- a/arch/powerpc/kernel/vdso/vdso64.lds.S +++ b/arch/powerpc/kernel/vdso/vdso64.lds.S @@ -69,7 +69,7 @@ SECTIONS .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr .eh_frame : { KEEP (*(.eh_frame)) } :text .gcc_except_table : { *(.gcc_except_table) } - .rela.dyn ALIGN(8) : { *(.rela.dyn) } + .rela.dyn ALIGN(8) : { *(.rela .rela*) } .got ALIGN(8) : { *(.got .toc) } @@ -86,7 +86,7 @@ SECTIONS *(.data .data.* .gnu.linkonce.d.* .sdata*) *(.bss .sbss .dynbss .dynsbss) *(.opd) - *(.glink .iplt .plt .rela*) + *(.glink .iplt .plt) } } diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 5de4dd549f6e..bcc7e4dff8c3 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -697,7 +697,15 @@ again: } release: - qnodesp->count--; /* release the node */ + /* + * Clear the lock before releasing the node, as another CPU might see stale + * values if an interrupt occurs after we increment qnodesp->count + * but before node->lock is initialized. The barrier ensures that + * there are no further stores to the node after it has been released. + */ + node->lock = NULL; + barrier(); + qnodesp->count--; } void queued_spin_lock_slowpath(struct qspinlock *lock) diff --git a/arch/powerpc/mm/nohash/tlb_64e.c b/arch/powerpc/mm/nohash/tlb_64e.c index 113edf76d3ce..d26656b07b72 100644 --- a/arch/powerpc/mm/nohash/tlb_64e.c +++ b/arch/powerpc/mm/nohash/tlb_64e.c @@ -33,7 +33,7 @@ * though this will probably be made common with other nohash * implementations at some point */ -int mmu_pte_psize; /* Page size used for PTE pages */ +static int mmu_pte_psize; /* Page size used for PTE pages */ int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */ unsigned long linear_map_top; /* Top of linear mapping */ diff --git a/arch/powerpc/platforms/chrp/pegasos_eth.c b/arch/powerpc/platforms/chrp/pegasos_eth.c index 5c4f1a9ca154..6f4a41a9352a 100644 --- a/arch/powerpc/platforms/chrp/pegasos_eth.c +++ b/arch/powerpc/platforms/chrp/pegasos_eth.c @@ -14,7 +14,7 @@ #include <linux/ioport.h> #include <linux/device.h> #include <linux/platform_device.h> -#include <linux/mv643xx.h> +#include <linux/mv643xx_eth.h> #include <linux/pci.h> #define PEGASOS2_MARVELL_REGBASE (0xf1000000) @@ -25,12 +25,15 @@ #define PEGASOS2_SRAM_BASE_ETH_PORT0 (PEGASOS2_SRAM_BASE) #define PEGASOS2_SRAM_BASE_ETH_PORT1 (PEGASOS2_SRAM_BASE_ETH_PORT0 + (PEGASOS2_SRAM_SIZE / 2) ) - #define PEGASOS2_SRAM_RXRING_SIZE (PEGASOS2_SRAM_SIZE/4) #define PEGASOS2_SRAM_TXRING_SIZE (PEGASOS2_SRAM_SIZE/4) #undef BE_VERBOSE +#define MV64340_BASE_ADDR_ENABLE 0x278 +#define MV64340_INTEGRATED_SRAM_BASE_ADDR 0x268 +#define MV64340_SRAM_CONFIG 0x380 + static struct resource mv643xx_eth_shared_resources[] = { [0] = { .name = "ethernet shared base", diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 0f3cd7c3a436..86d1f1cea571 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -13,6 +13,7 @@ config 32BIT config RISCV def_bool y select ACPI_GENERIC_GSI if ACPI + select ACPI_MCFG if (ACPI && PCI) select ACPI_PPTT if ACPI select ACPI_REDUCED_HARDWARE_ONLY if ACPI select ACPI_SPCR_TABLE if ACPI @@ -188,6 +189,7 @@ config RISCV select OF_EARLY_FLATTREE select OF_IRQ select PCI_DOMAINS_GENERIC if PCI + select PCI_ECAM if (ACPI && PCI) select PCI_MSI if PCI select RISCV_ALTERNATIVE if !XIP_KERNEL select RISCV_APLIC @@ -552,8 +554,8 @@ config RISCV_ISA_SVPBMT config TOOLCHAIN_HAS_V bool default y - depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64iv) - depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32iv) + depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64imv) + depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32imv) depends on LLD_VERSION >= 140000 || LD_VERSION >= 23800 depends on AS_HAS_OPTION_ARCH diff --git a/arch/riscv/boot/dts/starfive/jh7110-common.dtsi b/arch/riscv/boot/dts/starfive/jh7110-common.dtsi index ca2d44d59d48..c7771b3b6475 100644 --- a/arch/riscv/boot/dts/starfive/jh7110-common.dtsi +++ b/arch/riscv/boot/dts/starfive/jh7110-common.dtsi @@ -365,6 +365,12 @@ }; }; +&syscrg { + assigned-clocks = <&syscrg JH7110_SYSCLK_CPU_CORE>, + <&pllclk JH7110_PLLCLK_PLL0_OUT>; + assigned-clock-rates = <500000000>, <1500000000>; +}; + &sysgpio { i2c0_pins: i2c0-0 { i2c-pins { diff --git a/arch/riscv/include/asm/irq.h b/arch/riscv/include/asm/irq.h index 8e10a94430a2..7e9a84a005ed 100644 --- a/arch/riscv/include/asm/irq.h +++ b/arch/riscv/include/asm/irq.h @@ -12,8 +12,63 @@ #include <asm-generic/irq.h> +#define INVALID_CONTEXT UINT_MAX + void riscv_set_intc_hwnode_fn(struct fwnode_handle *(*fn)(void)); struct fwnode_handle *riscv_get_intc_hwnode(void); +#ifdef CONFIG_ACPI + +enum riscv_irqchip_type { + ACPI_RISCV_IRQCHIP_INTC = 0x00, + ACPI_RISCV_IRQCHIP_IMSIC = 0x01, + ACPI_RISCV_IRQCHIP_PLIC = 0x02, + ACPI_RISCV_IRQCHIP_APLIC = 0x03, +}; + +int riscv_acpi_get_gsi_info(struct fwnode_handle *fwnode, u32 *gsi_base, + u32 *id, u32 *nr_irqs, u32 *nr_idcs); +struct fwnode_handle *riscv_acpi_get_gsi_domain_id(u32 gsi); +unsigned long acpi_rintc_index_to_hartid(u32 index); +unsigned long acpi_rintc_ext_parent_to_hartid(unsigned int plic_id, unsigned int ctxt_idx); +unsigned int acpi_rintc_get_plic_nr_contexts(unsigned int plic_id); +unsigned int acpi_rintc_get_plic_context(unsigned int plic_id, unsigned int ctxt_idx); +int __init acpi_rintc_get_imsic_mmio_info(u32 index, struct resource *res); + +#else +static inline int riscv_acpi_get_gsi_info(struct fwnode_handle *fwnode, u32 *gsi_base, + u32 *id, u32 *nr_irqs, u32 *nr_idcs) +{ + return 0; +} + +static inline unsigned long acpi_rintc_index_to_hartid(u32 index) +{ + return INVALID_HARTID; +} + +static inline unsigned long acpi_rintc_ext_parent_to_hartid(unsigned int plic_id, + unsigned int ctxt_idx) +{ + return INVALID_HARTID; +} + +static inline unsigned int acpi_rintc_get_plic_nr_contexts(unsigned int plic_id) +{ + return INVALID_CONTEXT; +} + +static inline unsigned int acpi_rintc_get_plic_context(unsigned int plic_id, unsigned int ctxt_idx) +{ + return INVALID_CONTEXT; +} + +static inline int __init acpi_rintc_get_imsic_mmio_info(u32 index, struct resource *res) +{ + return 0; +} + +#endif /* CONFIG_ACPI */ + #endif /* _ASM_RISCV_IRQ_H */ diff --git a/arch/riscv/include/asm/kvm_vcpu_pmu.h b/arch/riscv/include/asm/kvm_vcpu_pmu.h index fa0f535bbbf0..1d85b6617508 100644 --- a/arch/riscv/include/asm/kvm_vcpu_pmu.h +++ b/arch/riscv/include/asm/kvm_vcpu_pmu.h @@ -10,6 +10,7 @@ #define __KVM_VCPU_RISCV_PMU_H #include <linux/perf/riscv_pmu.h> +#include <asm/kvm_vcpu_insn.h> #include <asm/sbi.h> #ifdef CONFIG_RISCV_PMU_SBI @@ -64,11 +65,11 @@ struct kvm_pmu { #if defined(CONFIG_32BIT) #define KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS \ -{.base = CSR_CYCLEH, .count = 31, .func = kvm_riscv_vcpu_pmu_read_hpm }, \ -{.base = CSR_CYCLE, .count = 31, .func = kvm_riscv_vcpu_pmu_read_hpm }, +{.base = CSR_CYCLEH, .count = 32, .func = kvm_riscv_vcpu_pmu_read_hpm }, \ +{.base = CSR_CYCLE, .count = 32, .func = kvm_riscv_vcpu_pmu_read_hpm }, #else #define KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS \ -{.base = CSR_CYCLE, .count = 31, .func = kvm_riscv_vcpu_pmu_read_hpm }, +{.base = CSR_CYCLE, .count = 32, .func = kvm_riscv_vcpu_pmu_read_hpm }, #endif int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid); @@ -104,8 +105,20 @@ void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu); struct kvm_pmu { }; +static inline int kvm_riscv_vcpu_pmu_read_legacy(struct kvm_vcpu *vcpu, unsigned int csr_num, + unsigned long *val, unsigned long new_val, + unsigned long wr_mask) +{ + if (csr_num == CSR_CYCLE || csr_num == CSR_INSTRET) { + *val = 0; + return KVM_INSN_CONTINUE_NEXT_SEPC; + } else { + return KVM_INSN_ILLEGAL_TRAP; + } +} + #define KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS \ -{.base = 0, .count = 0, .func = NULL }, +{.base = CSR_CYCLE, .count = 3, .func = kvm_riscv_vcpu_pmu_read_legacy }, static inline void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu) {} static inline int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid) diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 8702b8721a27..efa1b3519b23 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -14,36 +14,14 @@ #include <asm/ptrace.h> -/* - * addr is a hint to the maximum userspace address that mmap should provide, so - * this macro needs to return the largest address space available so that - * mmap_end < addr, being mmap_end the top of that address space. - * See Documentation/arch/riscv/vm-layout.rst for more details. - */ #define arch_get_mmap_end(addr, len, flags) \ ({ \ - unsigned long mmap_end; \ - typeof(addr) _addr = (addr); \ - if ((_addr) == 0 || is_compat_task() || \ - ((_addr + len) > BIT(VA_BITS - 1))) \ - mmap_end = STACK_TOP_MAX; \ - else \ - mmap_end = (_addr + len); \ - mmap_end; \ + STACK_TOP_MAX; \ }) #define arch_get_mmap_base(addr, base) \ ({ \ - unsigned long mmap_base; \ - typeof(addr) _addr = (addr); \ - typeof(base) _base = (base); \ - unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base); \ - if ((_addr) == 0 || is_compat_task() || \ - ((_addr + len) > BIT(VA_BITS - 1))) \ - mmap_base = (_base); \ - else \ - mmap_base = (_addr + len) - rnd_gap; \ - mmap_base; \ + base; \ }) #ifdef CONFIG_64BIT diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 7cffd4ffecd0..7bd3746028c9 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/cpumask.h> +#include <linux/jump_label.h> #ifdef CONFIG_RISCV_SBI enum sbi_ext_id { @@ -304,6 +305,7 @@ struct sbiret { }; void sbi_init(void); +long __sbi_base_ecall(int fid); struct sbiret __sbi_ecall(unsigned long arg0, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, @@ -373,7 +375,23 @@ static inline unsigned long sbi_mk_version(unsigned long major, | (minor & SBI_SPEC_VERSION_MINOR_MASK); } -int sbi_err_map_linux_errno(int err); +static inline int sbi_err_map_linux_errno(int err) +{ + switch (err) { + case SBI_SUCCESS: + return 0; + case SBI_ERR_DENIED: + return -EPERM; + case SBI_ERR_INVALID_PARAM: + return -EINVAL; + case SBI_ERR_INVALID_ADDRESS: + return -EFAULT; + case SBI_ERR_NOT_SUPPORTED: + case SBI_ERR_FAILURE: + default: + return -ENOTSUPP; + }; +} extern bool sbi_debug_console_available; int sbi_debug_console_write(const char *bytes, unsigned int num_bytes); diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 06d407f1b30b..7f88cc4931f5 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -20,17 +20,21 @@ endif ifdef CONFIG_RISCV_ALTERNATIVE_EARLY CFLAGS_alternative.o := -mcmodel=medany CFLAGS_cpufeature.o := -mcmodel=medany +CFLAGS_sbi_ecall.o := -mcmodel=medany ifdef CONFIG_FTRACE CFLAGS_REMOVE_alternative.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_cpufeature.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_sbi_ecall.o = $(CC_FLAGS_FTRACE) endif ifdef CONFIG_RELOCATABLE CFLAGS_alternative.o += -fno-pie CFLAGS_cpufeature.o += -fno-pie +CFLAGS_sbi_ecall.o += -fno-pie endif ifdef CONFIG_KASAN KASAN_SANITIZE_alternative.o := n KASAN_SANITIZE_cpufeature.o := n +KASAN_SANITIZE_sbi_ecall.o := n endif endif @@ -88,7 +92,7 @@ obj-$(CONFIG_DYNAMIC_FTRACE) += mcount-dyn.o obj-$(CONFIG_PERF_EVENTS) += perf_callchain.o obj-$(CONFIG_HAVE_PERF_REGS) += perf_regs.o -obj-$(CONFIG_RISCV_SBI) += sbi.o +obj-$(CONFIG_RISCV_SBI) += sbi.o sbi_ecall.o ifeq ($(CONFIG_RISCV_SBI), y) obj-$(CONFIG_SMP) += sbi-ipi.o obj-$(CONFIG_SMP) += cpu_ops_sbi.o diff --git a/arch/riscv/kernel/acpi.c b/arch/riscv/kernel/acpi.c index ba957aaca5cb..6e0d333f57e5 100644 --- a/arch/riscv/kernel/acpi.c +++ b/arch/riscv/kernel/acpi.c @@ -311,29 +311,26 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) #ifdef CONFIG_PCI /* - * These interfaces are defined just to enable building ACPI core. - * TODO: Update it with actual implementation when external interrupt - * controller support is added in RISC-V ACPI. + * raw_pci_read/write - Platform-specific PCI config space access. */ -int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn, - int reg, int len, u32 *val) +int raw_pci_read(unsigned int domain, unsigned int bus, + unsigned int devfn, int reg, int len, u32 *val) { - return PCIBIOS_DEVICE_NOT_FOUND; -} + struct pci_bus *b = pci_find_bus(domain, bus); -int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn, - int reg, int len, u32 val) -{ - return PCIBIOS_DEVICE_NOT_FOUND; + if (!b) + return PCIBIOS_DEVICE_NOT_FOUND; + return b->ops->read(b, devfn, reg, len, val); } -int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) +int raw_pci_write(unsigned int domain, unsigned int bus, + unsigned int devfn, int reg, int len, u32 val) { - return -1; -} + struct pci_bus *b = pci_find_bus(domain, bus); -struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) -{ - return NULL; + if (!b) + return PCIBIOS_DEVICE_NOT_FOUND; + return b->ops->write(b, devfn, reg, len, val); } + #endif /* CONFIG_PCI */ diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 837bdab2601b..1989b8cade1b 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -14,9 +14,6 @@ #include <asm/smp.h> #include <asm/tlbflush.h> -#define CREATE_TRACE_POINTS -#include <asm/trace.h> - /* default SBI version is 0.1 */ unsigned long sbi_spec_version __ro_after_init = SBI_SPEC_VERSION_DEFAULT; EXPORT_SYMBOL(sbi_spec_version); @@ -27,55 +24,6 @@ static int (*__sbi_rfence)(int fid, const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) __ro_after_init; -struct sbiret __sbi_ecall(unsigned long arg0, unsigned long arg1, - unsigned long arg2, unsigned long arg3, - unsigned long arg4, unsigned long arg5, - int fid, int ext) -{ - struct sbiret ret; - - trace_sbi_call(ext, fid); - - register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0); - register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1); - register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2); - register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3); - register uintptr_t a4 asm ("a4") = (uintptr_t)(arg4); - register uintptr_t a5 asm ("a5") = (uintptr_t)(arg5); - register uintptr_t a6 asm ("a6") = (uintptr_t)(fid); - register uintptr_t a7 asm ("a7") = (uintptr_t)(ext); - asm volatile ("ecall" - : "+r" (a0), "+r" (a1) - : "r" (a2), "r" (a3), "r" (a4), "r" (a5), "r" (a6), "r" (a7) - : "memory"); - ret.error = a0; - ret.value = a1; - - trace_sbi_return(ext, ret.error, ret.value); - - return ret; -} -EXPORT_SYMBOL(__sbi_ecall); - -int sbi_err_map_linux_errno(int err) -{ - switch (err) { - case SBI_SUCCESS: - return 0; - case SBI_ERR_DENIED: - return -EPERM; - case SBI_ERR_INVALID_PARAM: - return -EINVAL; - case SBI_ERR_INVALID_ADDRESS: - return -EFAULT; - case SBI_ERR_NOT_SUPPORTED: - case SBI_ERR_FAILURE: - default: - return -ENOTSUPP; - }; -} -EXPORT_SYMBOL(sbi_err_map_linux_errno); - #ifdef CONFIG_RISCV_SBI_V01 static unsigned long __sbi_v01_cpumask_to_hartmask(const struct cpumask *cpu_mask) { @@ -535,17 +483,6 @@ long sbi_probe_extension(int extid) } EXPORT_SYMBOL(sbi_probe_extension); -static long __sbi_base_ecall(int fid) -{ - struct sbiret ret; - - ret = sbi_ecall(SBI_EXT_BASE, fid, 0, 0, 0, 0, 0, 0); - if (!ret.error) - return ret.value; - else - return sbi_err_map_linux_errno(ret.error); -} - static inline long sbi_get_spec_version(void) { return __sbi_base_ecall(SBI_EXT_BASE_GET_SPEC_VERSION); diff --git a/arch/riscv/kernel/sbi_ecall.c b/arch/riscv/kernel/sbi_ecall.c new file mode 100644 index 000000000000..24aabb4fbde3 --- /dev/null +++ b/arch/riscv/kernel/sbi_ecall.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Rivos Inc. */ + +#include <asm/sbi.h> +#define CREATE_TRACE_POINTS +#include <asm/trace.h> + +long __sbi_base_ecall(int fid) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_BASE, fid, 0, 0, 0, 0, 0, 0); + if (!ret.error) + return ret.value; + else + return sbi_err_map_linux_errno(ret.error); +} +EXPORT_SYMBOL(__sbi_base_ecall); + +struct sbiret __sbi_ecall(unsigned long arg0, unsigned long arg1, + unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, + int fid, int ext) +{ + struct sbiret ret; + + trace_sbi_call(ext, fid); + + register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0); + register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1); + register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2); + register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3); + register uintptr_t a4 asm ("a4") = (uintptr_t)(arg4); + register uintptr_t a5 asm ("a5") = (uintptr_t)(arg5); + register uintptr_t a6 asm ("a6") = (uintptr_t)(fid); + register uintptr_t a7 asm ("a7") = (uintptr_t)(ext); + asm volatile ("ecall" + : "+r" (a0), "+r" (a1) + : "r" (a2), "r" (a3), "r" (a4), "r" (a5), "r" (a6), "r" (a7) + : "memory"); + ret.error = a0; + ret.value = a1; + + trace_sbi_return(ext, ret.error, ret.value); + + return ret; +} +EXPORT_SYMBOL(__sbi_ecall); diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 192cd5603e95..d4fd8af7aaf5 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -417,7 +417,7 @@ int handle_misaligned_load(struct pt_regs *regs) val.data_u64 = 0; if (user_mode(regs)) { - if (raw_copy_from_user(&val, (u8 __user *)addr, len)) + if (copy_from_user(&val, (u8 __user *)addr, len)) return -1; } else { memcpy(&val, (u8 *)addr, len); @@ -515,7 +515,7 @@ int handle_misaligned_store(struct pt_regs *regs) return -EOPNOTSUPP; if (user_mode(regs)) { - if (raw_copy_to_user((u8 __user *)addr, &val, len)) + if (copy_to_user((u8 __user *)addr, &val, len)) return -1; } else { memcpy((u8 *)addr, &val, len); diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c index bcf41d6e0df0..2707a51b082c 100644 --- a/arch/riscv/kvm/vcpu_pmu.c +++ b/arch/riscv/kvm/vcpu_pmu.c @@ -391,19 +391,9 @@ int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num, static void kvm_pmu_clear_snapshot_area(struct kvm_vcpu *vcpu) { struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); - int snapshot_area_size = sizeof(struct riscv_pmu_snapshot_data); - if (kvpmu->sdata) { - if (kvpmu->snapshot_addr != INVALID_GPA) { - memset(kvpmu->sdata, 0, snapshot_area_size); - kvm_vcpu_write_guest(vcpu, kvpmu->snapshot_addr, - kvpmu->sdata, snapshot_area_size); - } else { - pr_warn("snapshot address invalid\n"); - } - kfree(kvpmu->sdata); - kvpmu->sdata = NULL; - } + kfree(kvpmu->sdata); + kvpmu->sdata = NULL; kvpmu->snapshot_addr = INVALID_GPA; } diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index 62f409d4176e..7de128be8db9 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -127,8 +127,8 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run) run->riscv_sbi.args[3] = cp->a3; run->riscv_sbi.args[4] = cp->a4; run->riscv_sbi.args[5] = cp->a5; - run->riscv_sbi.ret[0] = cp->a0; - run->riscv_sbi.ret[1] = cp->a1; + run->riscv_sbi.ret[0] = SBI_ERR_NOT_SUPPORTED; + run->riscv_sbi.ret[1] = 0; } void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu, diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index a03c994eed3b..b81672729887 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -158,6 +158,7 @@ void __init riscv_init_cbo_blocksizes(void) #ifdef CONFIG_SMP static void set_icache_stale_mask(void) { + int cpu = get_cpu(); cpumask_t *mask; bool stale_cpu; @@ -168,10 +169,11 @@ static void set_icache_stale_mask(void) * concurrently on different harts. */ mask = ¤t->mm->context.icache_stale_mask; - stale_cpu = cpumask_test_cpu(smp_processor_id(), mask); + stale_cpu = cpumask_test_cpu(cpu, mask); cpumask_setall(mask); - cpumask_assign_cpu(smp_processor_id(), mask, stale_cpu); + cpumask_assign_cpu(cpu, mask, stale_cpu); + put_cpu(); } #endif @@ -239,14 +241,12 @@ int riscv_set_icache_flush_ctx(unsigned long ctx, unsigned long scope) case PR_RISCV_CTX_SW_FENCEI_OFF: switch (scope) { case PR_RISCV_SCOPE_PER_PROCESS: - current->mm->context.force_icache_flush = false; - set_icache_stale_mask(); + current->mm->context.force_icache_flush = false; break; case PR_RISCV_SCOPE_PER_THREAD: - current->thread.force_icache_flush = false; - set_icache_stale_mask(); + current->thread.force_icache_flush = false; break; default: return -EINVAL; diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index eb0649a61b4c..1785782c2e55 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -252,7 +252,7 @@ static void __init setup_bootmem(void) * The size of the linear page mapping may restrict the amount of * usable RAM. */ - if (IS_ENABLED(CONFIG_64BIT)) { + if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU)) { max_mapped_addr = __pa(PAGE_OFFSET) + KERN_VIRT_SIZE; memblock_cap_memory_range(phys_ram_base, max_mapped_addr - phys_ram_base); diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a822f952f64a..c60e699e99f5 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -604,6 +604,19 @@ config RANDOMIZE_BASE as a security feature that deters exploit attempts relying on knowledge of the location of kernel internals. +config RANDOMIZE_IDENTITY_BASE + bool "Randomize the address of the identity mapping base" + depends on RANDOMIZE_BASE + default DEBUG_VM + help + The identity mapping base address is pinned to zero by default. + Allow randomization of that base to expose otherwise missed + notion of physical and virtual addresses of data structures. + That does not have any impact on the base address at which the + kernel image is loaded. + + If unsure, say N + config KERNEL_IMAGE_BASE hex "Kernel image base address" range 0x100000 0x1FFFFFE0000000 if !KASAN diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index ce232552bc1c..c73b5118ad42 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -162,7 +162,7 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, loc = (long)*reloc + phys_offset; if (loc < min_addr || loc > max_addr) error("64-bit relocation outside of kernel!\n"); - *(u64 *)loc += offset - __START_KERNEL; + *(u64 *)loc += offset; } } @@ -177,7 +177,7 @@ static void kaslr_adjust_got(unsigned long offset) */ for (entry = (u64 *)vmlinux.got_start; entry < (u64 *)vmlinux.got_end; entry++) { if (*entry) - *entry += offset - __START_KERNEL; + *entry += offset; } } @@ -252,7 +252,7 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page); /* choose kernel address space layout: 4 or 3 levels. */ - BUILD_BUG_ON(!IS_ALIGNED(__START_KERNEL, THREAD_SIZE)); + BUILD_BUG_ON(!IS_ALIGNED(TEXT_OFFSET, THREAD_SIZE)); BUILD_BUG_ON(!IS_ALIGNED(__NO_KASLR_START_KERNEL, THREAD_SIZE)); BUILD_BUG_ON(__NO_KASLR_END_KERNEL > _REGION1_SIZE); vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION3_SIZE); @@ -341,7 +341,8 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) BUILD_BUG_ON(MAX_DCSS_ADDR > (1UL << MAX_PHYSMEM_BITS)); max_mappable = max(ident_map_size, MAX_DCSS_ADDR); max_mappable = min(max_mappable, vmemmap_start); - __identity_base = round_down(vmemmap_start - max_mappable, rte_size); + if (IS_ENABLED(CONFIG_RANDOMIZE_IDENTITY_BASE)) + __identity_base = round_down(vmemmap_start - max_mappable, rte_size); return asce_limit; } @@ -388,31 +389,25 @@ static void kaslr_adjust_vmlinux_info(long offset) #endif } -static void fixup_vmlinux_info(void) -{ - vmlinux.entry -= __START_KERNEL; - kaslr_adjust_vmlinux_info(-__START_KERNEL); -} - void startup_kernel(void) { - unsigned long kernel_size = vmlinux.image_size + vmlinux.bss_size; - unsigned long nokaslr_offset_phys, kaslr_large_page_offset; - unsigned long amode31_lma = 0; + unsigned long vmlinux_size = vmlinux.image_size + vmlinux.bss_size; + unsigned long nokaslr_text_lma, text_lma = 0, amode31_lma = 0; + unsigned long kernel_size = TEXT_OFFSET + vmlinux_size; + unsigned long kaslr_large_page_offset; unsigned long max_physmem_end; unsigned long asce_limit; unsigned long safe_addr; psw_t psw; - fixup_vmlinux_info(); setup_lpp(); /* * Non-randomized kernel physical start address must be _SEGMENT_SIZE * aligned (see blow). */ - nokaslr_offset_phys = ALIGN(mem_safe_offset(), _SEGMENT_SIZE); - safe_addr = PAGE_ALIGN(nokaslr_offset_phys + kernel_size); + nokaslr_text_lma = ALIGN(mem_safe_offset(), _SEGMENT_SIZE); + safe_addr = PAGE_ALIGN(nokaslr_text_lma + vmlinux_size); /* * Reserve decompressor memory together with decompression heap, @@ -456,16 +451,27 @@ void startup_kernel(void) */ kaslr_large_page_offset = __kaslr_offset & ~_SEGMENT_MASK; if (kaslr_enabled()) { - unsigned long end = ident_map_size - kaslr_large_page_offset; + unsigned long size = vmlinux_size + kaslr_large_page_offset; - __kaslr_offset_phys = randomize_within_range(kernel_size, _SEGMENT_SIZE, 0, end); + text_lma = randomize_within_range(size, _SEGMENT_SIZE, TEXT_OFFSET, ident_map_size); } - if (!__kaslr_offset_phys) - __kaslr_offset_phys = nokaslr_offset_phys; - __kaslr_offset_phys |= kaslr_large_page_offset; + if (!text_lma) + text_lma = nokaslr_text_lma; + text_lma |= kaslr_large_page_offset; + + /* + * [__kaslr_offset_phys..__kaslr_offset_phys + TEXT_OFFSET] region is + * never accessed via the kernel image mapping as per the linker script: + * + * . = TEXT_OFFSET; + * + * Therefore, this region could be used for something else and does + * not need to be reserved. See how it is skipped in setup_vmem(). + */ + __kaslr_offset_phys = text_lma - TEXT_OFFSET; kaslr_adjust_vmlinux_info(__kaslr_offset_phys); - physmem_reserve(RR_VMLINUX, __kaslr_offset_phys, kernel_size); - deploy_kernel((void *)__kaslr_offset_phys); + physmem_reserve(RR_VMLINUX, text_lma, vmlinux_size); + deploy_kernel((void *)text_lma); /* vmlinux decompression is done, shrink reserved low memory */ physmem_reserve(RR_DECOMPRESSOR, 0, (unsigned long)_decompressor_end); @@ -488,7 +494,7 @@ void startup_kernel(void) amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, amode31_min, SZ_2G); } if (!amode31_lma) - amode31_lma = __kaslr_offset_phys - vmlinux.amode31_size; + amode31_lma = text_lma - vmlinux.amode31_size; physmem_reserve(RR_AMODE31, amode31_lma, vmlinux.amode31_size); /* @@ -504,8 +510,8 @@ void startup_kernel(void) * - copy_bootdata() must follow setup_vmem() to propagate changes * to bootdata made by setup_vmem() */ - clear_bss_section(__kaslr_offset_phys); - kaslr_adjust_relocs(__kaslr_offset_phys, __kaslr_offset_phys + vmlinux.image_size, + clear_bss_section(text_lma); + kaslr_adjust_relocs(text_lma, text_lma + vmlinux.image_size, __kaslr_offset, __kaslr_offset_phys); kaslr_adjust_got(__kaslr_offset); setup_vmem(__kaslr_offset, __kaslr_offset + kernel_size, asce_limit); diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 2847cc059ab7..145035f84a0e 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -90,7 +90,7 @@ static void kasan_populate_shadow(unsigned long kernel_start, unsigned long kern } memgap_start = end; } - kasan_populate(kernel_start, kernel_end, POPULATE_KASAN_MAP_SHADOW); + kasan_populate(kernel_start + TEXT_OFFSET, kernel_end, POPULATE_KASAN_MAP_SHADOW); kasan_populate(0, (unsigned long)__identity_va(0), POPULATE_KASAN_ZERO_SHADOW); kasan_populate(AMODE31_START, AMODE31_END, POPULATE_KASAN_ZERO_SHADOW); if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { @@ -475,7 +475,17 @@ void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned l (unsigned long)__identity_va(end), POPULATE_IDENTITY); } - pgtable_populate(kernel_start, kernel_end, POPULATE_KERNEL); + + /* + * [kernel_start..kernel_start + TEXT_OFFSET] region is never + * accessed as per the linker script: + * + * . = TEXT_OFFSET; + * + * Therefore, skip mapping TEXT_OFFSET bytes to prevent access to + * [__kaslr_offset_phys..__kaslr_offset_phys + TEXT_OFFSET] region. + */ + pgtable_populate(kernel_start + TEXT_OFFSET, kernel_end, POPULATE_KERNEL); pgtable_populate(AMODE31_START, AMODE31_END, POPULATE_DIRECT); pgtable_populate(__abs_lowcore, __abs_lowcore + sizeof(struct lowcore), POPULATE_ABS_LOWCORE); diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S index a750711d44c8..66670212a361 100644 --- a/arch/s390/boot/vmlinux.lds.S +++ b/arch/s390/boot/vmlinux.lds.S @@ -109,7 +109,12 @@ SECTIONS #ifdef CONFIG_KERNEL_UNCOMPRESSED . = ALIGN(PAGE_SIZE); . += AMODE31_SIZE; /* .amode31 section */ - . = ALIGN(1 << 20); /* _SEGMENT_SIZE */ + + /* + * Make sure the location counter is not less than TEXT_OFFSET. + * _SEGMENT_SIZE is not available, use ALIGN(1 << 20) instead. + */ + . = MAX(TEXT_OFFSET, ALIGN(1 << 20)); #else . = ALIGN(8); #endif diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 06416b3f94f5..16e4caa931f1 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -279,8 +279,9 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #define AMODE31_SIZE (3 * PAGE_SIZE) #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) -#define __START_KERNEL 0x100000 #define __NO_KASLR_START_KERNEL CONFIG_KERNEL_IMAGE_BASE #define __NO_KASLR_END_KERNEL (__NO_KASLR_START_KERNEL + KERNEL_IMAGE_SIZE) +#define TEXT_OFFSET 0x100000 + #endif /* _S390_PAGE_H */ diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 4ec99f73fa27..a3fea683b227 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -734,7 +734,23 @@ static void __init memblock_add_physmem_info(void) } /* - * Reserve memory used for lowcore/command line/kernel image. + * Reserve memory used for lowcore. + */ +static void __init reserve_lowcore(void) +{ + void *lowcore_start = get_lowcore(); + void *lowcore_end = lowcore_start + sizeof(struct lowcore); + void *start, *end; + + if ((void *)__identity_base < lowcore_end) { + start = max(lowcore_start, (void *)__identity_base); + end = min(lowcore_end, (void *)(__identity_base + ident_map_size)); + memblock_reserve(__pa(start), __pa(end)); + } +} + +/* + * Reserve memory used for absolute lowcore/command line/kernel image. */ static void __init reserve_kernel(void) { @@ -918,6 +934,7 @@ void __init setup_arch(char **cmdline_p) /* Do some memory reservations *before* memory is added to memblock */ reserve_pgtables(); + reserve_lowcore(); reserve_kernel(); reserve_initrd(); reserve_certificate_list(); diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index e67cd409b858..ae5d0a9d6911 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -39,7 +39,7 @@ PHDRS { SECTIONS { - . = __START_KERNEL; + . = TEXT_OFFSET; .text : { _stext = .; /* Start of text section */ _text = .; /* Text and read-only data */ diff --git a/arch/s390/tools/relocs.c b/arch/s390/tools/relocs.c index a74dbd5c9896..30a732c808f3 100644 --- a/arch/s390/tools/relocs.c +++ b/arch/s390/tools/relocs.c @@ -280,7 +280,7 @@ static int do_reloc(struct section *sec, Elf_Rel *rel) case R_390_GOTOFF64: break; case R_390_64: - add_reloc(&relocs64, offset - ehdr.e_entry); + add_reloc(&relocs64, offset); break; default: die("Unsupported relocation type: %d\n", r_type); diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 1aa3c4a0c5b2..e9103998cca9 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -14,6 +14,7 @@ config SUPERH select ARCH_HIBERNATION_POSSIBLE if MMU select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_WANT_IPC_PARSE_VERSION + select ARCH_NEED_CMPXCHG_1_EMU select CPU_NO_EFFICIENT_FFS select DMA_DECLARE_COHERENT select GENERIC_ATOMIC64 diff --git a/arch/sh/include/asm/cmpxchg.h b/arch/sh/include/asm/cmpxchg.h index 5d617b3ef78f..1e5dc5ccf7bf 100644 --- a/arch/sh/include/asm/cmpxchg.h +++ b/arch/sh/include/asm/cmpxchg.h @@ -9,6 +9,7 @@ #include <linux/compiler.h> #include <linux/types.h> +#include <linux/cmpxchg-emu.h> #if defined(CONFIG_GUSA_RB) #include <asm/cmpxchg-grb.h> @@ -56,6 +57,8 @@ static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old, unsigned long new, int size) { switch (size) { + case 1: + return cmpxchg_emu_u8(ptr, old, new); case 4: return __cmpxchg_u32(ptr, old, new); } diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 682da3714686..57084ed2f3c4 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -133,6 +133,12 @@ #define SO_PASSPIDFD 0x0055 #define SO_PEERPIDFD 0x0056 +#define SO_DEVMEM_LINEAR 0x0057 +#define SCM_DEVMEM_LINEAR SO_DEVMEM_LINEAR +#define SO_DEVMEM_DMABUF 0x0058 +#define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF +#define SO_DEVMEM_DONTNEED 0x0059 + #if !defined(__KERNEL__) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 007bab9f2a0e..6f1c31f4b9ab 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1889,6 +1889,10 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS If unsure, say y. +config ARCH_PKEY_BITS + int + default 4 + choice prompt "TSX enable mode" depends on CPU_SUP_INTEL @@ -2610,24 +2614,15 @@ config MITIGATION_SLS against straight line speculation. The kernel image might be slightly larger. -config MITIGATION_GDS_FORCE - bool "Force GDS Mitigation" +config MITIGATION_GDS + bool "Mitigate Gather Data Sampling" depends on CPU_SUP_INTEL - default n + default y help - Gather Data Sampling (GDS) is a hardware vulnerability which allows - unprivileged speculative access to data which was previously stored in - vector registers. - - This option is equivalent to setting gather_data_sampling=force on the - command line. The microcode mitigation is used if present, otherwise - AVX is disabled as a mitigation. On affected systems that are missing - the microcode any userspace code that unconditionally uses AVX will - break with this option set. - - Setting this option on systems not vulnerable to GDS has no effect. - - If in doubt, say N. + Enable mitigation for Gather Data Sampling (GDS). GDS is a hardware + vulnerability which allows unprivileged speculative access to data + which was previously stored in vector registers. The attacker uses gather + instructions to infer the stale vector register data. config MITIGATION_RFDS bool "RFDS Mitigation" @@ -2650,6 +2645,107 @@ config MITIGATION_SPECTRE_BHI indirect branches. See <file:Documentation/admin-guide/hw-vuln/spectre.rst> +config MITIGATION_MDS + bool "Mitigate Microarchitectural Data Sampling (MDS) hardware bug" + depends on CPU_SUP_INTEL + default y + help + Enable mitigation for Microarchitectural Data Sampling (MDS). MDS is + a hardware vulnerability which allows unprivileged speculative access + to data which is available in various CPU internal buffers. + See also <file:Documentation/admin-guide/hw-vuln/mds.rst> + +config MITIGATION_TAA + bool "Mitigate TSX Asynchronous Abort (TAA) hardware bug" + depends on CPU_SUP_INTEL + default y + help + Enable mitigation for TSX Asynchronous Abort (TAA). TAA is a hardware + vulnerability that allows unprivileged speculative access to data + which is available in various CPU internal buffers by using + asynchronous aborts within an Intel TSX transactional region. + See also <file:Documentation/admin-guide/hw-vuln/tsx_async_abort.rst> + +config MITIGATION_MMIO_STALE_DATA + bool "Mitigate MMIO Stale Data hardware bug" + depends on CPU_SUP_INTEL + default y + help + Enable mitigation for MMIO Stale Data hardware bugs. Processor MMIO + Stale Data Vulnerabilities are a class of memory-mapped I/O (MMIO) + vulnerabilities that can expose data. The vulnerabilities require the + attacker to have access to MMIO. + See also + <file:Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst> + +config MITIGATION_L1TF + bool "Mitigate L1 Terminal Fault (L1TF) hardware bug" + depends on CPU_SUP_INTEL + default y + help + Mitigate L1 Terminal Fault (L1TF) hardware bug. L1 Terminal Fault is a + hardware vulnerability which allows unprivileged speculative access to data + available in the Level 1 Data Cache. + See <file:Documentation/admin-guide/hw-vuln/l1tf.rst + +config MITIGATION_RETBLEED + bool "Mitigate RETBleed hardware bug" + depends on (CPU_SUP_INTEL && MITIGATION_SPECTRE_V2) || MITIGATION_UNRET_ENTRY || MITIGATION_IBPB_ENTRY + default y + help + Enable mitigation for RETBleed (Arbitrary Speculative Code Execution + with Return Instructions) vulnerability. RETBleed is a speculative + execution attack which takes advantage of microarchitectural behavior + in many modern microprocessors, similar to Spectre v2. An + unprivileged attacker can use these flaws to bypass conventional + memory security restrictions to gain read access to privileged memory + that would otherwise be inaccessible. + +config MITIGATION_SPECTRE_V1 + bool "Mitigate SPECTRE V1 hardware bug" + default y + help + Enable mitigation for Spectre V1 (Bounds Check Bypass). Spectre V1 is a + class of side channel attacks that takes advantage of speculative + execution that bypasses conditional branch instructions used for + memory access bounds check. + See also <file:Documentation/admin-guide/hw-vuln/spectre.rst> + +config MITIGATION_SPECTRE_V2 + bool "Mitigate SPECTRE V2 hardware bug" + default y + help + Enable mitigation for Spectre V2 (Branch Target Injection). Spectre + V2 is a class of side channel attacks that takes advantage of + indirect branch predictors inside the processor. In Spectre variant 2 + attacks, the attacker can steer speculative indirect branches in the + victim to gadget code by poisoning the branch target buffer of a CPU + used for predicting indirect branch addresses. + See also <file:Documentation/admin-guide/hw-vuln/spectre.rst> + +config MITIGATION_SRBDS + bool "Mitigate Special Register Buffer Data Sampling (SRBDS) hardware bug" + depends on CPU_SUP_INTEL + default y + help + Enable mitigation for Special Register Buffer Data Sampling (SRBDS). + SRBDS is a hardware vulnerability that allows Microarchitectural Data + Sampling (MDS) techniques to infer values returned from special + register accesses. An unprivileged user can extract values returned + from RDRAND and RDSEED executed on another core or sibling thread + using MDS techniques. + See also + <file:Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst> + +config MITIGATION_SSB + bool "Mitigate Speculative Store Bypass (SSB) hardware bug" + default y + help + Enable mitigation for Speculative Store Bypass (SSB). SSB is a + hardware security vulnerability and its exploitation takes advantage + of speculative execution in a similar way to the Meltdown and Spectre + security vulnerabilities. + endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 078e2bac2553..da8b66dce0da 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -389,7 +389,6 @@ static bool mmio_read(int size, unsigned long addr, unsigned long *val) .r12 = size, .r13 = EPT_READ, .r14 = addr, - .r15 = *val, }; if (__tdx_hypercall(&args)) diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 24875e6295f2..7b1bebed879d 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -14,7 +14,7 @@ config CRYPTO_CURVE25519_X86 - ADX (large integer arithmetic) config CRYPTO_AES_NI_INTEL - tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTR, XTS, GCM (AES-NI)" + tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XCTR, XTS, GCM (AES-NI/VAES)" depends on X86 select CRYPTO_AEAD select CRYPTO_LIB_AES @@ -25,10 +25,14 @@ config CRYPTO_AES_NI_INTEL help Block cipher: AES cipher algorithms AEAD cipher: AES with GCM - Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XTR, XTS + Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XCTR, XTS Architecture: x86 (32-bit and 64-bit) using: - AES-NI (AES new instructions) + - VAES (Vector AES) + + Some algorithm implementations are supported only in 64-bit builds, + and some have additional prerequisites such as AVX2 or AVX512. config CRYPTO_BLOWFISH_X86_64 tristate "Ciphers: Blowfish, modes: ECB, CBC" diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index cd37de5ec404..b0dd83555499 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1366,6 +1366,8 @@ gcm_crypt(struct aead_request *req, int flags) err = skcipher_walk_aead_encrypt(&walk, req, false); else err = skcipher_walk_aead_decrypt(&walk, req, false); + if (err) + return err; /* * Since the AES-GCM assembly code requires that at least three assembly @@ -1381,37 +1383,31 @@ gcm_crypt(struct aead_request *req, int flags) gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags); /* En/decrypt the data and pass the ciphertext through GHASH. */ - while ((nbytes = walk.nbytes) != 0) { - if (unlikely(nbytes < walk.total)) { - /* - * Non-last segment. In this case, the assembly - * function requires that the length be a multiple of 16 - * (AES_BLOCK_SIZE) bytes. The needed buffering of up - * to 16 bytes is handled by the skcipher_walk. Here we - * just need to round down to a multiple of 16. - */ - nbytes = round_down(nbytes, AES_BLOCK_SIZE); - aes_gcm_update(key, le_ctr, ghash_acc, - walk.src.virt.addr, walk.dst.virt.addr, - nbytes, flags); - le_ctr[0] += nbytes / AES_BLOCK_SIZE; - kernel_fpu_end(); - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - kernel_fpu_begin(); - } else { - /* Last segment: process all remaining data. */ - aes_gcm_update(key, le_ctr, ghash_acc, - walk.src.virt.addr, walk.dst.virt.addr, - nbytes, flags); - err = skcipher_walk_done(&walk, 0); - /* - * The low word of the counter isn't used by the - * finalize, so there's no need to increment it here. - */ - } + while (unlikely((nbytes = walk.nbytes) < walk.total)) { + /* + * Non-last segment. In this case, the assembly function + * requires that the length be a multiple of 16 (AES_BLOCK_SIZE) + * bytes. The needed buffering of up to 16 bytes is handled by + * the skcipher_walk. Here we just need to round down to a + * multiple of 16. + */ + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr, + walk.dst.virt.addr, nbytes, flags); + le_ctr[0] += nbytes / AES_BLOCK_SIZE; + kernel_fpu_end(); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + if (err) + return err; + kernel_fpu_begin(); } - if (err) - goto out; + /* Last segment: process all remaining data. */ + aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr, + walk.dst.virt.addr, nbytes, flags); + /* + * The low word of the counter isn't used by the finalize, so there's no + * need to increment it here. + */ /* Finalize */ taglen = crypto_aead_authsize(tfm); @@ -1439,8 +1435,9 @@ gcm_crypt(struct aead_request *req, int flags) datalen, tag, taglen, flags)) err = -EBADMSG; } -out: kernel_fpu_end(); + if (nbytes) + skcipher_walk_done(&walk, 0); return err; } @@ -1753,6 +1750,6 @@ static void __exit aesni_exit(void) late_initcall(aesni_init); module_exit(aesni_exit); -MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized"); +MODULE_DESCRIPTION("AES cipher and modes, optimized with AES-NI or VAES instructions"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("aes"); diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 0ffb072be956..0bbec1c75cd0 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -592,22 +592,22 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx) leaq K256+0*32(%rip), INP ## reuse INP as scratch reg vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED _XFER + 0*32 + FOUR_ROUNDS_AND_SCHED (_XFER + 0*32) leaq K256+1*32(%rip), INP vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED _XFER + 1*32 + FOUR_ROUNDS_AND_SCHED (_XFER + 1*32) leaq K256+2*32(%rip), INP vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 2*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED _XFER + 2*32 + FOUR_ROUNDS_AND_SCHED (_XFER + 2*32) leaq K256+3*32(%rip), INP vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 3*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED _XFER + 3*32 + FOUR_ROUNDS_AND_SCHED (_XFER + 3*32) add $4*32, SRND cmp $3*4*32, SRND @@ -618,12 +618,12 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx) leaq K256+0*32(%rip), INP vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) - DO_4ROUNDS _XFER + 0*32 + DO_4ROUNDS (_XFER + 0*32) leaq K256+1*32(%rip), INP vpaddd (INP, SRND), X1, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) - DO_4ROUNDS _XFER + 1*32 + DO_4ROUNDS (_XFER + 1*32) add $2*32, SRND vmovdqa X2, X0 @@ -651,8 +651,8 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx) xor SRND, SRND .align 16 .Lloop3: - DO_4ROUNDS _XFER + 0*32 + 16 - DO_4ROUNDS _XFER + 1*32 + 16 + DO_4ROUNDS (_XFER + 0*32 + 16) + DO_4ROUNDS (_XFER + 1*32 + 16) add $2*32, SRND cmp $4*4*32, SRND jb .Lloop3 diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 0c9c2706d4ec..9e519d8a810a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4589,6 +4589,25 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void) return HYBRID_INTEL_CORE; } +static inline bool erratum_hsw11(struct perf_event *event) +{ + return (event->hw.config & INTEL_ARCH_EVENT_MASK) == + X86_CONFIG(.event=0xc0, .umask=0x01); +} + +/* + * The HSW11 requires a period larger than 100 which is the same as the BDM11. + * A minimum period of 128 is enforced as well for the INST_RETIRED.ALL. + * + * The message 'interrupt took too long' can be observed on any counter which + * was armed with a period < 32 and two events expired in the same NMI. + * A minimum period of 32 is enforced for the rest of the events. + */ +static void hsw_limit_period(struct perf_event *event, s64 *left) +{ + *left = max(*left, erratum_hsw11(event) ? 128 : 32); +} + /* * Broadwell: * @@ -4606,8 +4625,7 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void) */ static void bdw_limit_period(struct perf_event *event, s64 *left) { - if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == - X86_CONFIG(.event=0xc0, .umask=0x01)) { + if (erratum_hsw11(event)) { if (*left < 128) *left = 128; *left &= ~0x3fULL; @@ -6766,6 +6784,7 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; + x86_pmu.limit_period = hsw_limit_period; x86_pmu.lbr_double_abort = true; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 17a71e92a343..95eada2994e1 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -35,7 +35,6 @@ #include <clocksource/hyperv_timer.h> #include <linux/highmem.h> -int hyperv_init_cpuhp; u64 hv_current_partition_id = ~0ull; EXPORT_SYMBOL_GPL(hv_current_partition_id); @@ -607,8 +606,6 @@ skip_hypercall_pg_init: register_syscore_ops(&hv_syscore_ops); - hyperv_init_cpuhp = cpuhp; - if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID) hv_get_partition_id(); @@ -637,7 +634,7 @@ skip_hypercall_pg_init: clean_guest_os_id: wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); - cpuhp_remove_state(cpuhp); + cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE); free_ghcb_page: free_percpu(hv_ghcb_pg); free_vp_assist_page: diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index 3831f612e89c..e4121d9aa9e1 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -193,26 +193,6 @@ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, X86_MODEL_ANY, data) /** - * X86_MATCH_INTEL_FAM6_MODEL - Match vendor INTEL, family 6 and model - * @model: The model name without the INTEL_FAM6_ prefix or ANY - * The model name is expanded to INTEL_FAM6_@model internally - * @data: Driver specific data or NULL. The internal storage - * format is unsigned long. The supplied value, pointer - * etc. is casted to unsigned long internally. - * - * The vendor is set to INTEL, the family to 6 and all other missing - * arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are set to wildcards. - * - * See X86_MATCH_VENDOR_FAM_MODEL_FEATURE() for further information. - */ -#define X86_MATCH_INTEL_FAM6_MODEL(model, data) \ - X86_MATCH_VENDOR_FAM_MODEL(INTEL, 6, INTEL_FAM6_##model, data) - -#define X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(model, steppings, data) \ - X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, INTEL_FAM6_##model, \ - steppings, X86_FEATURE_ANY, data) - -/** * X86_MATCH_VFM - Match encoded vendor/family/model * @vfm: Encoded 8-bits each for vendor, family, model * @data: Driver specific data or NULL. The internal storage diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index eb17f31b06d2..de16862bf230 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -591,6 +591,13 @@ struct fpu_state_config { * even without XSAVE support, i.e. legacy features FP + SSE */ u64 legacy_features; + /* + * @independent_features: + * + * Features that are supported by XSAVES, but not managed as part of + * the FPU core, such as LBR + */ + u64 independent_features; }; /* FPU state configuration information */ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index f81a851c46dc..44949f972826 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -10,7 +10,7 @@ * that group keep the CPUID for the variants sorted by model number. * * The defined symbol names have the following form: - * INTEL_FAM6{OPTFAMILY}_{MICROARCH}{OPTDIFF} + * INTEL_{OPTFAMILY}_{MICROARCH}{OPTDIFF} * where: * OPTFAMILY Describes the family of CPUs that this belongs to. Default * is assumed to be "_CORE" (and should be omitted). Other values @@ -42,215 +42,136 @@ #define IFM(_fam, _model) VFM_MAKE(X86_VENDOR_INTEL, _fam, _model) -/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */ -#define INTEL_FAM6_ANY X86_MODEL_ANY -/* Wildcard match for FAM6 so X86_MATCH_VFM(ANY) works */ +/* Wildcard match so X86_MATCH_VFM(ANY) works */ #define INTEL_ANY IFM(X86_FAMILY_ANY, X86_MODEL_ANY) -#define INTEL_FAM6_CORE_YONAH 0x0E +#define INTEL_PENTIUM_PRO IFM(6, 0x01) + #define INTEL_CORE_YONAH IFM(6, 0x0E) -#define INTEL_FAM6_CORE2_MEROM 0x0F #define INTEL_CORE2_MEROM IFM(6, 0x0F) -#define INTEL_FAM6_CORE2_MEROM_L 0x16 #define INTEL_CORE2_MEROM_L IFM(6, 0x16) -#define INTEL_FAM6_CORE2_PENRYN 0x17 #define INTEL_CORE2_PENRYN IFM(6, 0x17) -#define INTEL_FAM6_CORE2_DUNNINGTON 0x1D #define INTEL_CORE2_DUNNINGTON IFM(6, 0x1D) -#define INTEL_FAM6_NEHALEM 0x1E #define INTEL_NEHALEM IFM(6, 0x1E) -#define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */ #define INTEL_NEHALEM_G IFM(6, 0x1F) /* Auburndale / Havendale */ -#define INTEL_FAM6_NEHALEM_EP 0x1A #define INTEL_NEHALEM_EP IFM(6, 0x1A) -#define INTEL_FAM6_NEHALEM_EX 0x2E #define INTEL_NEHALEM_EX IFM(6, 0x2E) -#define INTEL_FAM6_WESTMERE 0x25 #define INTEL_WESTMERE IFM(6, 0x25) -#define INTEL_FAM6_WESTMERE_EP 0x2C #define INTEL_WESTMERE_EP IFM(6, 0x2C) -#define INTEL_FAM6_WESTMERE_EX 0x2F #define INTEL_WESTMERE_EX IFM(6, 0x2F) -#define INTEL_FAM6_SANDYBRIDGE 0x2A #define INTEL_SANDYBRIDGE IFM(6, 0x2A) -#define INTEL_FAM6_SANDYBRIDGE_X 0x2D #define INTEL_SANDYBRIDGE_X IFM(6, 0x2D) -#define INTEL_FAM6_IVYBRIDGE 0x3A #define INTEL_IVYBRIDGE IFM(6, 0x3A) -#define INTEL_FAM6_IVYBRIDGE_X 0x3E #define INTEL_IVYBRIDGE_X IFM(6, 0x3E) -#define INTEL_FAM6_HASWELL 0x3C #define INTEL_HASWELL IFM(6, 0x3C) -#define INTEL_FAM6_HASWELL_X 0x3F #define INTEL_HASWELL_X IFM(6, 0x3F) -#define INTEL_FAM6_HASWELL_L 0x45 #define INTEL_HASWELL_L IFM(6, 0x45) -#define INTEL_FAM6_HASWELL_G 0x46 #define INTEL_HASWELL_G IFM(6, 0x46) -#define INTEL_FAM6_BROADWELL 0x3D #define INTEL_BROADWELL IFM(6, 0x3D) -#define INTEL_FAM6_BROADWELL_G 0x47 #define INTEL_BROADWELL_G IFM(6, 0x47) -#define INTEL_FAM6_BROADWELL_X 0x4F #define INTEL_BROADWELL_X IFM(6, 0x4F) -#define INTEL_FAM6_BROADWELL_D 0x56 #define INTEL_BROADWELL_D IFM(6, 0x56) -#define INTEL_FAM6_SKYLAKE_L 0x4E /* Sky Lake */ #define INTEL_SKYLAKE_L IFM(6, 0x4E) /* Sky Lake */ -#define INTEL_FAM6_SKYLAKE 0x5E /* Sky Lake */ #define INTEL_SKYLAKE IFM(6, 0x5E) /* Sky Lake */ -#define INTEL_FAM6_SKYLAKE_X 0x55 /* Sky Lake */ #define INTEL_SKYLAKE_X IFM(6, 0x55) /* Sky Lake */ /* CASCADELAKE_X 0x55 Sky Lake -- s: 7 */ /* COOPERLAKE_X 0x55 Sky Lake -- s: 11 */ -#define INTEL_FAM6_KABYLAKE_L 0x8E /* Sky Lake */ #define INTEL_KABYLAKE_L IFM(6, 0x8E) /* Sky Lake */ /* AMBERLAKE_L 0x8E Sky Lake -- s: 9 */ /* COFFEELAKE_L 0x8E Sky Lake -- s: 10 */ /* WHISKEYLAKE_L 0x8E Sky Lake -- s: 11,12 */ -#define INTEL_FAM6_KABYLAKE 0x9E /* Sky Lake */ #define INTEL_KABYLAKE IFM(6, 0x9E) /* Sky Lake */ /* COFFEELAKE 0x9E Sky Lake -- s: 10-13 */ -#define INTEL_FAM6_COMETLAKE 0xA5 /* Sky Lake */ #define INTEL_COMETLAKE IFM(6, 0xA5) /* Sky Lake */ -#define INTEL_FAM6_COMETLAKE_L 0xA6 /* Sky Lake */ #define INTEL_COMETLAKE_L IFM(6, 0xA6) /* Sky Lake */ -#define INTEL_FAM6_CANNONLAKE_L 0x66 /* Palm Cove */ #define INTEL_CANNONLAKE_L IFM(6, 0x66) /* Palm Cove */ -#define INTEL_FAM6_ICELAKE_X 0x6A /* Sunny Cove */ #define INTEL_ICELAKE_X IFM(6, 0x6A) /* Sunny Cove */ -#define INTEL_FAM6_ICELAKE_D 0x6C /* Sunny Cove */ #define INTEL_ICELAKE_D IFM(6, 0x6C) /* Sunny Cove */ -#define INTEL_FAM6_ICELAKE 0x7D /* Sunny Cove */ #define INTEL_ICELAKE IFM(6, 0x7D) /* Sunny Cove */ -#define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */ #define INTEL_ICELAKE_L IFM(6, 0x7E) /* Sunny Cove */ -#define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */ #define INTEL_ICELAKE_NNPI IFM(6, 0x9D) /* Sunny Cove */ -#define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */ #define INTEL_ROCKETLAKE IFM(6, 0xA7) /* Cypress Cove */ -#define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */ #define INTEL_TIGERLAKE_L IFM(6, 0x8C) /* Willow Cove */ -#define INTEL_FAM6_TIGERLAKE 0x8D /* Willow Cove */ #define INTEL_TIGERLAKE IFM(6, 0x8D) /* Willow Cove */ -#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */ #define INTEL_SAPPHIRERAPIDS_X IFM(6, 0x8F) /* Golden Cove */ -#define INTEL_FAM6_EMERALDRAPIDS_X 0xCF #define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF) -#define INTEL_FAM6_GRANITERAPIDS_X 0xAD #define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) -#define INTEL_FAM6_GRANITERAPIDS_D 0xAE #define INTEL_GRANITERAPIDS_D IFM(6, 0xAE) /* "Hybrid" Processors (P-Core/E-Core) */ -#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ #define INTEL_LAKEFIELD IFM(6, 0x8A) /* Sunny Cove / Tremont */ -#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ #define INTEL_ALDERLAKE IFM(6, 0x97) /* Golden Cove / Gracemont */ -#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ #define INTEL_ALDERLAKE_L IFM(6, 0x9A) /* Golden Cove / Gracemont */ -#define INTEL_FAM6_RAPTORLAKE 0xB7 /* Raptor Cove / Enhanced Gracemont */ #define INTEL_RAPTORLAKE IFM(6, 0xB7) /* Raptor Cove / Enhanced Gracemont */ -#define INTEL_FAM6_RAPTORLAKE_P 0xBA #define INTEL_RAPTORLAKE_P IFM(6, 0xBA) -#define INTEL_FAM6_RAPTORLAKE_S 0xBF #define INTEL_RAPTORLAKE_S IFM(6, 0xBF) -#define INTEL_FAM6_METEORLAKE 0xAC #define INTEL_METEORLAKE IFM(6, 0xAC) -#define INTEL_FAM6_METEORLAKE_L 0xAA #define INTEL_METEORLAKE_L IFM(6, 0xAA) -#define INTEL_FAM6_ARROWLAKE_H 0xC5 #define INTEL_ARROWLAKE_H IFM(6, 0xC5) -#define INTEL_FAM6_ARROWLAKE 0xC6 #define INTEL_ARROWLAKE IFM(6, 0xC6) -#define INTEL_FAM6_ARROWLAKE_U 0xB5 #define INTEL_ARROWLAKE_U IFM(6, 0xB5) -#define INTEL_FAM6_LUNARLAKE_M 0xBD #define INTEL_LUNARLAKE_M IFM(6, 0xBD) /* "Small Core" Processors (Atom/E-Core) */ -#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ #define INTEL_ATOM_BONNELL IFM(6, 0x1C) /* Diamondville, Pineview */ -#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ #define INTEL_ATOM_BONNELL_MID IFM(6, 0x26) /* Silverthorne, Lincroft */ -#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */ #define INTEL_ATOM_SALTWELL IFM(6, 0x36) /* Cedarview */ -#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */ #define INTEL_ATOM_SALTWELL_MID IFM(6, 0x27) /* Penwell */ -#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */ #define INTEL_ATOM_SALTWELL_TABLET IFM(6, 0x35) /* Cloverview */ -#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */ #define INTEL_ATOM_SILVERMONT IFM(6, 0x37) /* Bay Trail, Valleyview */ -#define INTEL_FAM6_ATOM_SILVERMONT_D 0x4D /* Avaton, Rangely */ #define INTEL_ATOM_SILVERMONT_D IFM(6, 0x4D) /* Avaton, Rangely */ -#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */ #define INTEL_ATOM_SILVERMONT_MID IFM(6, 0x4A) /* Merriefield */ -#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */ #define INTEL_ATOM_AIRMONT IFM(6, 0x4C) /* Cherry Trail, Braswell */ -#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */ #define INTEL_ATOM_AIRMONT_MID IFM(6, 0x5A) /* Moorefield */ -#define INTEL_FAM6_ATOM_AIRMONT_NP 0x75 /* Lightning Mountain */ #define INTEL_ATOM_AIRMONT_NP IFM(6, 0x75) /* Lightning Mountain */ -#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ #define INTEL_ATOM_GOLDMONT IFM(6, 0x5C) /* Apollo Lake */ -#define INTEL_FAM6_ATOM_GOLDMONT_D 0x5F /* Denverton */ #define INTEL_ATOM_GOLDMONT_D IFM(6, 0x5F) /* Denverton */ /* Note: the micro-architecture is "Goldmont Plus" */ -#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ #define INTEL_ATOM_GOLDMONT_PLUS IFM(6, 0x7A) /* Gemini Lake */ -#define INTEL_FAM6_ATOM_TREMONT_D 0x86 /* Jacobsville */ #define INTEL_ATOM_TREMONT_D IFM(6, 0x86) /* Jacobsville */ -#define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */ #define INTEL_ATOM_TREMONT IFM(6, 0x96) /* Elkhart Lake */ -#define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */ #define INTEL_ATOM_TREMONT_L IFM(6, 0x9C) /* Jasper Lake */ -#define INTEL_FAM6_ATOM_GRACEMONT 0xBE /* Alderlake N */ #define INTEL_ATOM_GRACEMONT IFM(6, 0xBE) /* Alderlake N */ -#define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */ #define INTEL_ATOM_CRESTMONT_X IFM(6, 0xAF) /* Sierra Forest */ -#define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */ #define INTEL_ATOM_CRESTMONT IFM(6, 0xB6) /* Grand Ridge */ -#define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */ #define INTEL_ATOM_DARKMONT_X IFM(6, 0xDD) /* Clearwater Forest */ /* Xeon Phi */ -#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ #define INTEL_XEON_PHI_KNL IFM(6, 0x57) /* Knights Landing */ -#define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */ #define INTEL_XEON_PHI_KNM IFM(6, 0x85) /* Knights Mill */ /* Family 5 */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 3ad29b128943..3b9970117a0f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -221,7 +221,7 @@ static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { return -EINVAL; } #endif -void mce_setup(struct mce *m); +void mce_prep_record(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct device *, mce_device); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 390c4d13956d..5f0bc6a6d025 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -40,7 +40,6 @@ static inline unsigned char hv_get_nmi_reason(void) } #if IS_ENABLED(CONFIG_HYPERV) -extern int hyperv_init_cpuhp; extern bool hyperv_paravisor_present; extern void *hv_hypercall_pg; diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index af4302d79b59..f3d257c45225 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -17,6 +17,7 @@ extern unsigned long phys_base; extern unsigned long page_offset_base; extern unsigned long vmalloc_base; extern unsigned long vmemmap_base; +extern unsigned long physmem_end; static __always_inline unsigned long __phys_addr_nodebug(unsigned long x) { diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 9053dfe9fa03..a98e53491a4e 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -140,6 +140,10 @@ extern unsigned int ptrs_per_p4d; # define VMEMMAP_START __VMEMMAP_BASE_L4 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ +#ifdef CONFIG_RANDOMIZE_MEMORY +# define PHYSMEM_END physmem_end +#endif + /* * End of the region for which vmalloc page tables are pre-allocated. * For non-KMSAN builds, this is the same as VMALLOC_END. diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a75a07f4931f..775acbdea1a9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -691,8 +691,6 @@ static inline u32 per_cpu_l2c_id(unsigned int cpu) } #ifdef CONFIG_CPU_SUP_AMD -extern u32 amd_get_highest_perf(void); - /* * Issue a DIV 0/1 insn to clear any division data from previous DIV * operations. @@ -705,7 +703,6 @@ static __always_inline void amd_clear_divider(void) extern void amd_check_microcode(void); #else -static inline u32 amd_get_highest_perf(void) { return 0; } static inline void amd_clear_divider(void) { } static inline void amd_check_microcode(void) { } #endif diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 12dbd2588ca7..8b1b6ce1e51b 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -156,12 +156,6 @@ static inline void resctrl_sched_in(struct task_struct *tsk) __resctrl_sched_in(tsk); } -static inline u32 resctrl_arch_system_num_rmid_idx(void) -{ - /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return boot_cpu_data.x86_cache_max_rmid + 1; -} - static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) { *rmid = idx; diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 79bbe2be900e..ee34ab00a8d6 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -164,7 +164,7 @@ struct snp_guest_msg_hdr { struct snp_guest_msg { struct snp_guest_msg_hdr hdr; - u8 payload[4000]; + u8 payload[PAGE_SIZE - sizeof(struct snp_guest_msg_hdr)]; } __packed; struct sev_guest_platform_data { diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index abe3a8f22cbd..aef70336d624 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -282,9 +282,22 @@ static inline long arch_scale_freq_capacity(int cpu) } #define arch_scale_freq_capacity arch_scale_freq_capacity +bool arch_enable_hybrid_capacity_scale(void); +void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, + unsigned long cap_freq, unsigned long base_freq); + +unsigned long arch_scale_cpu_capacity(int cpu); +#define arch_scale_cpu_capacity arch_scale_cpu_capacity + extern void arch_set_max_freq_ratio(bool turbo_disabled); extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled); #else +static inline bool arch_enable_hybrid_capacity_scale(void) { return false; } +static inline void arch_set_cpu_capacity(int cpu, unsigned long cap, + unsigned long max_cap, + unsigned long cap_freq, + unsigned long base_freq) { } + static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { } #endif diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index ff8f25faca3d..956984054bf3 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -9,6 +9,17 @@ #include <asm/processor.h> #include <asm/topology.h> +#define CPPC_HIGHEST_PERF_PERFORMANCE 196 +#define CPPC_HIGHEST_PERF_PREFCORE 166 + +enum amd_pref_core { + AMD_PREF_CORE_UNKNOWN = 0, + AMD_PREF_CORE_SUPPORTED, + AMD_PREF_CORE_UNSUPPORTED, +}; +static enum amd_pref_core amd_pref_core_detected; +static u64 boost_numerator; + /* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ bool cpc_supported_by_cpu(void) @@ -69,31 +80,30 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) static void amd_set_max_freq_ratio(void) { struct cppc_perf_caps perf_caps; - u64 highest_perf, nominal_perf; + u64 numerator, nominal_perf; u64 perf_ratio; int rc; rc = cppc_get_perf_caps(0, &perf_caps); if (rc) { - pr_debug("Could not retrieve perf counters (%d)\n", rc); + pr_warn("Could not retrieve perf counters (%d)\n", rc); return; } - highest_perf = amd_get_highest_perf(); + rc = amd_get_boost_ratio_numerator(0, &numerator); + if (rc) { + pr_warn("Could not retrieve highest performance (%d)\n", rc); + return; + } nominal_perf = perf_caps.nominal_perf; - if (!highest_perf || !nominal_perf) { - pr_debug("Could not retrieve highest or nominal performance\n"); + if (!nominal_perf) { + pr_warn("Could not retrieve nominal performance\n"); return; } - perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); /* midpoint between max_boost and max_P */ - perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; - if (!perf_ratio) { - pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); - return; - } + perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1; freq_invariance_set_perf_ratio(perf_ratio, false); } @@ -116,3 +126,143 @@ void init_freq_invariance_cppc(void) init_done = true; mutex_unlock(&freq_invariance_lock); } + +/* + * Get the highest performance register value. + * @cpu: CPU from which to get highest performance. + * @highest_perf: Return address for highest performance value. + * + * Return: 0 for success, negative error code otherwise. + */ +int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf) +{ + u64 val; + int ret; + + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val); + if (ret) + goto out; + + val = AMD_CPPC_HIGHEST_PERF(val); + } else { + ret = cppc_get_highest_perf(cpu, &val); + if (ret) + goto out; + } + + WRITE_ONCE(*highest_perf, (u32)val); +out: + return ret; +} +EXPORT_SYMBOL_GPL(amd_get_highest_perf); + +/** + * amd_detect_prefcore: Detect if CPUs in the system support preferred cores + * @detected: Output variable for the result of the detection. + * + * Determine whether CPUs in the system support preferred cores. On systems + * that support preferred cores, different highest perf values will be found + * on different cores. On other systems, the highest perf value will be the + * same on all cores. + * + * The result of the detection will be stored in the 'detected' parameter. + * + * Return: 0 for success, negative error code otherwise + */ +int amd_detect_prefcore(bool *detected) +{ + int cpu, count = 0; + u64 highest_perf[2] = {0}; + + if (WARN_ON(!detected)) + return -EINVAL; + + switch (amd_pref_core_detected) { + case AMD_PREF_CORE_SUPPORTED: + *detected = true; + return 0; + case AMD_PREF_CORE_UNSUPPORTED: + *detected = false; + return 0; + default: + break; + } + + for_each_present_cpu(cpu) { + u32 tmp; + int ret; + + ret = amd_get_highest_perf(cpu, &tmp); + if (ret) + return ret; + + if (!count || (count == 1 && tmp != highest_perf[0])) + highest_perf[count++] = tmp; + + if (count == 2) + break; + } + + *detected = (count == 2); + boost_numerator = highest_perf[0]; + + amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED : + AMD_PREF_CORE_UNSUPPORTED; + + pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n", + *detected ? "" : "un", highest_perf[0]); + + return 0; +} +EXPORT_SYMBOL_GPL(amd_detect_prefcore); + +/** + * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation + * @cpu: CPU to get numerator for. + * @numerator: Output variable for numerator. + * + * Determine the numerator to use for calculating the boost ratio on + * a CPU. On systems that support preferred cores, this will be a hardcoded + * value. On other systems this will the highest performance register value. + * + * If booting the system with amd-pstate enabled but preferred cores disabled then + * the correct boost numerator will be returned to match hardware capabilities + * even if the preferred cores scheduling hints are not enabled. + * + * Return: 0 for success, negative error code otherwise. + */ +int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) +{ + bool prefcore; + int ret; + + ret = amd_detect_prefcore(&prefcore); + if (ret) + return ret; + + /* without preferred cores, return the highest perf register value */ + if (!prefcore) { + *numerator = boost_numerator; + return 0; + } + + /* + * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f, + * the highest performance level is set to 196. + * https://bugzilla.kernel.org/show_bug.cgi?id=218759 + */ + if (cpu_feature_enabled(X86_FEATURE_ZEN4)) { + switch (boot_cpu_data.x86_model) { + case 0x70 ... 0x7f: + *numerator = CPPC_HIGHEST_PERF_PERFORMANCE; + return 0; + default: + break; + } + } + *numerator = CPPC_HIGHEST_PERF_PREFCORE; + + return 0; +} +EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 66fd4b2a37a3..373638691cd4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1775,12 +1775,9 @@ static __init void apic_set_fixmap(bool read_apic); static __init void x2apic_disable(void) { - u32 x2apic_id, state = x2apic_state; + u32 x2apic_id; - x2apic_mode = 0; - x2apic_state = X2APIC_DISABLED; - - if (state != X2APIC_ON) + if (x2apic_state < X2APIC_ON) return; x2apic_id = read_apic_id(); @@ -1793,6 +1790,10 @@ static __init void x2apic_disable(void) } __x2apic_disable(); + + x2apic_mode = 0; + x2apic_state = X2APIC_DISABLED; + /* * Don't reread the APIC ID as it was already done from * check_x2apic() and the APIC driver still is a x2APIC variant, diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 1e0fe5f8ab84..015971adadfc 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1190,22 +1190,6 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr) } EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask); -u32 amd_get_highest_perf(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) || - (c->x86_model >= 0x70 && c->x86_model < 0x80))) - return 166; - - if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) || - (c->x86_model >= 0x40 && c->x86_model < 0x70))) - return 166; - - return 255; -} -EXPORT_SYMBOL_GPL(amd_get_highest_perf); - static void zenbleed_check_cpu(void *unused) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 0b69bfbf345d..f642de2ebdac 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -349,9 +349,89 @@ static DECLARE_WORK(disable_freq_invariance_work, DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); +static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key); + +struct arch_hybrid_cpu_scale { + unsigned long capacity; + unsigned long freq_ratio; +}; + +static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale; + +/** + * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling + * + * Allocate memory for per-CPU data used by hybrid CPU capacity scaling, + * initialize it and set the static key controlling its code paths. + * + * Must be called before arch_set_cpu_capacity(). + */ +bool arch_enable_hybrid_capacity_scale(void) +{ + int cpu; + + if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) { + WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled"); + return true; + } + + arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale); + if (!arch_cpu_scale) + return false; + + for_each_possible_cpu(cpu) { + per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE; + per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio; + } + + static_branch_enable(&arch_hybrid_cap_scale_key); + + pr_info("Hybrid CPU capacity scaling enabled\n"); + + return true; +} + +/** + * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU + * @cpu: Target CPU. + * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap. + * @max_cap: System-wide maximum CPU capacity. + * @cap_freq: Frequency of @cpu corresponding to @cap. + * @base_freq: Frequency of @cpu at which MPERF counts. + * + * The units in which @cap and @max_cap are expressed do not matter, so long + * as they are consistent, because the former is effectively divided by the + * latter. Analogously for @cap_freq and @base_freq. + * + * After calling this function for all CPUs, call arch_rebuild_sched_domains() + * to let the scheduler know that capacity-aware scheduling can be used going + * forward. + */ +void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, + unsigned long cap_freq, unsigned long base_freq) +{ + if (static_branch_likely(&arch_hybrid_cap_scale_key)) { + WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity, + div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap)); + WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio, + div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq)); + } else { + WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled"); + } +} + +unsigned long arch_scale_cpu_capacity(int cpu) +{ + if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) + return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity); + + return SCHED_CAPACITY_SCALE; +} +EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity); + static void scale_freq_tick(u64 acnt, u64 mcnt) { - u64 freq_scale; + u64 freq_scale, freq_ratio; if (!arch_scale_freq_invariant()) return; @@ -359,7 +439,12 @@ static void scale_freq_tick(u64 acnt, u64 mcnt) if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) goto error; - if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) + if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) + freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio); + else + freq_ratio = arch_max_freq_ratio; + + if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt) goto error; freq_scale = div64_u64(acnt, mcnt); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 45675da354f3..d1915427b4ff 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -233,7 +233,8 @@ static void x86_amd_ssb_disable(void) #define pr_fmt(fmt) "MDS: " fmt /* Default mitigation for MDS-affected CPUs */ -static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL; +static enum mds_mitigations mds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_FULL : MDS_MITIGATION_OFF; static bool mds_nosmt __ro_after_init = false; static const char * const mds_strings[] = { @@ -293,7 +294,8 @@ enum taa_mitigations { }; /* Default mitigation for TAA-affected CPUs */ -static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; +static enum taa_mitigations taa_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_VERW : TAA_MITIGATION_OFF; static bool taa_nosmt __ro_after_init; static const char * const taa_strings[] = { @@ -391,7 +393,8 @@ enum mmio_mitigations { }; /* Default mitigation for Processor MMIO Stale Data vulnerabilities */ -static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW; +static enum mmio_mitigations mmio_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_VERW : MMIO_MITIGATION_OFF; static bool mmio_nosmt __ro_after_init = false; static const char * const mmio_strings[] = { @@ -605,7 +608,8 @@ enum srbds_mitigations { SRBDS_MITIGATION_HYPERVISOR, }; -static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL; +static enum srbds_mitigations srbds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_FULL : SRBDS_MITIGATION_OFF; static const char * const srbds_strings[] = { [SRBDS_MITIGATION_OFF] = "Vulnerable", @@ -731,11 +735,8 @@ enum gds_mitigations { GDS_MITIGATION_HYPERVISOR, }; -#if IS_ENABLED(CONFIG_MITIGATION_GDS_FORCE) -static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE; -#else -static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL; -#endif +static enum gds_mitigations gds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_FULL : GDS_MITIGATION_OFF; static const char * const gds_strings[] = { [GDS_MITIGATION_OFF] = "Vulnerable", @@ -871,7 +872,8 @@ enum spectre_v1_mitigation { }; static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init = - SPECTRE_V1_MITIGATION_AUTO; + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V1) ? + SPECTRE_V1_MITIGATION_AUTO : SPECTRE_V1_MITIGATION_NONE; static const char * const spectre_v1_strings[] = { [SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers", @@ -986,7 +988,7 @@ static const char * const retbleed_strings[] = { static enum retbleed_mitigation retbleed_mitigation __ro_after_init = RETBLEED_MITIGATION_NONE; static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = - RETBLEED_CMD_AUTO; + IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_CMD_AUTO : RETBLEED_CMD_OFF; static int __ro_after_init retbleed_nosmt = false; @@ -1447,17 +1449,18 @@ static void __init spec_v2_print_cond(const char *reason, bool secure) static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) { - enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; + enum spectre_v2_mitigation_cmd cmd; char arg[20]; int ret, i; + cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || cpu_mitigations_off()) return SPECTRE_V2_CMD_NONE; ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); if (ret < 0) - return SPECTRE_V2_CMD_AUTO; + return cmd; for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { if (!match_option(arg, ret, mitigation_options[i].option)) @@ -1467,8 +1470,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) } if (i >= ARRAY_SIZE(mitigation_options)) { - pr_err("unknown option (%s). Switching to AUTO select\n", arg); - return SPECTRE_V2_CMD_AUTO; + pr_err("unknown option (%s). Switching to default mode\n", arg); + return cmd; } if ((cmd == SPECTRE_V2_CMD_RETPOLINE || @@ -2021,10 +2024,12 @@ static const struct { static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) { - enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO; + enum ssb_mitigation_cmd cmd; char arg[20]; int ret, i; + cmd = IS_ENABLED(CONFIG_MITIGATION_SSB) ? + SPEC_STORE_BYPASS_CMD_AUTO : SPEC_STORE_BYPASS_CMD_NONE; if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || cpu_mitigations_off()) { return SPEC_STORE_BYPASS_CMD_NONE; @@ -2032,7 +2037,7 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", arg, sizeof(arg)); if (ret < 0) - return SPEC_STORE_BYPASS_CMD_AUTO; + return cmd; for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { if (!match_option(arg, ret, ssb_mitigation_options[i].option)) @@ -2043,8 +2048,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) } if (i >= ARRAY_SIZE(ssb_mitigation_options)) { - pr_err("unknown option (%s). Switching to AUTO select\n", arg); - return SPEC_STORE_BYPASS_CMD_AUTO; + pr_err("unknown option (%s). Switching to default mode\n", arg); + return cmd; } } @@ -2371,7 +2376,8 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); #define pr_fmt(fmt) "L1TF: " fmt /* Default mitigation for L1TF-affected CPUs */ -enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH; +enum l1tf_mitigations l1tf_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_FLUSH : L1TF_MITIGATION_OFF; #if IS_ENABLED(CONFIG_KVM_INTEL) EXPORT_SYMBOL_GPL(l1tf_mitigation); #endif @@ -2551,10 +2557,9 @@ static void __init srso_select_mitigation(void) { bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); - if (cpu_mitigations_off()) - return; - - if (!boot_cpu_has_bug(X86_BUG_SRSO)) { + if (!boot_cpu_has_bug(X86_BUG_SRSO) || + cpu_mitigations_off() || + srso_cmd == SRSO_CMD_OFF) { if (boot_cpu_has(X86_FEATURE_SBPB)) x86_pred_cmd = PRED_CMD_SBPB; return; @@ -2585,11 +2590,6 @@ static void __init srso_select_mitigation(void) } switch (srso_cmd) { - case SRSO_CMD_OFF: - if (boot_cpu_has(X86_FEATURE_SBPB)) - x86_pred_cmd = PRED_CMD_SBPB; - return; - case SRSO_CMD_MICROCODE: if (has_microcode) { srso_mitigation = SRSO_MITIGATION_MICROCODE; @@ -2643,6 +2643,8 @@ static void __init srso_select_mitigation(void) pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); } break; + default: + break; } out: diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d4e539d4e158..be307c9ef263 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1165,8 +1165,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB), - VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY), + VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), VULNWL_INTEL(INTEL_ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 08b95a35b5cb..e7656cbef68d 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -311,16 +311,18 @@ static void early_init_intel(struct cpuinfo_x86 *c) } /* - * There is a known erratum on Pentium III and Core Solo - * and Core Duo CPUs. - * " Page with PAT set to WC while associated MTRR is UC - * may consolidate to UC " - * Because of this erratum, it is better to stick with - * setting WC in MTRR rather than using PAT on these CPUs. + * PAT is broken on early family 6 CPUs, the last of which + * is "Yonah" where the erratum is named "AN7": * - * Enable PAT WC only on P4, Core 2 or later CPUs. + * Page with PAT (Page Attribute Table) Set to USWC + * (Uncacheable Speculative Write Combine) While + * Associated MTRR (Memory Type Range Register) Is UC + * (Uncacheable) May Consolidate to UC + * + * Disable PAT and fall back to MTRR on these CPUs. */ - if (c->x86 == 6 && c->x86_model < 15) + if (c->x86_vfm >= INTEL_PENTIUM_PRO && + c->x86_vfm <= INTEL_CORE_YONAH) clear_cpu_cap(c, X86_FEATURE_PAT); /* diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 9a0133ef7e20..14bf8c232e45 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -780,7 +780,7 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { struct mce m; - mce_setup(&m); + mce_prep_record(&m); m.status = status; m.misc = misc; diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 7f7309ff67d0..3885fe05f01e 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -44,7 +44,7 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) else lsb = PAGE_SHIFT; - mce_setup(&m); + mce_prep_record(&m); m.bank = -1; /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; @@ -66,6 +66,7 @@ EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { const u64 *i_mce = ((const u64 *) (ctx_info + 1)); + bool apicid_found = false; unsigned int cpu; struct mce m; @@ -97,20 +98,19 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) if (ctx_info->reg_arr_size < 48) return -EINVAL; - mce_setup(&m); - - m.extcpu = -1; - m.socketid = -1; - for_each_possible_cpu(cpu) { if (cpu_data(cpu).topo.initial_apicid == lapic_id) { - m.extcpu = cpu; - m.socketid = cpu_data(m.extcpu).topo.pkg_id; + apicid_found = true; break; } } - m.apicid = lapic_id; + if (!apicid_found) + return -EINVAL; + + mce_prep_record_common(&m); + mce_prep_record_per_cpu(cpu, &m); + m.bank = (ctx_info->msr_addr >> 4) & 0xFF; m.status = *i_mce; m.addr = *(i_mce + 1); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b85ec7a4ec9e..2a938f429c4d 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -117,20 +117,32 @@ static struct irq_work mce_irq_work; */ BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); -/* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) +void mce_prep_record_common(struct mce *m) { memset(m, 0, sizeof(struct mce)); - m->cpu = m->extcpu = smp_processor_id(); + + m->cpuid = cpuid_eax(1); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); /* need the internal __ version to avoid deadlocks */ - m->time = __ktime_get_real_seconds(); - m->cpuvendor = boot_cpu_data.x86_vendor; - m->cpuid = cpuid_eax(1); - m->socketid = cpu_data(m->extcpu).topo.pkg_id; - m->apicid = cpu_data(m->extcpu).topo.initial_apicid; - m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); - m->ppin = cpu_data(m->extcpu).ppin; - m->microcode = boot_cpu_data.microcode; + m->time = __ktime_get_real_seconds(); +} + +void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m) +{ + m->cpu = cpu; + m->extcpu = cpu; + m->apicid = cpu_data(cpu).topo.initial_apicid; + m->microcode = cpu_data(cpu).microcode; + m->ppin = topology_ppin(cpu); + m->socketid = topology_physical_package_id(cpu); +} + +/* Do initial initialization of a struct mce */ +void mce_prep_record(struct mce *m) +{ + mce_prep_record_common(m); + mce_prep_record_per_cpu(smp_processor_id(), m); } DEFINE_PER_CPU(struct mce, injectm); @@ -436,11 +448,11 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) { /* - * Enable instrumentation around mce_setup() which calls external + * Enable instrumentation around mce_prep_record() which calls external * facilities. */ instrumentation_begin(); - mce_setup(m); + mce_prep_record(m); instrumentation_end(); m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 01f8f03969e6..43c7f3b71df5 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -261,6 +261,8 @@ enum mca_msr { /* Decide whether to add MCE record to MCE event pool or filter it out. */ extern bool filter_mce(struct mce *m); +void mce_prep_record_common(struct mce *m); +void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index c0d56c02b8da..f63b051f25a0 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -89,6 +89,31 @@ static struct equiv_cpu_table { struct equiv_cpu_entry *entry; } equiv_table; +union zen_patch_rev { + struct { + __u32 rev : 8, + stepping : 4, + model : 4, + __reserved : 4, + ext_model : 4, + ext_fam : 8; + }; + __u32 ucode_rev; +}; + +union cpuid_1_eax { + struct { + __u32 stepping : 4, + model : 4, + family : 4, + __reserved0 : 4, + ext_model : 4, + ext_fam : 8, + __reserved1 : 4; + }; + __u32 full; +}; + /* * This points to the current valid container of microcode patches which we will * save from the initrd/builtin before jettisoning its contents. @mc is the @@ -96,7 +121,6 @@ static struct equiv_cpu_table { */ struct cont_desc { struct microcode_amd *mc; - u32 cpuid_1_eax; u32 psize; u8 *data; size_t size; @@ -109,10 +133,42 @@ struct cont_desc { static const char ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; +/* + * This is CPUID(1).EAX on the BSP. It is used in two ways: + * + * 1. To ignore the equivalence table on Zen1 and newer. + * + * 2. To match which patches to load because the patch revision ID + * already contains the f/m/s for which the microcode is destined + * for. + */ +static u32 bsp_cpuid_1_eax __ro_after_init; + +static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) +{ + union zen_patch_rev p; + union cpuid_1_eax c; + + p.ucode_rev = val; + c.full = 0; + + c.stepping = p.stepping; + c.model = p.model; + c.ext_model = p.ext_model; + c.family = 0xf; + c.ext_fam = p.ext_fam; + + return c; +} + static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig) { unsigned int i; + /* Zen and newer do not need an equivalence table. */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + return 0; + if (!et || !et->num_entries) return 0; @@ -159,6 +215,10 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) if (!verify_container(buf, buf_size)) return false; + /* Zen and newer do not need an equivalence table. */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + return true; + cont_type = hdr[1]; if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) { pr_debug("Wrong microcode container equivalence table type: %u.\n", @@ -222,8 +282,9 @@ __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize) * exceed the per-family maximum). @sh_psize is the size read from the section * header. */ -static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size) +static unsigned int __verify_patch_size(u32 sh_psize, size_t buf_size) { + u8 family = x86_family(bsp_cpuid_1_eax); u32 max_size; if (family >= 0x15) @@ -258,9 +319,9 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size * positive: patch is not for this family, skip it * 0: success */ -static int -verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size) +static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) { + u8 family = x86_family(bsp_cpuid_1_eax); struct microcode_header_amd *mc_hdr; unsigned int ret; u32 sh_psize; @@ -286,7 +347,7 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size) return -1; } - ret = __verify_patch_size(family, sh_psize, buf_size); + ret = __verify_patch_size(sh_psize, buf_size); if (!ret) { pr_debug("Per-family patch size mismatch.\n"); return -1; @@ -308,6 +369,15 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size) return 0; } +static bool mc_patch_matches(struct microcode_amd *mc, u16 eq_id) +{ + /* Zen and newer do not need an equivalence table. */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + return ucode_rev_to_cpuid(mc->hdr.patch_id).full == bsp_cpuid_1_eax; + else + return eq_id == mc->hdr.processor_rev_id; +} + /* * This scans the ucode blob for the proper container as we can have multiple * containers glued together. Returns the equivalence ID from the equivalence @@ -336,7 +406,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) * doesn't contain a patch for the CPU, scan through the whole container * so that it can be skipped in case there are other containers appended. */ - eq_id = find_equiv_id(&table, desc->cpuid_1_eax); + eq_id = find_equiv_id(&table, bsp_cpuid_1_eax); buf += hdr[2] + CONTAINER_HDR_SZ; size -= hdr[2] + CONTAINER_HDR_SZ; @@ -350,7 +420,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) u32 patch_size; int ret; - ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size); + ret = verify_patch(buf, size, &patch_size); if (ret < 0) { /* * Patch verification failed, skip to the next container, if @@ -363,7 +433,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) } mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE); - if (eq_id == mc->hdr.processor_rev_id) { + if (mc_patch_matches(mc, eq_id)) { desc->psize = patch_size; desc->mc = mc; } @@ -421,6 +491,7 @@ static int __apply_microcode_amd(struct microcode_amd *mc) /* verify patch application was successful */ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (rev != mc->hdr.patch_id) return -1; @@ -438,14 +509,12 @@ static int __apply_microcode_amd(struct microcode_amd *mc) * * Returns true if container found (sets @desc), false otherwise. */ -static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, size_t size) +static bool early_apply_microcode(u32 old_rev, void *ucode, size_t size) { struct cont_desc desc = { 0 }; struct microcode_amd *mc; bool ret = false; - desc.cpuid_1_eax = cpuid_1_eax; - scan_containers(ucode, size, &desc); mc = desc.mc; @@ -463,9 +532,10 @@ static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, siz return !__apply_microcode_amd(mc); } -static bool get_builtin_microcode(struct cpio_data *cp, u8 family) +static bool get_builtin_microcode(struct cpio_data *cp) { char fw_name[36] = "amd-ucode/microcode_amd.bin"; + u8 family = x86_family(bsp_cpuid_1_eax); struct firmware fw; if (IS_ENABLED(CONFIG_X86_32)) @@ -484,11 +554,11 @@ static bool get_builtin_microcode(struct cpio_data *cp, u8 family) return false; } -static void __init find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret) +static void __init find_blobs_in_containers(struct cpio_data *ret) { struct cpio_data cp; - if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax))) + if (!get_builtin_microcode(&cp)) cp = find_microcode_in_initrd(ucode_path); *ret = cp; @@ -499,16 +569,18 @@ void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_ struct cpio_data cp = { }; u32 dummy; + bsp_cpuid_1_eax = cpuid_1_eax; + native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy); /* Needed in load_microcode_amd() */ ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax; - find_blobs_in_containers(cpuid_1_eax, &cp); + find_blobs_in_containers(&cp); if (!(cp.data && cp.size)) return; - if (early_apply_microcode(cpuid_1_eax, ed->old_rev, cp.data, cp.size)) + if (early_apply_microcode(ed->old_rev, cp.data, cp.size)) native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy); } @@ -525,12 +597,10 @@ static int __init save_microcode_in_initrd(void) if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) return 0; - find_blobs_in_containers(cpuid_1_eax, &cp); + find_blobs_in_containers(&cp); if (!(cp.data && cp.size)) return -EINVAL; - desc.cpuid_1_eax = cpuid_1_eax; - scan_containers(cp.data, cp.size, &desc); if (!desc.mc) return -EINVAL; @@ -543,26 +613,65 @@ static int __init save_microcode_in_initrd(void) } early_initcall(save_microcode_in_initrd); +static inline bool patch_cpus_equivalent(struct ucode_patch *p, struct ucode_patch *n) +{ + /* Zen and newer hardcode the f/m/s in the patch ID */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) { + union cpuid_1_eax p_cid = ucode_rev_to_cpuid(p->patch_id); + union cpuid_1_eax n_cid = ucode_rev_to_cpuid(n->patch_id); + + /* Zap stepping */ + p_cid.stepping = 0; + n_cid.stepping = 0; + + return p_cid.full == n_cid.full; + } else { + return p->equiv_cpu == n->equiv_cpu; + } +} + /* * a small, trivial cache of per-family ucode patches */ -static struct ucode_patch *cache_find_patch(u16 equiv_cpu) +static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equiv_cpu) { struct ucode_patch *p; + struct ucode_patch n; + + n.equiv_cpu = equiv_cpu; + n.patch_id = uci->cpu_sig.rev; + + WARN_ON_ONCE(!n.patch_id); list_for_each_entry(p, µcode_cache, plist) - if (p->equiv_cpu == equiv_cpu) + if (patch_cpus_equivalent(p, &n)) return p; + return NULL; } +static inline bool patch_newer(struct ucode_patch *p, struct ucode_patch *n) +{ + /* Zen and newer hardcode the f/m/s in the patch ID */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) { + union zen_patch_rev zp, zn; + + zp.ucode_rev = p->patch_id; + zn.ucode_rev = n->patch_id; + + return zn.rev > zp.rev; + } else { + return n->patch_id > p->patch_id; + } +} + static void update_cache(struct ucode_patch *new_patch) { struct ucode_patch *p; list_for_each_entry(p, µcode_cache, plist) { - if (p->equiv_cpu == new_patch->equiv_cpu) { - if (p->patch_id >= new_patch->patch_id) { + if (patch_cpus_equivalent(p, new_patch)) { + if (!patch_newer(p, new_patch)) { /* we already have the latest patch */ kfree(new_patch->data); kfree(new_patch); @@ -593,13 +702,22 @@ static void free_cache(void) static struct ucode_patch *find_patch(unsigned int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u16 equiv_id; + u32 rev, dummy __always_unused; + u16 equiv_id = 0; - equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig); - if (!equiv_id) - return NULL; + /* fetch rev if not populated yet: */ + if (!uci->cpu_sig.rev) { + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + uci->cpu_sig.rev = rev; + } + + if (x86_family(bsp_cpuid_1_eax) < 0x17) { + equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig); + if (!equiv_id) + return NULL; + } - return cache_find_patch(equiv_id); + return cache_find_patch(uci, equiv_id); } void reload_ucode_amd(unsigned int cpu) @@ -649,7 +767,7 @@ static enum ucode_state apply_microcode_amd(int cpu) struct ucode_cpu_info *uci; struct ucode_patch *p; enum ucode_state ret; - u32 rev, dummy __always_unused; + u32 rev; BUG_ON(raw_smp_processor_id() != cpu); @@ -659,11 +777,11 @@ static enum ucode_state apply_microcode_amd(int cpu) if (!p) return UCODE_NFOUND; + rev = uci->cpu_sig.rev; + mc_amd = p->data; uci->mc = p->data; - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* need to apply patch? */ if (rev > mc_amd->hdr.patch_id) { ret = UCODE_OK; @@ -709,6 +827,10 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size) hdr = (const u32 *)buf; equiv_tbl_len = hdr[2]; + /* Zen and newer do not need an equivalence table. */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + goto out; + equiv_table.entry = vmalloc(equiv_tbl_len); if (!equiv_table.entry) { pr_err("failed to allocate equivalent CPU table\n"); @@ -718,12 +840,16 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size) memcpy(equiv_table.entry, buf + CONTAINER_HDR_SZ, equiv_tbl_len); equiv_table.num_entries = equiv_tbl_len / sizeof(struct equiv_cpu_entry); +out: /* add header length */ return equiv_tbl_len + CONTAINER_HDR_SZ; } static void free_equiv_cpu_table(void) { + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + return; + vfree(equiv_table.entry); memset(&equiv_table, 0, sizeof(equiv_table)); } @@ -749,7 +875,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, u16 proc_id; int ret; - ret = verify_patch(family, fw, leftover, patch_size); + ret = verify_patch(fw, leftover, patch_size); if (ret) return ret; @@ -774,7 +900,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, patch->patch_id = mc_hdr->patch_id; patch->equiv_cpu = proc_id; - pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n", + pr_debug("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n", __func__, patch->patch_id, proc_id); /* ... and add to cache. */ diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index e0fd57a8ba84..ead967479fa6 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -199,8 +199,8 @@ static void hv_machine_shutdown(void) * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor * corrupts the old VP Assist Pages and can crash the kexec kernel. */ - if (kexec_in_progress && hyperv_init_cpuhp > 0) - cpuhp_remove_state(hyperv_init_cpuhp); + if (kexec_in_progress) + cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE); /* The function calls stop_other_cpus(). */ native_machine_shutdown(); @@ -424,6 +424,7 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { x86_platform.calibrate_tsc = hv_get_tsc_khz; x86_platform.calibrate_cpu = hv_get_tsc_khz; + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); } if (ms_hyperv.priv_high & HV_ISOLATION) { @@ -449,9 +450,23 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED; if (!ms_hyperv.paravisor_present) { - /* To be supported: more work is required. */ + /* + * Mark the Hyper-V TSC page feature as disabled + * in a TDX VM without paravisor so that the + * Invariant TSC, which is a better clocksource + * anyway, is used instead. + */ ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE; + /* + * The Invariant TSC is expected to be available + * in a TDX VM without paravisor, but if not, + * print a warning message. The slower Hyper-V MSR-based + * Ref Counter should end up being the clocksource. + */ + if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) + pr_warn("Hyper-V: Invariant TSC is unavailable\n"); + /* HV_MSR_CRASH_CTL is unsupported. */ ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 1930fce9dfe9..8591d53c144b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -119,6 +119,14 @@ struct rdt_hw_resource rdt_resources_all[] = { }, }; +u32 resctrl_arch_system_num_rmid_idx(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + + /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ + return r->num_rmid; +} + /* * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs * as they do not have CPUID enumeration support for Cache allocation. diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 27892e57c4ef..f3f1461273ee 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -475,24 +475,25 @@ struct sgx_epc_page *__sgx_alloc_epc_page(void) { struct sgx_epc_page *page; int nid_of_current = numa_node_id(); - int nid = nid_of_current; + int nid_start, nid; - if (node_isset(nid_of_current, sgx_numa_mask)) { - page = __sgx_alloc_epc_page_from_node(nid_of_current); - if (page) - return page; - } - - /* Fall back to the non-local NUMA nodes: */ - while (true) { - nid = next_node_in(nid, sgx_numa_mask); - if (nid == nid_of_current) - break; + /* + * Try local node first. If it doesn't have an EPC section, + * fall back to the non-local NUMA nodes. + */ + if (node_isset(nid_of_current, sgx_numa_mask)) + nid_start = nid_of_current; + else + nid_start = next_node_in(nid_of_current, sgx_numa_mask); + nid = nid_start; + do { page = __sgx_alloc_epc_page_from_node(nid); if (page) return page; - } + + nid = next_node_in(nid, sgx_numa_mask); + } while (nid != nid_start); return ERR_PTR(-ENOMEM); } @@ -847,6 +848,13 @@ static bool __init sgx_page_cache_init(void) return false; } + for_each_online_node(nid) { + if (!node_isset(nid, sgx_numa_mask) && + node_state(nid, N_MEMORY) && node_state(nid, N_CPU)) + pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n", + nid); + } + return true; } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c5a026fee5e0..1339f8328db5 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -788,6 +788,9 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) goto out_disable; } + fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features & + XFEATURE_MASK_INDEPENDENT; + /* * Clear XSAVE features that are disabled in the normal CPUID. */ diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 2ee0b9c53dcc..afb404cd2059 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -62,9 +62,9 @@ static inline u64 xfeatures_mask_supervisor(void) static inline u64 xfeatures_mask_independent(void) { if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) - return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR; + return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR; - return XFEATURE_MASK_INDEPENDENT; + return fpu_kernel_cfg.independent_features; } /* XSAVE/XRSTOR wrapper functions */ diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 472a1537b7a9..730c2f34d347 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -19,7 +19,6 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" - depends on HIGH_RES_TIMERS depends on X86_LOCAL_APIC select KVM_COMMON select KVM_GENERIC_MMU_NOTIFIER @@ -144,8 +143,10 @@ config KVM_AMD_SEV select HAVE_KVM_ARCH_GMEM_PREPARE select HAVE_KVM_ARCH_GMEM_INVALIDATE help - Provides support for launching Encrypted VMs (SEV) and Encrypted VMs - with Encrypted State (SEV-ES) on AMD processors. + Provides support for launching encrypted VMs which use Secure + Encrypted Virtualization (SEV), Secure Encrypted Virtualization with + Encrypted State (SEV-ES), and Secure Encrypted Virtualization with + Secure Nested Paging (SEV-SNP) technologies on AMD processors. config KVM_SMM bool "System Management Mode emulation" diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 928cf84778b0..7813d28b082f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4674,16 +4674,14 @@ out_unlock: bool kvm_mmu_may_ignore_guest_pat(void) { /* - * When EPT is enabled (shadow_memtype_mask is non-zero), the CPU does - * not support self-snoop (or is affected by an erratum), and the VM + * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to * honor the memtype from the guest's PAT so that guest accesses to * memory that is DMA'd aren't cached against the guest's wishes. As a * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA, - * KVM _always_ ignores or honors guest PAT, i.e. doesn't toggle SPTE - * bits in response to non-coherent device (un)registration. + * KVM _always_ ignores guest PAT (when EPT is enabled). */ - return !static_cpu_has(X86_FEATURE_SELFSNOOP) && shadow_memtype_mask; + return shadow_memtype_mask; } int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) @@ -4750,7 +4748,9 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, * reload is efficient when called repeatedly, so we can do it on * every iteration. */ - kvm_mmu_reload(vcpu); + r = kvm_mmu_reload(vcpu); + if (r) + return r; if (kvm_arch_has_private_mem(vcpu->kvm) && kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa))) diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index d4527965e48c..8f7eb3ad88fc 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -391,9 +391,9 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) mmio_value = 0; /* - * The masked MMIO value must obviously match itself and a removed SPTE - * must not get a false positive. Removed SPTEs and MMIO SPTEs should - * never collide as MMIO must set some RWX bits, and removed SPTEs must + * The masked MMIO value must obviously match itself and a frozen SPTE + * must not get a false positive. Frozen SPTEs and MMIO SPTEs should + * never collide as MMIO must set some RWX bits, and frozen SPTEs must * not set any RWX bits. */ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) || diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index ef793c459b05..2cb816ea2430 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -214,7 +214,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; */ #define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) -/* Removed SPTEs must not be misconstrued as shadow present PTEs. */ +/* Frozen SPTEs must not be misconstrued as shadow present PTEs. */ static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK)); static inline bool is_frozen_spte(u64 spte) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c7dc49ee7388..3c55955bcaf8 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -359,10 +359,10 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) /* * Set the SPTE to a nonpresent value that other * threads will not overwrite. If the SPTE was - * already marked as removed then another thread + * already marked as frozen then another thread * handling a page fault could overwrite it, so * set the SPTE until it is set from some other - * value to the removed SPTE value. + * value to the frozen SPTE value. */ for (;;) { old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE); @@ -536,8 +536,8 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 *sptep = rcu_dereference(iter->sptep); /* - * The caller is responsible for ensuring the old SPTE is not a REMOVED - * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, + * The caller is responsible for ensuring the old SPTE is not a FROZEN + * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE, * and pre-checking before inserting a new SPTE is advantageous as it * avoids unnecessary work. */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d6f252555ab3..5ab2c92c7331 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2876,6 +2876,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CSTAR: msr_info->data = svm->vmcb01.ptr->save.cstar; break; + case MSR_GS_BASE: + msr_info->data = svm->vmcb01.ptr->save.gs.base; + break; + case MSR_FS_BASE: + msr_info->data = svm->vmcb01.ptr->save.fs.base; + break; case MSR_KERNEL_GS_BASE: msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base; break; @@ -3101,6 +3107,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_CSTAR: svm->vmcb01.ptr->save.cstar = data; break; + case MSR_GS_BASE: + svm->vmcb01.ptr->save.gs.base = data; + break; + case MSR_FS_BASE: + svm->vmcb01.ptr->save.fs.base = data; + break; case MSR_KERNEL_GS_BASE: svm->vmcb01.ptr->save.kernel_gs_base = data; break; @@ -5224,6 +5236,9 @@ static __init void svm_set_cpu_caps(void) /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); + + /* Don't advertise Bus Lock Detect to guest if SVM support is absent */ + kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); } static __init int svm_hardware_setup(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f18c2d8c7476..733a0c45d1a6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7659,13 +7659,11 @@ u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) /* * Force WB and ignore guest PAT if the VM does NOT have a non-coherent - * device attached and the CPU doesn't support self-snoop. Letting the - * guest control memory types on Intel CPUs without self-snoop may - * result in unexpected behavior, and so KVM's (historical) ABI is to - * trust the guest to behave only as a last resort. + * device attached. Letting the guest control memory types on Intel + * CPUs may result in unexpected behavior, and so KVM's ABI is to trust + * the guest to behave only as a last resort. */ - if (!static_cpu_has(X86_FEATURE_SELFSNOOP) && - !kvm_arch_has_noncoherent_dma(vcpu->kvm)) + if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 70219e406987..c983c8e434b8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4656,7 +4656,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ASYNC_PF_INT: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: - case KVM_CAP_READONLY_MEM: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: case KVM_CAP_DISABLE_QUIRKS: @@ -4815,6 +4814,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VM_TYPES: r = kvm_caps.supported_vm_types; break; + case KVM_CAP_READONLY_MEM: + r = kvm ? kvm_arch_has_readonly_mem(kvm) : 1; + break; default: break; } @@ -6040,7 +6042,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) break; + kvm_vcpu_srcu_read_lock(vcpu); r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); + kvm_vcpu_srcu_read_unlock(vcpu); break; } case KVM_GET_DEBUGREGS: { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d8dbeac8b206..ff253648706f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -958,8 +958,12 @@ static void update_end_of_memory_vars(u64 start, u64 size) int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, struct mhp_params *params) { + unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1; int ret; + if (WARN_ON_ONCE(end > PHYSMEM_END)) + return -ERANGE; + ret = __add_pages(nid, start_pfn, nr_pages, params); WARN_ON_ONCE(ret); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 37db264866b6..230f1dee4f09 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -47,13 +47,24 @@ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; */ static __initdata struct kaslr_memory_region { unsigned long *base; + unsigned long *end; unsigned long size_tb; } kaslr_regions[] = { - { &page_offset_base, 0 }, - { &vmalloc_base, 0 }, - { &vmemmap_base, 0 }, + { + .base = &page_offset_base, + .end = &physmem_end, + }, + { + .base = &vmalloc_base, + }, + { + .base = &vmemmap_base, + }, }; +/* The end of the possible address space for physical memory */ +unsigned long physmem_end __ro_after_init; + /* Get size in bytes used by the memory region */ static inline unsigned long get_padding(struct kaslr_memory_region *region) { @@ -82,6 +93,8 @@ void __init kernel_randomize_memory(void) BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE); BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); + /* Preset the end of the possible address space for physical memory */ + physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1); if (!kaslr_memory_enabled()) return; @@ -128,11 +141,18 @@ void __init kernel_randomize_memory(void) vaddr += entropy; *kaslr_regions[i].base = vaddr; + /* Calculate the end of the region */ + vaddr += get_padding(&kaslr_regions[i]); /* - * Jump the region and add a minimum padding based on - * randomization alignment. + * KASLR trims the maximum possible size of the + * direct-map. Update the physmem_end boundary. + * No rounding required as the region starts + * PUD aligned and size is in units of TB. */ - vaddr += get_padding(&kaslr_regions[i]); + if (kaslr_regions[i].end) + *kaslr_regions[i].end = __pa_nodebug(vaddr - 1); + + /* Add a minimum padding based on randomization alignment. */ vaddr = round_up(vaddr + 1, PUD_SIZE); remain_entropy -= entropy; } diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index f200a4ec044e..d3db28f2f811 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -14,6 +14,7 @@ config XTENSA select ARCH_HAS_DMA_SET_UNCACHED if MMU select ARCH_HAS_STRNCPY_FROM_USER if !KASAN select ARCH_HAS_STRNLEN_USER + select ARCH_NEED_CMPXCHG_1_EMU select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS diff --git a/arch/xtensa/include/asm/cmpxchg.h b/arch/xtensa/include/asm/cmpxchg.h index 675a11ea8de7..95e33a913962 100644 --- a/arch/xtensa/include/asm/cmpxchg.h +++ b/arch/xtensa/include/asm/cmpxchg.h @@ -15,6 +15,7 @@ #include <linux/bits.h> #include <linux/stringify.h> +#include <linux/cmpxchg-emu.h> /* * cmpxchg @@ -74,6 +75,7 @@ static __inline__ unsigned long __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size) { switch (size) { + case 1: return cmpxchg_emu_u8(ptr, old, new); case 4: return __cmpxchg_u32(ptr, old, new); default: __cmpxchg_called_with_bad_pointer(); return old; |
