diff options
Diffstat (limited to 'arch/x86')
62 files changed, 964 insertions, 448 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 007bab9f2a0e..6f1c31f4b9ab 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1889,6 +1889,10 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS If unsure, say y. +config ARCH_PKEY_BITS + int + default 4 + choice prompt "TSX enable mode" depends on CPU_SUP_INTEL @@ -2610,24 +2614,15 @@ config MITIGATION_SLS against straight line speculation. The kernel image might be slightly larger. -config MITIGATION_GDS_FORCE - bool "Force GDS Mitigation" +config MITIGATION_GDS + bool "Mitigate Gather Data Sampling" depends on CPU_SUP_INTEL - default n + default y help - Gather Data Sampling (GDS) is a hardware vulnerability which allows - unprivileged speculative access to data which was previously stored in - vector registers. - - This option is equivalent to setting gather_data_sampling=force on the - command line. The microcode mitigation is used if present, otherwise - AVX is disabled as a mitigation. On affected systems that are missing - the microcode any userspace code that unconditionally uses AVX will - break with this option set. - - Setting this option on systems not vulnerable to GDS has no effect. - - If in doubt, say N. + Enable mitigation for Gather Data Sampling (GDS). GDS is a hardware + vulnerability which allows unprivileged speculative access to data + which was previously stored in vector registers. The attacker uses gather + instructions to infer the stale vector register data. config MITIGATION_RFDS bool "RFDS Mitigation" @@ -2650,6 +2645,107 @@ config MITIGATION_SPECTRE_BHI indirect branches. See <file:Documentation/admin-guide/hw-vuln/spectre.rst> +config MITIGATION_MDS + bool "Mitigate Microarchitectural Data Sampling (MDS) hardware bug" + depends on CPU_SUP_INTEL + default y + help + Enable mitigation for Microarchitectural Data Sampling (MDS). MDS is + a hardware vulnerability which allows unprivileged speculative access + to data which is available in various CPU internal buffers. + See also <file:Documentation/admin-guide/hw-vuln/mds.rst> + +config MITIGATION_TAA + bool "Mitigate TSX Asynchronous Abort (TAA) hardware bug" + depends on CPU_SUP_INTEL + default y + help + Enable mitigation for TSX Asynchronous Abort (TAA). TAA is a hardware + vulnerability that allows unprivileged speculative access to data + which is available in various CPU internal buffers by using + asynchronous aborts within an Intel TSX transactional region. + See also <file:Documentation/admin-guide/hw-vuln/tsx_async_abort.rst> + +config MITIGATION_MMIO_STALE_DATA + bool "Mitigate MMIO Stale Data hardware bug" + depends on CPU_SUP_INTEL + default y + help + Enable mitigation for MMIO Stale Data hardware bugs. Processor MMIO + Stale Data Vulnerabilities are a class of memory-mapped I/O (MMIO) + vulnerabilities that can expose data. The vulnerabilities require the + attacker to have access to MMIO. + See also + <file:Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst> + +config MITIGATION_L1TF + bool "Mitigate L1 Terminal Fault (L1TF) hardware bug" + depends on CPU_SUP_INTEL + default y + help + Mitigate L1 Terminal Fault (L1TF) hardware bug. L1 Terminal Fault is a + hardware vulnerability which allows unprivileged speculative access to data + available in the Level 1 Data Cache. + See <file:Documentation/admin-guide/hw-vuln/l1tf.rst + +config MITIGATION_RETBLEED + bool "Mitigate RETBleed hardware bug" + depends on (CPU_SUP_INTEL && MITIGATION_SPECTRE_V2) || MITIGATION_UNRET_ENTRY || MITIGATION_IBPB_ENTRY + default y + help + Enable mitigation for RETBleed (Arbitrary Speculative Code Execution + with Return Instructions) vulnerability. RETBleed is a speculative + execution attack which takes advantage of microarchitectural behavior + in many modern microprocessors, similar to Spectre v2. An + unprivileged attacker can use these flaws to bypass conventional + memory security restrictions to gain read access to privileged memory + that would otherwise be inaccessible. + +config MITIGATION_SPECTRE_V1 + bool "Mitigate SPECTRE V1 hardware bug" + default y + help + Enable mitigation for Spectre V1 (Bounds Check Bypass). Spectre V1 is a + class of side channel attacks that takes advantage of speculative + execution that bypasses conditional branch instructions used for + memory access bounds check. + See also <file:Documentation/admin-guide/hw-vuln/spectre.rst> + +config MITIGATION_SPECTRE_V2 + bool "Mitigate SPECTRE V2 hardware bug" + default y + help + Enable mitigation for Spectre V2 (Branch Target Injection). Spectre + V2 is a class of side channel attacks that takes advantage of + indirect branch predictors inside the processor. In Spectre variant 2 + attacks, the attacker can steer speculative indirect branches in the + victim to gadget code by poisoning the branch target buffer of a CPU + used for predicting indirect branch addresses. + See also <file:Documentation/admin-guide/hw-vuln/spectre.rst> + +config MITIGATION_SRBDS + bool "Mitigate Special Register Buffer Data Sampling (SRBDS) hardware bug" + depends on CPU_SUP_INTEL + default y + help + Enable mitigation for Special Register Buffer Data Sampling (SRBDS). + SRBDS is a hardware vulnerability that allows Microarchitectural Data + Sampling (MDS) techniques to infer values returned from special + register accesses. An unprivileged user can extract values returned + from RDRAND and RDSEED executed on another core or sibling thread + using MDS techniques. + See also + <file:Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst> + +config MITIGATION_SSB + bool "Mitigate Speculative Store Bypass (SSB) hardware bug" + default y + help + Enable mitigation for Speculative Store Bypass (SSB). SSB is a + hardware security vulnerability and its exploitation takes advantage + of speculative execution in a similar way to the Meltdown and Spectre + security vulnerabilities. + endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 082d61d85dfc..de1df0cb45da 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -163,7 +163,7 @@ struct sev_config { */ use_cas : 1, - __reserved : 62; + __reserved : 61; }; static struct sev_config sev_cfg __read_mostly; diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 078e2bac2553..da8b66dce0da 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -389,7 +389,6 @@ static bool mmio_read(int size, unsigned long addr, unsigned long *val) .r12 = size, .r13 = EPT_READ, .r14 = addr, - .r15 = *val, }; if (__tdx_hypercall(&args)) diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 24875e6295f2..7b1bebed879d 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -14,7 +14,7 @@ config CRYPTO_CURVE25519_X86 - ADX (large integer arithmetic) config CRYPTO_AES_NI_INTEL - tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTR, XTS, GCM (AES-NI)" + tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XCTR, XTS, GCM (AES-NI/VAES)" depends on X86 select CRYPTO_AEAD select CRYPTO_LIB_AES @@ -25,10 +25,14 @@ config CRYPTO_AES_NI_INTEL help Block cipher: AES cipher algorithms AEAD cipher: AES with GCM - Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XTR, XTS + Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XCTR, XTS Architecture: x86 (32-bit and 64-bit) using: - AES-NI (AES new instructions) + - VAES (Vector AES) + + Some algorithm implementations are supported only in 64-bit builds, + and some have additional prerequisites such as AVX2 or AVX512. config CRYPTO_BLOWFISH_X86_64 tristate "Ciphers: Blowfish, modes: ECB, CBC" diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index cd37de5ec404..b0dd83555499 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1366,6 +1366,8 @@ gcm_crypt(struct aead_request *req, int flags) err = skcipher_walk_aead_encrypt(&walk, req, false); else err = skcipher_walk_aead_decrypt(&walk, req, false); + if (err) + return err; /* * Since the AES-GCM assembly code requires that at least three assembly @@ -1381,37 +1383,31 @@ gcm_crypt(struct aead_request *req, int flags) gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags); /* En/decrypt the data and pass the ciphertext through GHASH. */ - while ((nbytes = walk.nbytes) != 0) { - if (unlikely(nbytes < walk.total)) { - /* - * Non-last segment. In this case, the assembly - * function requires that the length be a multiple of 16 - * (AES_BLOCK_SIZE) bytes. The needed buffering of up - * to 16 bytes is handled by the skcipher_walk. Here we - * just need to round down to a multiple of 16. - */ - nbytes = round_down(nbytes, AES_BLOCK_SIZE); - aes_gcm_update(key, le_ctr, ghash_acc, - walk.src.virt.addr, walk.dst.virt.addr, - nbytes, flags); - le_ctr[0] += nbytes / AES_BLOCK_SIZE; - kernel_fpu_end(); - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - kernel_fpu_begin(); - } else { - /* Last segment: process all remaining data. */ - aes_gcm_update(key, le_ctr, ghash_acc, - walk.src.virt.addr, walk.dst.virt.addr, - nbytes, flags); - err = skcipher_walk_done(&walk, 0); - /* - * The low word of the counter isn't used by the - * finalize, so there's no need to increment it here. - */ - } + while (unlikely((nbytes = walk.nbytes) < walk.total)) { + /* + * Non-last segment. In this case, the assembly function + * requires that the length be a multiple of 16 (AES_BLOCK_SIZE) + * bytes. The needed buffering of up to 16 bytes is handled by + * the skcipher_walk. Here we just need to round down to a + * multiple of 16. + */ + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr, + walk.dst.virt.addr, nbytes, flags); + le_ctr[0] += nbytes / AES_BLOCK_SIZE; + kernel_fpu_end(); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + if (err) + return err; + kernel_fpu_begin(); } - if (err) - goto out; + /* Last segment: process all remaining data. */ + aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr, + walk.dst.virt.addr, nbytes, flags); + /* + * The low word of the counter isn't used by the finalize, so there's no + * need to increment it here. + */ /* Finalize */ taglen = crypto_aead_authsize(tfm); @@ -1439,8 +1435,9 @@ gcm_crypt(struct aead_request *req, int flags) datalen, tag, taglen, flags)) err = -EBADMSG; } -out: kernel_fpu_end(); + if (nbytes) + skcipher_walk_done(&walk, 0); return err; } @@ -1753,6 +1750,6 @@ static void __exit aesni_exit(void) late_initcall(aesni_init); module_exit(aesni_exit); -MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized"); +MODULE_DESCRIPTION("AES cipher and modes, optimized with AES-NI or VAES instructions"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("aes"); diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 0ffb072be956..0bbec1c75cd0 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -592,22 +592,22 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx) leaq K256+0*32(%rip), INP ## reuse INP as scratch reg vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED _XFER + 0*32 + FOUR_ROUNDS_AND_SCHED (_XFER + 0*32) leaq K256+1*32(%rip), INP vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED _XFER + 1*32 + FOUR_ROUNDS_AND_SCHED (_XFER + 1*32) leaq K256+2*32(%rip), INP vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 2*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED _XFER + 2*32 + FOUR_ROUNDS_AND_SCHED (_XFER + 2*32) leaq K256+3*32(%rip), INP vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 3*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED _XFER + 3*32 + FOUR_ROUNDS_AND_SCHED (_XFER + 3*32) add $4*32, SRND cmp $3*4*32, SRND @@ -618,12 +618,12 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx) leaq K256+0*32(%rip), INP vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) - DO_4ROUNDS _XFER + 0*32 + DO_4ROUNDS (_XFER + 0*32) leaq K256+1*32(%rip), INP vpaddd (INP, SRND), X1, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) - DO_4ROUNDS _XFER + 1*32 + DO_4ROUNDS (_XFER + 1*32) add $2*32, SRND vmovdqa X2, X0 @@ -651,8 +651,8 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx) xor SRND, SRND .align 16 .Lloop3: - DO_4ROUNDS _XFER + 0*32 + 16 - DO_4ROUNDS _XFER + 1*32 + 16 + DO_4ROUNDS (_XFER + 0*32 + 16) + DO_4ROUNDS (_XFER + 1*32 + 16) add $2*32, SRND cmp $4*4*32, SRND jb .Lloop3 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 83073fa3c989..7093ee21c0d1 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -344,6 +344,7 @@ 332 common statx sys_statx 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq +335 common uretprobe sys_uretprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal @@ -385,7 +386,6 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal -467 common uretprobe sys_uretprobe # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 12f2a0c14d33..be01823b1bb4 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1520,20 +1520,23 @@ static void x86_pmu_start(struct perf_event *event, int flags) void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; + unsigned long *cntr_mask, *fixed_cntr_mask; + struct event_constraint *pebs_constraints; + struct cpu_hw_events *cpuc; u64 pebs, debugctl; - int cpu = smp_processor_id(); - struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - unsigned long *cntr_mask = hybrid(cpuc->pmu, cntr_mask); - unsigned long *fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask); - struct event_constraint *pebs_constraints = hybrid(cpuc->pmu, pebs_constraints); - unsigned long flags; - int idx; + int cpu, idx; + + guard(irqsave)(); + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_events, cpu); + cntr_mask = hybrid(cpuc->pmu, cntr_mask); + fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask); + pebs_constraints = hybrid(cpuc->pmu, pebs_constraints); if (!*(u64 *)cntr_mask) return; - local_irq_save(flags); - if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); @@ -1577,7 +1580,6 @@ void perf_event_print_debug(void) pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); } - local_irq_restore(flags); } void x86_pmu_stop(struct perf_event *event, int flags) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 0c9c2706d4ec..9e519d8a810a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4589,6 +4589,25 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void) return HYBRID_INTEL_CORE; } +static inline bool erratum_hsw11(struct perf_event *event) +{ + return (event->hw.config & INTEL_ARCH_EVENT_MASK) == + X86_CONFIG(.event=0xc0, .umask=0x01); +} + +/* + * The HSW11 requires a period larger than 100 which is the same as the BDM11. + * A minimum period of 128 is enforced as well for the INST_RETIRED.ALL. + * + * The message 'interrupt took too long' can be observed on any counter which + * was armed with a period < 32 and two events expired in the same NMI. + * A minimum period of 32 is enforced for the rest of the events. + */ +static void hsw_limit_period(struct perf_event *event, s64 *left) +{ + *left = max(*left, erratum_hsw11(event) ? 128 : 32); +} + /* * Broadwell: * @@ -4606,8 +4625,7 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void) */ static void bdw_limit_period(struct perf_event *event, s64 *left) { - if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == - X86_CONFIG(.event=0xc0, .umask=0x01)) { + if (erratum_hsw11(event)) { if (*left < 128) *left = 128; *left &= ~0x3fULL; @@ -6766,6 +6784,7 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; + x86_pmu.limit_period = hsw_limit_period; x86_pmu.lbr_double_abort = true; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index be58cfb012dd..9f116dfc4728 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -64,7 +64,7 @@ * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL, - * RPL,SPR,MTL,ARL,LNL + * RPL,SPR,MTL,ARL,LNL,SRF * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 @@ -693,7 +693,8 @@ static const struct cstate_model srf_cstates __initconst = { .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | BIT(PERF_CSTATE_CORE_C6_RES), - .pkg_events = BIT(PERF_CSTATE_PKG_C6_RES), + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C6_RES), .module_events = BIT(PERF_CSTATE_MODULE_C6_RES), }; diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 17a71e92a343..95eada2994e1 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -35,7 +35,6 @@ #include <clocksource/hyperv_timer.h> #include <linux/highmem.h> -int hyperv_init_cpuhp; u64 hv_current_partition_id = ~0ull; EXPORT_SYMBOL_GPL(hv_current_partition_id); @@ -607,8 +606,6 @@ skip_hypercall_pg_init: register_syscore_ops(&hv_syscore_ops); - hyperv_init_cpuhp = cpuhp; - if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID) hv_get_partition_id(); @@ -637,7 +634,7 @@ skip_hypercall_pg_init: clean_guest_os_id: wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); - cpuhp_remove_state(cpuhp); + cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE); free_ghcb_page: free_percpu(hv_ghcb_pg); free_vp_assist_page: diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h index 6faaf27e8899..6cbd9ae58b21 100644 --- a/arch/x86/include/asm/cmdline.h +++ b/arch/x86/include/asm/cmdline.h @@ -2,6 +2,10 @@ #ifndef _ASM_X86_CMDLINE_H #define _ASM_X86_CMDLINE_H +#include <asm/setup.h> + +extern char builtin_cmdline[COMMAND_LINE_SIZE]; + int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); int cmdline_find_option(const char *cmdline_ptr, const char *option, char *buffer, int bufsize); diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index 3831f612e89c..e4121d9aa9e1 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -193,26 +193,6 @@ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, X86_MODEL_ANY, data) /** - * X86_MATCH_INTEL_FAM6_MODEL - Match vendor INTEL, family 6 and model - * @model: The model name without the INTEL_FAM6_ prefix or ANY - * The model name is expanded to INTEL_FAM6_@model internally - * @data: Driver specific data or NULL. The internal storage - * format is unsigned long. The supplied value, pointer - * etc. is casted to unsigned long internally. - * - * The vendor is set to INTEL, the family to 6 and all other missing - * arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are set to wildcards. - * - * See X86_MATCH_VENDOR_FAM_MODEL_FEATURE() for further information. - */ -#define X86_MATCH_INTEL_FAM6_MODEL(model, data) \ - X86_MATCH_VENDOR_FAM_MODEL(INTEL, 6, INTEL_FAM6_##model, data) - -#define X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(model, steppings, data) \ - X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, INTEL_FAM6_##model, \ - steppings, X86_FEATURE_ANY, data) - -/** * X86_MATCH_VFM - Match encoded vendor/family/model * @vfm: Encoded 8-bits each for vendor, family, model * @data: Driver specific data or NULL. The internal storage diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index eb17f31b06d2..de16862bf230 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -591,6 +591,13 @@ struct fpu_state_config { * even without XSAVE support, i.e. legacy features FP + SSE */ u64 legacy_features; + /* + * @independent_features: + * + * Features that are supported by XSAVES, but not managed as part of + * the FPU core, such as LBR + */ + u64 independent_features; }; /* FPU state configuration information */ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index f81a851c46dc..44949f972826 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -10,7 +10,7 @@ * that group keep the CPUID for the variants sorted by model number. * * The defined symbol names have the following form: - * INTEL_FAM6{OPTFAMILY}_{MICROARCH}{OPTDIFF} + * INTEL_{OPTFAMILY}_{MICROARCH}{OPTDIFF} * where: * OPTFAMILY Describes the family of CPUs that this belongs to. Default * is assumed to be "_CORE" (and should be omitted). Other values @@ -42,215 +42,136 @@ #define IFM(_fam, _model) VFM_MAKE(X86_VENDOR_INTEL, _fam, _model) -/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */ -#define INTEL_FAM6_ANY X86_MODEL_ANY -/* Wildcard match for FAM6 so X86_MATCH_VFM(ANY) works */ +/* Wildcard match so X86_MATCH_VFM(ANY) works */ #define INTEL_ANY IFM(X86_FAMILY_ANY, X86_MODEL_ANY) -#define INTEL_FAM6_CORE_YONAH 0x0E +#define INTEL_PENTIUM_PRO IFM(6, 0x01) + #define INTEL_CORE_YONAH IFM(6, 0x0E) -#define INTEL_FAM6_CORE2_MEROM 0x0F #define INTEL_CORE2_MEROM IFM(6, 0x0F) -#define INTEL_FAM6_CORE2_MEROM_L 0x16 #define INTEL_CORE2_MEROM_L IFM(6, 0x16) -#define INTEL_FAM6_CORE2_PENRYN 0x17 #define INTEL_CORE2_PENRYN IFM(6, 0x17) -#define INTEL_FAM6_CORE2_DUNNINGTON 0x1D #define INTEL_CORE2_DUNNINGTON IFM(6, 0x1D) -#define INTEL_FAM6_NEHALEM 0x1E #define INTEL_NEHALEM IFM(6, 0x1E) -#define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */ #define INTEL_NEHALEM_G IFM(6, 0x1F) /* Auburndale / Havendale */ -#define INTEL_FAM6_NEHALEM_EP 0x1A #define INTEL_NEHALEM_EP IFM(6, 0x1A) -#define INTEL_FAM6_NEHALEM_EX 0x2E #define INTEL_NEHALEM_EX IFM(6, 0x2E) -#define INTEL_FAM6_WESTMERE 0x25 #define INTEL_WESTMERE IFM(6, 0x25) -#define INTEL_FAM6_WESTMERE_EP 0x2C #define INTEL_WESTMERE_EP IFM(6, 0x2C) -#define INTEL_FAM6_WESTMERE_EX 0x2F #define INTEL_WESTMERE_EX IFM(6, 0x2F) -#define INTEL_FAM6_SANDYBRIDGE 0x2A #define INTEL_SANDYBRIDGE IFM(6, 0x2A) -#define INTEL_FAM6_SANDYBRIDGE_X 0x2D #define INTEL_SANDYBRIDGE_X IFM(6, 0x2D) -#define INTEL_FAM6_IVYBRIDGE 0x3A #define INTEL_IVYBRIDGE IFM(6, 0x3A) -#define INTEL_FAM6_IVYBRIDGE_X 0x3E #define INTEL_IVYBRIDGE_X IFM(6, 0x3E) -#define INTEL_FAM6_HASWELL 0x3C #define INTEL_HASWELL IFM(6, 0x3C) -#define INTEL_FAM6_HASWELL_X 0x3F #define INTEL_HASWELL_X IFM(6, 0x3F) -#define INTEL_FAM6_HASWELL_L 0x45 #define INTEL_HASWELL_L IFM(6, 0x45) -#define INTEL_FAM6_HASWELL_G 0x46 #define INTEL_HASWELL_G IFM(6, 0x46) -#define INTEL_FAM6_BROADWELL 0x3D #define INTEL_BROADWELL IFM(6, 0x3D) -#define INTEL_FAM6_BROADWELL_G 0x47 #define INTEL_BROADWELL_G IFM(6, 0x47) -#define INTEL_FAM6_BROADWELL_X 0x4F #define INTEL_BROADWELL_X IFM(6, 0x4F) -#define INTEL_FAM6_BROADWELL_D 0x56 #define INTEL_BROADWELL_D IFM(6, 0x56) -#define INTEL_FAM6_SKYLAKE_L 0x4E /* Sky Lake */ #define INTEL_SKYLAKE_L IFM(6, 0x4E) /* Sky Lake */ -#define INTEL_FAM6_SKYLAKE 0x5E /* Sky Lake */ #define INTEL_SKYLAKE IFM(6, 0x5E) /* Sky Lake */ -#define INTEL_FAM6_SKYLAKE_X 0x55 /* Sky Lake */ #define INTEL_SKYLAKE_X IFM(6, 0x55) /* Sky Lake */ /* CASCADELAKE_X 0x55 Sky Lake -- s: 7 */ /* COOPERLAKE_X 0x55 Sky Lake -- s: 11 */ -#define INTEL_FAM6_KABYLAKE_L 0x8E /* Sky Lake */ #define INTEL_KABYLAKE_L IFM(6, 0x8E) /* Sky Lake */ /* AMBERLAKE_L 0x8E Sky Lake -- s: 9 */ /* COFFEELAKE_L 0x8E Sky Lake -- s: 10 */ /* WHISKEYLAKE_L 0x8E Sky Lake -- s: 11,12 */ -#define INTEL_FAM6_KABYLAKE 0x9E /* Sky Lake */ #define INTEL_KABYLAKE IFM(6, 0x9E) /* Sky Lake */ /* COFFEELAKE 0x9E Sky Lake -- s: 10-13 */ -#define INTEL_FAM6_COMETLAKE 0xA5 /* Sky Lake */ #define INTEL_COMETLAKE IFM(6, 0xA5) /* Sky Lake */ -#define INTEL_FAM6_COMETLAKE_L 0xA6 /* Sky Lake */ #define INTEL_COMETLAKE_L IFM(6, 0xA6) /* Sky Lake */ -#define INTEL_FAM6_CANNONLAKE_L 0x66 /* Palm Cove */ #define INTEL_CANNONLAKE_L IFM(6, 0x66) /* Palm Cove */ -#define INTEL_FAM6_ICELAKE_X 0x6A /* Sunny Cove */ #define INTEL_ICELAKE_X IFM(6, 0x6A) /* Sunny Cove */ -#define INTEL_FAM6_ICELAKE_D 0x6C /* Sunny Cove */ #define INTEL_ICELAKE_D IFM(6, 0x6C) /* Sunny Cove */ -#define INTEL_FAM6_ICELAKE 0x7D /* Sunny Cove */ #define INTEL_ICELAKE IFM(6, 0x7D) /* Sunny Cove */ -#define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */ #define INTEL_ICELAKE_L IFM(6, 0x7E) /* Sunny Cove */ -#define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */ #define INTEL_ICELAKE_NNPI IFM(6, 0x9D) /* Sunny Cove */ -#define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */ #define INTEL_ROCKETLAKE IFM(6, 0xA7) /* Cypress Cove */ -#define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */ #define INTEL_TIGERLAKE_L IFM(6, 0x8C) /* Willow Cove */ -#define INTEL_FAM6_TIGERLAKE 0x8D /* Willow Cove */ #define INTEL_TIGERLAKE IFM(6, 0x8D) /* Willow Cove */ -#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */ #define INTEL_SAPPHIRERAPIDS_X IFM(6, 0x8F) /* Golden Cove */ -#define INTEL_FAM6_EMERALDRAPIDS_X 0xCF #define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF) -#define INTEL_FAM6_GRANITERAPIDS_X 0xAD #define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) -#define INTEL_FAM6_GRANITERAPIDS_D 0xAE #define INTEL_GRANITERAPIDS_D IFM(6, 0xAE) /* "Hybrid" Processors (P-Core/E-Core) */ -#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ #define INTEL_LAKEFIELD IFM(6, 0x8A) /* Sunny Cove / Tremont */ -#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ #define INTEL_ALDERLAKE IFM(6, 0x97) /* Golden Cove / Gracemont */ -#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ #define INTEL_ALDERLAKE_L IFM(6, 0x9A) /* Golden Cove / Gracemont */ -#define INTEL_FAM6_RAPTORLAKE 0xB7 /* Raptor Cove / Enhanced Gracemont */ #define INTEL_RAPTORLAKE IFM(6, 0xB7) /* Raptor Cove / Enhanced Gracemont */ -#define INTEL_FAM6_RAPTORLAKE_P 0xBA #define INTEL_RAPTORLAKE_P IFM(6, 0xBA) -#define INTEL_FAM6_RAPTORLAKE_S 0xBF #define INTEL_RAPTORLAKE_S IFM(6, 0xBF) -#define INTEL_FAM6_METEORLAKE 0xAC #define INTEL_METEORLAKE IFM(6, 0xAC) -#define INTEL_FAM6_METEORLAKE_L 0xAA #define INTEL_METEORLAKE_L IFM(6, 0xAA) -#define INTEL_FAM6_ARROWLAKE_H 0xC5 #define INTEL_ARROWLAKE_H IFM(6, 0xC5) -#define INTEL_FAM6_ARROWLAKE 0xC6 #define INTEL_ARROWLAKE IFM(6, 0xC6) -#define INTEL_FAM6_ARROWLAKE_U 0xB5 #define INTEL_ARROWLAKE_U IFM(6, 0xB5) -#define INTEL_FAM6_LUNARLAKE_M 0xBD #define INTEL_LUNARLAKE_M IFM(6, 0xBD) /* "Small Core" Processors (Atom/E-Core) */ -#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ #define INTEL_ATOM_BONNELL IFM(6, 0x1C) /* Diamondville, Pineview */ -#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ #define INTEL_ATOM_BONNELL_MID IFM(6, 0x26) /* Silverthorne, Lincroft */ -#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */ #define INTEL_ATOM_SALTWELL IFM(6, 0x36) /* Cedarview */ -#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */ #define INTEL_ATOM_SALTWELL_MID IFM(6, 0x27) /* Penwell */ -#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */ #define INTEL_ATOM_SALTWELL_TABLET IFM(6, 0x35) /* Cloverview */ -#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */ #define INTEL_ATOM_SILVERMONT IFM(6, 0x37) /* Bay Trail, Valleyview */ -#define INTEL_FAM6_ATOM_SILVERMONT_D 0x4D /* Avaton, Rangely */ #define INTEL_ATOM_SILVERMONT_D IFM(6, 0x4D) /* Avaton, Rangely */ -#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */ #define INTEL_ATOM_SILVERMONT_MID IFM(6, 0x4A) /* Merriefield */ -#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */ #define INTEL_ATOM_AIRMONT IFM(6, 0x4C) /* Cherry Trail, Braswell */ -#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */ #define INTEL_ATOM_AIRMONT_MID IFM(6, 0x5A) /* Moorefield */ -#define INTEL_FAM6_ATOM_AIRMONT_NP 0x75 /* Lightning Mountain */ #define INTEL_ATOM_AIRMONT_NP IFM(6, 0x75) /* Lightning Mountain */ -#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ #define INTEL_ATOM_GOLDMONT IFM(6, 0x5C) /* Apollo Lake */ -#define INTEL_FAM6_ATOM_GOLDMONT_D 0x5F /* Denverton */ #define INTEL_ATOM_GOLDMONT_D IFM(6, 0x5F) /* Denverton */ /* Note: the micro-architecture is "Goldmont Plus" */ -#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ #define INTEL_ATOM_GOLDMONT_PLUS IFM(6, 0x7A) /* Gemini Lake */ -#define INTEL_FAM6_ATOM_TREMONT_D 0x86 /* Jacobsville */ #define INTEL_ATOM_TREMONT_D IFM(6, 0x86) /* Jacobsville */ -#define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */ #define INTEL_ATOM_TREMONT IFM(6, 0x96) /* Elkhart Lake */ -#define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */ #define INTEL_ATOM_TREMONT_L IFM(6, 0x9C) /* Jasper Lake */ -#define INTEL_FAM6_ATOM_GRACEMONT 0xBE /* Alderlake N */ #define INTEL_ATOM_GRACEMONT IFM(6, 0xBE) /* Alderlake N */ -#define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */ #define INTEL_ATOM_CRESTMONT_X IFM(6, 0xAF) /* Sierra Forest */ -#define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */ #define INTEL_ATOM_CRESTMONT IFM(6, 0xB6) /* Grand Ridge */ -#define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */ #define INTEL_ATOM_DARKMONT_X IFM(6, 0xDD) /* Clearwater Forest */ /* Xeon Phi */ -#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ #define INTEL_XEON_PHI_KNL IFM(6, 0x57) /* Knights Landing */ -#define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */ #define INTEL_XEON_PHI_KNM IFM(6, 0x85) /* Knights Mill */ /* Family 5 */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 950a03e0181e..4a68cb3eba78 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1305,6 +1305,7 @@ struct kvm_arch { u8 vm_type; bool has_private_mem; bool has_protected_state; + bool pre_fault_allowed; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; @@ -2191,6 +2192,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, #define kvm_arch_has_private_mem(kvm) false #endif +#define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state) + static inline u16 kvm_read_ldt(void) { u16 ldt; diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 3ad29b128943..3b9970117a0f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -221,7 +221,7 @@ static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { return -EINVAL; } #endif -void mce_setup(struct mce *m); +void mce_prep_record(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct device *, mce_device); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 390c4d13956d..5f0bc6a6d025 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -40,7 +40,6 @@ static inline unsigned char hv_get_nmi_reason(void) } #if IS_ENABLED(CONFIG_HYPERV) -extern int hyperv_init_cpuhp; extern bool hyperv_paravisor_present; extern void *hv_hypercall_pg; diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index af4302d79b59..f3d257c45225 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -17,6 +17,7 @@ extern unsigned long phys_base; extern unsigned long page_offset_base; extern unsigned long vmalloc_base; extern unsigned long vmemmap_base; +extern unsigned long physmem_end; static __always_inline unsigned long __phys_addr_nodebug(unsigned long x) { diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 9053dfe9fa03..a98e53491a4e 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -140,6 +140,10 @@ extern unsigned int ptrs_per_p4d; # define VMEMMAP_START __VMEMMAP_BASE_L4 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ +#ifdef CONFIG_RANDOMIZE_MEMORY +# define PHYSMEM_END physmem_end +#endif + /* * End of the region for which vmalloc page tables are pre-allocated. * For non-KMSAN builds, this is the same as VMALLOC_END. diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a75a07f4931f..775acbdea1a9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -691,8 +691,6 @@ static inline u32 per_cpu_l2c_id(unsigned int cpu) } #ifdef CONFIG_CPU_SUP_AMD -extern u32 amd_get_highest_perf(void); - /* * Issue a DIV 0/1 insn to clear any division data from previous DIV * operations. @@ -705,7 +703,6 @@ static __always_inline void amd_clear_divider(void) extern void amd_check_microcode(void); #else -static inline u32 amd_get_highest_perf(void) { return 0; } static inline void amd_clear_divider(void) { } static inline void amd_check_microcode(void) { } #endif diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index a053c1293975..68da67df304d 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -66,13 +66,15 @@ static inline bool vcpu_is_preempted(long cpu) #ifdef CONFIG_PARAVIRT /* - * virt_spin_lock_key - enables (by default) the virt_spin_lock() hijack. + * virt_spin_lock_key - disables by default the virt_spin_lock() hijack. * - * Native (and PV wanting native due to vCPU pinning) should disable this key. - * It is done in this backwards fashion to only have a single direction change, - * which removes ordering between native_pv_spin_init() and HV setup. + * Native (and PV wanting native due to vCPU pinning) should keep this key + * disabled. Native does not touch the key. + * + * When in a guest then native_pv_lock_init() enables the key first and + * KVM/XEN might conditionally disable it later in the boot process again. */ -DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key); +DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key); /* * Shortcut for the queued_spin_lock_slowpath() function that allows diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 12dbd2588ca7..8b1b6ce1e51b 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -156,12 +156,6 @@ static inline void resctrl_sched_in(struct task_struct *tsk) __resctrl_sched_in(tsk); } -static inline u32 resctrl_arch_system_num_rmid_idx(void) -{ - /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return boot_cpu_data.x86_cache_max_rmid + 1; -} - static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) { *rmid = idx; diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 79bbe2be900e..ee34ab00a8d6 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -164,7 +164,7 @@ struct snp_guest_msg_hdr { struct snp_guest_msg { struct snp_guest_msg_hdr hdr; - u8 payload[4000]; + u8 payload[PAGE_SIZE - sizeof(struct snp_guest_msg_hdr)]; } __packed; struct sev_guest_platform_data { diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index abe3a8f22cbd..aef70336d624 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -282,9 +282,22 @@ static inline long arch_scale_freq_capacity(int cpu) } #define arch_scale_freq_capacity arch_scale_freq_capacity +bool arch_enable_hybrid_capacity_scale(void); +void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, + unsigned long cap_freq, unsigned long base_freq); + +unsigned long arch_scale_cpu_capacity(int cpu); +#define arch_scale_cpu_capacity arch_scale_cpu_capacity + extern void arch_set_max_freq_ratio(bool turbo_disabled); extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled); #else +static inline bool arch_enable_hybrid_capacity_scale(void) { return false; } +static inline void arch_set_cpu_capacity(int cpu, unsigned long cap, + unsigned long max_cap, + unsigned long cap_freq, + unsigned long base_freq) { } + static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { } #endif diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index ff8f25faca3d..956984054bf3 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -9,6 +9,17 @@ #include <asm/processor.h> #include <asm/topology.h> +#define CPPC_HIGHEST_PERF_PERFORMANCE 196 +#define CPPC_HIGHEST_PERF_PREFCORE 166 + +enum amd_pref_core { + AMD_PREF_CORE_UNKNOWN = 0, + AMD_PREF_CORE_SUPPORTED, + AMD_PREF_CORE_UNSUPPORTED, +}; +static enum amd_pref_core amd_pref_core_detected; +static u64 boost_numerator; + /* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ bool cpc_supported_by_cpu(void) @@ -69,31 +80,30 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) static void amd_set_max_freq_ratio(void) { struct cppc_perf_caps perf_caps; - u64 highest_perf, nominal_perf; + u64 numerator, nominal_perf; u64 perf_ratio; int rc; rc = cppc_get_perf_caps(0, &perf_caps); if (rc) { - pr_debug("Could not retrieve perf counters (%d)\n", rc); + pr_warn("Could not retrieve perf counters (%d)\n", rc); return; } - highest_perf = amd_get_highest_perf(); + rc = amd_get_boost_ratio_numerator(0, &numerator); + if (rc) { + pr_warn("Could not retrieve highest performance (%d)\n", rc); + return; + } nominal_perf = perf_caps.nominal_perf; - if (!highest_perf || !nominal_perf) { - pr_debug("Could not retrieve highest or nominal performance\n"); + if (!nominal_perf) { + pr_warn("Could not retrieve nominal performance\n"); return; } - perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); /* midpoint between max_boost and max_P */ - perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; - if (!perf_ratio) { - pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); - return; - } + perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1; freq_invariance_set_perf_ratio(perf_ratio, false); } @@ -116,3 +126,143 @@ void init_freq_invariance_cppc(void) init_done = true; mutex_unlock(&freq_invariance_lock); } + +/* + * Get the highest performance register value. + * @cpu: CPU from which to get highest performance. + * @highest_perf: Return address for highest performance value. + * + * Return: 0 for success, negative error code otherwise. + */ +int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf) +{ + u64 val; + int ret; + + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val); + if (ret) + goto out; + + val = AMD_CPPC_HIGHEST_PERF(val); + } else { + ret = cppc_get_highest_perf(cpu, &val); + if (ret) + goto out; + } + + WRITE_ONCE(*highest_perf, (u32)val); +out: + return ret; +} +EXPORT_SYMBOL_GPL(amd_get_highest_perf); + +/** + * amd_detect_prefcore: Detect if CPUs in the system support preferred cores + * @detected: Output variable for the result of the detection. + * + * Determine whether CPUs in the system support preferred cores. On systems + * that support preferred cores, different highest perf values will be found + * on different cores. On other systems, the highest perf value will be the + * same on all cores. + * + * The result of the detection will be stored in the 'detected' parameter. + * + * Return: 0 for success, negative error code otherwise + */ +int amd_detect_prefcore(bool *detected) +{ + int cpu, count = 0; + u64 highest_perf[2] = {0}; + + if (WARN_ON(!detected)) + return -EINVAL; + + switch (amd_pref_core_detected) { + case AMD_PREF_CORE_SUPPORTED: + *detected = true; + return 0; + case AMD_PREF_CORE_UNSUPPORTED: + *detected = false; + return 0; + default: + break; + } + + for_each_present_cpu(cpu) { + u32 tmp; + int ret; + + ret = amd_get_highest_perf(cpu, &tmp); + if (ret) + return ret; + + if (!count || (count == 1 && tmp != highest_perf[0])) + highest_perf[count++] = tmp; + + if (count == 2) + break; + } + + *detected = (count == 2); + boost_numerator = highest_perf[0]; + + amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED : + AMD_PREF_CORE_UNSUPPORTED; + + pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n", + *detected ? "" : "un", highest_perf[0]); + + return 0; +} +EXPORT_SYMBOL_GPL(amd_detect_prefcore); + +/** + * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation + * @cpu: CPU to get numerator for. + * @numerator: Output variable for numerator. + * + * Determine the numerator to use for calculating the boost ratio on + * a CPU. On systems that support preferred cores, this will be a hardcoded + * value. On other systems this will the highest performance register value. + * + * If booting the system with amd-pstate enabled but preferred cores disabled then + * the correct boost numerator will be returned to match hardware capabilities + * even if the preferred cores scheduling hints are not enabled. + * + * Return: 0 for success, negative error code otherwise. + */ +int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) +{ + bool prefcore; + int ret; + + ret = amd_detect_prefcore(&prefcore); + if (ret) + return ret; + + /* without preferred cores, return the highest perf register value */ + if (!prefcore) { + *numerator = boost_numerator; + return 0; + } + + /* + * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f, + * the highest performance level is set to 196. + * https://bugzilla.kernel.org/show_bug.cgi?id=218759 + */ + if (cpu_feature_enabled(X86_FEATURE_ZEN4)) { + switch (boot_cpu_data.x86_model) { + case 0x70 ... 0x7f: + *numerator = CPPC_HIGHEST_PERF_PERFORMANCE; + return 0; + default: + break; + } + } + *numerator = CPPC_HIGHEST_PERF_PREFCORE; + + return 0; +} +EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator); diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c index 6cfe762be28b..d5ef6215583b 100644 --- a/arch/x86/kernel/acpi/madt_wakeup.c +++ b/arch/x86/kernel/acpi/madt_wakeup.c @@ -19,7 +19,7 @@ static u64 acpi_mp_wake_mailbox_paddr __ro_after_init; /* Virtual address of the Multiprocessor Wakeup Structure mailbox */ -static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox __ro_after_init; +static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; static u64 acpi_mp_pgd __ro_after_init; static u64 acpi_mp_reset_vector_paddr __ro_after_init; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 66fd4b2a37a3..373638691cd4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1775,12 +1775,9 @@ static __init void apic_set_fixmap(bool read_apic); static __init void x2apic_disable(void) { - u32 x2apic_id, state = x2apic_state; + u32 x2apic_id; - x2apic_mode = 0; - x2apic_state = X2APIC_DISABLED; - - if (state != X2APIC_ON) + if (x2apic_state < X2APIC_ON) return; x2apic_id = read_apic_id(); @@ -1793,6 +1790,10 @@ static __init void x2apic_disable(void) } __x2apic_disable(); + + x2apic_mode = 0; + x2apic_state = X2APIC_DISABLED; + /* * Don't reread the APIC ID as it was already done from * check_x2apic() and the APIC driver still is a x2APIC variant, diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index be5889bded49..015971adadfc 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -462,7 +462,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) switch (c->x86_model) { case 0x00 ... 0x2f: case 0x40 ... 0x4f: - case 0x70 ... 0x7f: + case 0x60 ... 0x7f: setup_force_cpu_cap(X86_FEATURE_ZEN5); break; default: @@ -1190,22 +1190,6 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr) } EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask); -u32 amd_get_highest_perf(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) || - (c->x86_model >= 0x70 && c->x86_model < 0x80))) - return 166; - - if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) || - (c->x86_model >= 0x40 && c->x86_model < 0x70))) - return 166; - - return 255; -} -EXPORT_SYMBOL_GPL(amd_get_highest_perf); - static void zenbleed_check_cpu(void *unused) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index b3fa61d45352..f642de2ebdac 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -306,7 +306,7 @@ static void freq_invariance_enable(void) WARN_ON_ONCE(1); return; } - static_branch_enable(&arch_scale_freq_key); + static_branch_enable_cpuslocked(&arch_scale_freq_key); register_freq_invariance_syscore_ops(); pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); } @@ -323,8 +323,10 @@ static void __init bp_init_freq_invariance(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return; - if (intel_set_max_freq_ratio()) + if (intel_set_max_freq_ratio()) { + guard(cpus_read_lock)(); freq_invariance_enable(); + } } static void disable_freq_invariance_workfn(struct work_struct *work) @@ -347,9 +349,89 @@ static DECLARE_WORK(disable_freq_invariance_work, DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); +static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key); + +struct arch_hybrid_cpu_scale { + unsigned long capacity; + unsigned long freq_ratio; +}; + +static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale; + +/** + * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling + * + * Allocate memory for per-CPU data used by hybrid CPU capacity scaling, + * initialize it and set the static key controlling its code paths. + * + * Must be called before arch_set_cpu_capacity(). + */ +bool arch_enable_hybrid_capacity_scale(void) +{ + int cpu; + + if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) { + WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled"); + return true; + } + + arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale); + if (!arch_cpu_scale) + return false; + + for_each_possible_cpu(cpu) { + per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE; + per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio; + } + + static_branch_enable(&arch_hybrid_cap_scale_key); + + pr_info("Hybrid CPU capacity scaling enabled\n"); + + return true; +} + +/** + * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU + * @cpu: Target CPU. + * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap. + * @max_cap: System-wide maximum CPU capacity. + * @cap_freq: Frequency of @cpu corresponding to @cap. + * @base_freq: Frequency of @cpu at which MPERF counts. + * + * The units in which @cap and @max_cap are expressed do not matter, so long + * as they are consistent, because the former is effectively divided by the + * latter. Analogously for @cap_freq and @base_freq. + * + * After calling this function for all CPUs, call arch_rebuild_sched_domains() + * to let the scheduler know that capacity-aware scheduling can be used going + * forward. + */ +void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, + unsigned long cap_freq, unsigned long base_freq) +{ + if (static_branch_likely(&arch_hybrid_cap_scale_key)) { + WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity, + div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap)); + WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio, + div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq)); + } else { + WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled"); + } +} + +unsigned long arch_scale_cpu_capacity(int cpu) +{ + if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) + return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity); + + return SCHED_CAPACITY_SCALE; +} +EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity); + static void scale_freq_tick(u64 acnt, u64 mcnt) { - u64 freq_scale; + u64 freq_scale, freq_ratio; if (!arch_scale_freq_invariant()) return; @@ -357,7 +439,12 @@ static void scale_freq_tick(u64 acnt, u64 mcnt) if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) goto error; - if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) + if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) + freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio); + else + freq_ratio = arch_max_freq_ratio; + + if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt) goto error; freq_scale = div64_u64(acnt, mcnt); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 45675da354f3..d1915427b4ff 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -233,7 +233,8 @@ static void x86_amd_ssb_disable(void) #define pr_fmt(fmt) "MDS: " fmt /* Default mitigation for MDS-affected CPUs */ -static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL; +static enum mds_mitigations mds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_FULL : MDS_MITIGATION_OFF; static bool mds_nosmt __ro_after_init = false; static const char * const mds_strings[] = { @@ -293,7 +294,8 @@ enum taa_mitigations { }; /* Default mitigation for TAA-affected CPUs */ -static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; +static enum taa_mitigations taa_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_VERW : TAA_MITIGATION_OFF; static bool taa_nosmt __ro_after_init; static const char * const taa_strings[] = { @@ -391,7 +393,8 @@ enum mmio_mitigations { }; /* Default mitigation for Processor MMIO Stale Data vulnerabilities */ -static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW; +static enum mmio_mitigations mmio_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_VERW : MMIO_MITIGATION_OFF; static bool mmio_nosmt __ro_after_init = false; static const char * const mmio_strings[] = { @@ -605,7 +608,8 @@ enum srbds_mitigations { SRBDS_MITIGATION_HYPERVISOR, }; -static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL; +static enum srbds_mitigations srbds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_FULL : SRBDS_MITIGATION_OFF; static const char * const srbds_strings[] = { [SRBDS_MITIGATION_OFF] = "Vulnerable", @@ -731,11 +735,8 @@ enum gds_mitigations { GDS_MITIGATION_HYPERVISOR, }; -#if IS_ENABLED(CONFIG_MITIGATION_GDS_FORCE) -static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE; -#else -static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL; -#endif +static enum gds_mitigations gds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_FULL : GDS_MITIGATION_OFF; static const char * const gds_strings[] = { [GDS_MITIGATION_OFF] = "Vulnerable", @@ -871,7 +872,8 @@ enum spectre_v1_mitigation { }; static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init = - SPECTRE_V1_MITIGATION_AUTO; + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V1) ? + SPECTRE_V1_MITIGATION_AUTO : SPECTRE_V1_MITIGATION_NONE; static const char * const spectre_v1_strings[] = { [SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers", @@ -986,7 +988,7 @@ static const char * const retbleed_strings[] = { static enum retbleed_mitigation retbleed_mitigation __ro_after_init = RETBLEED_MITIGATION_NONE; static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = - RETBLEED_CMD_AUTO; + IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_CMD_AUTO : RETBLEED_CMD_OFF; static int __ro_after_init retbleed_nosmt = false; @@ -1447,17 +1449,18 @@ static void __init spec_v2_print_cond(const char *reason, bool secure) static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) { - enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; + enum spectre_v2_mitigation_cmd cmd; char arg[20]; int ret, i; + cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || cpu_mitigations_off()) return SPECTRE_V2_CMD_NONE; ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); if (ret < 0) - return SPECTRE_V2_CMD_AUTO; + return cmd; for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { if (!match_option(arg, ret, mitigation_options[i].option)) @@ -1467,8 +1470,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) } if (i >= ARRAY_SIZE(mitigation_options)) { - pr_err("unknown option (%s). Switching to AUTO select\n", arg); - return SPECTRE_V2_CMD_AUTO; + pr_err("unknown option (%s). Switching to default mode\n", arg); + return cmd; } if ((cmd == SPECTRE_V2_CMD_RETPOLINE || @@ -2021,10 +2024,12 @@ static const struct { static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) { - enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO; + enum ssb_mitigation_cmd cmd; char arg[20]; int ret, i; + cmd = IS_ENABLED(CONFIG_MITIGATION_SSB) ? + SPEC_STORE_BYPASS_CMD_AUTO : SPEC_STORE_BYPASS_CMD_NONE; if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || cpu_mitigations_off()) { return SPEC_STORE_BYPASS_CMD_NONE; @@ -2032,7 +2037,7 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", arg, sizeof(arg)); if (ret < 0) - return SPEC_STORE_BYPASS_CMD_AUTO; + return cmd; for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { if (!match_option(arg, ret, ssb_mitigation_options[i].option)) @@ -2043,8 +2048,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) } if (i >= ARRAY_SIZE(ssb_mitigation_options)) { - pr_err("unknown option (%s). Switching to AUTO select\n", arg); - return SPEC_STORE_BYPASS_CMD_AUTO; + pr_err("unknown option (%s). Switching to default mode\n", arg); + return cmd; } } @@ -2371,7 +2376,8 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); #define pr_fmt(fmt) "L1TF: " fmt /* Default mitigation for L1TF-affected CPUs */ -enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH; +enum l1tf_mitigations l1tf_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_FLUSH : L1TF_MITIGATION_OFF; #if IS_ENABLED(CONFIG_KVM_INTEL) EXPORT_SYMBOL_GPL(l1tf_mitigation); #endif @@ -2551,10 +2557,9 @@ static void __init srso_select_mitigation(void) { bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); - if (cpu_mitigations_off()) - return; - - if (!boot_cpu_has_bug(X86_BUG_SRSO)) { + if (!boot_cpu_has_bug(X86_BUG_SRSO) || + cpu_mitigations_off() || + srso_cmd == SRSO_CMD_OFF) { if (boot_cpu_has(X86_FEATURE_SBPB)) x86_pred_cmd = PRED_CMD_SBPB; return; @@ -2585,11 +2590,6 @@ static void __init srso_select_mitigation(void) } switch (srso_cmd) { - case SRSO_CMD_OFF: - if (boot_cpu_has(X86_FEATURE_SBPB)) - x86_pred_cmd = PRED_CMD_SBPB; - return; - case SRSO_CMD_MICROCODE: if (has_microcode) { srso_mitigation = SRSO_MITIGATION_MICROCODE; @@ -2643,6 +2643,8 @@ static void __init srso_select_mitigation(void) pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); } break; + default: + break; } out: diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d4e539d4e158..be307c9ef263 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1165,8 +1165,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB), - VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY), + VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), VULNWL_INTEL(INTEL_ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 08b95a35b5cb..e7656cbef68d 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -311,16 +311,18 @@ static void early_init_intel(struct cpuinfo_x86 *c) } /* - * There is a known erratum on Pentium III and Core Solo - * and Core Duo CPUs. - * " Page with PAT set to WC while associated MTRR is UC - * may consolidate to UC " - * Because of this erratum, it is better to stick with - * setting WC in MTRR rather than using PAT on these CPUs. + * PAT is broken on early family 6 CPUs, the last of which + * is "Yonah" where the erratum is named "AN7": * - * Enable PAT WC only on P4, Core 2 or later CPUs. + * Page with PAT (Page Attribute Table) Set to USWC + * (Uncacheable Speculative Write Combine) While + * Associated MTRR (Memory Type Range Register) Is UC + * (Uncacheable) May Consolidate to UC + * + * Disable PAT and fall back to MTRR on these CPUs. */ - if (c->x86 == 6 && c->x86_model < 15) + if (c->x86_vfm >= INTEL_PENTIUM_PRO && + c->x86_vfm <= INTEL_CORE_YONAH) clear_cpu_cap(c, X86_FEATURE_PAT); /* diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 9a0133ef7e20..14bf8c232e45 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -780,7 +780,7 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { struct mce m; - mce_setup(&m); + mce_prep_record(&m); m.status = status; m.misc = misc; diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 7f7309ff67d0..3885fe05f01e 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -44,7 +44,7 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) else lsb = PAGE_SHIFT; - mce_setup(&m); + mce_prep_record(&m); m.bank = -1; /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; @@ -66,6 +66,7 @@ EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { const u64 *i_mce = ((const u64 *) (ctx_info + 1)); + bool apicid_found = false; unsigned int cpu; struct mce m; @@ -97,20 +98,19 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) if (ctx_info->reg_arr_size < 48) return -EINVAL; - mce_setup(&m); - - m.extcpu = -1; - m.socketid = -1; - for_each_possible_cpu(cpu) { if (cpu_data(cpu).topo.initial_apicid == lapic_id) { - m.extcpu = cpu; - m.socketid = cpu_data(m.extcpu).topo.pkg_id; + apicid_found = true; break; } } - m.apicid = lapic_id; + if (!apicid_found) + return -EINVAL; + + mce_prep_record_common(&m); + mce_prep_record_per_cpu(cpu, &m); + m.bank = (ctx_info->msr_addr >> 4) & 0xFF; m.status = *i_mce; m.addr = *(i_mce + 1); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b85ec7a4ec9e..2a938f429c4d 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -117,20 +117,32 @@ static struct irq_work mce_irq_work; */ BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); -/* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) +void mce_prep_record_common(struct mce *m) { memset(m, 0, sizeof(struct mce)); - m->cpu = m->extcpu = smp_processor_id(); + + m->cpuid = cpuid_eax(1); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); /* need the internal __ version to avoid deadlocks */ - m->time = __ktime_get_real_seconds(); - m->cpuvendor = boot_cpu_data.x86_vendor; - m->cpuid = cpuid_eax(1); - m->socketid = cpu_data(m->extcpu).topo.pkg_id; - m->apicid = cpu_data(m->extcpu).topo.initial_apicid; - m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); - m->ppin = cpu_data(m->extcpu).ppin; - m->microcode = boot_cpu_data.microcode; + m->time = __ktime_get_real_seconds(); +} + +void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m) +{ + m->cpu = cpu; + m->extcpu = cpu; + m->apicid = cpu_data(cpu).topo.initial_apicid; + m->microcode = cpu_data(cpu).microcode; + m->ppin = topology_ppin(cpu); + m->socketid = topology_physical_package_id(cpu); +} + +/* Do initial initialization of a struct mce */ +void mce_prep_record(struct mce *m) +{ + mce_prep_record_common(m); + mce_prep_record_per_cpu(smp_processor_id(), m); } DEFINE_PER_CPU(struct mce, injectm); @@ -436,11 +448,11 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) { /* - * Enable instrumentation around mce_setup() which calls external + * Enable instrumentation around mce_prep_record() which calls external * facilities. */ instrumentation_begin(); - mce_setup(m); + mce_prep_record(m); instrumentation_end(); m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 01f8f03969e6..43c7f3b71df5 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -261,6 +261,8 @@ enum mca_msr { /* Decide whether to add MCE record to MCE event pool or filter it out. */ extern bool filter_mce(struct mce *m); +void mce_prep_record_common(struct mce *m); +void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index c0d56c02b8da..f63b051f25a0 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -89,6 +89,31 @@ static struct equiv_cpu_table { struct equiv_cpu_entry *entry; } equiv_table; +union zen_patch_rev { + struct { + __u32 rev : 8, + stepping : 4, + model : 4, + __reserved : 4, + ext_model : 4, + ext_fam : 8; + }; + __u32 ucode_rev; +}; + +union cpuid_1_eax { + struct { + __u32 stepping : 4, + model : 4, + family : 4, + __reserved0 : 4, + ext_model : 4, + ext_fam : 8, + __reserved1 : 4; + }; + __u32 full; +}; + /* * This points to the current valid container of microcode patches which we will * save from the initrd/builtin before jettisoning its contents. @mc is the @@ -96,7 +121,6 @@ static struct equiv_cpu_table { */ struct cont_desc { struct microcode_amd *mc; - u32 cpuid_1_eax; u32 psize; u8 *data; size_t size; @@ -109,10 +133,42 @@ struct cont_desc { static const char ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; +/* + * This is CPUID(1).EAX on the BSP. It is used in two ways: + * + * 1. To ignore the equivalence table on Zen1 and newer. + * + * 2. To match which patches to load because the patch revision ID + * already contains the f/m/s for which the microcode is destined + * for. + */ +static u32 bsp_cpuid_1_eax __ro_after_init; + +static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) +{ + union zen_patch_rev p; + union cpuid_1_eax c; + + p.ucode_rev = val; + c.full = 0; + + c.stepping = p.stepping; + c.model = p.model; + c.ext_model = p.ext_model; + c.family = 0xf; + c.ext_fam = p.ext_fam; + + return c; +} + static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig) { unsigned int i; + /* Zen and newer do not need an equivalence table. */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + return 0; + if (!et || !et->num_entries) return 0; @@ -159,6 +215,10 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) if (!verify_container(buf, buf_size)) return false; + /* Zen and newer do not need an equivalence table. */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + return true; + cont_type = hdr[1]; if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) { pr_debug("Wrong microcode container equivalence table type: %u.\n", @@ -222,8 +282,9 @@ __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize) * exceed the per-family maximum). @sh_psize is the size read from the section * header. */ -static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size) +static unsigned int __verify_patch_size(u32 sh_psize, size_t buf_size) { + u8 family = x86_family(bsp_cpuid_1_eax); u32 max_size; if (family >= 0x15) @@ -258,9 +319,9 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size * positive: patch is not for this family, skip it * 0: success */ -static int -verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size) +static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) { + u8 family = x86_family(bsp_cpuid_1_eax); struct microcode_header_amd *mc_hdr; unsigned int ret; u32 sh_psize; @@ -286,7 +347,7 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size) return -1; } - ret = __verify_patch_size(family, sh_psize, buf_size); + ret = __verify_patch_size(sh_psize, buf_size); if (!ret) { pr_debug("Per-family patch size mismatch.\n"); return -1; @@ -308,6 +369,15 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size) return 0; } +static bool mc_patch_matches(struct microcode_amd *mc, u16 eq_id) +{ + /* Zen and newer do not need an equivalence table. */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + return ucode_rev_to_cpuid(mc->hdr.patch_id).full == bsp_cpuid_1_eax; + else + return eq_id == mc->hdr.processor_rev_id; +} + /* * This scans the ucode blob for the proper container as we can have multiple * containers glued together. Returns the equivalence ID from the equivalence @@ -336,7 +406,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) * doesn't contain a patch for the CPU, scan through the whole container * so that it can be skipped in case there are other containers appended. */ - eq_id = find_equiv_id(&table, desc->cpuid_1_eax); + eq_id = find_equiv_id(&table, bsp_cpuid_1_eax); buf += hdr[2] + CONTAINER_HDR_SZ; size -= hdr[2] + CONTAINER_HDR_SZ; @@ -350,7 +420,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) u32 patch_size; int ret; - ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size); + ret = verify_patch(buf, size, &patch_size); if (ret < 0) { /* * Patch verification failed, skip to the next container, if @@ -363,7 +433,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) } mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE); - if (eq_id == mc->hdr.processor_rev_id) { + if (mc_patch_matches(mc, eq_id)) { desc->psize = patch_size; desc->mc = mc; } @@ -421,6 +491,7 @@ static int __apply_microcode_amd(struct microcode_amd *mc) /* verify patch application was successful */ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (rev != mc->hdr.patch_id) return -1; @@ -438,14 +509,12 @@ static int __apply_microcode_amd(struct microcode_amd *mc) * * Returns true if container found (sets @desc), false otherwise. */ -static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, size_t size) +static bool early_apply_microcode(u32 old_rev, void *ucode, size_t size) { struct cont_desc desc = { 0 }; struct microcode_amd *mc; bool ret = false; - desc.cpuid_1_eax = cpuid_1_eax; - scan_containers(ucode, size, &desc); mc = desc.mc; @@ -463,9 +532,10 @@ static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, siz return !__apply_microcode_amd(mc); } -static bool get_builtin_microcode(struct cpio_data *cp, u8 family) +static bool get_builtin_microcode(struct cpio_data *cp) { char fw_name[36] = "amd-ucode/microcode_amd.bin"; + u8 family = x86_family(bsp_cpuid_1_eax); struct firmware fw; if (IS_ENABLED(CONFIG_X86_32)) @@ -484,11 +554,11 @@ static bool get_builtin_microcode(struct cpio_data *cp, u8 family) return false; } -static void __init find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret) +static void __init find_blobs_in_containers(struct cpio_data *ret) { struct cpio_data cp; - if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax))) + if (!get_builtin_microcode(&cp)) cp = find_microcode_in_initrd(ucode_path); *ret = cp; @@ -499,16 +569,18 @@ void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_ struct cpio_data cp = { }; u32 dummy; + bsp_cpuid_1_eax = cpuid_1_eax; + native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy); /* Needed in load_microcode_amd() */ ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax; - find_blobs_in_containers(cpuid_1_eax, &cp); + find_blobs_in_containers(&cp); if (!(cp.data && cp.size)) return; - if (early_apply_microcode(cpuid_1_eax, ed->old_rev, cp.data, cp.size)) + if (early_apply_microcode(ed->old_rev, cp.data, cp.size)) native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy); } @@ -525,12 +597,10 @@ static int __init save_microcode_in_initrd(void) if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) return 0; - find_blobs_in_containers(cpuid_1_eax, &cp); + find_blobs_in_containers(&cp); if (!(cp.data && cp.size)) return -EINVAL; - desc.cpuid_1_eax = cpuid_1_eax; - scan_containers(cp.data, cp.size, &desc); if (!desc.mc) return -EINVAL; @@ -543,26 +613,65 @@ static int __init save_microcode_in_initrd(void) } early_initcall(save_microcode_in_initrd); +static inline bool patch_cpus_equivalent(struct ucode_patch *p, struct ucode_patch *n) +{ + /* Zen and newer hardcode the f/m/s in the patch ID */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) { + union cpuid_1_eax p_cid = ucode_rev_to_cpuid(p->patch_id); + union cpuid_1_eax n_cid = ucode_rev_to_cpuid(n->patch_id); + + /* Zap stepping */ + p_cid.stepping = 0; + n_cid.stepping = 0; + + return p_cid.full == n_cid.full; + } else { + return p->equiv_cpu == n->equiv_cpu; + } +} + /* * a small, trivial cache of per-family ucode patches */ -static struct ucode_patch *cache_find_patch(u16 equiv_cpu) +static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equiv_cpu) { struct ucode_patch *p; + struct ucode_patch n; + + n.equiv_cpu = equiv_cpu; + n.patch_id = uci->cpu_sig.rev; + + WARN_ON_ONCE(!n.patch_id); list_for_each_entry(p, µcode_cache, plist) - if (p->equiv_cpu == equiv_cpu) + if (patch_cpus_equivalent(p, &n)) return p; + return NULL; } +static inline bool patch_newer(struct ucode_patch *p, struct ucode_patch *n) +{ + /* Zen and newer hardcode the f/m/s in the patch ID */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) { + union zen_patch_rev zp, zn; + + zp.ucode_rev = p->patch_id; + zn.ucode_rev = n->patch_id; + + return zn.rev > zp.rev; + } else { + return n->patch_id > p->patch_id; + } +} + static void update_cache(struct ucode_patch *new_patch) { struct ucode_patch *p; list_for_each_entry(p, µcode_cache, plist) { - if (p->equiv_cpu == new_patch->equiv_cpu) { - if (p->patch_id >= new_patch->patch_id) { + if (patch_cpus_equivalent(p, new_patch)) { + if (!patch_newer(p, new_patch)) { /* we already have the latest patch */ kfree(new_patch->data); kfree(new_patch); @@ -593,13 +702,22 @@ static void free_cache(void) static struct ucode_patch *find_patch(unsigned int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u16 equiv_id; + u32 rev, dummy __always_unused; + u16 equiv_id = 0; - equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig); - if (!equiv_id) - return NULL; + /* fetch rev if not populated yet: */ + if (!uci->cpu_sig.rev) { + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + uci->cpu_sig.rev = rev; + } + + if (x86_family(bsp_cpuid_1_eax) < 0x17) { + equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig); + if (!equiv_id) + return NULL; + } - return cache_find_patch(equiv_id); + return cache_find_patch(uci, equiv_id); } void reload_ucode_amd(unsigned int cpu) @@ -649,7 +767,7 @@ static enum ucode_state apply_microcode_amd(int cpu) struct ucode_cpu_info *uci; struct ucode_patch *p; enum ucode_state ret; - u32 rev, dummy __always_unused; + u32 rev; BUG_ON(raw_smp_processor_id() != cpu); @@ -659,11 +777,11 @@ static enum ucode_state apply_microcode_amd(int cpu) if (!p) return UCODE_NFOUND; + rev = uci->cpu_sig.rev; + mc_amd = p->data; uci->mc = p->data; - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* need to apply patch? */ if (rev > mc_amd->hdr.patch_id) { ret = UCODE_OK; @@ -709,6 +827,10 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size) hdr = (const u32 *)buf; equiv_tbl_len = hdr[2]; + /* Zen and newer do not need an equivalence table. */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + goto out; + equiv_table.entry = vmalloc(equiv_tbl_len); if (!equiv_table.entry) { pr_err("failed to allocate equivalent CPU table\n"); @@ -718,12 +840,16 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size) memcpy(equiv_table.entry, buf + CONTAINER_HDR_SZ, equiv_tbl_len); equiv_table.num_entries = equiv_tbl_len / sizeof(struct equiv_cpu_entry); +out: /* add header length */ return equiv_tbl_len + CONTAINER_HDR_SZ; } static void free_equiv_cpu_table(void) { + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + return; + vfree(equiv_table.entry); memset(&equiv_table, 0, sizeof(equiv_table)); } @@ -749,7 +875,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, u16 proc_id; int ret; - ret = verify_patch(family, fw, leftover, patch_size); + ret = verify_patch(fw, leftover, patch_size); if (ret) return ret; @@ -774,7 +900,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, patch->patch_id = mc_hdr->patch_id; patch->equiv_cpu = proc_id; - pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n", + pr_debug("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n", __func__, patch->patch_id, proc_id); /* ... and add to cache. */ diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index e0fd57a8ba84..ead967479fa6 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -199,8 +199,8 @@ static void hv_machine_shutdown(void) * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor * corrupts the old VP Assist Pages and can crash the kexec kernel. */ - if (kexec_in_progress && hyperv_init_cpuhp > 0) - cpuhp_remove_state(hyperv_init_cpuhp); + if (kexec_in_progress) + cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE); /* The function calls stop_other_cpus(). */ native_machine_shutdown(); @@ -424,6 +424,7 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { x86_platform.calibrate_tsc = hv_get_tsc_khz; x86_platform.calibrate_cpu = hv_get_tsc_khz; + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); } if (ms_hyperv.priv_high & HV_ISOLATION) { @@ -449,9 +450,23 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED; if (!ms_hyperv.paravisor_present) { - /* To be supported: more work is required. */ + /* + * Mark the Hyper-V TSC page feature as disabled + * in a TDX VM without paravisor so that the + * Invariant TSC, which is a better clocksource + * anyway, is used instead. + */ ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE; + /* + * The Invariant TSC is expected to be available + * in a TDX VM without paravisor, but if not, + * print a warning message. The slower Hyper-V MSR-based + * Ref Counter should end up being the clocksource. + */ + if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) + pr_warn("Hyper-V: Invariant TSC is unavailable\n"); + /* HV_MSR_CRASH_CTL is unsupported. */ ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 767bf1c71aad..2a2fc14955cd 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -609,7 +609,7 @@ void mtrr_save_state(void) { int first_cpu; - if (!mtrr_enabled()) + if (!mtrr_enabled() || !mtrr_state.have_fixed) return; first_cpu = cpumask_first(cpu_online_mask); diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 1930fce9dfe9..8591d53c144b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -119,6 +119,14 @@ struct rdt_hw_resource rdt_resources_all[] = { }, }; +u32 resctrl_arch_system_num_rmid_idx(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + + /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ + return r->num_rmid; +} + /* * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs * as they do not have CPUID enumeration support for Cache allocation. diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 27892e57c4ef..f3f1461273ee 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -475,24 +475,25 @@ struct sgx_epc_page *__sgx_alloc_epc_page(void) { struct sgx_epc_page *page; int nid_of_current = numa_node_id(); - int nid = nid_of_current; + int nid_start, nid; - if (node_isset(nid_of_current, sgx_numa_mask)) { - page = __sgx_alloc_epc_page_from_node(nid_of_current); - if (page) - return page; - } - - /* Fall back to the non-local NUMA nodes: */ - while (true) { - nid = next_node_in(nid, sgx_numa_mask); - if (nid == nid_of_current) - break; + /* + * Try local node first. If it doesn't have an EPC section, + * fall back to the non-local NUMA nodes. + */ + if (node_isset(nid_of_current, sgx_numa_mask)) + nid_start = nid_of_current; + else + nid_start = next_node_in(nid_of_current, sgx_numa_mask); + nid = nid_start; + do { page = __sgx_alloc_epc_page_from_node(nid); if (page) return page; - } + + nid = next_node_in(nid, sgx_numa_mask); + } while (nid != nid_start); return ERR_PTR(-ENOMEM); } @@ -847,6 +848,13 @@ static bool __init sgx_page_cache_init(void) return false; } + for_each_online_node(nid) { + if (!node_isset(nid, sgx_numa_mask) && + node_state(nid, N_MEMORY) && node_state(nid, N_CPU)) + pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n", + nid); + } + return true; } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c5a026fee5e0..1339f8328db5 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -788,6 +788,9 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) goto out_disable; } + fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features & + XFEATURE_MASK_INDEPENDENT; + /* * Clear XSAVE features that are disabled in the normal CPUID. */ diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 2ee0b9c53dcc..afb404cd2059 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -62,9 +62,9 @@ static inline u64 xfeatures_mask_supervisor(void) static inline u64 xfeatures_mask_independent(void) { if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) - return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR; + return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR; - return XFEATURE_MASK_INDEPENDENT; + return fpu_kernel_cfg.independent_features; } /* XSAVE/XRSTOR wrapper functions */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 5358d43886ad..fec381533555 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -51,13 +51,12 @@ DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text); DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); #endif -DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); +DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key); void __init native_pv_lock_init(void) { - if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && - !boot_cpu_has(X86_FEATURE_HYPERVISOR)) - static_branch_disable(&virt_spin_lock_key); + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + static_branch_enable(&virt_spin_lock_key); } static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5d34cad9b7b1..6129dc2ba784 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -164,7 +164,7 @@ unsigned long saved_video_mode; static char __initdata command_line[COMMAND_LINE_SIZE]; #ifdef CONFIG_CMDLINE_BOOL -static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; +char builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; bool builtin_cmdline_added __ro_after_init; #endif diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 4287a8071a3a..730c2f34d347 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -19,7 +19,6 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" - depends on HIGH_RES_TIMERS depends on X86_LOCAL_APIC select KVM_COMMON select KVM_GENERIC_MMU_NOTIFIER @@ -141,11 +140,13 @@ config KVM_AMD_SEV depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) select ARCH_HAS_CC_PLATFORM select KVM_GENERIC_PRIVATE_MEM - select HAVE_KVM_GMEM_PREPARE - select HAVE_KVM_GMEM_INVALIDATE + select HAVE_KVM_ARCH_GMEM_PREPARE + select HAVE_KVM_ARCH_GMEM_INVALIDATE help - Provides support for launching Encrypted VMs (SEV) and Encrypted VMs - with Encrypted State (SEV-ES) on AMD processors. + Provides support for launching encrypted VMs which use Secure + Encrypted Virtualization (SEV), Secure Encrypted Virtualization with + Encrypted State (SEV-ES), and Secure Encrypted Virtualization with + Secure Nested Paging (SEV-SNP) technologies on AMD processors. config KVM_SMM bool "System Management Mode emulation" diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 923e64903da9..913bfc96959c 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -286,7 +286,6 @@ static inline int kvm_hv_hypercall(struct kvm_vcpu *vcpu) return HV_STATUS_ACCESS_DENIED; } static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) {} -static inline void kvm_hv_free_pa_page(struct kvm *kvm) {} static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector) { return false; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a7172ba59ad2..5bb481aefcbc 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -351,10 +351,8 @@ static void kvm_recalculate_logical_map(struct kvm_apic_map *new, * reversing the LDR calculation to get cluster of APICs, i.e. no * additional work is required. */ - if (apic_x2apic_mode(apic)) { - WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(kvm_x2apic_id(apic))); + if (apic_x2apic_mode(apic)) return; - } if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))) { @@ -1743,7 +1741,7 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic) s64 min_period = min_timer_period_us * 1000LL; if (apic->lapic_timer.period < min_period) { - pr_info_ratelimited( + pr_info_once( "vcpu %i: requested %lld ns " "lapic timer period limited to %lld ns\n", apic->vcpu->vcpu_id, @@ -2966,18 +2964,28 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s, bool set) { if (apic_x2apic_mode(vcpu->arch.apic)) { + u32 x2apic_id = kvm_x2apic_id(vcpu->arch.apic); u32 *id = (u32 *)(s->regs + APIC_ID); u32 *ldr = (u32 *)(s->regs + APIC_LDR); u64 icr; if (vcpu->kvm->arch.x2apic_format) { - if (*id != vcpu->vcpu_id) + if (*id != x2apic_id) return -EINVAL; } else { + /* + * Ignore the userspace value when setting APIC state. + * KVM's model is that the x2APIC ID is readonly, e.g. + * KVM only supports delivering interrupts to KVM's + * version of the x2APIC ID. However, for backwards + * compatibility, don't reject attempts to set a + * mismatched ID for userspace that hasn't opted into + * x2apic_format. + */ if (set) - *id >>= 24; + *id = x2apic_id; else - *id <<= 24; + *id = x2apic_id << 24; } /* @@ -2986,7 +2994,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, * split to ICR+ICR2 in userspace for backwards compatibility. */ if (set) { - *ldr = kvm_apic_calc_x2apic_ldr(*id); + *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id); icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 901be9e420a4..7813d28b082f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4335,7 +4335,7 @@ static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, if (req_max_level) max_level = min(max_level, req_max_level); - return req_max_level; + return max_level; } static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, @@ -4674,16 +4674,14 @@ out_unlock: bool kvm_mmu_may_ignore_guest_pat(void) { /* - * When EPT is enabled (shadow_memtype_mask is non-zero), the CPU does - * not support self-snoop (or is affected by an erratum), and the VM + * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to * honor the memtype from the guest's PAT so that guest accesses to * memory that is DMA'd aren't cached against the guest's wishes. As a * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA, - * KVM _always_ ignores or honors guest PAT, i.e. doesn't toggle SPTE - * bits in response to non-coherent device (un)registration. + * KVM _always_ ignores guest PAT (when EPT is enabled). */ - return !static_cpu_has(X86_FEATURE_SELFSNOOP) && shadow_memtype_mask; + return shadow_memtype_mask; } int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) @@ -4743,11 +4741,16 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, u64 end; int r; + if (!vcpu->kvm->arch.pre_fault_allowed) + return -EOPNOTSUPP; + /* * reload is efficient when called repeatedly, so we can do it on * every iteration. */ - kvm_mmu_reload(vcpu); + r = kvm_mmu_reload(vcpu); + if (r) + return r; if (kvm_arch_has_private_mem(vcpu->kvm) && kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa))) @@ -7510,7 +7513,7 @@ static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, const unsigned long end = start + KVM_PAGES_PER_HPAGE(level); if (level == PG_LEVEL_2M) - return kvm_range_has_memory_attributes(kvm, start, end, attrs); + return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs); for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) { if (hugepage_test_mixed(slot, gfn, level - 1) || diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index d4527965e48c..8f7eb3ad88fc 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -391,9 +391,9 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) mmio_value = 0; /* - * The masked MMIO value must obviously match itself and a removed SPTE - * must not get a false positive. Removed SPTEs and MMIO SPTEs should - * never collide as MMIO must set some RWX bits, and removed SPTEs must + * The masked MMIO value must obviously match itself and a frozen SPTE + * must not get a false positive. Frozen SPTEs and MMIO SPTEs should + * never collide as MMIO must set some RWX bits, and frozen SPTEs must * not set any RWX bits. */ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) || diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index ef793c459b05..2cb816ea2430 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -214,7 +214,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; */ #define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) -/* Removed SPTEs must not be misconstrued as shadow present PTEs. */ +/* Frozen SPTEs must not be misconstrued as shadow present PTEs. */ static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK)); static inline bool is_frozen_spte(u64 spte) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c7dc49ee7388..3c55955bcaf8 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -359,10 +359,10 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) /* * Set the SPTE to a nonpresent value that other * threads will not overwrite. If the SPTE was - * already marked as removed then another thread + * already marked as frozen then another thread * handling a page fault could overwrite it, so * set the SPTE until it is set from some other - * value to the removed SPTE value. + * value to the frozen SPTE value. */ for (;;) { old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE); @@ -536,8 +536,8 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 *sptep = rcu_dereference(iter->sptep); /* - * The caller is responsible for ensuring the old SPTE is not a REMOVED - * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, + * The caller is responsible for ensuring the old SPTE is not a FROZEN + * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE, * and pre-checking before inserting a new SPTE is advantageous as it * avoids unnecessary work. */ diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a16c873b3232..714c517dd4b7 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2276,30 +2276,24 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) { struct sev_data_snp_launch_update fw_args = {0}; - bool assigned; + bool assigned = false; int level; - if (!kvm_mem_is_private(kvm, gfn)) { - pr_debug("%s: Failed to ensure GFN 0x%llx has private memory attribute set\n", - __func__, gfn); - ret = -EINVAL; - goto err; - } - ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level); if (ret || assigned) { pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", __func__, gfn, ret, assigned); - ret = -EINVAL; + ret = ret ? -EINVAL : -EEXIST; goto err; } if (src) { void *vaddr = kmap_local_pfn(pfn + i); - ret = copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE); - if (ret) + if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) { + ret = -EFAULT; goto err; + } kunmap_local(vaddr); } @@ -2549,6 +2543,14 @@ static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) data->gctx_paddr = __psp_pa(sev->snp_context); ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error); + /* + * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages + * can be given to the guest simply by marking the RMP entry as private. + * This can happen on first access and also with KVM_PRE_FAULT_MEMORY. + */ + if (!ret) + kvm->arch.pre_fault_allowed = true; + kfree(id_auth); e_free_id_block: diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c115d26844f7..5ab2c92c7331 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2876,6 +2876,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CSTAR: msr_info->data = svm->vmcb01.ptr->save.cstar; break; + case MSR_GS_BASE: + msr_info->data = svm->vmcb01.ptr->save.gs.base; + break; + case MSR_FS_BASE: + msr_info->data = svm->vmcb01.ptr->save.fs.base; + break; case MSR_KERNEL_GS_BASE: msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base; break; @@ -3101,6 +3107,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_CSTAR: svm->vmcb01.ptr->save.cstar = data; break; + case MSR_GS_BASE: + svm->vmcb01.ptr->save.gs.base = data; + break; + case MSR_FS_BASE: + svm->vmcb01.ptr->save.fs.base = data; + break; case MSR_KERNEL_GS_BASE: svm->vmcb01.ptr->save.kernel_gs_base = data; break; @@ -4949,6 +4961,7 @@ static int svm_vm_init(struct kvm *kvm) to_kvm_sev_info(kvm)->need_init = true; kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM); + kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem; } if (!pause_filter_count || !pause_filter_thresh) @@ -5223,6 +5236,9 @@ static __init void svm_set_cpu_caps(void) /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); + + /* Don't advertise Bus Lock Detect to guest if SVM support is absent */ + kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); } static __init int svm_hardware_setup(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f18c2d8c7476..733a0c45d1a6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7659,13 +7659,11 @@ u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) /* * Force WB and ignore guest PAT if the VM does NOT have a non-coherent - * device attached and the CPU doesn't support self-snoop. Letting the - * guest control memory types on Intel CPUs without self-snoop may - * result in unexpected behavior, and so KVM's (historical) ABI is to - * trust the guest to behave only as a last resort. + * device attached. Letting the guest control memory types on Intel + * CPUs may result in unexpected behavior, and so KVM's ABI is to trust + * the guest to behave only as a last resort. */ - if (!static_cpu_has(X86_FEATURE_SELFSNOOP) && - !kvm_arch_has_noncoherent_dma(vcpu->kvm)) + if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index af6c8cf6a37a..c983c8e434b8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -427,8 +427,7 @@ static void kvm_user_return_msr_cpu_online(void) int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { - unsigned int cpu = smp_processor_id(); - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); int err; value = (value & mask) | (msrs->values[slot].host & ~mask); @@ -450,8 +449,7 @@ EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); static void drop_user_return_notifiers(void) { - unsigned int cpu = smp_processor_id(); - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); if (msrs->registered) kvm_on_user_return(&msrs->urn); @@ -4658,7 +4656,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ASYNC_PF_INT: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: - case KVM_CAP_READONLY_MEM: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: case KVM_CAP_DISABLE_QUIRKS: @@ -4817,6 +4814,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VM_TYPES: r = kvm_caps.supported_vm_types; break; + case KVM_CAP_READONLY_MEM: + r = kvm ? kvm_arch_has_readonly_mem(kvm) : 1; + break; default: break; } @@ -6042,7 +6042,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) break; + kvm_vcpu_srcu_read_lock(vcpu); r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); + kvm_vcpu_srcu_read_unlock(vcpu); break; } case KVM_GET_DEBUGREGS: { @@ -12646,6 +12648,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.vm_type = type; kvm->arch.has_private_mem = (type == KVM_X86_SW_PROTECTED_VM); + /* Decided by the vendor code for other VM types. */ + kvm->arch.pre_fault_allowed = + type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM; ret = kvm_page_track_init(kvm); if (ret) @@ -13641,19 +13646,14 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); -#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE -bool kvm_arch_gmem_prepare_needed(struct kvm *kvm) -{ - return kvm->arch.vm_type == KVM_X86_SNP_VM; -} - +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) { return kvm_x86_call(gmem_prepare)(kvm, pfn, gfn, max_order); } #endif -#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) { kvm_x86_call(gmem_invalidate)(start, end); diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c index 384da1fdd5c6..c65cd5550454 100644 --- a/arch/x86/lib/cmdline.c +++ b/arch/x86/lib/cmdline.c @@ -207,18 +207,29 @@ __cmdline_find_option(const char *cmdline, int max_cmdline_size, int cmdline_find_option_bool(const char *cmdline, const char *option) { - if (IS_ENABLED(CONFIG_CMDLINE_BOOL)) - WARN_ON_ONCE(!builtin_cmdline_added); + int ret; - return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); + ret = __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); + if (ret > 0) + return ret; + + if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && !builtin_cmdline_added) + return __cmdline_find_option_bool(builtin_cmdline, COMMAND_LINE_SIZE, option); + + return ret; } int cmdline_find_option(const char *cmdline, const char *option, char *buffer, int bufsize) { - if (IS_ENABLED(CONFIG_CMDLINE_BOOL)) - WARN_ON_ONCE(!builtin_cmdline_added); + int ret; + + ret = __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, buffer, bufsize); + if (ret > 0) + return ret; + + if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && !builtin_cmdline_added) + return __cmdline_find_option(builtin_cmdline, COMMAND_LINE_SIZE, option, buffer, bufsize); - return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, - buffer, bufsize); + return ret; } diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index a314622aa093..d066aecf8aeb 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -88,12 +88,14 @@ SYM_FUNC_END(__get_user_4) EXPORT_SYMBOL(__get_user_4) SYM_FUNC_START(__get_user_8) +#ifndef CONFIG_X86_64 + xor %ecx,%ecx +#endif check_range size=8 ASM_STAC #ifdef CONFIG_X86_64 UACCESS movq (%_ASM_AX),%rdx #else - xor %ecx,%ecx UACCESS movl (%_ASM_AX),%edx UACCESS movl 4(%_ASM_AX),%ecx #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d8dbeac8b206..ff253648706f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -958,8 +958,12 @@ static void update_end_of_memory_vars(u64 start, u64 size) int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, struct mhp_params *params) { + unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1; int ret; + if (WARN_ON_ONCE(end > PHYSMEM_END)) + return -ERANGE; + ret = __add_pages(nid, start_pfn, nr_pages, params); WARN_ON_ONCE(ret); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 37db264866b6..230f1dee4f09 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -47,13 +47,24 @@ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; */ static __initdata struct kaslr_memory_region { unsigned long *base; + unsigned long *end; unsigned long size_tb; } kaslr_regions[] = { - { &page_offset_base, 0 }, - { &vmalloc_base, 0 }, - { &vmemmap_base, 0 }, + { + .base = &page_offset_base, + .end = &physmem_end, + }, + { + .base = &vmalloc_base, + }, + { + .base = &vmemmap_base, + }, }; +/* The end of the possible address space for physical memory */ +unsigned long physmem_end __ro_after_init; + /* Get size in bytes used by the memory region */ static inline unsigned long get_padding(struct kaslr_memory_region *region) { @@ -82,6 +93,8 @@ void __init kernel_randomize_memory(void) BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE); BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); + /* Preset the end of the possible address space for physical memory */ + physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1); if (!kaslr_memory_enabled()) return; @@ -128,11 +141,18 @@ void __init kernel_randomize_memory(void) vaddr += entropy; *kaslr_regions[i].base = vaddr; + /* Calculate the end of the region */ + vaddr += get_padding(&kaslr_regions[i]); /* - * Jump the region and add a minimum padding based on - * randomization alignment. + * KASLR trims the maximum possible size of the + * direct-map. Update the physmem_end boundary. + * No rounding required as the region starts + * PUD aligned and size is in units of TB. */ - vaddr += get_padding(&kaslr_regions[i]); + if (kaslr_regions[i].end) + *kaslr_regions[i].end = __pa_nodebug(vaddr - 1); + + /* Add a minimum padding based on randomization alignment. */ vaddr = round_up(vaddr + 1, PUD_SIZE); remain_entropy -= entropy; } diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 2e69abf4f852..851ec8f1363a 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -241,7 +241,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) * * Returns a pointer to a PTE on success, or NULL on failure. */ -static pte_t *pti_user_pagetable_walk_pte(unsigned long address) +static pte_t *pti_user_pagetable_walk_pte(unsigned long address, bool late_text) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); pmd_t *pmd; @@ -251,10 +251,15 @@ static pte_t *pti_user_pagetable_walk_pte(unsigned long address) if (!pmd) return NULL; - /* We can't do anything sensible if we hit a large mapping. */ + /* Large PMD mapping found */ if (pmd_leaf(*pmd)) { - WARN_ON(1); - return NULL; + /* Clear the PMD if we hit a large mapping from the first round */ + if (late_text) { + set_pmd(pmd, __pmd(0)); + } else { + WARN_ON_ONCE(1); + return NULL; + } } if (pmd_none(*pmd)) { @@ -283,7 +288,7 @@ static void __init pti_setup_vsyscall(void) if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) return; - target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); + target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR, false); if (WARN_ON(!target_pte)) return; @@ -301,7 +306,7 @@ enum pti_clone_level { static void pti_clone_pgtable(unsigned long start, unsigned long end, - enum pti_clone_level level) + enum pti_clone_level level, bool late_text) { unsigned long addr; @@ -374,14 +379,14 @@ pti_clone_pgtable(unsigned long start, unsigned long end, */ *target_pmd = *pmd; - addr += PMD_SIZE; + addr = round_up(addr + 1, PMD_SIZE); } else if (level == PTI_CLONE_PTE) { /* Walk the page-table down to the pte level */ pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) { - addr += PAGE_SIZE; + addr = round_up(addr + 1, PAGE_SIZE); continue; } @@ -390,7 +395,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end, return; /* Allocate PTE in the user page-table */ - target_pte = pti_user_pagetable_walk_pte(addr); + target_pte = pti_user_pagetable_walk_pte(addr, late_text); if (WARN_ON(!target_pte)) return; @@ -401,7 +406,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end, /* Clone the PTE */ *target_pte = *pte; - addr += PAGE_SIZE; + addr = round_up(addr + 1, PAGE_SIZE); } else { BUG(); @@ -452,7 +457,7 @@ static void __init pti_clone_user_shared(void) phys_addr_t pa = per_cpu_ptr_to_phys((void *)va); pte_t *target_pte; - target_pte = pti_user_pagetable_walk_pte(va); + target_pte = pti_user_pagetable_walk_pte(va, false); if (WARN_ON(!target_pte)) return; @@ -475,7 +480,7 @@ static void __init pti_clone_user_shared(void) start = CPU_ENTRY_AREA_BASE; end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); - pti_clone_pgtable(start, end, PTI_CLONE_PMD); + pti_clone_pgtable(start, end, PTI_CLONE_PMD, false); } #endif /* CONFIG_X86_64 */ @@ -492,11 +497,11 @@ static void __init pti_setup_espfix64(void) /* * Clone the populated PMDs of the entry text and force it RO. */ -static void pti_clone_entry_text(void) +static void pti_clone_entry_text(bool late) { pti_clone_pgtable((unsigned long) __entry_text_start, (unsigned long) __entry_text_end, - PTI_CLONE_PMD); + PTI_LEVEL_KERNEL_IMAGE, late); } /* @@ -571,7 +576,7 @@ static void pti_clone_kernel_text(void) * pti_set_kernel_image_nonglobal() did to clear the * global bit. */ - pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE); + pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE, false); /* * pti_clone_pgtable() will set the global bit in any PMDs @@ -638,8 +643,15 @@ void __init pti_init(void) /* Undo all global bits from the init pagetables in head_64.S: */ pti_set_kernel_image_nonglobal(); + /* Replace some of the global bits just for shared entry text: */ - pti_clone_entry_text(); + /* + * This is very early in boot. Device and Late initcalls can do + * modprobe before free_initmem() and mark_readonly(). This + * pti_clone_entry_text() allows those user-mode-helpers to function, + * but notably the text is still RW. + */ + pti_clone_entry_text(false); pti_setup_espfix64(); pti_setup_vsyscall(); } @@ -656,10 +668,11 @@ void pti_finalize(void) if (!boot_cpu_has(X86_FEATURE_PTI)) return; /* - * We need to clone everything (again) that maps parts of the - * kernel image. + * This is after free_initmem() (all initcalls are done) and we've done + * mark_readonly(). Text is now NX which might've split some PMDs + * relative to the early clone. */ - pti_clone_entry_text(); + pti_clone_entry_text(true); pti_clone_kernel_text(); debug_checkwx_user(); |
