aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig128
-rw-r--r--arch/x86/coco/sev/core.c2
-rw-r--r--arch/x86/coco/tdx/tdx.c1
-rw-r--r--arch/x86/crypto/Kconfig8
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c61
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S16
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/events/core.c22
-rw-r--r--arch/x86/events/intel/core.c23
-rw-r--r--arch/x86/events/intel/cstate.c5
-rw-r--r--arch/x86/hyperv/hv_init.c5
-rw-r--r--arch/x86/include/asm/cmdline.h4
-rw-r--r--arch/x86/include/asm/cpu_device_id.h20
-rw-r--r--arch/x86/include/asm/fpu/types.h7
-rw-r--r--arch/x86/include/asm/intel-family.h87
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/mce.h2
-rw-r--r--arch/x86/include/asm/mshyperv.h1
-rw-r--r--arch/x86/include/asm/page_64.h1
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h4
-rw-r--r--arch/x86/include/asm/processor.h3
-rw-r--r--arch/x86/include/asm/qspinlock.h12
-rw-r--r--arch/x86/include/asm/resctrl.h6
-rw-r--r--arch/x86/include/asm/sev.h2
-rw-r--r--arch/x86/include/asm/topology.h13
-rw-r--r--arch/x86/kernel/acpi/cppc.c172
-rw-r--r--arch/x86/kernel/acpi/madt_wakeup.c2
-rw-r--r--arch/x86/kernel/apic/apic.c11
-rw-r--r--arch/x86/kernel/cpu/amd.c18
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c95
-rw-r--r--arch/x86/kernel/cpu/bugs.c60
-rw-r--r--arch/x86/kernel/cpu/common.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c18
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c18
-rw-r--r--arch/x86/kernel/cpu/mce/core.c38
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h2
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c192
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c21
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c8
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c34
-rw-r--r--arch/x86/kernel/fpu/xstate.c3
-rw-r--r--arch/x86/kernel/fpu/xstate.h4
-rw-r--r--arch/x86/kernel/paravirt.c7
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kvm/Kconfig11
-rw-r--r--arch/x86/kvm/hyperv.h1
-rw-r--r--arch/x86/kvm/lapic.c24
-rw-r--r--arch/x86/kvm/mmu/mmu.c19
-rw-r--r--arch/x86/kvm/mmu/spte.c6
-rw-r--r--arch/x86/kvm/mmu/spte.h2
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c8
-rw-r--r--arch/x86/kvm/svm/sev.c24
-rw-r--r--arch/x86/kvm/svm/svm.c16
-rw-r--r--arch/x86/kvm/vmx/vmx.c10
-rw-r--r--arch/x86/kvm/x86.c24
-rw-r--r--arch/x86/lib/cmdline.c25
-rw-r--r--arch/x86/lib/getuser.S4
-rw-r--r--arch/x86/mm/init_64.c4
-rw-r--r--arch/x86/mm/kaslr.c32
-rw-r--r--arch/x86/mm/pti.c51
62 files changed, 964 insertions, 448 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 007bab9f2a0e..6f1c31f4b9ab 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1889,6 +1889,10 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS
If unsure, say y.
+config ARCH_PKEY_BITS
+ int
+ default 4
+
choice
prompt "TSX enable mode"
depends on CPU_SUP_INTEL
@@ -2610,24 +2614,15 @@ config MITIGATION_SLS
against straight line speculation. The kernel image might be slightly
larger.
-config MITIGATION_GDS_FORCE
- bool "Force GDS Mitigation"
+config MITIGATION_GDS
+ bool "Mitigate Gather Data Sampling"
depends on CPU_SUP_INTEL
- default n
+ default y
help
- Gather Data Sampling (GDS) is a hardware vulnerability which allows
- unprivileged speculative access to data which was previously stored in
- vector registers.
-
- This option is equivalent to setting gather_data_sampling=force on the
- command line. The microcode mitigation is used if present, otherwise
- AVX is disabled as a mitigation. On affected systems that are missing
- the microcode any userspace code that unconditionally uses AVX will
- break with this option set.
-
- Setting this option on systems not vulnerable to GDS has no effect.
-
- If in doubt, say N.
+ Enable mitigation for Gather Data Sampling (GDS). GDS is a hardware
+ vulnerability which allows unprivileged speculative access to data
+ which was previously stored in vector registers. The attacker uses gather
+ instructions to infer the stale vector register data.
config MITIGATION_RFDS
bool "RFDS Mitigation"
@@ -2650,6 +2645,107 @@ config MITIGATION_SPECTRE_BHI
indirect branches.
See <file:Documentation/admin-guide/hw-vuln/spectre.rst>
+config MITIGATION_MDS
+ bool "Mitigate Microarchitectural Data Sampling (MDS) hardware bug"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for Microarchitectural Data Sampling (MDS). MDS is
+ a hardware vulnerability which allows unprivileged speculative access
+ to data which is available in various CPU internal buffers.
+ See also <file:Documentation/admin-guide/hw-vuln/mds.rst>
+
+config MITIGATION_TAA
+ bool "Mitigate TSX Asynchronous Abort (TAA) hardware bug"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for TSX Asynchronous Abort (TAA). TAA is a hardware
+ vulnerability that allows unprivileged speculative access to data
+ which is available in various CPU internal buffers by using
+ asynchronous aborts within an Intel TSX transactional region.
+ See also <file:Documentation/admin-guide/hw-vuln/tsx_async_abort.rst>
+
+config MITIGATION_MMIO_STALE_DATA
+ bool "Mitigate MMIO Stale Data hardware bug"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for MMIO Stale Data hardware bugs. Processor MMIO
+ Stale Data Vulnerabilities are a class of memory-mapped I/O (MMIO)
+ vulnerabilities that can expose data. The vulnerabilities require the
+ attacker to have access to MMIO.
+ See also
+ <file:Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst>
+
+config MITIGATION_L1TF
+ bool "Mitigate L1 Terminal Fault (L1TF) hardware bug"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Mitigate L1 Terminal Fault (L1TF) hardware bug. L1 Terminal Fault is a
+ hardware vulnerability which allows unprivileged speculative access to data
+ available in the Level 1 Data Cache.
+ See <file:Documentation/admin-guide/hw-vuln/l1tf.rst
+
+config MITIGATION_RETBLEED
+ bool "Mitigate RETBleed hardware bug"
+ depends on (CPU_SUP_INTEL && MITIGATION_SPECTRE_V2) || MITIGATION_UNRET_ENTRY || MITIGATION_IBPB_ENTRY
+ default y
+ help
+ Enable mitigation for RETBleed (Arbitrary Speculative Code Execution
+ with Return Instructions) vulnerability. RETBleed is a speculative
+ execution attack which takes advantage of microarchitectural behavior
+ in many modern microprocessors, similar to Spectre v2. An
+ unprivileged attacker can use these flaws to bypass conventional
+ memory security restrictions to gain read access to privileged memory
+ that would otherwise be inaccessible.
+
+config MITIGATION_SPECTRE_V1
+ bool "Mitigate SPECTRE V1 hardware bug"
+ default y
+ help
+ Enable mitigation for Spectre V1 (Bounds Check Bypass). Spectre V1 is a
+ class of side channel attacks that takes advantage of speculative
+ execution that bypasses conditional branch instructions used for
+ memory access bounds check.
+ See also <file:Documentation/admin-guide/hw-vuln/spectre.rst>
+
+config MITIGATION_SPECTRE_V2
+ bool "Mitigate SPECTRE V2 hardware bug"
+ default y
+ help
+ Enable mitigation for Spectre V2 (Branch Target Injection). Spectre
+ V2 is a class of side channel attacks that takes advantage of
+ indirect branch predictors inside the processor. In Spectre variant 2
+ attacks, the attacker can steer speculative indirect branches in the
+ victim to gadget code by poisoning the branch target buffer of a CPU
+ used for predicting indirect branch addresses.
+ See also <file:Documentation/admin-guide/hw-vuln/spectre.rst>
+
+config MITIGATION_SRBDS
+ bool "Mitigate Special Register Buffer Data Sampling (SRBDS) hardware bug"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for Special Register Buffer Data Sampling (SRBDS).
+ SRBDS is a hardware vulnerability that allows Microarchitectural Data
+ Sampling (MDS) techniques to infer values returned from special
+ register accesses. An unprivileged user can extract values returned
+ from RDRAND and RDSEED executed on another core or sibling thread
+ using MDS techniques.
+ See also
+ <file:Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst>
+
+config MITIGATION_SSB
+ bool "Mitigate Speculative Store Bypass (SSB) hardware bug"
+ default y
+ help
+ Enable mitigation for Speculative Store Bypass (SSB). SSB is a
+ hardware security vulnerability and its exploitation takes advantage
+ of speculative execution in a similar way to the Meltdown and Spectre
+ security vulnerabilities.
+
endif
config ARCH_HAS_ADD_PAGES
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index 082d61d85dfc..de1df0cb45da 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -163,7 +163,7 @@ struct sev_config {
*/
use_cas : 1,
- __reserved : 62;
+ __reserved : 61;
};
static struct sev_config sev_cfg __read_mostly;
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 078e2bac2553..da8b66dce0da 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -389,7 +389,6 @@ static bool mmio_read(int size, unsigned long addr, unsigned long *val)
.r12 = size,
.r13 = EPT_READ,
.r14 = addr,
- .r15 = *val,
};
if (__tdx_hypercall(&args))
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 24875e6295f2..7b1bebed879d 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -14,7 +14,7 @@ config CRYPTO_CURVE25519_X86
- ADX (large integer arithmetic)
config CRYPTO_AES_NI_INTEL
- tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTR, XTS, GCM (AES-NI)"
+ tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XCTR, XTS, GCM (AES-NI/VAES)"
depends on X86
select CRYPTO_AEAD
select CRYPTO_LIB_AES
@@ -25,10 +25,14 @@ config CRYPTO_AES_NI_INTEL
help
Block cipher: AES cipher algorithms
AEAD cipher: AES with GCM
- Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XTR, XTS
+ Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XCTR, XTS
Architecture: x86 (32-bit and 64-bit) using:
- AES-NI (AES new instructions)
+ - VAES (Vector AES)
+
+ Some algorithm implementations are supported only in 64-bit builds,
+ and some have additional prerequisites such as AVX2 or AVX512.
config CRYPTO_BLOWFISH_X86_64
tristate "Ciphers: Blowfish, modes: ECB, CBC"
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index cd37de5ec404..b0dd83555499 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1366,6 +1366,8 @@ gcm_crypt(struct aead_request *req, int flags)
err = skcipher_walk_aead_encrypt(&walk, req, false);
else
err = skcipher_walk_aead_decrypt(&walk, req, false);
+ if (err)
+ return err;
/*
* Since the AES-GCM assembly code requires that at least three assembly
@@ -1381,37 +1383,31 @@ gcm_crypt(struct aead_request *req, int flags)
gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags);
/* En/decrypt the data and pass the ciphertext through GHASH. */
- while ((nbytes = walk.nbytes) != 0) {
- if (unlikely(nbytes < walk.total)) {
- /*
- * Non-last segment. In this case, the assembly
- * function requires that the length be a multiple of 16
- * (AES_BLOCK_SIZE) bytes. The needed buffering of up
- * to 16 bytes is handled by the skcipher_walk. Here we
- * just need to round down to a multiple of 16.
- */
- nbytes = round_down(nbytes, AES_BLOCK_SIZE);
- aes_gcm_update(key, le_ctr, ghash_acc,
- walk.src.virt.addr, walk.dst.virt.addr,
- nbytes, flags);
- le_ctr[0] += nbytes / AES_BLOCK_SIZE;
- kernel_fpu_end();
- err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
- kernel_fpu_begin();
- } else {
- /* Last segment: process all remaining data. */
- aes_gcm_update(key, le_ctr, ghash_acc,
- walk.src.virt.addr, walk.dst.virt.addr,
- nbytes, flags);
- err = skcipher_walk_done(&walk, 0);
- /*
- * The low word of the counter isn't used by the
- * finalize, so there's no need to increment it here.
- */
- }
+ while (unlikely((nbytes = walk.nbytes) < walk.total)) {
+ /*
+ * Non-last segment. In this case, the assembly function
+ * requires that the length be a multiple of 16 (AES_BLOCK_SIZE)
+ * bytes. The needed buffering of up to 16 bytes is handled by
+ * the skcipher_walk. Here we just need to round down to a
+ * multiple of 16.
+ */
+ nbytes = round_down(nbytes, AES_BLOCK_SIZE);
+ aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr,
+ walk.dst.virt.addr, nbytes, flags);
+ le_ctr[0] += nbytes / AES_BLOCK_SIZE;
+ kernel_fpu_end();
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ if (err)
+ return err;
+ kernel_fpu_begin();
}
- if (err)
- goto out;
+ /* Last segment: process all remaining data. */
+ aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr,
+ walk.dst.virt.addr, nbytes, flags);
+ /*
+ * The low word of the counter isn't used by the finalize, so there's no
+ * need to increment it here.
+ */
/* Finalize */
taglen = crypto_aead_authsize(tfm);
@@ -1439,8 +1435,9 @@ gcm_crypt(struct aead_request *req, int flags)
datalen, tag, taglen, flags))
err = -EBADMSG;
}
-out:
kernel_fpu_end();
+ if (nbytes)
+ skcipher_walk_done(&walk, 0);
return err;
}
@@ -1753,6 +1750,6 @@ static void __exit aesni_exit(void)
late_initcall(aesni_init);
module_exit(aesni_exit);
-MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized");
+MODULE_DESCRIPTION("AES cipher and modes, optimized with AES-NI or VAES instructions");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CRYPTO("aes");
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 0ffb072be956..0bbec1c75cd0 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -592,22 +592,22 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx)
leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED _XFER + 0*32
+ FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)
leaq K256+1*32(%rip), INP
vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED _XFER + 1*32
+ FOUR_ROUNDS_AND_SCHED (_XFER + 1*32)
leaq K256+2*32(%rip), INP
vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED _XFER + 2*32
+ FOUR_ROUNDS_AND_SCHED (_XFER + 2*32)
leaq K256+3*32(%rip), INP
vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED _XFER + 3*32
+ FOUR_ROUNDS_AND_SCHED (_XFER + 3*32)
add $4*32, SRND
cmp $3*4*32, SRND
@@ -618,12 +618,12 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx)
leaq K256+0*32(%rip), INP
vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
- DO_4ROUNDS _XFER + 0*32
+ DO_4ROUNDS (_XFER + 0*32)
leaq K256+1*32(%rip), INP
vpaddd (INP, SRND), X1, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
- DO_4ROUNDS _XFER + 1*32
+ DO_4ROUNDS (_XFER + 1*32)
add $2*32, SRND
vmovdqa X2, X0
@@ -651,8 +651,8 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx)
xor SRND, SRND
.align 16
.Lloop3:
- DO_4ROUNDS _XFER + 0*32 + 16
- DO_4ROUNDS _XFER + 1*32 + 16
+ DO_4ROUNDS (_XFER + 0*32 + 16)
+ DO_4ROUNDS (_XFER + 1*32 + 16)
add $2*32, SRND
cmp $4*4*32, SRND
jb .Lloop3
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 83073fa3c989..7093ee21c0d1 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -344,6 +344,7 @@
332 common statx sys_statx
333 common io_pgetevents sys_io_pgetevents
334 common rseq sys_rseq
+335 common uretprobe sys_uretprobe
# don't use numbers 387 through 423, add new calls after the last
# 'common' entry
424 common pidfd_send_signal sys_pidfd_send_signal
@@ -385,7 +386,6 @@
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
462 common mseal sys_mseal
-467 common uretprobe sys_uretprobe
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 12f2a0c14d33..be01823b1bb4 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1520,20 +1520,23 @@ static void x86_pmu_start(struct perf_event *event, int flags)
void perf_event_print_debug(void)
{
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+ unsigned long *cntr_mask, *fixed_cntr_mask;
+ struct event_constraint *pebs_constraints;
+ struct cpu_hw_events *cpuc;
u64 pebs, debugctl;
- int cpu = smp_processor_id();
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- unsigned long *cntr_mask = hybrid(cpuc->pmu, cntr_mask);
- unsigned long *fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask);
- struct event_constraint *pebs_constraints = hybrid(cpuc->pmu, pebs_constraints);
- unsigned long flags;
- int idx;
+ int cpu, idx;
+
+ guard(irqsave)();
+
+ cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_events, cpu);
+ cntr_mask = hybrid(cpuc->pmu, cntr_mask);
+ fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask);
+ pebs_constraints = hybrid(cpuc->pmu, pebs_constraints);
if (!*(u64 *)cntr_mask)
return;
- local_irq_save(flags);
-
if (x86_pmu.version >= 2) {
rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
@@ -1577,7 +1580,6 @@ void perf_event_print_debug(void)
pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
cpu, idx, pmc_count);
}
- local_irq_restore(flags);
}
void x86_pmu_stop(struct perf_event *event, int flags)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 0c9c2706d4ec..9e519d8a810a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4589,6 +4589,25 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
return HYBRID_INTEL_CORE;
}
+static inline bool erratum_hsw11(struct perf_event *event)
+{
+ return (event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+ X86_CONFIG(.event=0xc0, .umask=0x01);
+}
+
+/*
+ * The HSW11 requires a period larger than 100 which is the same as the BDM11.
+ * A minimum period of 128 is enforced as well for the INST_RETIRED.ALL.
+ *
+ * The message 'interrupt took too long' can be observed on any counter which
+ * was armed with a period < 32 and two events expired in the same NMI.
+ * A minimum period of 32 is enforced for the rest of the events.
+ */
+static void hsw_limit_period(struct perf_event *event, s64 *left)
+{
+ *left = max(*left, erratum_hsw11(event) ? 128 : 32);
+}
+
/*
* Broadwell:
*
@@ -4606,8 +4625,7 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
*/
static void bdw_limit_period(struct perf_event *event, s64 *left)
{
- if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
- X86_CONFIG(.event=0xc0, .umask=0x01)) {
+ if (erratum_hsw11(event)) {
if (*left < 128)
*left = 128;
*left &= ~0x3fULL;
@@ -6766,6 +6784,7 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.limit_period = hsw_limit_period;
x86_pmu.lbr_double_abort = true;
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
hsw_format_attr : nhm_format_attr;
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index be58cfb012dd..9f116dfc4728 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -64,7 +64,7 @@
* perf code: 0x00
* Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
* KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL,
- * RPL,SPR,MTL,ARL,LNL
+ * RPL,SPR,MTL,ARL,LNL,SRF
* Scope: Package (physical package)
* MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
* perf code: 0x01
@@ -693,7 +693,8 @@ static const struct cstate_model srf_cstates __initconst = {
.core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
BIT(PERF_CSTATE_CORE_C6_RES),
- .pkg_events = BIT(PERF_CSTATE_PKG_C6_RES),
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES),
.module_events = BIT(PERF_CSTATE_MODULE_C6_RES),
};
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 17a71e92a343..95eada2994e1 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -35,7 +35,6 @@
#include <clocksource/hyperv_timer.h>
#include <linux/highmem.h>
-int hyperv_init_cpuhp;
u64 hv_current_partition_id = ~0ull;
EXPORT_SYMBOL_GPL(hv_current_partition_id);
@@ -607,8 +606,6 @@ skip_hypercall_pg_init:
register_syscore_ops(&hv_syscore_ops);
- hyperv_init_cpuhp = cpuhp;
-
if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID)
hv_get_partition_id();
@@ -637,7 +634,7 @@ skip_hypercall_pg_init:
clean_guest_os_id:
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
- cpuhp_remove_state(cpuhp);
+ cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE);
free_ghcb_page:
free_percpu(hv_ghcb_pg);
free_vp_assist_page:
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
index 6faaf27e8899..6cbd9ae58b21 100644
--- a/arch/x86/include/asm/cmdline.h
+++ b/arch/x86/include/asm/cmdline.h
@@ -2,6 +2,10 @@
#ifndef _ASM_X86_CMDLINE_H
#define _ASM_X86_CMDLINE_H
+#include <asm/setup.h>
+
+extern char builtin_cmdline[COMMAND_LINE_SIZE];
+
int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
int cmdline_find_option(const char *cmdline_ptr, const char *option,
char *buffer, int bufsize);
diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h
index 3831f612e89c..e4121d9aa9e1 100644
--- a/arch/x86/include/asm/cpu_device_id.h
+++ b/arch/x86/include/asm/cpu_device_id.h
@@ -193,26 +193,6 @@
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, X86_MODEL_ANY, data)
/**
- * X86_MATCH_INTEL_FAM6_MODEL - Match vendor INTEL, family 6 and model
- * @model: The model name without the INTEL_FAM6_ prefix or ANY
- * The model name is expanded to INTEL_FAM6_@model internally
- * @data: Driver specific data or NULL. The internal storage
- * format is unsigned long. The supplied value, pointer
- * etc. is casted to unsigned long internally.
- *
- * The vendor is set to INTEL, the family to 6 and all other missing
- * arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are set to wildcards.
- *
- * See X86_MATCH_VENDOR_FAM_MODEL_FEATURE() for further information.
- */
-#define X86_MATCH_INTEL_FAM6_MODEL(model, data) \
- X86_MATCH_VENDOR_FAM_MODEL(INTEL, 6, INTEL_FAM6_##model, data)
-
-#define X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(model, steppings, data) \
- X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, INTEL_FAM6_##model, \
- steppings, X86_FEATURE_ANY, data)
-
-/**
* X86_MATCH_VFM - Match encoded vendor/family/model
* @vfm: Encoded 8-bits each for vendor, family, model
* @data: Driver specific data or NULL. The internal storage
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index eb17f31b06d2..de16862bf230 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -591,6 +591,13 @@ struct fpu_state_config {
* even without XSAVE support, i.e. legacy features FP + SSE
*/
u64 legacy_features;
+ /*
+ * @independent_features:
+ *
+ * Features that are supported by XSAVES, but not managed as part of
+ * the FPU core, such as LBR
+ */
+ u64 independent_features;
};
/* FPU state configuration information */
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index f81a851c46dc..44949f972826 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -10,7 +10,7 @@
* that group keep the CPUID for the variants sorted by model number.
*
* The defined symbol names have the following form:
- * INTEL_FAM6{OPTFAMILY}_{MICROARCH}{OPTDIFF}
+ * INTEL_{OPTFAMILY}_{MICROARCH}{OPTDIFF}
* where:
* OPTFAMILY Describes the family of CPUs that this belongs to. Default
* is assumed to be "_CORE" (and should be omitted). Other values
@@ -42,215 +42,136 @@
#define IFM(_fam, _model) VFM_MAKE(X86_VENDOR_INTEL, _fam, _model)
-/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */
-#define INTEL_FAM6_ANY X86_MODEL_ANY
-/* Wildcard match for FAM6 so X86_MATCH_VFM(ANY) works */
+/* Wildcard match so X86_MATCH_VFM(ANY) works */
#define INTEL_ANY IFM(X86_FAMILY_ANY, X86_MODEL_ANY)
-#define INTEL_FAM6_CORE_YONAH 0x0E
+#define INTEL_PENTIUM_PRO IFM(6, 0x01)
+
#define INTEL_CORE_YONAH IFM(6, 0x0E)
-#define INTEL_FAM6_CORE2_MEROM 0x0F
#define INTEL_CORE2_MEROM IFM(6, 0x0F)
-#define INTEL_FAM6_CORE2_MEROM_L 0x16
#define INTEL_CORE2_MEROM_L IFM(6, 0x16)
-#define INTEL_FAM6_CORE2_PENRYN 0x17
#define INTEL_CORE2_PENRYN IFM(6, 0x17)
-#define INTEL_FAM6_CORE2_DUNNINGTON 0x1D
#define INTEL_CORE2_DUNNINGTON IFM(6, 0x1D)
-#define INTEL_FAM6_NEHALEM 0x1E
#define INTEL_NEHALEM IFM(6, 0x1E)
-#define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */
#define INTEL_NEHALEM_G IFM(6, 0x1F) /* Auburndale / Havendale */
-#define INTEL_FAM6_NEHALEM_EP 0x1A
#define INTEL_NEHALEM_EP IFM(6, 0x1A)
-#define INTEL_FAM6_NEHALEM_EX 0x2E
#define INTEL_NEHALEM_EX IFM(6, 0x2E)
-#define INTEL_FAM6_WESTMERE 0x25
#define INTEL_WESTMERE IFM(6, 0x25)
-#define INTEL_FAM6_WESTMERE_EP 0x2C
#define INTEL_WESTMERE_EP IFM(6, 0x2C)
-#define INTEL_FAM6_WESTMERE_EX 0x2F
#define INTEL_WESTMERE_EX IFM(6, 0x2F)
-#define INTEL_FAM6_SANDYBRIDGE 0x2A
#define INTEL_SANDYBRIDGE IFM(6, 0x2A)
-#define INTEL_FAM6_SANDYBRIDGE_X 0x2D
#define INTEL_SANDYBRIDGE_X IFM(6, 0x2D)
-#define INTEL_FAM6_IVYBRIDGE 0x3A
#define INTEL_IVYBRIDGE IFM(6, 0x3A)
-#define INTEL_FAM6_IVYBRIDGE_X 0x3E
#define INTEL_IVYBRIDGE_X IFM(6, 0x3E)
-#define INTEL_FAM6_HASWELL 0x3C
#define INTEL_HASWELL IFM(6, 0x3C)
-#define INTEL_FAM6_HASWELL_X 0x3F
#define INTEL_HASWELL_X IFM(6, 0x3F)
-#define INTEL_FAM6_HASWELL_L 0x45
#define INTEL_HASWELL_L IFM(6, 0x45)
-#define INTEL_FAM6_HASWELL_G 0x46
#define INTEL_HASWELL_G IFM(6, 0x46)
-#define INTEL_FAM6_BROADWELL 0x3D
#define INTEL_BROADWELL IFM(6, 0x3D)
-#define INTEL_FAM6_BROADWELL_G 0x47
#define INTEL_BROADWELL_G IFM(6, 0x47)
-#define INTEL_FAM6_BROADWELL_X 0x4F
#define INTEL_BROADWELL_X IFM(6, 0x4F)
-#define INTEL_FAM6_BROADWELL_D 0x56
#define INTEL_BROADWELL_D IFM(6, 0x56)
-#define INTEL_FAM6_SKYLAKE_L 0x4E /* Sky Lake */
#define INTEL_SKYLAKE_L IFM(6, 0x4E) /* Sky Lake */
-#define INTEL_FAM6_SKYLAKE 0x5E /* Sky Lake */
#define INTEL_SKYLAKE IFM(6, 0x5E) /* Sky Lake */
-#define INTEL_FAM6_SKYLAKE_X 0x55 /* Sky Lake */
#define INTEL_SKYLAKE_X IFM(6, 0x55) /* Sky Lake */
/* CASCADELAKE_X 0x55 Sky Lake -- s: 7 */
/* COOPERLAKE_X 0x55 Sky Lake -- s: 11 */
-#define INTEL_FAM6_KABYLAKE_L 0x8E /* Sky Lake */
#define INTEL_KABYLAKE_L IFM(6, 0x8E) /* Sky Lake */
/* AMBERLAKE_L 0x8E Sky Lake -- s: 9 */
/* COFFEELAKE_L 0x8E Sky Lake -- s: 10 */
/* WHISKEYLAKE_L 0x8E Sky Lake -- s: 11,12 */
-#define INTEL_FAM6_KABYLAKE 0x9E /* Sky Lake */
#define INTEL_KABYLAKE IFM(6, 0x9E) /* Sky Lake */
/* COFFEELAKE 0x9E Sky Lake -- s: 10-13 */
-#define INTEL_FAM6_COMETLAKE 0xA5 /* Sky Lake */
#define INTEL_COMETLAKE IFM(6, 0xA5) /* Sky Lake */
-#define INTEL_FAM6_COMETLAKE_L 0xA6 /* Sky Lake */
#define INTEL_COMETLAKE_L IFM(6, 0xA6) /* Sky Lake */
-#define INTEL_FAM6_CANNONLAKE_L 0x66 /* Palm Cove */
#define INTEL_CANNONLAKE_L IFM(6, 0x66) /* Palm Cove */
-#define INTEL_FAM6_ICELAKE_X 0x6A /* Sunny Cove */
#define INTEL_ICELAKE_X IFM(6, 0x6A) /* Sunny Cove */
-#define INTEL_FAM6_ICELAKE_D 0x6C /* Sunny Cove */
#define INTEL_ICELAKE_D IFM(6, 0x6C) /* Sunny Cove */
-#define INTEL_FAM6_ICELAKE 0x7D /* Sunny Cove */
#define INTEL_ICELAKE IFM(6, 0x7D) /* Sunny Cove */
-#define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */
#define INTEL_ICELAKE_L IFM(6, 0x7E) /* Sunny Cove */
-#define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */
#define INTEL_ICELAKE_NNPI IFM(6, 0x9D) /* Sunny Cove */
-#define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */
#define INTEL_ROCKETLAKE IFM(6, 0xA7) /* Cypress Cove */
-#define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */
#define INTEL_TIGERLAKE_L IFM(6, 0x8C) /* Willow Cove */
-#define INTEL_FAM6_TIGERLAKE 0x8D /* Willow Cove */
#define INTEL_TIGERLAKE IFM(6, 0x8D) /* Willow Cove */
-#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */
#define INTEL_SAPPHIRERAPIDS_X IFM(6, 0x8F) /* Golden Cove */
-#define INTEL_FAM6_EMERALDRAPIDS_X 0xCF
#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF)
-#define INTEL_FAM6_GRANITERAPIDS_X 0xAD
#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD)
-#define INTEL_FAM6_GRANITERAPIDS_D 0xAE
#define INTEL_GRANITERAPIDS_D IFM(6, 0xAE)
/* "Hybrid" Processors (P-Core/E-Core) */
-#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */
#define INTEL_LAKEFIELD IFM(6, 0x8A) /* Sunny Cove / Tremont */
-#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
#define INTEL_ALDERLAKE IFM(6, 0x97) /* Golden Cove / Gracemont */
-#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
#define INTEL_ALDERLAKE_L IFM(6, 0x9A) /* Golden Cove / Gracemont */
-#define INTEL_FAM6_RAPTORLAKE 0xB7 /* Raptor Cove / Enhanced Gracemont */
#define INTEL_RAPTORLAKE IFM(6, 0xB7) /* Raptor Cove / Enhanced Gracemont */
-#define INTEL_FAM6_RAPTORLAKE_P 0xBA
#define INTEL_RAPTORLAKE_P IFM(6, 0xBA)
-#define INTEL_FAM6_RAPTORLAKE_S 0xBF
#define INTEL_RAPTORLAKE_S IFM(6, 0xBF)
-#define INTEL_FAM6_METEORLAKE 0xAC
#define INTEL_METEORLAKE IFM(6, 0xAC)
-#define INTEL_FAM6_METEORLAKE_L 0xAA
#define INTEL_METEORLAKE_L IFM(6, 0xAA)
-#define INTEL_FAM6_ARROWLAKE_H 0xC5
#define INTEL_ARROWLAKE_H IFM(6, 0xC5)
-#define INTEL_FAM6_ARROWLAKE 0xC6
#define INTEL_ARROWLAKE IFM(6, 0xC6)
-#define INTEL_FAM6_ARROWLAKE_U 0xB5
#define INTEL_ARROWLAKE_U IFM(6, 0xB5)
-#define INTEL_FAM6_LUNARLAKE_M 0xBD
#define INTEL_LUNARLAKE_M IFM(6, 0xBD)
/* "Small Core" Processors (Atom/E-Core) */
-#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
#define INTEL_ATOM_BONNELL IFM(6, 0x1C) /* Diamondville, Pineview */
-#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */
#define INTEL_ATOM_BONNELL_MID IFM(6, 0x26) /* Silverthorne, Lincroft */
-#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */
#define INTEL_ATOM_SALTWELL IFM(6, 0x36) /* Cedarview */
-#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */
#define INTEL_ATOM_SALTWELL_MID IFM(6, 0x27) /* Penwell */
-#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */
#define INTEL_ATOM_SALTWELL_TABLET IFM(6, 0x35) /* Cloverview */
-#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */
#define INTEL_ATOM_SILVERMONT IFM(6, 0x37) /* Bay Trail, Valleyview */
-#define INTEL_FAM6_ATOM_SILVERMONT_D 0x4D /* Avaton, Rangely */
#define INTEL_ATOM_SILVERMONT_D IFM(6, 0x4D) /* Avaton, Rangely */
-#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */
#define INTEL_ATOM_SILVERMONT_MID IFM(6, 0x4A) /* Merriefield */
-#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */
#define INTEL_ATOM_AIRMONT IFM(6, 0x4C) /* Cherry Trail, Braswell */
-#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */
#define INTEL_ATOM_AIRMONT_MID IFM(6, 0x5A) /* Moorefield */
-#define INTEL_FAM6_ATOM_AIRMONT_NP 0x75 /* Lightning Mountain */
#define INTEL_ATOM_AIRMONT_NP IFM(6, 0x75) /* Lightning Mountain */
-#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */
#define INTEL_ATOM_GOLDMONT IFM(6, 0x5C) /* Apollo Lake */
-#define INTEL_FAM6_ATOM_GOLDMONT_D 0x5F /* Denverton */
#define INTEL_ATOM_GOLDMONT_D IFM(6, 0x5F) /* Denverton */
/* Note: the micro-architecture is "Goldmont Plus" */
-#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */
#define INTEL_ATOM_GOLDMONT_PLUS IFM(6, 0x7A) /* Gemini Lake */
-#define INTEL_FAM6_ATOM_TREMONT_D 0x86 /* Jacobsville */
#define INTEL_ATOM_TREMONT_D IFM(6, 0x86) /* Jacobsville */
-#define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */
#define INTEL_ATOM_TREMONT IFM(6, 0x96) /* Elkhart Lake */
-#define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */
#define INTEL_ATOM_TREMONT_L IFM(6, 0x9C) /* Jasper Lake */
-#define INTEL_FAM6_ATOM_GRACEMONT 0xBE /* Alderlake N */
#define INTEL_ATOM_GRACEMONT IFM(6, 0xBE) /* Alderlake N */
-#define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */
#define INTEL_ATOM_CRESTMONT_X IFM(6, 0xAF) /* Sierra Forest */
-#define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */
#define INTEL_ATOM_CRESTMONT IFM(6, 0xB6) /* Grand Ridge */
-#define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */
#define INTEL_ATOM_DARKMONT_X IFM(6, 0xDD) /* Clearwater Forest */
/* Xeon Phi */
-#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */
#define INTEL_XEON_PHI_KNL IFM(6, 0x57) /* Knights Landing */
-#define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */
#define INTEL_XEON_PHI_KNM IFM(6, 0x85) /* Knights Mill */
/* Family 5 */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 950a03e0181e..4a68cb3eba78 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1305,6 +1305,7 @@ struct kvm_arch {
u8 vm_type;
bool has_private_mem;
bool has_protected_state;
+ bool pre_fault_allowed;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
@@ -2191,6 +2192,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
#define kvm_arch_has_private_mem(kvm) false
#endif
+#define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state)
+
static inline u16 kvm_read_ldt(void)
{
u16 ldt;
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 3ad29b128943..3b9970117a0f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -221,7 +221,7 @@ static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
u64 lapic_id) { return -EINVAL; }
#endif
-void mce_setup(struct mce *m);
+void mce_prep_record(struct mce *m);
void mce_log(struct mce *m);
DECLARE_PER_CPU(struct device *, mce_device);
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 390c4d13956d..5f0bc6a6d025 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -40,7 +40,6 @@ static inline unsigned char hv_get_nmi_reason(void)
}
#if IS_ENABLED(CONFIG_HYPERV)
-extern int hyperv_init_cpuhp;
extern bool hyperv_paravisor_present;
extern void *hv_hypercall_pg;
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index af4302d79b59..f3d257c45225 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -17,6 +17,7 @@ extern unsigned long phys_base;
extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
+extern unsigned long physmem_end;
static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
{
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 9053dfe9fa03..a98e53491a4e 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -140,6 +140,10 @@ extern unsigned int ptrs_per_p4d;
# define VMEMMAP_START __VMEMMAP_BASE_L4
#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
+#ifdef CONFIG_RANDOMIZE_MEMORY
+# define PHYSMEM_END physmem_end
+#endif
+
/*
* End of the region for which vmalloc page tables are pre-allocated.
* For non-KMSAN builds, this is the same as VMALLOC_END.
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a75a07f4931f..775acbdea1a9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -691,8 +691,6 @@ static inline u32 per_cpu_l2c_id(unsigned int cpu)
}
#ifdef CONFIG_CPU_SUP_AMD
-extern u32 amd_get_highest_perf(void);
-
/*
* Issue a DIV 0/1 insn to clear any division data from previous DIV
* operations.
@@ -705,7 +703,6 @@ static __always_inline void amd_clear_divider(void)
extern void amd_check_microcode(void);
#else
-static inline u32 amd_get_highest_perf(void) { return 0; }
static inline void amd_clear_divider(void) { }
static inline void amd_check_microcode(void) { }
#endif
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index a053c1293975..68da67df304d 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -66,13 +66,15 @@ static inline bool vcpu_is_preempted(long cpu)
#ifdef CONFIG_PARAVIRT
/*
- * virt_spin_lock_key - enables (by default) the virt_spin_lock() hijack.
+ * virt_spin_lock_key - disables by default the virt_spin_lock() hijack.
*
- * Native (and PV wanting native due to vCPU pinning) should disable this key.
- * It is done in this backwards fashion to only have a single direction change,
- * which removes ordering between native_pv_spin_init() and HV setup.
+ * Native (and PV wanting native due to vCPU pinning) should keep this key
+ * disabled. Native does not touch the key.
+ *
+ * When in a guest then native_pv_lock_init() enables the key first and
+ * KVM/XEN might conditionally disable it later in the boot process again.
*/
-DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key);
+DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);
/*
* Shortcut for the queued_spin_lock_slowpath() function that allows
diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 12dbd2588ca7..8b1b6ce1e51b 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -156,12 +156,6 @@ static inline void resctrl_sched_in(struct task_struct *tsk)
__resctrl_sched_in(tsk);
}
-static inline u32 resctrl_arch_system_num_rmid_idx(void)
-{
- /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
- return boot_cpu_data.x86_cache_max_rmid + 1;
-}
-
static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid)
{
*rmid = idx;
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 79bbe2be900e..ee34ab00a8d6 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -164,7 +164,7 @@ struct snp_guest_msg_hdr {
struct snp_guest_msg {
struct snp_guest_msg_hdr hdr;
- u8 payload[4000];
+ u8 payload[PAGE_SIZE - sizeof(struct snp_guest_msg_hdr)];
} __packed;
struct sev_guest_platform_data {
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index abe3a8f22cbd..aef70336d624 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -282,9 +282,22 @@ static inline long arch_scale_freq_capacity(int cpu)
}
#define arch_scale_freq_capacity arch_scale_freq_capacity
+bool arch_enable_hybrid_capacity_scale(void);
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+ unsigned long cap_freq, unsigned long base_freq);
+
+unsigned long arch_scale_cpu_capacity(int cpu);
+#define arch_scale_cpu_capacity arch_scale_cpu_capacity
+
extern void arch_set_max_freq_ratio(bool turbo_disabled);
extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled);
#else
+static inline bool arch_enable_hybrid_capacity_scale(void) { return false; }
+static inline void arch_set_cpu_capacity(int cpu, unsigned long cap,
+ unsigned long max_cap,
+ unsigned long cap_freq,
+ unsigned long base_freq) { }
+
static inline void arch_set_max_freq_ratio(bool turbo_disabled) { }
static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { }
#endif
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index ff8f25faca3d..956984054bf3 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -9,6 +9,17 @@
#include <asm/processor.h>
#include <asm/topology.h>
+#define CPPC_HIGHEST_PERF_PERFORMANCE 196
+#define CPPC_HIGHEST_PERF_PREFCORE 166
+
+enum amd_pref_core {
+ AMD_PREF_CORE_UNKNOWN = 0,
+ AMD_PREF_CORE_SUPPORTED,
+ AMD_PREF_CORE_UNSUPPORTED,
+};
+static enum amd_pref_core amd_pref_core_detected;
+static u64 boost_numerator;
+
/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
bool cpc_supported_by_cpu(void)
@@ -69,31 +80,30 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
static void amd_set_max_freq_ratio(void)
{
struct cppc_perf_caps perf_caps;
- u64 highest_perf, nominal_perf;
+ u64 numerator, nominal_perf;
u64 perf_ratio;
int rc;
rc = cppc_get_perf_caps(0, &perf_caps);
if (rc) {
- pr_debug("Could not retrieve perf counters (%d)\n", rc);
+ pr_warn("Could not retrieve perf counters (%d)\n", rc);
return;
}
- highest_perf = amd_get_highest_perf();
+ rc = amd_get_boost_ratio_numerator(0, &numerator);
+ if (rc) {
+ pr_warn("Could not retrieve highest performance (%d)\n", rc);
+ return;
+ }
nominal_perf = perf_caps.nominal_perf;
- if (!highest_perf || !nominal_perf) {
- pr_debug("Could not retrieve highest or nominal performance\n");
+ if (!nominal_perf) {
+ pr_warn("Could not retrieve nominal performance\n");
return;
}
- perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
/* midpoint between max_boost and max_P */
- perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
- if (!perf_ratio) {
- pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
- return;
- }
+ perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1;
freq_invariance_set_perf_ratio(perf_ratio, false);
}
@@ -116,3 +126,143 @@ void init_freq_invariance_cppc(void)
init_done = true;
mutex_unlock(&freq_invariance_lock);
}
+
+/*
+ * Get the highest performance register value.
+ * @cpu: CPU from which to get highest performance.
+ * @highest_perf: Return address for highest performance value.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
+{
+ u64 val;
+ int ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+ ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val);
+ if (ret)
+ goto out;
+
+ val = AMD_CPPC_HIGHEST_PERF(val);
+ } else {
+ ret = cppc_get_highest_perf(cpu, &val);
+ if (ret)
+ goto out;
+ }
+
+ WRITE_ONCE(*highest_perf, (u32)val);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+
+/**
+ * amd_detect_prefcore: Detect if CPUs in the system support preferred cores
+ * @detected: Output variable for the result of the detection.
+ *
+ * Determine whether CPUs in the system support preferred cores. On systems
+ * that support preferred cores, different highest perf values will be found
+ * on different cores. On other systems, the highest perf value will be the
+ * same on all cores.
+ *
+ * The result of the detection will be stored in the 'detected' parameter.
+ *
+ * Return: 0 for success, negative error code otherwise
+ */
+int amd_detect_prefcore(bool *detected)
+{
+ int cpu, count = 0;
+ u64 highest_perf[2] = {0};
+
+ if (WARN_ON(!detected))
+ return -EINVAL;
+
+ switch (amd_pref_core_detected) {
+ case AMD_PREF_CORE_SUPPORTED:
+ *detected = true;
+ return 0;
+ case AMD_PREF_CORE_UNSUPPORTED:
+ *detected = false;
+ return 0;
+ default:
+ break;
+ }
+
+ for_each_present_cpu(cpu) {
+ u32 tmp;
+ int ret;
+
+ ret = amd_get_highest_perf(cpu, &tmp);
+ if (ret)
+ return ret;
+
+ if (!count || (count == 1 && tmp != highest_perf[0]))
+ highest_perf[count++] = tmp;
+
+ if (count == 2)
+ break;
+ }
+
+ *detected = (count == 2);
+ boost_numerator = highest_perf[0];
+
+ amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED :
+ AMD_PREF_CORE_UNSUPPORTED;
+
+ pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n",
+ *detected ? "" : "un", highest_perf[0]);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_detect_prefcore);
+
+/**
+ * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
+ * @cpu: CPU to get numerator for.
+ * @numerator: Output variable for numerator.
+ *
+ * Determine the numerator to use for calculating the boost ratio on
+ * a CPU. On systems that support preferred cores, this will be a hardcoded
+ * value. On other systems this will the highest performance register value.
+ *
+ * If booting the system with amd-pstate enabled but preferred cores disabled then
+ * the correct boost numerator will be returned to match hardware capabilities
+ * even if the preferred cores scheduling hints are not enabled.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
+{
+ bool prefcore;
+ int ret;
+
+ ret = amd_detect_prefcore(&prefcore);
+ if (ret)
+ return ret;
+
+ /* without preferred cores, return the highest perf register value */
+ if (!prefcore) {
+ *numerator = boost_numerator;
+ return 0;
+ }
+
+ /*
+ * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f,
+ * the highest performance level is set to 196.
+ * https://bugzilla.kernel.org/show_bug.cgi?id=218759
+ */
+ if (cpu_feature_enabled(X86_FEATURE_ZEN4)) {
+ switch (boot_cpu_data.x86_model) {
+ case 0x70 ... 0x7f:
+ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
+ return 0;
+ default:
+ break;
+ }
+ }
+ *numerator = CPPC_HIGHEST_PERF_PREFCORE;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator);
diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
index 6cfe762be28b..d5ef6215583b 100644
--- a/arch/x86/kernel/acpi/madt_wakeup.c
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -19,7 +19,7 @@
static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
-static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox __ro_after_init;
+static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
static u64 acpi_mp_pgd __ro_after_init;
static u64 acpi_mp_reset_vector_paddr __ro_after_init;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 66fd4b2a37a3..373638691cd4 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1775,12 +1775,9 @@ static __init void apic_set_fixmap(bool read_apic);
static __init void x2apic_disable(void)
{
- u32 x2apic_id, state = x2apic_state;
+ u32 x2apic_id;
- x2apic_mode = 0;
- x2apic_state = X2APIC_DISABLED;
-
- if (state != X2APIC_ON)
+ if (x2apic_state < X2APIC_ON)
return;
x2apic_id = read_apic_id();
@@ -1793,6 +1790,10 @@ static __init void x2apic_disable(void)
}
__x2apic_disable();
+
+ x2apic_mode = 0;
+ x2apic_state = X2APIC_DISABLED;
+
/*
* Don't reread the APIC ID as it was already done from
* check_x2apic() and the APIC driver still is a x2APIC variant,
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index be5889bded49..015971adadfc 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -462,7 +462,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
switch (c->x86_model) {
case 0x00 ... 0x2f:
case 0x40 ... 0x4f:
- case 0x70 ... 0x7f:
+ case 0x60 ... 0x7f:
setup_force_cpu_cap(X86_FEATURE_ZEN5);
break;
default:
@@ -1190,22 +1190,6 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr)
}
EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
-u32 amd_get_highest_perf(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
- (c->x86_model >= 0x70 && c->x86_model < 0x80)))
- return 166;
-
- if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
- (c->x86_model >= 0x40 && c->x86_model < 0x70)))
- return 166;
-
- return 255;
-}
-EXPORT_SYMBOL_GPL(amd_get_highest_perf);
-
static void zenbleed_check_cpu(void *unused)
{
struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index b3fa61d45352..f642de2ebdac 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -306,7 +306,7 @@ static void freq_invariance_enable(void)
WARN_ON_ONCE(1);
return;
}
- static_branch_enable(&arch_scale_freq_key);
+ static_branch_enable_cpuslocked(&arch_scale_freq_key);
register_freq_invariance_syscore_ops();
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
}
@@ -323,8 +323,10 @@ static void __init bp_init_freq_invariance(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return;
- if (intel_set_max_freq_ratio())
+ if (intel_set_max_freq_ratio()) {
+ guard(cpus_read_lock)();
freq_invariance_enable();
+ }
}
static void disable_freq_invariance_workfn(struct work_struct *work)
@@ -347,9 +349,89 @@ static DECLARE_WORK(disable_freq_invariance_work,
DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
+static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
+
+struct arch_hybrid_cpu_scale {
+ unsigned long capacity;
+ unsigned long freq_ratio;
+};
+
+static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
+
+/**
+ * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
+ *
+ * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
+ * initialize it and set the static key controlling its code paths.
+ *
+ * Must be called before arch_set_cpu_capacity().
+ */
+bool arch_enable_hybrid_capacity_scale(void)
+{
+ int cpu;
+
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
+ WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
+ return true;
+ }
+
+ arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
+ if (!arch_cpu_scale)
+ return false;
+
+ for_each_possible_cpu(cpu) {
+ per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
+ per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
+ }
+
+ static_branch_enable(&arch_hybrid_cap_scale_key);
+
+ pr_info("Hybrid CPU capacity scaling enabled\n");
+
+ return true;
+}
+
+/**
+ * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
+ * @cpu: Target CPU.
+ * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
+ * @max_cap: System-wide maximum CPU capacity.
+ * @cap_freq: Frequency of @cpu corresponding to @cap.
+ * @base_freq: Frequency of @cpu at which MPERF counts.
+ *
+ * The units in which @cap and @max_cap are expressed do not matter, so long
+ * as they are consistent, because the former is effectively divided by the
+ * latter. Analogously for @cap_freq and @base_freq.
+ *
+ * After calling this function for all CPUs, call arch_rebuild_sched_domains()
+ * to let the scheduler know that capacity-aware scheduling can be used going
+ * forward.
+ */
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+ unsigned long cap_freq, unsigned long base_freq)
+{
+ if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
+ div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
+ div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
+ } else {
+ WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
+ }
+}
+
+unsigned long arch_scale_cpu_capacity(int cpu)
+{
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+ return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
+
+ return SCHED_CAPACITY_SCALE;
+}
+EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
+
static void scale_freq_tick(u64 acnt, u64 mcnt)
{
- u64 freq_scale;
+ u64 freq_scale, freq_ratio;
if (!arch_scale_freq_invariant())
return;
@@ -357,7 +439,12 @@ static void scale_freq_tick(u64 acnt, u64 mcnt)
if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
goto error;
- if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+ freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
+ else
+ freq_ratio = arch_max_freq_ratio;
+
+ if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
goto error;
freq_scale = div64_u64(acnt, mcnt);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 45675da354f3..d1915427b4ff 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -233,7 +233,8 @@ static void x86_amd_ssb_disable(void)
#define pr_fmt(fmt) "MDS: " fmt
/* Default mitigation for MDS-affected CPUs */
-static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
+static enum mds_mitigations mds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_FULL : MDS_MITIGATION_OFF;
static bool mds_nosmt __ro_after_init = false;
static const char * const mds_strings[] = {
@@ -293,7 +294,8 @@ enum taa_mitigations {
};
/* Default mitigation for TAA-affected CPUs */
-static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW;
+static enum taa_mitigations taa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_VERW : TAA_MITIGATION_OFF;
static bool taa_nosmt __ro_after_init;
static const char * const taa_strings[] = {
@@ -391,7 +393,8 @@ enum mmio_mitigations {
};
/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
-static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW;
+static enum mmio_mitigations mmio_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_VERW : MMIO_MITIGATION_OFF;
static bool mmio_nosmt __ro_after_init = false;
static const char * const mmio_strings[] = {
@@ -605,7 +608,8 @@ enum srbds_mitigations {
SRBDS_MITIGATION_HYPERVISOR,
};
-static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL;
+static enum srbds_mitigations srbds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_FULL : SRBDS_MITIGATION_OFF;
static const char * const srbds_strings[] = {
[SRBDS_MITIGATION_OFF] = "Vulnerable",
@@ -731,11 +735,8 @@ enum gds_mitigations {
GDS_MITIGATION_HYPERVISOR,
};
-#if IS_ENABLED(CONFIG_MITIGATION_GDS_FORCE)
-static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE;
-#else
-static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL;
-#endif
+static enum gds_mitigations gds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_FULL : GDS_MITIGATION_OFF;
static const char * const gds_strings[] = {
[GDS_MITIGATION_OFF] = "Vulnerable",
@@ -871,7 +872,8 @@ enum spectre_v1_mitigation {
};
static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init =
- SPECTRE_V1_MITIGATION_AUTO;
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V1) ?
+ SPECTRE_V1_MITIGATION_AUTO : SPECTRE_V1_MITIGATION_NONE;
static const char * const spectre_v1_strings[] = {
[SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers",
@@ -986,7 +988,7 @@ static const char * const retbleed_strings[] = {
static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
RETBLEED_MITIGATION_NONE;
static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init =
- RETBLEED_CMD_AUTO;
+ IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_CMD_AUTO : RETBLEED_CMD_OFF;
static int __ro_after_init retbleed_nosmt = false;
@@ -1447,17 +1449,18 @@ static void __init spec_v2_print_cond(const char *reason, bool secure)
static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
{
- enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
+ enum spectre_v2_mitigation_cmd cmd;
char arg[20];
int ret, i;
+ cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE;
if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") ||
cpu_mitigations_off())
return SPECTRE_V2_CMD_NONE;
ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
if (ret < 0)
- return SPECTRE_V2_CMD_AUTO;
+ return cmd;
for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
if (!match_option(arg, ret, mitigation_options[i].option))
@@ -1467,8 +1470,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
}
if (i >= ARRAY_SIZE(mitigation_options)) {
- pr_err("unknown option (%s). Switching to AUTO select\n", arg);
- return SPECTRE_V2_CMD_AUTO;
+ pr_err("unknown option (%s). Switching to default mode\n", arg);
+ return cmd;
}
if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
@@ -2021,10 +2024,12 @@ static const struct {
static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
{
- enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO;
+ enum ssb_mitigation_cmd cmd;
char arg[20];
int ret, i;
+ cmd = IS_ENABLED(CONFIG_MITIGATION_SSB) ?
+ SPEC_STORE_BYPASS_CMD_AUTO : SPEC_STORE_BYPASS_CMD_NONE;
if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") ||
cpu_mitigations_off()) {
return SPEC_STORE_BYPASS_CMD_NONE;
@@ -2032,7 +2037,7 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
arg, sizeof(arg));
if (ret < 0)
- return SPEC_STORE_BYPASS_CMD_AUTO;
+ return cmd;
for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) {
if (!match_option(arg, ret, ssb_mitigation_options[i].option))
@@ -2043,8 +2048,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
}
if (i >= ARRAY_SIZE(ssb_mitigation_options)) {
- pr_err("unknown option (%s). Switching to AUTO select\n", arg);
- return SPEC_STORE_BYPASS_CMD_AUTO;
+ pr_err("unknown option (%s). Switching to default mode\n", arg);
+ return cmd;
}
}
@@ -2371,7 +2376,8 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
#define pr_fmt(fmt) "L1TF: " fmt
/* Default mitigation for L1TF-affected CPUs */
-enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH;
+enum l1tf_mitigations l1tf_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_FLUSH : L1TF_MITIGATION_OFF;
#if IS_ENABLED(CONFIG_KVM_INTEL)
EXPORT_SYMBOL_GPL(l1tf_mitigation);
#endif
@@ -2551,10 +2557,9 @@ static void __init srso_select_mitigation(void)
{
bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
- if (cpu_mitigations_off())
- return;
-
- if (!boot_cpu_has_bug(X86_BUG_SRSO)) {
+ if (!boot_cpu_has_bug(X86_BUG_SRSO) ||
+ cpu_mitigations_off() ||
+ srso_cmd == SRSO_CMD_OFF) {
if (boot_cpu_has(X86_FEATURE_SBPB))
x86_pred_cmd = PRED_CMD_SBPB;
return;
@@ -2585,11 +2590,6 @@ static void __init srso_select_mitigation(void)
}
switch (srso_cmd) {
- case SRSO_CMD_OFF:
- if (boot_cpu_has(X86_FEATURE_SBPB))
- x86_pred_cmd = PRED_CMD_SBPB;
- return;
-
case SRSO_CMD_MICROCODE:
if (has_microcode) {
srso_mitigation = SRSO_MITIGATION_MICROCODE;
@@ -2643,6 +2643,8 @@ static void __init srso_select_mitigation(void)
pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n");
}
break;
+ default:
+ break;
}
out:
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d4e539d4e158..be307c9ef263 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1165,8 +1165,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB),
- VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
VULNWL_INTEL(INTEL_ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 08b95a35b5cb..e7656cbef68d 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -311,16 +311,18 @@ static void early_init_intel(struct cpuinfo_x86 *c)
}
/*
- * There is a known erratum on Pentium III and Core Solo
- * and Core Duo CPUs.
- * " Page with PAT set to WC while associated MTRR is UC
- * may consolidate to UC "
- * Because of this erratum, it is better to stick with
- * setting WC in MTRR rather than using PAT on these CPUs.
+ * PAT is broken on early family 6 CPUs, the last of which
+ * is "Yonah" where the erratum is named "AN7":
*
- * Enable PAT WC only on P4, Core 2 or later CPUs.
+ * Page with PAT (Page Attribute Table) Set to USWC
+ * (Uncacheable Speculative Write Combine) While
+ * Associated MTRR (Memory Type Range Register) Is UC
+ * (Uncacheable) May Consolidate to UC
+ *
+ * Disable PAT and fall back to MTRR on these CPUs.
*/
- if (c->x86 == 6 && c->x86_model < 15)
+ if (c->x86_vfm >= INTEL_PENTIUM_PRO &&
+ c->x86_vfm <= INTEL_CORE_YONAH)
clear_cpu_cap(c, X86_FEATURE_PAT);
/*
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 9a0133ef7e20..14bf8c232e45 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -780,7 +780,7 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
{
struct mce m;
- mce_setup(&m);
+ mce_prep_record(&m);
m.status = status;
m.misc = misc;
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index 7f7309ff67d0..3885fe05f01e 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -44,7 +44,7 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
else
lsb = PAGE_SHIFT;
- mce_setup(&m);
+ mce_prep_record(&m);
m.bank = -1;
/* Fake a memory read error with unknown channel */
m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
@@ -66,6 +66,7 @@ EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
{
const u64 *i_mce = ((const u64 *) (ctx_info + 1));
+ bool apicid_found = false;
unsigned int cpu;
struct mce m;
@@ -97,20 +98,19 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
if (ctx_info->reg_arr_size < 48)
return -EINVAL;
- mce_setup(&m);
-
- m.extcpu = -1;
- m.socketid = -1;
-
for_each_possible_cpu(cpu) {
if (cpu_data(cpu).topo.initial_apicid == lapic_id) {
- m.extcpu = cpu;
- m.socketid = cpu_data(m.extcpu).topo.pkg_id;
+ apicid_found = true;
break;
}
}
- m.apicid = lapic_id;
+ if (!apicid_found)
+ return -EINVAL;
+
+ mce_prep_record_common(&m);
+ mce_prep_record_per_cpu(cpu, &m);
+
m.bank = (ctx_info->msr_addr >> 4) & 0xFF;
m.status = *i_mce;
m.addr = *(i_mce + 1);
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index b85ec7a4ec9e..2a938f429c4d 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -117,20 +117,32 @@ static struct irq_work mce_irq_work;
*/
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
-/* Do initial initialization of a struct mce */
-void mce_setup(struct mce *m)
+void mce_prep_record_common(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
- m->cpu = m->extcpu = smp_processor_id();
+
+ m->cpuid = cpuid_eax(1);
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
/* need the internal __ version to avoid deadlocks */
- m->time = __ktime_get_real_seconds();
- m->cpuvendor = boot_cpu_data.x86_vendor;
- m->cpuid = cpuid_eax(1);
- m->socketid = cpu_data(m->extcpu).topo.pkg_id;
- m->apicid = cpu_data(m->extcpu).topo.initial_apicid;
- m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
- m->ppin = cpu_data(m->extcpu).ppin;
- m->microcode = boot_cpu_data.microcode;
+ m->time = __ktime_get_real_seconds();
+}
+
+void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m)
+{
+ m->cpu = cpu;
+ m->extcpu = cpu;
+ m->apicid = cpu_data(cpu).topo.initial_apicid;
+ m->microcode = cpu_data(cpu).microcode;
+ m->ppin = topology_ppin(cpu);
+ m->socketid = topology_physical_package_id(cpu);
+}
+
+/* Do initial initialization of a struct mce */
+void mce_prep_record(struct mce *m)
+{
+ mce_prep_record_common(m);
+ mce_prep_record_per_cpu(smp_processor_id(), m);
}
DEFINE_PER_CPU(struct mce, injectm);
@@ -436,11 +448,11 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
{
/*
- * Enable instrumentation around mce_setup() which calls external
+ * Enable instrumentation around mce_prep_record() which calls external
* facilities.
*/
instrumentation_begin();
- mce_setup(m);
+ mce_prep_record(m);
instrumentation_end();
m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 01f8f03969e6..43c7f3b71df5 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -261,6 +261,8 @@ enum mca_msr {
/* Decide whether to add MCE record to MCE event pool or filter it out. */
extern bool filter_mce(struct mce *m);
+void mce_prep_record_common(struct mce *m);
+void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
extern bool amd_filter_mce(struct mce *m);
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index c0d56c02b8da..f63b051f25a0 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -89,6 +89,31 @@ static struct equiv_cpu_table {
struct equiv_cpu_entry *entry;
} equiv_table;
+union zen_patch_rev {
+ struct {
+ __u32 rev : 8,
+ stepping : 4,
+ model : 4,
+ __reserved : 4,
+ ext_model : 4,
+ ext_fam : 8;
+ };
+ __u32 ucode_rev;
+};
+
+union cpuid_1_eax {
+ struct {
+ __u32 stepping : 4,
+ model : 4,
+ family : 4,
+ __reserved0 : 4,
+ ext_model : 4,
+ ext_fam : 8,
+ __reserved1 : 4;
+ };
+ __u32 full;
+};
+
/*
* This points to the current valid container of microcode patches which we will
* save from the initrd/builtin before jettisoning its contents. @mc is the
@@ -96,7 +121,6 @@ static struct equiv_cpu_table {
*/
struct cont_desc {
struct microcode_amd *mc;
- u32 cpuid_1_eax;
u32 psize;
u8 *data;
size_t size;
@@ -109,10 +133,42 @@ struct cont_desc {
static const char
ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
+/*
+ * This is CPUID(1).EAX on the BSP. It is used in two ways:
+ *
+ * 1. To ignore the equivalence table on Zen1 and newer.
+ *
+ * 2. To match which patches to load because the patch revision ID
+ * already contains the f/m/s for which the microcode is destined
+ * for.
+ */
+static u32 bsp_cpuid_1_eax __ro_after_init;
+
+static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val)
+{
+ union zen_patch_rev p;
+ union cpuid_1_eax c;
+
+ p.ucode_rev = val;
+ c.full = 0;
+
+ c.stepping = p.stepping;
+ c.model = p.model;
+ c.ext_model = p.ext_model;
+ c.family = 0xf;
+ c.ext_fam = p.ext_fam;
+
+ return c;
+}
+
static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig)
{
unsigned int i;
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return 0;
+
if (!et || !et->num_entries)
return 0;
@@ -159,6 +215,10 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size)
if (!verify_container(buf, buf_size))
return false;
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return true;
+
cont_type = hdr[1];
if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) {
pr_debug("Wrong microcode container equivalence table type: %u.\n",
@@ -222,8 +282,9 @@ __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize)
* exceed the per-family maximum). @sh_psize is the size read from the section
* header.
*/
-static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size)
+static unsigned int __verify_patch_size(u32 sh_psize, size_t buf_size)
{
+ u8 family = x86_family(bsp_cpuid_1_eax);
u32 max_size;
if (family >= 0x15)
@@ -258,9 +319,9 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size
* positive: patch is not for this family, skip it
* 0: success
*/
-static int
-verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size)
+static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size)
{
+ u8 family = x86_family(bsp_cpuid_1_eax);
struct microcode_header_amd *mc_hdr;
unsigned int ret;
u32 sh_psize;
@@ -286,7 +347,7 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size)
return -1;
}
- ret = __verify_patch_size(family, sh_psize, buf_size);
+ ret = __verify_patch_size(sh_psize, buf_size);
if (!ret) {
pr_debug("Per-family patch size mismatch.\n");
return -1;
@@ -308,6 +369,15 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size)
return 0;
}
+static bool mc_patch_matches(struct microcode_amd *mc, u16 eq_id)
+{
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return ucode_rev_to_cpuid(mc->hdr.patch_id).full == bsp_cpuid_1_eax;
+ else
+ return eq_id == mc->hdr.processor_rev_id;
+}
+
/*
* This scans the ucode blob for the proper container as we can have multiple
* containers glued together. Returns the equivalence ID from the equivalence
@@ -336,7 +406,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
* doesn't contain a patch for the CPU, scan through the whole container
* so that it can be skipped in case there are other containers appended.
*/
- eq_id = find_equiv_id(&table, desc->cpuid_1_eax);
+ eq_id = find_equiv_id(&table, bsp_cpuid_1_eax);
buf += hdr[2] + CONTAINER_HDR_SZ;
size -= hdr[2] + CONTAINER_HDR_SZ;
@@ -350,7 +420,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
u32 patch_size;
int ret;
- ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size);
+ ret = verify_patch(buf, size, &patch_size);
if (ret < 0) {
/*
* Patch verification failed, skip to the next container, if
@@ -363,7 +433,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
}
mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE);
- if (eq_id == mc->hdr.processor_rev_id) {
+ if (mc_patch_matches(mc, eq_id)) {
desc->psize = patch_size;
desc->mc = mc;
}
@@ -421,6 +491,7 @@ static int __apply_microcode_amd(struct microcode_amd *mc)
/* verify patch application was successful */
native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
if (rev != mc->hdr.patch_id)
return -1;
@@ -438,14 +509,12 @@ static int __apply_microcode_amd(struct microcode_amd *mc)
*
* Returns true if container found (sets @desc), false otherwise.
*/
-static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, size_t size)
+static bool early_apply_microcode(u32 old_rev, void *ucode, size_t size)
{
struct cont_desc desc = { 0 };
struct microcode_amd *mc;
bool ret = false;
- desc.cpuid_1_eax = cpuid_1_eax;
-
scan_containers(ucode, size, &desc);
mc = desc.mc;
@@ -463,9 +532,10 @@ static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, siz
return !__apply_microcode_amd(mc);
}
-static bool get_builtin_microcode(struct cpio_data *cp, u8 family)
+static bool get_builtin_microcode(struct cpio_data *cp)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ u8 family = x86_family(bsp_cpuid_1_eax);
struct firmware fw;
if (IS_ENABLED(CONFIG_X86_32))
@@ -484,11 +554,11 @@ static bool get_builtin_microcode(struct cpio_data *cp, u8 family)
return false;
}
-static void __init find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret)
+static void __init find_blobs_in_containers(struct cpio_data *ret)
{
struct cpio_data cp;
- if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax)))
+ if (!get_builtin_microcode(&cp))
cp = find_microcode_in_initrd(ucode_path);
*ret = cp;
@@ -499,16 +569,18 @@ void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_
struct cpio_data cp = { };
u32 dummy;
+ bsp_cpuid_1_eax = cpuid_1_eax;
+
native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy);
/* Needed in load_microcode_amd() */
ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax;
- find_blobs_in_containers(cpuid_1_eax, &cp);
+ find_blobs_in_containers(&cp);
if (!(cp.data && cp.size))
return;
- if (early_apply_microcode(cpuid_1_eax, ed->old_rev, cp.data, cp.size))
+ if (early_apply_microcode(ed->old_rev, cp.data, cp.size))
native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy);
}
@@ -525,12 +597,10 @@ static int __init save_microcode_in_initrd(void)
if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
return 0;
- find_blobs_in_containers(cpuid_1_eax, &cp);
+ find_blobs_in_containers(&cp);
if (!(cp.data && cp.size))
return -EINVAL;
- desc.cpuid_1_eax = cpuid_1_eax;
-
scan_containers(cp.data, cp.size, &desc);
if (!desc.mc)
return -EINVAL;
@@ -543,26 +613,65 @@ static int __init save_microcode_in_initrd(void)
}
early_initcall(save_microcode_in_initrd);
+static inline bool patch_cpus_equivalent(struct ucode_patch *p, struct ucode_patch *n)
+{
+ /* Zen and newer hardcode the f/m/s in the patch ID */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
+ union cpuid_1_eax p_cid = ucode_rev_to_cpuid(p->patch_id);
+ union cpuid_1_eax n_cid = ucode_rev_to_cpuid(n->patch_id);
+
+ /* Zap stepping */
+ p_cid.stepping = 0;
+ n_cid.stepping = 0;
+
+ return p_cid.full == n_cid.full;
+ } else {
+ return p->equiv_cpu == n->equiv_cpu;
+ }
+}
+
/*
* a small, trivial cache of per-family ucode patches
*/
-static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
+static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equiv_cpu)
{
struct ucode_patch *p;
+ struct ucode_patch n;
+
+ n.equiv_cpu = equiv_cpu;
+ n.patch_id = uci->cpu_sig.rev;
+
+ WARN_ON_ONCE(!n.patch_id);
list_for_each_entry(p, &microcode_cache, plist)
- if (p->equiv_cpu == equiv_cpu)
+ if (patch_cpus_equivalent(p, &n))
return p;
+
return NULL;
}
+static inline bool patch_newer(struct ucode_patch *p, struct ucode_patch *n)
+{
+ /* Zen and newer hardcode the f/m/s in the patch ID */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
+ union zen_patch_rev zp, zn;
+
+ zp.ucode_rev = p->patch_id;
+ zn.ucode_rev = n->patch_id;
+
+ return zn.rev > zp.rev;
+ } else {
+ return n->patch_id > p->patch_id;
+ }
+}
+
static void update_cache(struct ucode_patch *new_patch)
{
struct ucode_patch *p;
list_for_each_entry(p, &microcode_cache, plist) {
- if (p->equiv_cpu == new_patch->equiv_cpu) {
- if (p->patch_id >= new_patch->patch_id) {
+ if (patch_cpus_equivalent(p, new_patch)) {
+ if (!patch_newer(p, new_patch)) {
/* we already have the latest patch */
kfree(new_patch->data);
kfree(new_patch);
@@ -593,13 +702,22 @@ static void free_cache(void)
static struct ucode_patch *find_patch(unsigned int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- u16 equiv_id;
+ u32 rev, dummy __always_unused;
+ u16 equiv_id = 0;
- equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig);
- if (!equiv_id)
- return NULL;
+ /* fetch rev if not populated yet: */
+ if (!uci->cpu_sig.rev) {
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+ uci->cpu_sig.rev = rev;
+ }
+
+ if (x86_family(bsp_cpuid_1_eax) < 0x17) {
+ equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig);
+ if (!equiv_id)
+ return NULL;
+ }
- return cache_find_patch(equiv_id);
+ return cache_find_patch(uci, equiv_id);
}
void reload_ucode_amd(unsigned int cpu)
@@ -649,7 +767,7 @@ static enum ucode_state apply_microcode_amd(int cpu)
struct ucode_cpu_info *uci;
struct ucode_patch *p;
enum ucode_state ret;
- u32 rev, dummy __always_unused;
+ u32 rev;
BUG_ON(raw_smp_processor_id() != cpu);
@@ -659,11 +777,11 @@ static enum ucode_state apply_microcode_amd(int cpu)
if (!p)
return UCODE_NFOUND;
+ rev = uci->cpu_sig.rev;
+
mc_amd = p->data;
uci->mc = p->data;
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
/* need to apply patch? */
if (rev > mc_amd->hdr.patch_id) {
ret = UCODE_OK;
@@ -709,6 +827,10 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size)
hdr = (const u32 *)buf;
equiv_tbl_len = hdr[2];
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ goto out;
+
equiv_table.entry = vmalloc(equiv_tbl_len);
if (!equiv_table.entry) {
pr_err("failed to allocate equivalent CPU table\n");
@@ -718,12 +840,16 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size)
memcpy(equiv_table.entry, buf + CONTAINER_HDR_SZ, equiv_tbl_len);
equiv_table.num_entries = equiv_tbl_len / sizeof(struct equiv_cpu_entry);
+out:
/* add header length */
return equiv_tbl_len + CONTAINER_HDR_SZ;
}
static void free_equiv_cpu_table(void)
{
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return;
+
vfree(equiv_table.entry);
memset(&equiv_table, 0, sizeof(equiv_table));
}
@@ -749,7 +875,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
u16 proc_id;
int ret;
- ret = verify_patch(family, fw, leftover, patch_size);
+ ret = verify_patch(fw, leftover, patch_size);
if (ret)
return ret;
@@ -774,7 +900,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
patch->patch_id = mc_hdr->patch_id;
patch->equiv_cpu = proc_id;
- pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
+ pr_debug("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n",
__func__, patch->patch_id, proc_id);
/* ... and add to cache. */
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index e0fd57a8ba84..ead967479fa6 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -199,8 +199,8 @@ static void hv_machine_shutdown(void)
* Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor
* corrupts the old VP Assist Pages and can crash the kexec kernel.
*/
- if (kexec_in_progress && hyperv_init_cpuhp > 0)
- cpuhp_remove_state(hyperv_init_cpuhp);
+ if (kexec_in_progress)
+ cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE);
/* The function calls stop_other_cpus(). */
native_machine_shutdown();
@@ -424,6 +424,7 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
x86_platform.calibrate_tsc = hv_get_tsc_khz;
x86_platform.calibrate_cpu = hv_get_tsc_khz;
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
}
if (ms_hyperv.priv_high & HV_ISOLATION) {
@@ -449,9 +450,23 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
if (!ms_hyperv.paravisor_present) {
- /* To be supported: more work is required. */
+ /*
+ * Mark the Hyper-V TSC page feature as disabled
+ * in a TDX VM without paravisor so that the
+ * Invariant TSC, which is a better clocksource
+ * anyway, is used instead.
+ */
ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
+ /*
+ * The Invariant TSC is expected to be available
+ * in a TDX VM without paravisor, but if not,
+ * print a warning message. The slower Hyper-V MSR-based
+ * Ref Counter should end up being the clocksource.
+ */
+ if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
+ pr_warn("Hyper-V: Invariant TSC is unavailable\n");
+
/* HV_MSR_CRASH_CTL is unsupported. */
ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 767bf1c71aad..2a2fc14955cd 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -609,7 +609,7 @@ void mtrr_save_state(void)
{
int first_cpu;
- if (!mtrr_enabled())
+ if (!mtrr_enabled() || !mtrr_state.have_fixed)
return;
first_cpu = cpumask_first(cpu_online_mask);
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 1930fce9dfe9..8591d53c144b 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -119,6 +119,14 @@ struct rdt_hw_resource rdt_resources_all[] = {
},
};
+u32 resctrl_arch_system_num_rmid_idx(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+ /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
+ return r->num_rmid;
+}
+
/*
* cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
* as they do not have CPUID enumeration support for Cache allocation.
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 27892e57c4ef..f3f1461273ee 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -475,24 +475,25 @@ struct sgx_epc_page *__sgx_alloc_epc_page(void)
{
struct sgx_epc_page *page;
int nid_of_current = numa_node_id();
- int nid = nid_of_current;
+ int nid_start, nid;
- if (node_isset(nid_of_current, sgx_numa_mask)) {
- page = __sgx_alloc_epc_page_from_node(nid_of_current);
- if (page)
- return page;
- }
-
- /* Fall back to the non-local NUMA nodes: */
- while (true) {
- nid = next_node_in(nid, sgx_numa_mask);
- if (nid == nid_of_current)
- break;
+ /*
+ * Try local node first. If it doesn't have an EPC section,
+ * fall back to the non-local NUMA nodes.
+ */
+ if (node_isset(nid_of_current, sgx_numa_mask))
+ nid_start = nid_of_current;
+ else
+ nid_start = next_node_in(nid_of_current, sgx_numa_mask);
+ nid = nid_start;
+ do {
page = __sgx_alloc_epc_page_from_node(nid);
if (page)
return page;
- }
+
+ nid = next_node_in(nid, sgx_numa_mask);
+ } while (nid != nid_start);
return ERR_PTR(-ENOMEM);
}
@@ -847,6 +848,13 @@ static bool __init sgx_page_cache_init(void)
return false;
}
+ for_each_online_node(nid) {
+ if (!node_isset(nid, sgx_numa_mask) &&
+ node_state(nid, N_MEMORY) && node_state(nid, N_CPU))
+ pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n",
+ nid);
+ }
+
return true;
}
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c5a026fee5e0..1339f8328db5 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -788,6 +788,9 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
goto out_disable;
}
+ fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
+ XFEATURE_MASK_INDEPENDENT;
+
/*
* Clear XSAVE features that are disabled in the normal CPUID.
*/
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 2ee0b9c53dcc..afb404cd2059 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -62,9 +62,9 @@ static inline u64 xfeatures_mask_supervisor(void)
static inline u64 xfeatures_mask_independent(void)
{
if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
- return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;
+ return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR;
- return XFEATURE_MASK_INDEPENDENT;
+ return fpu_kernel_cfg.independent_features;
}
/* XSAVE/XRSTOR wrapper functions */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 5358d43886ad..fec381533555 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -51,13 +51,12 @@ DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text);
DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
#endif
-DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
+DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
void __init native_pv_lock_init(void)
{
- if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) &&
- !boot_cpu_has(X86_FEATURE_HYPERVISOR))
- static_branch_disable(&virt_spin_lock_key);
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ static_branch_enable(&virt_spin_lock_key);
}
static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5d34cad9b7b1..6129dc2ba784 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -164,7 +164,7 @@ unsigned long saved_video_mode;
static char __initdata command_line[COMMAND_LINE_SIZE];
#ifdef CONFIG_CMDLINE_BOOL
-static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+char builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
bool builtin_cmdline_added __ro_after_init;
#endif
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 4287a8071a3a..730c2f34d347 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -19,7 +19,6 @@ if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
- depends on HIGH_RES_TIMERS
depends on X86_LOCAL_APIC
select KVM_COMMON
select KVM_GENERIC_MMU_NOTIFIER
@@ -141,11 +140,13 @@ config KVM_AMD_SEV
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
select ARCH_HAS_CC_PLATFORM
select KVM_GENERIC_PRIVATE_MEM
- select HAVE_KVM_GMEM_PREPARE
- select HAVE_KVM_GMEM_INVALIDATE
+ select HAVE_KVM_ARCH_GMEM_PREPARE
+ select HAVE_KVM_ARCH_GMEM_INVALIDATE
help
- Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
- with Encrypted State (SEV-ES) on AMD processors.
+ Provides support for launching encrypted VMs which use Secure
+ Encrypted Virtualization (SEV), Secure Encrypted Virtualization with
+ Encrypted State (SEV-ES), and Secure Encrypted Virtualization with
+ Secure Nested Paging (SEV-SNP) technologies on AMD processors.
config KVM_SMM
bool "System Management Mode emulation"
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 923e64903da9..913bfc96959c 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -286,7 +286,6 @@ static inline int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
return HV_STATUS_ACCESS_DENIED;
}
static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) {}
-static inline void kvm_hv_free_pa_page(struct kvm *kvm) {}
static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector)
{
return false;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a7172ba59ad2..5bb481aefcbc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -351,10 +351,8 @@ static void kvm_recalculate_logical_map(struct kvm_apic_map *new,
* reversing the LDR calculation to get cluster of APICs, i.e. no
* additional work is required.
*/
- if (apic_x2apic_mode(apic)) {
- WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(kvm_x2apic_id(apic)));
+ if (apic_x2apic_mode(apic))
return;
- }
if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr,
&cluster, &mask))) {
@@ -1743,7 +1741,7 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
s64 min_period = min_timer_period_us * 1000LL;
if (apic->lapic_timer.period < min_period) {
- pr_info_ratelimited(
+ pr_info_once(
"vcpu %i: requested %lld ns "
"lapic timer period limited to %lld ns\n",
apic->vcpu->vcpu_id,
@@ -2966,18 +2964,28 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s, bool set)
{
if (apic_x2apic_mode(vcpu->arch.apic)) {
+ u32 x2apic_id = kvm_x2apic_id(vcpu->arch.apic);
u32 *id = (u32 *)(s->regs + APIC_ID);
u32 *ldr = (u32 *)(s->regs + APIC_LDR);
u64 icr;
if (vcpu->kvm->arch.x2apic_format) {
- if (*id != vcpu->vcpu_id)
+ if (*id != x2apic_id)
return -EINVAL;
} else {
+ /*
+ * Ignore the userspace value when setting APIC state.
+ * KVM's model is that the x2APIC ID is readonly, e.g.
+ * KVM only supports delivering interrupts to KVM's
+ * version of the x2APIC ID. However, for backwards
+ * compatibility, don't reject attempts to set a
+ * mismatched ID for userspace that hasn't opted into
+ * x2apic_format.
+ */
if (set)
- *id >>= 24;
+ *id = x2apic_id;
else
- *id <<= 24;
+ *id = x2apic_id << 24;
}
/*
@@ -2986,7 +2994,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
* split to ICR+ICR2 in userspace for backwards compatibility.
*/
if (set) {
- *ldr = kvm_apic_calc_x2apic_ldr(*id);
+ *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id);
icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
(u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 901be9e420a4..7813d28b082f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4335,7 +4335,7 @@ static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
if (req_max_level)
max_level = min(max_level, req_max_level);
- return req_max_level;
+ return max_level;
}
static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
@@ -4674,16 +4674,14 @@ out_unlock:
bool kvm_mmu_may_ignore_guest_pat(void)
{
/*
- * When EPT is enabled (shadow_memtype_mask is non-zero), the CPU does
- * not support self-snoop (or is affected by an erratum), and the VM
+ * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM
* has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to
* honor the memtype from the guest's PAT so that guest accesses to
* memory that is DMA'd aren't cached against the guest's wishes. As a
* result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA,
- * KVM _always_ ignores or honors guest PAT, i.e. doesn't toggle SPTE
- * bits in response to non-coherent device (un)registration.
+ * KVM _always_ ignores guest PAT (when EPT is enabled).
*/
- return !static_cpu_has(X86_FEATURE_SELFSNOOP) && shadow_memtype_mask;
+ return shadow_memtype_mask;
}
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@ -4743,11 +4741,16 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
u64 end;
int r;
+ if (!vcpu->kvm->arch.pre_fault_allowed)
+ return -EOPNOTSUPP;
+
/*
* reload is efficient when called repeatedly, so we can do it on
* every iteration.
*/
- kvm_mmu_reload(vcpu);
+ r = kvm_mmu_reload(vcpu);
+ if (r)
+ return r;
if (kvm_arch_has_private_mem(vcpu->kvm) &&
kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
@@ -7510,7 +7513,7 @@ static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
if (level == PG_LEVEL_2M)
- return kvm_range_has_memory_attributes(kvm, start, end, attrs);
+ return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs);
for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
if (hugepage_test_mixed(slot, gfn, level - 1) ||
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index d4527965e48c..8f7eb3ad88fc 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -391,9 +391,9 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
mmio_value = 0;
/*
- * The masked MMIO value must obviously match itself and a removed SPTE
- * must not get a false positive. Removed SPTEs and MMIO SPTEs should
- * never collide as MMIO must set some RWX bits, and removed SPTEs must
+ * The masked MMIO value must obviously match itself and a frozen SPTE
+ * must not get a false positive. Frozen SPTEs and MMIO SPTEs should
+ * never collide as MMIO must set some RWX bits, and frozen SPTEs must
* not set any RWX bits.
*/
if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index ef793c459b05..2cb816ea2430 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -214,7 +214,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
*/
#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL)
-/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
+/* Frozen SPTEs must not be misconstrued as shadow present PTEs. */
static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK));
static inline bool is_frozen_spte(u64 spte)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c7dc49ee7388..3c55955bcaf8 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -359,10 +359,10 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
/*
* Set the SPTE to a nonpresent value that other
* threads will not overwrite. If the SPTE was
- * already marked as removed then another thread
+ * already marked as frozen then another thread
* handling a page fault could overwrite it, so
* set the SPTE until it is set from some other
- * value to the removed SPTE value.
+ * value to the frozen SPTE value.
*/
for (;;) {
old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
@@ -536,8 +536,8 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter,
u64 *sptep = rcu_dereference(iter->sptep);
/*
- * The caller is responsible for ensuring the old SPTE is not a REMOVED
- * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
+ * The caller is responsible for ensuring the old SPTE is not a FROZEN
+ * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE,
* and pre-checking before inserting a new SPTE is advantageous as it
* avoids unnecessary work.
*/
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index a16c873b3232..714c517dd4b7 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2276,30 +2276,24 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf
for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
struct sev_data_snp_launch_update fw_args = {0};
- bool assigned;
+ bool assigned = false;
int level;
- if (!kvm_mem_is_private(kvm, gfn)) {
- pr_debug("%s: Failed to ensure GFN 0x%llx has private memory attribute set\n",
- __func__, gfn);
- ret = -EINVAL;
- goto err;
- }
-
ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level);
if (ret || assigned) {
pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
__func__, gfn, ret, assigned);
- ret = -EINVAL;
+ ret = ret ? -EINVAL : -EEXIST;
goto err;
}
if (src) {
void *vaddr = kmap_local_pfn(pfn + i);
- ret = copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE);
- if (ret)
+ if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) {
+ ret = -EFAULT;
goto err;
+ }
kunmap_local(vaddr);
}
@@ -2549,6 +2543,14 @@ static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
data->gctx_paddr = __psp_pa(sev->snp_context);
ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error);
+ /*
+ * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages
+ * can be given to the guest simply by marking the RMP entry as private.
+ * This can happen on first access and also with KVM_PRE_FAULT_MEMORY.
+ */
+ if (!ret)
+ kvm->arch.pre_fault_allowed = true;
+
kfree(id_auth);
e_free_id_block:
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c115d26844f7..5ab2c92c7331 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2876,6 +2876,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CSTAR:
msr_info->data = svm->vmcb01.ptr->save.cstar;
break;
+ case MSR_GS_BASE:
+ msr_info->data = svm->vmcb01.ptr->save.gs.base;
+ break;
+ case MSR_FS_BASE:
+ msr_info->data = svm->vmcb01.ptr->save.fs.base;
+ break;
case MSR_KERNEL_GS_BASE:
msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
break;
@@ -3101,6 +3107,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_CSTAR:
svm->vmcb01.ptr->save.cstar = data;
break;
+ case MSR_GS_BASE:
+ svm->vmcb01.ptr->save.gs.base = data;
+ break;
+ case MSR_FS_BASE:
+ svm->vmcb01.ptr->save.fs.base = data;
+ break;
case MSR_KERNEL_GS_BASE:
svm->vmcb01.ptr->save.kernel_gs_base = data;
break;
@@ -4949,6 +4961,7 @@ static int svm_vm_init(struct kvm *kvm)
to_kvm_sev_info(kvm)->need_init = true;
kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM);
+ kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem;
}
if (!pause_filter_count || !pause_filter_thresh)
@@ -5223,6 +5236,9 @@ static __init void svm_set_cpu_caps(void)
/* CPUID 0x8000001F (SME/SEV features) */
sev_set_cpu_caps();
+
+ /* Don't advertise Bus Lock Detect to guest if SVM support is absent */
+ kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT);
}
static __init int svm_hardware_setup(void)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f18c2d8c7476..733a0c45d1a6 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7659,13 +7659,11 @@ u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
/*
* Force WB and ignore guest PAT if the VM does NOT have a non-coherent
- * device attached and the CPU doesn't support self-snoop. Letting the
- * guest control memory types on Intel CPUs without self-snoop may
- * result in unexpected behavior, and so KVM's (historical) ABI is to
- * trust the guest to behave only as a last resort.
+ * device attached. Letting the guest control memory types on Intel
+ * CPUs may result in unexpected behavior, and so KVM's ABI is to trust
+ * the guest to behave only as a last resort.
*/
- if (!static_cpu_has(X86_FEATURE_SELFSNOOP) &&
- !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index af6c8cf6a37a..c983c8e434b8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -427,8 +427,7 @@ static void kvm_user_return_msr_cpu_online(void)
int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
{
- unsigned int cpu = smp_processor_id();
- struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
int err;
value = (value & mask) | (msrs->values[slot].host & ~mask);
@@ -450,8 +449,7 @@ EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
static void drop_user_return_notifiers(void)
{
- unsigned int cpu = smp_processor_id();
- struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
if (msrs->registered)
kvm_on_user_return(&msrs->urn);
@@ -4658,7 +4656,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ASYNC_PF_INT:
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_KVMCLOCK_CTRL:
- case KVM_CAP_READONLY_MEM:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
case KVM_CAP_TSC_DEADLINE_TIMER:
case KVM_CAP_DISABLE_QUIRKS:
@@ -4817,6 +4814,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VM_TYPES:
r = kvm_caps.supported_vm_types;
break;
+ case KVM_CAP_READONLY_MEM:
+ r = kvm ? kvm_arch_has_readonly_mem(kvm) : 1;
+ break;
default:
break;
}
@@ -6042,7 +6042,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
break;
+ kvm_vcpu_srcu_read_lock(vcpu);
r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
+ kvm_vcpu_srcu_read_unlock(vcpu);
break;
}
case KVM_GET_DEBUGREGS: {
@@ -12646,6 +12648,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.vm_type = type;
kvm->arch.has_private_mem =
(type == KVM_X86_SW_PROTECTED_VM);
+ /* Decided by the vendor code for other VM types. */
+ kvm->arch.pre_fault_allowed =
+ type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM;
ret = kvm_page_track_init(kvm);
if (ret)
@@ -13641,19 +13646,14 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
-#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE
-bool kvm_arch_gmem_prepare_needed(struct kvm *kvm)
-{
- return kvm->arch.vm_type == KVM_X86_SNP_VM;
-}
-
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order)
{
return kvm_x86_call(gmem_prepare)(kvm, pfn, gfn, max_order);
}
#endif
-#ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
{
kvm_x86_call(gmem_invalidate)(start, end);
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 384da1fdd5c6..c65cd5550454 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -207,18 +207,29 @@ __cmdline_find_option(const char *cmdline, int max_cmdline_size,
int cmdline_find_option_bool(const char *cmdline, const char *option)
{
- if (IS_ENABLED(CONFIG_CMDLINE_BOOL))
- WARN_ON_ONCE(!builtin_cmdline_added);
+ int ret;
- return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
+ ret = __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
+ if (ret > 0)
+ return ret;
+
+ if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && !builtin_cmdline_added)
+ return __cmdline_find_option_bool(builtin_cmdline, COMMAND_LINE_SIZE, option);
+
+ return ret;
}
int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
int bufsize)
{
- if (IS_ENABLED(CONFIG_CMDLINE_BOOL))
- WARN_ON_ONCE(!builtin_cmdline_added);
+ int ret;
+
+ ret = __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, buffer, bufsize);
+ if (ret > 0)
+ return ret;
+
+ if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && !builtin_cmdline_added)
+ return __cmdline_find_option(builtin_cmdline, COMMAND_LINE_SIZE, option, buffer, bufsize);
- return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
- buffer, bufsize);
+ return ret;
}
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index a314622aa093..d066aecf8aeb 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -88,12 +88,14 @@ SYM_FUNC_END(__get_user_4)
EXPORT_SYMBOL(__get_user_4)
SYM_FUNC_START(__get_user_8)
+#ifndef CONFIG_X86_64
+ xor %ecx,%ecx
+#endif
check_range size=8
ASM_STAC
#ifdef CONFIG_X86_64
UACCESS movq (%_ASM_AX),%rdx
#else
- xor %ecx,%ecx
UACCESS movl (%_ASM_AX),%edx
UACCESS movl 4(%_ASM_AX),%ecx
#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d8dbeac8b206..ff253648706f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -958,8 +958,12 @@ static void update_end_of_memory_vars(u64 start, u64 size)
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
struct mhp_params *params)
{
+ unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1;
int ret;
+ if (WARN_ON_ONCE(end > PHYSMEM_END))
+ return -ERANGE;
+
ret = __add_pages(nid, start_pfn, nr_pages, params);
WARN_ON_ONCE(ret);
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 37db264866b6..230f1dee4f09 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -47,13 +47,24 @@ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
*/
static __initdata struct kaslr_memory_region {
unsigned long *base;
+ unsigned long *end;
unsigned long size_tb;
} kaslr_regions[] = {
- { &page_offset_base, 0 },
- { &vmalloc_base, 0 },
- { &vmemmap_base, 0 },
+ {
+ .base = &page_offset_base,
+ .end = &physmem_end,
+ },
+ {
+ .base = &vmalloc_base,
+ },
+ {
+ .base = &vmemmap_base,
+ },
};
+/* The end of the possible address space for physical memory */
+unsigned long physmem_end __ro_after_init;
+
/* Get size in bytes used by the memory region */
static inline unsigned long get_padding(struct kaslr_memory_region *region)
{
@@ -82,6 +93,8 @@ void __init kernel_randomize_memory(void)
BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
+ /* Preset the end of the possible address space for physical memory */
+ physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1);
if (!kaslr_memory_enabled())
return;
@@ -128,11 +141,18 @@ void __init kernel_randomize_memory(void)
vaddr += entropy;
*kaslr_regions[i].base = vaddr;
+ /* Calculate the end of the region */
+ vaddr += get_padding(&kaslr_regions[i]);
/*
- * Jump the region and add a minimum padding based on
- * randomization alignment.
+ * KASLR trims the maximum possible size of the
+ * direct-map. Update the physmem_end boundary.
+ * No rounding required as the region starts
+ * PUD aligned and size is in units of TB.
*/
- vaddr += get_padding(&kaslr_regions[i]);
+ if (kaslr_regions[i].end)
+ *kaslr_regions[i].end = __pa_nodebug(vaddr - 1);
+
+ /* Add a minimum padding based on randomization alignment. */
vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 2e69abf4f852..851ec8f1363a 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -241,7 +241,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
*
* Returns a pointer to a PTE on success, or NULL on failure.
*/
-static pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+static pte_t *pti_user_pagetable_walk_pte(unsigned long address, bool late_text)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
pmd_t *pmd;
@@ -251,10 +251,15 @@ static pte_t *pti_user_pagetable_walk_pte(unsigned long address)
if (!pmd)
return NULL;
- /* We can't do anything sensible if we hit a large mapping. */
+ /* Large PMD mapping found */
if (pmd_leaf(*pmd)) {
- WARN_ON(1);
- return NULL;
+ /* Clear the PMD if we hit a large mapping from the first round */
+ if (late_text) {
+ set_pmd(pmd, __pmd(0));
+ } else {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
}
if (pmd_none(*pmd)) {
@@ -283,7 +288,7 @@ static void __init pti_setup_vsyscall(void)
if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
return;
- target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
+ target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR, false);
if (WARN_ON(!target_pte))
return;
@@ -301,7 +306,7 @@ enum pti_clone_level {
static void
pti_clone_pgtable(unsigned long start, unsigned long end,
- enum pti_clone_level level)
+ enum pti_clone_level level, bool late_text)
{
unsigned long addr;
@@ -374,14 +379,14 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
*/
*target_pmd = *pmd;
- addr += PMD_SIZE;
+ addr = round_up(addr + 1, PMD_SIZE);
} else if (level == PTI_CLONE_PTE) {
/* Walk the page-table down to the pte level */
pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte)) {
- addr += PAGE_SIZE;
+ addr = round_up(addr + 1, PAGE_SIZE);
continue;
}
@@ -390,7 +395,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
return;
/* Allocate PTE in the user page-table */
- target_pte = pti_user_pagetable_walk_pte(addr);
+ target_pte = pti_user_pagetable_walk_pte(addr, late_text);
if (WARN_ON(!target_pte))
return;
@@ -401,7 +406,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
/* Clone the PTE */
*target_pte = *pte;
- addr += PAGE_SIZE;
+ addr = round_up(addr + 1, PAGE_SIZE);
} else {
BUG();
@@ -452,7 +457,7 @@ static void __init pti_clone_user_shared(void)
phys_addr_t pa = per_cpu_ptr_to_phys((void *)va);
pte_t *target_pte;
- target_pte = pti_user_pagetable_walk_pte(va);
+ target_pte = pti_user_pagetable_walk_pte(va, false);
if (WARN_ON(!target_pte))
return;
@@ -475,7 +480,7 @@ static void __init pti_clone_user_shared(void)
start = CPU_ENTRY_AREA_BASE;
end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES);
- pti_clone_pgtable(start, end, PTI_CLONE_PMD);
+ pti_clone_pgtable(start, end, PTI_CLONE_PMD, false);
}
#endif /* CONFIG_X86_64 */
@@ -492,11 +497,11 @@ static void __init pti_setup_espfix64(void)
/*
* Clone the populated PMDs of the entry text and force it RO.
*/
-static void pti_clone_entry_text(void)
+static void pti_clone_entry_text(bool late)
{
pti_clone_pgtable((unsigned long) __entry_text_start,
(unsigned long) __entry_text_end,
- PTI_CLONE_PMD);
+ PTI_LEVEL_KERNEL_IMAGE, late);
}
/*
@@ -571,7 +576,7 @@ static void pti_clone_kernel_text(void)
* pti_set_kernel_image_nonglobal() did to clear the
* global bit.
*/
- pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE);
+ pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE, false);
/*
* pti_clone_pgtable() will set the global bit in any PMDs
@@ -638,8 +643,15 @@ void __init pti_init(void)
/* Undo all global bits from the init pagetables in head_64.S: */
pti_set_kernel_image_nonglobal();
+
/* Replace some of the global bits just for shared entry text: */
- pti_clone_entry_text();
+ /*
+ * This is very early in boot. Device and Late initcalls can do
+ * modprobe before free_initmem() and mark_readonly(). This
+ * pti_clone_entry_text() allows those user-mode-helpers to function,
+ * but notably the text is still RW.
+ */
+ pti_clone_entry_text(false);
pti_setup_espfix64();
pti_setup_vsyscall();
}
@@ -656,10 +668,11 @@ void pti_finalize(void)
if (!boot_cpu_has(X86_FEATURE_PTI))
return;
/*
- * We need to clone everything (again) that maps parts of the
- * kernel image.
+ * This is after free_initmem() (all initcalls are done) and we've done
+ * mark_readonly(). Text is now NX which might've split some PMDs
+ * relative to the early clone.
*/
- pti_clone_entry_text();
+ pti_clone_entry_text(true);
pti_clone_kernel_text();
debug_checkwx_user();