aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2025-04-11 11:13:27 +0200
committerIngo Molnar <mingo@kernel.org>2025-04-11 11:13:27 +0200
commit9f13acb2406a3aed90c6738b3a2f1c0e43118cbd (patch)
treecf37d241e85bf8fea4d13cb8d64ad6602c745b72 /arch/x86/kernel
parentx86/cacheinfo: Properly parse CPUID(0x80000006) L2/L3 associativity (diff)
parentLinux 6.15-rc1 (diff)
downloadlinux-9f13acb2406a3aed90c6738b3a2f1c0e43118cbd.tar.gz
linux-9f13acb2406a3aed90c6738b3a2f1c0e43118cbd.zip
Merge tag 'v6.15-rc1' into x86/cpu, to refresh the branch with upstream changes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/acpi/cppc.c4
-rw-r--r--arch/x86/kernel/apic/vector.c233
-rw-r--r--arch/x86/kernel/callthunks.c3
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c2
-rw-r--r--arch/x86/kernel/cpu/bugs.c121
-rw-r--r--arch/x86/kernel/cpu/bus_lock.c35
-rw-r--r--arch/x86/kernel/cpu/common.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c22
-rw-r--r--arch/x86/kernel/cpu/mce/core.c50
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c1
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c11
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c40
-rw-r--r--arch/x86/kernel/cpu/proc.c7
-rw-r--r--arch/x86/kernel/cpu/resctrl/Makefile5
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c181
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c93
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h206
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c119
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c69
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c357
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/kernel/early_printk.c45
-rw-r--r--arch/x86/kernel/fpu/core.c2
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/paravirt.c14
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/setup.c12
-rw-r--r--arch/x86/kernel/static_call.c2
-rw-r--r--arch/x86/kernel/unwind_orc.c4
30 files changed, 900 insertions, 753 deletions
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index d745dd586303..77bfb846490c 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -4,6 +4,8 @@
* Copyright (c) 2016, Intel Corporation.
*/
+#include <linux/bitfield.h>
+
#include <acpi/cppc_acpi.h>
#include <asm/msr.h>
#include <asm/processor.h>
@@ -149,7 +151,7 @@ int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
if (ret)
goto out;
- val = AMD_CPPC_HIGHEST_PERF(val);
+ val = FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, val);
} else {
ret = cppc_get_highest_perf(cpu, &val);
if (ret)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 736f62812f5c..fee42a73d64a 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -799,7 +799,7 @@ int __init arch_early_irq_init(void)
x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops,
NULL);
BUG_ON(x86_vector_domain == NULL);
- irq_set_default_host(x86_vector_domain);
+ irq_set_default_domain(x86_vector_domain);
BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
@@ -888,8 +888,109 @@ static int apic_set_affinity(struct irq_data *irqd,
return err ? err : IRQ_SET_MASK_OK;
}
+static void free_moved_vector(struct apic_chip_data *apicd)
+{
+ unsigned int vector = apicd->prev_vector;
+ unsigned int cpu = apicd->prev_cpu;
+ bool managed = apicd->is_managed;
+
+ /*
+ * Managed interrupts are usually not migrated away
+ * from an online CPU, but CPU isolation 'managed_irq'
+ * can make that happen.
+ * 1) Activation does not take the isolation into account
+ * to keep the code simple
+ * 2) Migration away from an isolated CPU can happen when
+ * a non-isolated CPU which is in the calculated
+ * affinity mask comes online.
+ */
+ trace_vector_free_moved(apicd->irq, cpu, vector, managed);
+ irq_matrix_free(vector_matrix, cpu, vector, managed);
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
+ hlist_del_init(&apicd->clist);
+ apicd->prev_vector = 0;
+ apicd->move_in_progress = 0;
+}
+
+/*
+ * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
+ */
+static void apic_force_complete_move(struct irq_data *irqd)
+{
+ unsigned int cpu = smp_processor_id();
+ struct apic_chip_data *apicd;
+ unsigned int vector;
+
+ guard(raw_spinlock)(&vector_lock);
+ apicd = apic_chip_data(irqd);
+ if (!apicd)
+ return;
+
+ /*
+ * If prev_vector is empty or the descriptor is neither currently
+ * nor previously on the outgoing CPU no action required.
+ */
+ vector = apicd->prev_vector;
+ if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu))
+ return;
+
+ /*
+ * This is tricky. If the cleanup of the old vector has not been
+ * done yet, then the following setaffinity call will fail with
+ * -EBUSY. This can leave the interrupt in a stale state.
+ *
+ * All CPUs are stuck in stop machine with interrupts disabled so
+ * calling __irq_complete_move() would be completely pointless.
+ *
+ * 1) The interrupt is in move_in_progress state. That means that we
+ * have not seen an interrupt since the io_apic was reprogrammed to
+ * the new vector.
+ *
+ * 2) The interrupt has fired on the new vector, but the cleanup IPIs
+ * have not been processed yet.
+ */
+ if (apicd->move_in_progress) {
+ /*
+ * In theory there is a race:
+ *
+ * set_ioapic(new_vector) <-- Interrupt is raised before update
+ * is effective, i.e. it's raised on
+ * the old vector.
+ *
+ * So if the target cpu cannot handle that interrupt before
+ * the old vector is cleaned up, we get a spurious interrupt
+ * and in the worst case the ioapic irq line becomes stale.
+ *
+ * But in case of cpu hotplug this should be a non issue
+ * because if the affinity update happens right before all
+ * cpus rendezvous in stop machine, there is no way that the
+ * interrupt can be blocked on the target cpu because all cpus
+ * loops first with interrupts enabled in stop machine, so the
+ * old vector is not yet cleaned up when the interrupt fires.
+ *
+ * So the only way to run into this issue is if the delivery
+ * of the interrupt on the apic/system bus would be delayed
+ * beyond the point where the target cpu disables interrupts
+ * in stop machine. I doubt that it can happen, but at least
+ * there is a theoretical chance. Virtualization might be
+ * able to expose this, but AFAICT the IOAPIC emulation is not
+ * as stupid as the real hardware.
+ *
+ * Anyway, there is nothing we can do about that at this point
+ * w/o refactoring the whole fixup_irq() business completely.
+ * We print at least the irq number and the old vector number,
+ * so we have the necessary information when a problem in that
+ * area arises.
+ */
+ pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
+ irqd->irq, vector);
+ }
+ free_moved_vector(apicd);
+}
+
#else
-# define apic_set_affinity NULL
+# define apic_set_affinity NULL
+# define apic_force_complete_move NULL
#endif
static int apic_retrigger_irq(struct irq_data *irqd)
@@ -923,39 +1024,16 @@ static void x86_vector_msi_compose_msg(struct irq_data *data,
}
static struct irq_chip lapic_controller = {
- .name = "APIC",
- .irq_ack = apic_ack_edge,
- .irq_set_affinity = apic_set_affinity,
- .irq_compose_msi_msg = x86_vector_msi_compose_msg,
- .irq_retrigger = apic_retrigger_irq,
+ .name = "APIC",
+ .irq_ack = apic_ack_edge,
+ .irq_set_affinity = apic_set_affinity,
+ .irq_compose_msi_msg = x86_vector_msi_compose_msg,
+ .irq_force_complete_move = apic_force_complete_move,
+ .irq_retrigger = apic_retrigger_irq,
};
#ifdef CONFIG_SMP
-static void free_moved_vector(struct apic_chip_data *apicd)
-{
- unsigned int vector = apicd->prev_vector;
- unsigned int cpu = apicd->prev_cpu;
- bool managed = apicd->is_managed;
-
- /*
- * Managed interrupts are usually not migrated away
- * from an online CPU, but CPU isolation 'managed_irq'
- * can make that happen.
- * 1) Activation does not take the isolation into account
- * to keep the code simple
- * 2) Migration away from an isolated CPU can happen when
- * a non-isolated CPU which is in the calculated
- * affinity mask comes online.
- */
- trace_vector_free_moved(apicd->irq, cpu, vector, managed);
- irq_matrix_free(vector_matrix, cpu, vector, managed);
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
- hlist_del_init(&apicd->clist);
- apicd->prev_vector = 0;
- apicd->move_in_progress = 0;
-}
-
static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
{
struct apic_chip_data *apicd;
@@ -1068,99 +1146,6 @@ void irq_complete_move(struct irq_cfg *cfg)
__vector_schedule_cleanup(apicd);
}
-/*
- * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
- */
-void irq_force_complete_move(struct irq_desc *desc)
-{
- unsigned int cpu = smp_processor_id();
- struct apic_chip_data *apicd;
- struct irq_data *irqd;
- unsigned int vector;
-
- /*
- * The function is called for all descriptors regardless of which
- * irqdomain they belong to. For example if an IRQ is provided by
- * an irq_chip as part of a GPIO driver, the chip data for that
- * descriptor is specific to the irq_chip in question.
- *
- * Check first that the chip_data is what we expect
- * (apic_chip_data) before touching it any further.
- */
- irqd = irq_domain_get_irq_data(x86_vector_domain,
- irq_desc_get_irq(desc));
- if (!irqd)
- return;
-
- raw_spin_lock(&vector_lock);
- apicd = apic_chip_data(irqd);
- if (!apicd)
- goto unlock;
-
- /*
- * If prev_vector is empty or the descriptor is neither currently
- * nor previously on the outgoing CPU no action required.
- */
- vector = apicd->prev_vector;
- if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu))
- goto unlock;
-
- /*
- * This is tricky. If the cleanup of the old vector has not been
- * done yet, then the following setaffinity call will fail with
- * -EBUSY. This can leave the interrupt in a stale state.
- *
- * All CPUs are stuck in stop machine with interrupts disabled so
- * calling __irq_complete_move() would be completely pointless.
- *
- * 1) The interrupt is in move_in_progress state. That means that we
- * have not seen an interrupt since the io_apic was reprogrammed to
- * the new vector.
- *
- * 2) The interrupt has fired on the new vector, but the cleanup IPIs
- * have not been processed yet.
- */
- if (apicd->move_in_progress) {
- /*
- * In theory there is a race:
- *
- * set_ioapic(new_vector) <-- Interrupt is raised before update
- * is effective, i.e. it's raised on
- * the old vector.
- *
- * So if the target cpu cannot handle that interrupt before
- * the old vector is cleaned up, we get a spurious interrupt
- * and in the worst case the ioapic irq line becomes stale.
- *
- * But in case of cpu hotplug this should be a non issue
- * because if the affinity update happens right before all
- * cpus rendezvous in stop machine, there is no way that the
- * interrupt can be blocked on the target cpu because all cpus
- * loops first with interrupts enabled in stop machine, so the
- * old vector is not yet cleaned up when the interrupt fires.
- *
- * So the only way to run into this issue is if the delivery
- * of the interrupt on the apic/system bus would be delayed
- * beyond the point where the target cpu disables interrupts
- * in stop machine. I doubt that it can happen, but at least
- * there is a theoretical chance. Virtualization might be
- * able to expose this, but AFAICT the IOAPIC emulation is not
- * as stupid as the real hardware.
- *
- * Anyway, there is nothing we can do about that at this point
- * w/o refactoring the whole fixup_irq() business completely.
- * We print at least the irq number and the old vector number,
- * so we have the necessary information when a problem in that
- * area arises.
- */
- pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
- irqd->irq, vector);
- }
- free_moved_vector(apicd);
-unlock:
- raw_spin_unlock(&vector_lock);
-}
-
#ifdef CONFIG_HOTPLUG_CPU
/*
* Note, this is not accurate accounting, but at least good enough to
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index 25ae54250112..d86d7d6e750c 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -98,11 +98,10 @@ static inline bool within_module_coretext(void *addr)
#ifdef CONFIG_MODULES
struct module *mod;
- preempt_disable();
+ guard(rcu)();
mod = __module_address((unsigned long)addr);
if (mod && within_module_core((unsigned long)addr, mod))
ret = true;
- preempt_enable();
#endif
return ret;
}
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index f642de2ebdac..6cf31a1649c4 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -498,7 +498,7 @@ void arch_scale_freq_tick(void)
*/
#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
-unsigned int arch_freq_get_on_cpu(int cpu)
+int arch_freq_get_on_cpu(int cpu)
{
struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
unsigned int seq, freq;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index a5d0998d7604..4386aa6c69e1 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -113,6 +113,10 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
/* Control unconditional IBPB in switch_mm() */
DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+/* Control IBPB on vCPU load */
+DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
+EXPORT_SYMBOL_GPL(switch_vcpu_ibpb);
+
/* Control MDS CPU buffer clear before idling (halt, mwait) */
DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
EXPORT_SYMBOL_GPL(mds_idle_clear);
@@ -234,7 +238,7 @@ static void x86_amd_ssb_disable(void)
/* Default mitigation for MDS-affected CPUs */
static enum mds_mitigations mds_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_FULL : MDS_MITIGATION_OFF;
+ IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_AUTO : MDS_MITIGATION_OFF;
static bool mds_nosmt __ro_after_init = false;
static const char * const mds_strings[] = {
@@ -243,6 +247,40 @@ static const char * const mds_strings[] = {
[MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode",
};
+enum taa_mitigations {
+ TAA_MITIGATION_OFF,
+ TAA_MITIGATION_AUTO,
+ TAA_MITIGATION_UCODE_NEEDED,
+ TAA_MITIGATION_VERW,
+ TAA_MITIGATION_TSX_DISABLED,
+};
+
+/* Default mitigation for TAA-affected CPUs */
+static enum taa_mitigations taa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_AUTO : TAA_MITIGATION_OFF;
+
+enum mmio_mitigations {
+ MMIO_MITIGATION_OFF,
+ MMIO_MITIGATION_AUTO,
+ MMIO_MITIGATION_UCODE_NEEDED,
+ MMIO_MITIGATION_VERW,
+};
+
+/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
+static enum mmio_mitigations mmio_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_AUTO : MMIO_MITIGATION_OFF;
+
+enum rfds_mitigations {
+ RFDS_MITIGATION_OFF,
+ RFDS_MITIGATION_AUTO,
+ RFDS_MITIGATION_VERW,
+ RFDS_MITIGATION_UCODE_NEEDED,
+};
+
+/* Default mitigation for Register File Data Sampling */
+static enum rfds_mitigations rfds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF;
+
static void __init mds_select_mitigation(void)
{
if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
@@ -250,6 +288,9 @@ static void __init mds_select_mitigation(void)
return;
}
+ if (mds_mitigation == MDS_MITIGATION_AUTO)
+ mds_mitigation = MDS_MITIGATION_FULL;
+
if (mds_mitigation == MDS_MITIGATION_FULL) {
if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
mds_mitigation = MDS_MITIGATION_VMWERV;
@@ -286,16 +327,6 @@ early_param("mds", mds_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "TAA: " fmt
-enum taa_mitigations {
- TAA_MITIGATION_OFF,
- TAA_MITIGATION_UCODE_NEEDED,
- TAA_MITIGATION_VERW,
- TAA_MITIGATION_TSX_DISABLED,
-};
-
-/* Default mitigation for TAA-affected CPUs */
-static enum taa_mitigations taa_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_VERW : TAA_MITIGATION_OFF;
static bool taa_nosmt __ro_after_init;
static const char * const taa_strings[] = {
@@ -386,15 +417,6 @@ early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "MMIO Stale Data: " fmt
-enum mmio_mitigations {
- MMIO_MITIGATION_OFF,
- MMIO_MITIGATION_UCODE_NEEDED,
- MMIO_MITIGATION_VERW,
-};
-
-/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
-static enum mmio_mitigations mmio_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_VERW : MMIO_MITIGATION_OFF;
static bool mmio_nosmt __ro_after_init = false;
static const char * const mmio_strings[] = {
@@ -483,16 +505,6 @@ early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "Register File Data Sampling: " fmt
-enum rfds_mitigations {
- RFDS_MITIGATION_OFF,
- RFDS_MITIGATION_VERW,
- RFDS_MITIGATION_UCODE_NEEDED,
-};
-
-/* Default mitigation for Register File Data Sampling */
-static enum rfds_mitigations rfds_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_VERW : RFDS_MITIGATION_OFF;
-
static const char * const rfds_strings[] = {
[RFDS_MITIGATION_OFF] = "Vulnerable",
[RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File",
@@ -508,6 +520,9 @@ static void __init rfds_select_mitigation(void)
if (rfds_mitigation == RFDS_MITIGATION_OFF)
return;
+ if (rfds_mitigation == RFDS_MITIGATION_AUTO)
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR)
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
else
@@ -1293,9 +1308,13 @@ static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd;
static enum spectre_v2_user_cmd __init
spectre_v2_parse_user_cmdline(void)
{
+ enum spectre_v2_user_cmd mode;
char arg[20];
int ret, i;
+ mode = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ?
+ SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE;
+
switch (spectre_v2_cmd) {
case SPECTRE_V2_CMD_NONE:
return SPECTRE_V2_USER_CMD_NONE;
@@ -1308,7 +1327,7 @@ spectre_v2_parse_user_cmdline(void)
ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
arg, sizeof(arg));
if (ret < 0)
- return SPECTRE_V2_USER_CMD_AUTO;
+ return mode;
for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) {
if (match_option(arg, ret, v2_user_options[i].option)) {
@@ -1318,8 +1337,8 @@ spectre_v2_parse_user_cmdline(void)
}
}
- pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg);
- return SPECTRE_V2_USER_CMD_AUTO;
+ pr_err("Unknown user space protection option (%s). Switching to default\n", arg);
+ return mode;
}
static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
@@ -1331,16 +1350,11 @@ static void __init
spectre_v2_user_select_mitigation(void)
{
enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
- bool smt_possible = IS_ENABLED(CONFIG_SMP);
enum spectre_v2_user_cmd cmd;
if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
return;
- if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
- cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
- smt_possible = false;
-
cmd = spectre_v2_parse_user_cmdline();
switch (cmd) {
case SPECTRE_V2_USER_CMD_NONE:
@@ -1364,7 +1378,7 @@ spectre_v2_user_select_mitigation(void)
/* Initialize Indirect Branch Prediction Barrier */
if (boot_cpu_has(X86_FEATURE_IBPB)) {
- setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
+ static_branch_enable(&switch_vcpu_ibpb);
spectre_v2_user_ibpb = mode;
switch (cmd) {
@@ -1401,7 +1415,7 @@ spectre_v2_user_select_mitigation(void)
* so allow for STIBP to be selected in those cases.
*/
if (!boot_cpu_has(X86_FEATURE_STIBP) ||
- !smt_possible ||
+ !cpu_smt_possible() ||
(spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
!boot_cpu_has(X86_FEATURE_AUTOIBRS)))
return;
@@ -1973,6 +1987,7 @@ void cpu_bugs_smt_update(void)
switch (mds_mitigation) {
case MDS_MITIGATION_FULL:
+ case MDS_MITIGATION_AUTO:
case MDS_MITIGATION_VMWERV:
if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
pr_warn_once(MDS_MSG_SMT);
@@ -1984,6 +1999,7 @@ void cpu_bugs_smt_update(void)
switch (taa_mitigation) {
case TAA_MITIGATION_VERW:
+ case TAA_MITIGATION_AUTO:
case TAA_MITIGATION_UCODE_NEEDED:
if (sched_smt_active())
pr_warn_once(TAA_MSG_SMT);
@@ -1995,6 +2011,7 @@ void cpu_bugs_smt_update(void)
switch (mmio_mitigation) {
case MMIO_MITIGATION_VERW:
+ case MMIO_MITIGATION_AUTO:
case MMIO_MITIGATION_UCODE_NEEDED:
if (sched_smt_active())
pr_warn_once(MMIO_MSG_SMT);
@@ -2522,6 +2539,7 @@ enum srso_mitigation {
SRSO_MITIGATION_SAFE_RET,
SRSO_MITIGATION_IBPB,
SRSO_MITIGATION_IBPB_ON_VMEXIT,
+ SRSO_MITIGATION_BP_SPEC_REDUCE,
};
enum srso_mitigation_cmd {
@@ -2539,7 +2557,8 @@ static const char * const srso_strings[] = {
[SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET",
[SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET",
[SRSO_MITIGATION_IBPB] = "Mitigation: IBPB",
- [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only"
+ [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only",
+ [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation"
};
static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE;
@@ -2578,7 +2597,7 @@ static void __init srso_select_mitigation(void)
srso_cmd == SRSO_CMD_OFF) {
if (boot_cpu_has(X86_FEATURE_SBPB))
x86_pred_cmd = PRED_CMD_SBPB;
- return;
+ goto out;
}
if (has_microcode) {
@@ -2590,7 +2609,7 @@ static void __init srso_select_mitigation(void)
*/
if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
- return;
+ goto out;
}
if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
@@ -2670,6 +2689,12 @@ static void __init srso_select_mitigation(void)
ibpb_on_vmexit:
case SRSO_CMD_IBPB_ON_VMEXIT:
+ if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) {
+ pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n");
+ srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE;
+ break;
+ }
+
if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
if (has_microcode) {
setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
@@ -2691,7 +2716,15 @@ ibpb_on_vmexit:
}
out:
- pr_info("%s\n", srso_strings[srso_mitigation]);
+ /*
+ * Clear the feature flag if this mitigation is not selected as that
+ * feature flag controls the BpSpecReduce MSR bit toggling in KVM.
+ */
+ if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE)
+ setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE);
+
+ if (srso_mitigation != SRSO_MITIGATION_NONE)
+ pr_info("%s\n", srso_strings[srso_mitigation]);
}
#undef pr_fmt
diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c
index 97222efb4d2a..237faf7e700c 100644
--- a/arch/x86/kernel/cpu/bus_lock.c
+++ b/arch/x86/kernel/cpu/bus_lock.c
@@ -201,6 +201,26 @@ static void __split_lock_reenable(struct work_struct *work)
static DEFINE_PER_CPU(struct delayed_work, sl_reenable);
/*
+ * Per-CPU delayed_work can't be statically initialized properly because
+ * the struct address is unknown. Thus per-CPU delayed_work structures
+ * have to be initialized during kernel initialization and after calling
+ * setup_per_cpu_areas().
+ */
+static int __init setup_split_lock_delayed_work(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct delayed_work *work = per_cpu_ptr(&sl_reenable, cpu);
+
+ INIT_DELAYED_WORK(work, __split_lock_reenable);
+ }
+
+ return 0;
+}
+pure_initcall(setup_split_lock_delayed_work);
+
+/*
* If a CPU goes offline with pending delayed work to re-enable split lock
* detection then the delayed work will be executed on some other CPU. That
* handles releasing the buslock_sem, but because it executes on a
@@ -219,15 +239,16 @@ static int splitlock_cpu_offline(unsigned int cpu)
static void split_lock_warn(unsigned long ip)
{
- struct delayed_work *work = NULL;
+ struct delayed_work *work;
int cpu;
+ unsigned int saved_sld_mitigate = READ_ONCE(sysctl_sld_mitigate);
if (!current->reported_split_lock)
pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
current->comm, current->pid, ip);
current->reported_split_lock = 1;
- if (sysctl_sld_mitigate) {
+ if (saved_sld_mitigate) {
/*
* misery factor #1:
* sleep 10ms before trying to execute split lock.
@@ -240,18 +261,10 @@ static void split_lock_warn(unsigned long ip)
*/
if (down_interruptible(&buslock_sem) == -EINTR)
return;
- work = &sl_reenable_unlock;
}
cpu = get_cpu();
-
- if (!work) {
- work = this_cpu_ptr(&sl_reenable);
- /* Deferred initialization of per-CPU struct */
- if (!work->work.func)
- INIT_DELAYED_WORK(work, __split_lock_reenable);
- }
-
+ work = saved_sld_mitigate ? &sl_reenable_unlock : per_cpu_ptr(&sl_reenable, cpu);
schedule_delayed_work_on(cpu, work, 2);
/* Disable split lock detection on this CPU to make progress */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 73565168fc19..12126adbc3a9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1332,8 +1332,10 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
- if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2))
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) {
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER);
+ }
if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) &&
!(x86_arch_cap_msr & ARCH_CAP_SSB_NO) &&
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index e5d814703406..a6493f60b3f2 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -514,6 +514,25 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
}
+/*
+ * This is a list of Intel CPUs that are known to suffer from downclocking when
+ * ZMM registers (512-bit vectors) are used. On these CPUs, when the kernel
+ * executes SIMD-optimized code such as cryptography functions or CRCs, it
+ * should prefer 256-bit (YMM) code to 512-bit (ZMM) code.
+ */
+static const struct x86_cpu_id zmm_exclusion_list[] = {
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_NNPI, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, 0),
+ /* Allow Rocket Lake and later, and Sapphire Rapids and later. */
+ {},
+};
+
static void init_intel(struct cpuinfo_x86 *c)
{
early_init_intel(c);
@@ -592,6 +611,9 @@ static void init_intel(struct cpuinfo_x86 *c)
}
#endif
+ if (x86_match_cpu(zmm_exclusion_list))
+ set_cpu_cap(c, X86_FEATURE_PREFER_YMM);
+
/* Work around errata */
srat_detect_node(c);
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 0dc00c9894c7..f6fd71b64b66 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -584,6 +584,28 @@ bool mce_is_correctable(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_is_correctable);
+/*
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
+ */
+static bool mce_notify_irq(void)
+{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
+ if (test_and_clear_bit(0, &mce_need_notify)) {
+ mce_work_trigger();
+
+ if (__ratelimit(&ratelimit))
+ pr_info(HW_ERR "Machine check events logged\n");
+
+ return true;
+ }
+
+ return false;
+}
+
static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
@@ -1764,36 +1786,14 @@ void mce_timer_kick(bool storm)
__this_cpu_write(mce_next_interval, check_interval * HZ);
}
-/* Must not be called in IRQ context where del_timer_sync() can deadlock */
+/* Must not be called in IRQ context where timer_delete_sync() can deadlock */
static void mce_timer_delete_all(void)
{
int cpu;
for_each_online_cpu(cpu)
- del_timer_sync(&per_cpu(mce_timer, cpu));
-}
-
-/*
- * Notify the user(s) about new machine check events.
- * Can be called from interrupt context, but not from machine check/NMI
- * context.
- */
-bool mce_notify_irq(void)
-{
- /* Not more than two messages every minute */
- static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-
- if (test_and_clear_bit(0, &mce_need_notify)) {
- mce_work_trigger();
-
- if (__ratelimit(&ratelimit))
- pr_info(HW_ERR "Machine check events logged\n");
-
- return true;
- }
- return false;
+ timer_delete_sync(&per_cpu(mce_timer, cpu));
}
-EXPORT_SYMBOL_GPL(mce_notify_irq);
static void __mcheck_cpu_mce_banks_init(void)
{
@@ -2820,7 +2820,7 @@ static int mce_cpu_pre_down(unsigned int cpu)
struct timer_list *t = this_cpu_ptr(&mce_timer);
mce_disable_cpu();
- del_timer_sync(t);
+ timer_delete_sync(t);
mce_threshold_remove_device(cpu);
mce_device_remove(cpu);
return 0;
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 313fe682db33..06e3cf7229ce 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -229,7 +229,6 @@ static int raise_local(void)
} else if (m->status) {
pr_info("Starting machine check poll CPU %d\n", cpu);
raise_poll(m);
- mce_notify_irq();
pr_info("Machine check poll done on CPU %d\n", cpu);
} else
m->finished = 0;
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index dac4d64dfb2a..2235a7477436 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -300,13 +300,12 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
copy_user = is_copy_from_user(regs);
instrumentation_end();
- switch (fixup_type) {
- case EX_TYPE_UACCESS:
- if (!copy_user)
- return IN_KERNEL;
- m->kflags |= MCE_IN_KERNEL_COPYIN;
- fallthrough;
+ if (copy_user) {
+ m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV;
+ return IN_KERNEL_RECOV;
+ }
+ switch (fixup_type) {
case EX_TYPE_FAULT_MCE_SAFE:
case EX_TYPE_DEFAULT_MCE_SAFE:
m->kflags |= MCE_IN_KERNEL_RECOV;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 138689b8e1d8..b61028cf5c8a 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -600,7 +600,7 @@ static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev,
unsigned long p_addr = (unsigned long)&mc->hdr.data_code;
if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize))
- return -1;
+ return false;
native_wrmsrl(MSR_AMD64_PATCH_LOADER, p_addr);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index f285757618fc..3e2533954675 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -33,8 +33,6 @@
#include <asm/numa.h>
#include <asm/svm.h>
-/* Is Linux running as the root partition? */
-bool hv_root_partition;
/* Is Linux running on nested Microsoft Hypervisor */
bool hv_nested;
struct ms_hyperv_info ms_hyperv;
@@ -109,6 +107,7 @@ void hv_set_msr(unsigned int reg, u64 value)
}
EXPORT_SYMBOL_GPL(hv_set_msr);
+static void (*mshv_handler)(void);
static void (*vmbus_handler)(void);
static void (*hv_stimer0_handler)(void);
static void (*hv_kexec_handler)(void);
@@ -119,6 +118,9 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
struct pt_regs *old_regs = set_irq_regs(regs);
inc_irq_stat(irq_hv_callback_count);
+ if (mshv_handler)
+ mshv_handler();
+
if (vmbus_handler)
vmbus_handler();
@@ -128,6 +130,11 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
set_irq_regs(old_regs);
}
+void hv_setup_mshv_handler(void (*handler)(void))
+{
+ mshv_handler = handler;
+}
+
void hv_setup_vmbus_handler(void (*handler)(void))
{
vmbus_handler = handler;
@@ -422,6 +429,7 @@ int hv_get_hypervisor_version(union hv_hypervisor_version_info *info)
return 0;
}
+EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
static void __init ms_hyperv_init_platform(void)
{
@@ -436,13 +444,15 @@ static void __init ms_hyperv_init_platform(void)
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.ext_features = cpuid_ecx(HYPERV_CPUID_FEATURES);
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
- pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n",
- ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
+ pr_info("Hyper-V: privilege flags low %#x, high %#x, ext %#x, hints %#x, misc %#x\n",
+ ms_hyperv.features, ms_hyperv.priv_high,
+ ms_hyperv.ext_features, ms_hyperv.hints,
ms_hyperv.misc_features);
ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
@@ -451,25 +461,7 @@ static void __init ms_hyperv_init_platform(void)
pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
- /*
- * Check CPU management privilege.
- *
- * To mirror what Windows does we should extract CPU management
- * features and use the ReservedIdentityBit to detect if Linux is the
- * root partition. But that requires negotiating CPU management
- * interface (a process to be finalized). For now, use the privilege
- * flag as the indicator for running as root.
- *
- * Hyper-V should never specify running as root and as a Confidential
- * VM. But to protect against a compromised/malicious Hyper-V trying
- * to exploit root behavior to expose Confidential VM memory, ignore
- * the root partition setting if also a Confidential VM.
- */
- if ((ms_hyperv.priv_high & HV_CPU_MANAGEMENT) &&
- !(ms_hyperv.priv_high & HV_ISOLATION)) {
- hv_root_partition = true;
- pr_info("Hyper-V: running as root partition\n");
- }
+ hv_identify_partition_type();
if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) {
hv_nested = true;
@@ -618,7 +610,7 @@ static void __init ms_hyperv_init_platform(void)
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
- if (hv_root_partition ||
+ if (hv_root_partition() ||
(!ms_hyperv.paravisor_present && hv_isolation_type_snp()))
smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
# endif
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 41ed01f46bd9..6571d432cbe3 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -86,9 +86,12 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = arch_freq_get_on_cpu(cpu);
+ int freq = arch_freq_get_on_cpu(cpu);
- seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
+ if (freq < 0)
+ seq_puts(m, "cpu MHz\t\t: Unknown\n");
+ else
+ seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
}
/* Cache size */
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile
index 4a06c37b9cf1..0c13b0befd8a 100644
--- a/arch/x86/kernel/cpu/resctrl/Makefile
+++ b/arch/x86/kernel/cpu/resctrl/Makefile
@@ -1,4 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
-obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o pseudo_lock.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o
+obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o
CFLAGS_pseudo_lock.o = -I$(src)
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 3d1735ed8d1f..cf29681d01e0 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -44,12 +44,6 @@ static DEFINE_MUTEX(domain_list_lock);
DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state);
/*
- * Used to store the max resource name width and max resource data width
- * to display the schemata in a tabular format
- */
-int max_name_width, max_data_width;
-
-/*
* Global boolean for rdt_alloc which is true if any
* resource allocation is enabled.
*/
@@ -62,7 +56,7 @@ static void mba_wrmsr_amd(struct msr_param *m);
#define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains)
#define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains)
-struct rdt_hw_resource rdt_resources_all[] = {
+struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = {
[RDT_RESOURCE_L3] =
{
.r_resctrl = {
@@ -72,9 +66,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
.mon_scope = RESCTRL_L3_CACHE,
.ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L3),
.mon_domains = mon_domain_init(RDT_RESOURCE_L3),
- .parse_ctrlval = parse_cbm,
- .format_str = "%d=%0*x",
- .fflags = RFTYPE_RES_CACHE,
+ .schema_fmt = RESCTRL_SCHEMA_BITMAP,
},
.msr_base = MSR_IA32_L3_CBM_BASE,
.msr_update = cat_wrmsr,
@@ -86,9 +78,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
.name = "L2",
.ctrl_scope = RESCTRL_L2_CACHE,
.ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2),
- .parse_ctrlval = parse_cbm,
- .format_str = "%d=%0*x",
- .fflags = RFTYPE_RES_CACHE,
+ .schema_fmt = RESCTRL_SCHEMA_BITMAP,
},
.msr_base = MSR_IA32_L2_CBM_BASE,
.msr_update = cat_wrmsr,
@@ -100,9 +90,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
.name = "MB",
.ctrl_scope = RESCTRL_L3_CACHE,
.ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA),
- .parse_ctrlval = parse_bw,
- .format_str = "%d=%*u",
- .fflags = RFTYPE_RES_MB,
+ .schema_fmt = RESCTRL_SCHEMA_RANGE,
},
},
[RDT_RESOURCE_SMBA] =
@@ -112,9 +100,7 @@ struct rdt_hw_resource rdt_resources_all[] = {
.name = "SMBA",
.ctrl_scope = RESCTRL_L3_CACHE,
.ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA),
- .parse_ctrlval = parse_bw,
- .format_str = "%d=%*u",
- .fflags = RFTYPE_RES_MB,
+ .schema_fmt = RESCTRL_SCHEMA_RANGE,
},
},
};
@@ -127,6 +113,14 @@ u32 resctrl_arch_system_num_rmid_idx(void)
return r->num_rmid;
}
+struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
+{
+ if (l >= RDT_NUM_RESOURCES)
+ return NULL;
+
+ return &rdt_resources_all[l].r_resctrl;
+}
+
/*
* cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
* as they do not have CPUID enumeration support for Cache allocation.
@@ -161,7 +155,6 @@ static inline void cache_alloc_hsw_probe(void)
return;
hw_res->num_closid = 4;
- r->default_ctrl = max_cbm;
r->cache.cbm_len = 20;
r->cache.shareable_bits = 0xc0000;
r->cache.min_cbm_bits = 2;
@@ -174,7 +167,7 @@ static inline void cache_alloc_hsw_probe(void)
bool is_mba_sc(struct rdt_resource *r)
{
if (!r)
- return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc;
+ r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
/*
* The software controller support is only applicable to MBA resource.
@@ -217,7 +210,7 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r)
cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
hw_res->num_closid = edx.split.cos_max + 1;
max_delay = eax.split.max_delay + 1;
- r->default_ctrl = MAX_MBA_BW;
+ r->membw.max_bw = MAX_MBA_BW;
r->membw.arch_needs_linear = true;
if (ecx & MBA_IS_LINEAR) {
r->membw.delay_linear = true;
@@ -228,16 +221,12 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r)
return false;
r->membw.arch_needs_linear = false;
}
- r->data_width = 3;
if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA))
r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
else
r->membw.throttle_mode = THREAD_THROTTLE_MAX;
- resctrl_file_fflags_init("thread_throttle_mode",
- RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
-
r->alloc_capable = true;
return true;
@@ -256,7 +245,7 @@ static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r)
cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
hw_res->num_closid = edx + 1;
- r->default_ctrl = 1 << eax;
+ r->membw.max_bw = 1 << eax;
/* AMD does not use delay */
r->membw.delay_linear = false;
@@ -269,8 +258,6 @@ static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r)
r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
r->membw.min_bw = 0;
r->membw.bw_gran = 1;
- /* Max value is 2048, Data width should be 4 in decimal */
- r->data_width = 4;
r->alloc_capable = true;
@@ -283,14 +270,13 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
union cpuid_0x10_1_eax eax;
union cpuid_0x10_x_ecx ecx;
union cpuid_0x10_x_edx edx;
- u32 ebx;
+ u32 ebx, default_ctrl;
cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full);
hw_res->num_closid = edx.split.cos_max + 1;
r->cache.cbm_len = eax.split.cbm_len + 1;
- r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
- r->cache.shareable_bits = ebx & r->default_ctrl;
- r->data_width = (r->cache.cbm_len + 3) / 4;
+ default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->cache.shareable_bits = ebx & default_ctrl;
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
r->cache.arch_has_sparse_bitmasks = ecx.split.noncont;
r->alloc_capable = true;
@@ -337,7 +323,7 @@ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
return MAX_MBA_BW - bw;
pr_warn_once("Non Linear delay-bw map not supported but queried\n");
- return r->default_ctrl;
+ return MAX_MBA_BW;
}
static void mba_wrmsr_intel(struct msr_param *m)
@@ -361,36 +347,6 @@ static void cat_wrmsr(struct msr_param *m)
wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
}
-struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r)
-{
- struct rdt_ctrl_domain *d;
-
- lockdep_assert_cpus_held();
-
- list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
- /* Find the domain that contains this CPU */
- if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
- return d;
- }
-
- return NULL;
-}
-
-struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r)
-{
- struct rdt_mon_domain *d;
-
- lockdep_assert_cpus_held();
-
- list_for_each_entry(d, &r->mon_domains, hdr.list) {
- /* Find the domain that contains this CPU */
- if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
- return d;
- }
-
- return NULL;
-}
-
u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
{
return resctrl_to_arch_res(r)->num_closid;
@@ -405,36 +361,6 @@ void rdt_ctrl_update(void *arg)
hw_res->msr_update(m);
}
-/*
- * rdt_find_domain - Search for a domain id in a resource domain list.
- *
- * Search the domain list to find the domain id. If the domain id is
- * found, return the domain. NULL otherwise. If the domain id is not
- * found (and NULL returned) then the first domain with id bigger than
- * the input id can be returned to the caller via @pos.
- */
-struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id,
- struct list_head **pos)
-{
- struct rdt_domain_hdr *d;
- struct list_head *l;
-
- list_for_each(l, h) {
- d = list_entry(l, struct rdt_domain_hdr, list);
- /* When id is found, return its domain. */
- if (id == d->id)
- return d;
- /* Stop searching when finding id's position in sorted list. */
- if (id < d->id)
- break;
- }
-
- if (pos)
- *pos = l;
-
- return NULL;
-}
-
static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
@@ -446,7 +372,7 @@ static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
* For Memory Allocation: Set b/w requested to 100%
*/
for (i = 0; i < hw_res->num_closid; i++, dc++)
- *dc = r->default_ctrl;
+ *dc = resctrl_get_default_ctrl(r);
}
static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom)
@@ -494,13 +420,13 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom)
{
size_t tsize;
- if (is_mbm_total_enabled()) {
+ if (resctrl_arch_is_mbm_total_enabled()) {
tsize = sizeof(*hw_dom->arch_mbm_total);
hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL);
if (!hw_dom->arch_mbm_total)
return -ENOMEM;
}
- if (is_mbm_local_enabled()) {
+ if (resctrl_arch_is_mbm_local_enabled()) {
tsize = sizeof(*hw_dom->arch_mbm_local);
hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL);
if (!hw_dom->arch_mbm_local) {
@@ -545,7 +471,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
return;
}
- hdr = rdt_find_domain(&r->ctrl_domains, id, &add_pos);
+ hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos);
if (hdr) {
if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
return;
@@ -600,7 +526,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
return;
}
- hdr = rdt_find_domain(&r->mon_domains, id, &add_pos);
+ hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos);
if (hdr) {
if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
return;
@@ -665,7 +591,7 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
return;
}
- hdr = rdt_find_domain(&r->ctrl_domains, id, NULL);
+ hdr = resctrl_find_domain(&r->ctrl_domains, id, NULL);
if (!hdr) {
pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n",
id, cpu, r->name);
@@ -711,7 +637,7 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
return;
}
- hdr = rdt_find_domain(&r->mon_domains, id, NULL);
+ hdr = resctrl_find_domain(&r->mon_domains, id, NULL);
if (!hdr) {
pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n",
id, cpu, r->name);
@@ -786,20 +712,6 @@ static int resctrl_arch_offline_cpu(unsigned int cpu)
return 0;
}
-/*
- * Choose a width for the resource name and resource data based on the
- * resource that has widest name and cbm.
- */
-static __init void rdt_init_padding(void)
-{
- struct rdt_resource *r;
-
- for_each_alloc_capable_rdt_resource(r) {
- if (r->data_width > max_data_width)
- max_data_width = r->data_width;
- }
-}
-
enum {
RDT_FLAG_CMT,
RDT_FLAG_MBM_TOTAL,
@@ -885,6 +797,21 @@ bool __init rdt_cpu_has(int flag)
return ret;
}
+__init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
+{
+ if (!rdt_cpu_has(X86_FEATURE_BMEC))
+ return false;
+
+ switch (evt) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL);
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL);
+ default:
+ return false;
+ }
+}
+
static __init bool get_mem_config(void)
{
struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA];
@@ -963,11 +890,6 @@ static __init bool get_rdt_mon_resources(void)
if (!rdt_mon_features)
return false;
- if (is_mbm_local_enabled())
- mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID;
- else if (is_mbm_total_enabled())
- mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
-
return !rdt_get_mon_l3_config(r);
}
@@ -1086,7 +1008,7 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c)
}
}
-static int __init resctrl_late_init(void)
+static int __init resctrl_arch_late_init(void)
{
struct rdt_resource *r;
int state, ret;
@@ -1102,8 +1024,6 @@ static int __init resctrl_late_init(void)
if (!get_rdt_resources())
return -ENODEV;
- rdt_init_padding();
-
state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"x86/resctrl/cat:online:",
resctrl_arch_online_cpu,
@@ -1111,7 +1031,7 @@ static int __init resctrl_late_init(void)
if (state < 0)
return state;
- ret = rdtgroup_init();
+ ret = resctrl_init();
if (ret) {
cpuhp_remove_state(state);
return ret;
@@ -1127,18 +1047,13 @@ static int __init resctrl_late_init(void)
return 0;
}
-late_initcall(resctrl_late_init);
+late_initcall(resctrl_arch_late_init);
-static void __exit resctrl_exit(void)
+static void __exit resctrl_arch_exit(void)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
-
cpuhp_remove_state(rdt_online);
- rdtgroup_exit();
-
- if (r->mon_capable)
- rdt_put_mon_l3_config();
+ resctrl_exit();
}
-__exitcall(resctrl_exit);
+__exitcall(resctrl_arch_exit);
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 536351159cc2..0a0ac5f6112e 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -23,6 +23,15 @@
#include "internal.h"
+struct rdt_parse_data {
+ struct rdtgroup *rdtgrp;
+ char *buf;
+};
+
+typedef int (ctrlval_parser_t)(struct rdt_parse_data *data,
+ struct resctrl_schema *s,
+ struct rdt_ctrl_domain *d);
+
/*
* Check whether MBA bandwidth percentage value is correct. The value is
* checked against the minimum and max bandwidth values specified by the
@@ -54,9 +63,9 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r)
return true;
}
- if (bw < r->membw.min_bw || bw > r->default_ctrl) {
+ if (bw < r->membw.min_bw || bw > r->membw.max_bw) {
rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n",
- bw, r->membw.min_bw, r->default_ctrl);
+ bw, r->membw.min_bw, r->membw.max_bw);
return false;
}
@@ -64,8 +73,8 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r)
return true;
}
-int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_ctrl_domain *d)
+static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
+ struct rdt_ctrl_domain *d)
{
struct resctrl_staged_config *cfg;
u32 closid = data->rdtgrp->closid;
@@ -104,8 +113,9 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
*/
static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
{
- unsigned long first_bit, zero_bit, val;
+ u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1;
unsigned int cbm_len = r->cache.cbm_len;
+ unsigned long first_bit, zero_bit, val;
int ret;
ret = kstrtoul(buf, 16, &val);
@@ -114,7 +124,7 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
return false;
}
- if ((r->cache.min_cbm_bits > 0 && val == 0) || val > r->default_ctrl) {
+ if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) {
rdt_last_cmd_puts("Mask out of range\n");
return false;
}
@@ -143,8 +153,8 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
* Read one cache bit mask (hex). Check that it is valid for the current
* resource type.
*/
-int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_ctrl_domain *d)
+static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
+ struct rdt_ctrl_domain *d)
{
struct rdtgroup *rdtgrp = data->rdtgrp;
struct resctrl_staged_config *cfg;
@@ -210,6 +220,7 @@ static int parse_line(char *line, struct resctrl_schema *s,
struct rdtgroup *rdtgrp)
{
enum resctrl_conf_type t = s->conf_type;
+ ctrlval_parser_t *parse_ctrlval = NULL;
struct resctrl_staged_config *cfg;
struct rdt_resource *r = s->res;
struct rdt_parse_data data;
@@ -220,6 +231,18 @@ static int parse_line(char *line, struct resctrl_schema *s,
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
+ switch (r->schema_fmt) {
+ case RESCTRL_SCHEMA_BITMAP:
+ parse_ctrlval = &parse_cbm;
+ break;
+ case RESCTRL_SCHEMA_RANGE:
+ parse_ctrlval = &parse_bw;
+ break;
+ }
+
+ if (WARN_ON_ONCE(!parse_ctrlval))
+ return -EINVAL;
+
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
(r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
@@ -240,7 +263,7 @@ next:
if (d->hdr.id == dom_id) {
data.buf = dom;
data.rdtgrp = rdtgrp;
- if (r->parse_ctrlval(&data, s, d))
+ if (parse_ctrlval(&data, s, d))
return -EINVAL;
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
cfg = &d->staged_config[t];
@@ -264,25 +287,12 @@ next:
return -EINVAL;
}
-static u32 get_config_index(u32 closid, enum resctrl_conf_type type)
-{
- switch (type) {
- default:
- case CDP_NONE:
- return closid;
- case CDP_CODE:
- return closid * 2 + 1;
- case CDP_DATA:
- return closid * 2;
- }
-}
-
int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
u32 closid, enum resctrl_conf_type t, u32 cfg_val)
{
struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- u32 idx = get_config_index(closid, t);
+ u32 idx = resctrl_get_config_index(closid, t);
struct msr_param msr_param;
if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask))
@@ -319,7 +329,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
if (!cfg->have_new_ctrl)
continue;
- idx = get_config_index(closid, t);
+ idx = resctrl_get_config_index(closid, t);
if (cfg->new_ctrl == hw_dom->ctrl_val[idx])
continue;
hw_dom->ctrl_val[idx] = cfg->new_ctrl;
@@ -439,7 +449,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
u32 closid, enum resctrl_conf_type type)
{
struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
- u32 idx = get_config_index(closid, type);
+ u32 idx = resctrl_get_config_index(closid, type);
return hw_dom->ctrl_val[idx];
}
@@ -465,8 +475,7 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo
ctrl_val = resctrl_arch_get_config(r, dom, closid,
schema->conf_type);
- seq_printf(s, r->format_str, dom->hdr.id, max_data_width,
- ctrl_val);
+ seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val);
sep = true;
}
seq_puts(s, "\n");
@@ -537,12 +546,12 @@ ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
rdt_last_cmd_clear();
if (!strcmp(buf, "mbm_local_bytes")) {
- if (is_mbm_local_enabled())
+ if (resctrl_arch_is_mbm_local_enabled())
rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;
else
ret = -EINVAL;
} else if (!strcmp(buf, "mbm_total_bytes")) {
- if (is_mbm_total_enabled())
+ if (resctrl_arch_is_mbm_total_enabled())
rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID;
else
ret = -EINVAL;
@@ -588,6 +597,28 @@ int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
return ret;
}
+struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id,
+ struct list_head **pos)
+{
+ struct rdt_domain_hdr *d;
+ struct list_head *l;
+
+ list_for_each(l, h) {
+ d = list_entry(l, struct rdt_domain_hdr, list);
+ /* When id is found, return its domain. */
+ if (id == d->id)
+ return d;
+ /* Stop searching when finding id's position in sorted list. */
+ if (id < d->id)
+ break;
+ }
+
+ if (pos)
+ *pos = l;
+
+ return NULL;
+}
+
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
cpumask_t *cpumask, int evtid, int first)
@@ -649,7 +680,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
resid = md.u.rid;
domid = md.u.domid;
evtid = md.u.evtid;
- r = &rdt_resources_all[resid].r_resctrl;
+ r = resctrl_arch_get_resource(resid);
if (md.u.sum) {
/*
@@ -673,7 +704,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
* This file provides data from a single domain. Search
* the resource to find the domain with "domid".
*/
- hdr = rdt_find_domain(&r->mon_domains, domid, NULL);
+ hdr = resctrl_find_domain(&r->mon_domains, domid, NULL);
if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) {
ret = -ENOENT;
goto out;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 20c898f09b7e..eaae99602b61 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -32,30 +32,6 @@
*/
#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
-/* Reads to Local DRAM Memory */
-#define READS_TO_LOCAL_MEM BIT(0)
-
-/* Reads to Remote DRAM Memory */
-#define READS_TO_REMOTE_MEM BIT(1)
-
-/* Non-Temporal Writes to Local Memory */
-#define NON_TEMP_WRITE_TO_LOCAL_MEM BIT(2)
-
-/* Non-Temporal Writes to Remote Memory */
-#define NON_TEMP_WRITE_TO_REMOTE_MEM BIT(3)
-
-/* Reads to Local Memory the system identifies as "Slow Memory" */
-#define READS_TO_LOCAL_S_MEM BIT(4)
-
-/* Reads to Remote Memory the system identifies as "Slow Memory" */
-#define READS_TO_REMOTE_S_MEM BIT(5)
-
-/* Dirty Victims to All Types of Memory */
-#define DIRTY_VICTIMS_TO_ALL_MEM BIT(6)
-
-/* Max event bits supported */
-#define MAX_EVT_CONFIG_BITS GENMASK(6, 0)
-
/**
* cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that
* aren't marked nohz_full
@@ -180,7 +156,6 @@ struct rmid_read {
void *arch_mon_ctx;
};
-extern unsigned int rdt_mon_features;
extern struct list_head resctrl_schema_all;
extern bool resctrl_mounted;
@@ -234,43 +209,6 @@ struct mongroup {
};
/**
- * struct pseudo_lock_region - pseudo-lock region information
- * @s: Resctrl schema for the resource to which this
- * pseudo-locked region belongs
- * @d: RDT domain to which this pseudo-locked region
- * belongs
- * @cbm: bitmask of the pseudo-locked region
- * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread
- * completion
- * @thread_done: variable used by waitqueue to test if pseudo-locking
- * thread completed
- * @cpu: core associated with the cache on which the setup code
- * will be run
- * @line_size: size of the cache lines
- * @size: size of pseudo-locked region in bytes
- * @kmem: the kernel memory associated with pseudo-locked region
- * @minor: minor number of character device associated with this
- * region
- * @debugfs_dir: pointer to this region's directory in the debugfs
- * filesystem
- * @pm_reqs: Power management QoS requests related to this region
- */
-struct pseudo_lock_region {
- struct resctrl_schema *s;
- struct rdt_ctrl_domain *d;
- u32 cbm;
- wait_queue_head_t lock_thread_wq;
- int thread_done;
- int cpu;
- unsigned int line_size;
- unsigned int size;
- void *kmem;
- unsigned int minor;
- struct dentry *debugfs_dir;
- struct list_head pm_reqs;
-};
-
-/**
* struct rdtgroup - store rdtgroup's data in resctrl file system.
* @kn: kernfs node
* @rdtgroup_list: linked list for all rdtgroups
@@ -326,10 +264,7 @@ struct rdtgroup {
/* List of all resource groups */
extern struct list_head rdt_all_groups;
-extern int max_name_width, max_data_width;
-
-int __init rdtgroup_init(void);
-void __exit rdtgroup_exit(void);
+extern int max_name_width;
/**
* struct rftype - describe each file in the resctrl file system
@@ -433,37 +368,6 @@ struct msr_param {
u32 high;
};
-static inline bool is_llc_occupancy_enabled(void)
-{
- return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
-}
-
-static inline bool is_mbm_total_enabled(void)
-{
- return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID));
-}
-
-static inline bool is_mbm_local_enabled(void)
-{
- return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID));
-}
-
-static inline bool is_mbm_enabled(void)
-{
- return (is_mbm_total_enabled() || is_mbm_local_enabled());
-}
-
-static inline bool is_mbm_event(int e)
-{
- return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
- e <= QOS_L3_MBM_LOCAL_EVENT_ID);
-}
-
-struct rdt_parse_data {
- struct rdtgroup *rdtgrp;
- char *buf;
-};
-
/**
* struct rdt_hw_resource - arch private attributes of a resctrl resource
* @r_resctrl: Attributes of the resource used directly by resctrl.
@@ -476,8 +380,6 @@ struct rdt_parse_data {
* @msr_update: Function pointer to update QOS MSRs
* @mon_scale: cqm counter * mon_scale = occupancy in bytes
* @mbm_width: Monitor width, to detect and correct for overflow.
- * @mbm_cfg_mask: Bandwidth sources that can be tracked when Bandwidth
- * Monitoring Event Configuration (BMEC) is supported.
* @cdp_enabled: CDP state of this resource
*
* Members of this structure are either private to the architecture
@@ -491,7 +393,6 @@ struct rdt_hw_resource {
void (*msr_update)(struct msr_param *m);
unsigned int mon_scale;
unsigned int mbm_width;
- unsigned int mbm_cfg_mask;
bool cdp_enabled;
};
@@ -500,36 +401,18 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r
return container_of(r, struct rdt_hw_resource, r_resctrl);
}
-int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_ctrl_domain *d);
-int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_ctrl_domain *d);
-
extern struct mutex rdtgroup_mutex;
+static inline const char *rdt_kn_name(const struct kernfs_node *kn)
+{
+ return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex));
+}
+
extern struct rdt_hw_resource rdt_resources_all[];
extern struct rdtgroup rdtgroup_default;
extern struct dentry *debugfs_resctrl;
extern enum resctrl_event_id mba_mbps_default_event;
-enum resctrl_res_level {
- RDT_RESOURCE_L3,
- RDT_RESOURCE_L2,
- RDT_RESOURCE_MBA,
- RDT_RESOURCE_SMBA,
-
- /* Must be the last */
- RDT_NUM_RESOURCES,
-};
-
-static inline struct rdt_resource *resctrl_inc(struct rdt_resource *res)
-{
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(res);
-
- hw_res++;
- return &hw_res->r_resctrl;
-}
-
static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
{
return rdt_resources_all[l].cdp_enabled;
@@ -539,27 +422,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable);
void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d);
-/*
- * To return the common struct rdt_resource, which is contained in struct
- * rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource.
- */
-#define for_each_rdt_resource(r) \
- for (r = &rdt_resources_all[0].r_resctrl; \
- r <= &rdt_resources_all[RDT_NUM_RESOURCES - 1].r_resctrl; \
- r = resctrl_inc(r))
-
-#define for_each_capable_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->alloc_capable || r->mon_capable)
-
-#define for_each_alloc_capable_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->alloc_capable)
-
-#define for_each_mon_capable_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->mon_capable)
-
/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
union cpuid_0x10_1_eax {
struct {
@@ -604,8 +466,6 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn);
int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
umode_t mask);
-struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id,
- struct list_head **pos);
ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
int rdtgroup_schemata_show(struct kernfs_open_file *of,
@@ -620,28 +480,19 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain
unsigned long cbm);
enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
int rdtgroup_tasks_assigned(struct rdtgroup *r);
-int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
-int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm);
-bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d);
-int rdt_pseudo_lock_init(void);
-void rdt_pseudo_lock_release(void);
-int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
-void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
-struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r);
-struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r);
int closids_supported(void);
void closid_free(int closid);
int alloc_rmid(u32 closid);
void free_rmid(u32 closid, u32 rmid);
int rdt_get_mon_l3_config(struct rdt_resource *r);
-void __exit rdt_put_mon_l3_config(void);
+void resctrl_mon_resource_exit(void);
bool __init rdt_cpu_has(int flag);
void mon_event_count(void *info);
int rdtgroup_mondata_show(struct seq_file *m, void *arg);
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
cpumask_t *cpumask, int evtid, int first);
+int __init resctrl_mon_resource_init(void);
void mbm_setup_overflow_handler(struct rdt_mon_domain *dom,
unsigned long delay_ms,
int exclude_cpu);
@@ -658,4 +509,45 @@ void resctrl_file_fflags_init(const char *config, unsigned long fflags);
void rdt_staged_configs_clear(void);
bool closid_allocated(unsigned int closid);
int resctrl_find_cleanest_closid(void);
+
+#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK
+int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
+int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm);
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d);
+int rdt_pseudo_lock_init(void);
+void rdt_pseudo_lock_release(void);
+int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
+void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
+#else
+static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm)
+{
+ return false;
+}
+
+static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
+{
+ return false;
+}
+
+static inline int rdt_pseudo_lock_init(void) { return 0; }
+static inline void rdt_pseudo_lock_release(void) { }
+static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { }
+#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */
+
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 94a1d9780461..a93ed7d2a160 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -295,11 +295,11 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *
{
struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
- if (is_mbm_total_enabled())
+ if (resctrl_arch_is_mbm_total_enabled())
memset(hw_dom->arch_mbm_total, 0,
sizeof(*hw_dom->arch_mbm_total) * r->num_rmid);
- if (is_mbm_local_enabled())
+ if (resctrl_arch_is_mbm_local_enabled())
memset(hw_dom->arch_mbm_local, 0,
sizeof(*hw_dom->arch_mbm_local) * r->num_rmid);
}
@@ -365,7 +365,7 @@ static void limbo_release_entry(struct rmid_entry *entry)
*/
void __check_limbo(struct rdt_mon_domain *d, bool force_free)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
u32 idx_limit = resctrl_arch_system_num_rmid_idx();
struct rmid_entry *entry;
u32 idx, cur_idx = 1;
@@ -521,7 +521,7 @@ int alloc_rmid(u32 closid)
static void add_rmid_to_limbo(struct rmid_entry *entry)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
struct rdt_mon_domain *d;
u32 idx;
@@ -569,7 +569,7 @@ void free_rmid(u32 closid, u32 rmid)
entry = __rmid_entry(idx);
- if (is_llc_occupancy_enabled())
+ if (resctrl_arch_is_llc_occupancy_enabled())
add_rmid_to_limbo(entry);
else
list_add_tail(&entry->list, &rmid_free_lru);
@@ -718,6 +718,22 @@ void mon_event_count(void *info)
rr->err = 0;
}
+static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu,
+ struct rdt_resource *r)
+{
+ struct rdt_ctrl_domain *d;
+
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
+ return d;
+ }
+
+ return NULL;
+}
+
/*
* Feedback loop for MBA software controller (mba_sc)
*
@@ -761,7 +777,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
struct rdtgroup *entry;
u32 cur_bw, user_bw;
- r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+ r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
evt_id = rgrp->mba_mbps_event;
closid = rgrp->closid;
@@ -852,10 +868,10 @@ static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
* This is protected from concurrent reads from user as both
* the user and overflow handler hold the global mutex.
*/
- if (is_mbm_total_enabled())
+ if (resctrl_arch_is_mbm_total_enabled())
mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
- if (is_mbm_local_enabled())
+ if (resctrl_arch_is_mbm_local_enabled())
mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
}
@@ -925,7 +941,7 @@ void mbm_handle_overflow(struct work_struct *work)
if (!resctrl_mounted || !resctrl_arch_mon_capable())
goto out_unlock;
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
d = container_of(work, struct rdt_mon_domain, mbm_over.work);
list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
@@ -1027,7 +1043,7 @@ static int dom_data_init(struct rdt_resource *r)
/*
* RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and
* are always allocated. These are used for the rdtgroup_default
- * control group, which will be setup later in rdtgroup_init().
+ * control group, which will be setup later in resctrl_init().
*/
idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
RESCTRL_RESERVED_RMID);
@@ -1040,10 +1056,13 @@ out_unlock:
return err;
}
-static void __exit dom_data_exit(void)
+static void dom_data_exit(struct rdt_resource *r)
{
mutex_lock(&rdtgroup_mutex);
+ if (!r->mon_capable)
+ goto out_unlock;
+
if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
kfree(closid_num_dirty_rmid);
closid_num_dirty_rmid = NULL;
@@ -1052,6 +1071,7 @@ static void __exit dom_data_exit(void)
kfree(rmid_ptrs);
rmid_ptrs = NULL;
+out_unlock:
mutex_unlock(&rdtgroup_mutex);
}
@@ -1081,11 +1101,11 @@ static void l3_mon_evt_init(struct rdt_resource *r)
{
INIT_LIST_HEAD(&r->evt_list);
- if (is_llc_occupancy_enabled())
+ if (resctrl_arch_is_llc_occupancy_enabled())
list_add_tail(&llc_occupancy_event.list, &r->evt_list);
- if (is_mbm_total_enabled())
+ if (resctrl_arch_is_mbm_total_enabled())
list_add_tail(&mbm_total_event.list, &r->evt_list);
- if (is_mbm_local_enabled())
+ if (resctrl_arch_is_mbm_local_enabled())
list_add_tail(&mbm_local_event.list, &r->evt_list);
}
@@ -1172,12 +1192,56 @@ static __init int snc_get_config(void)
return ret;
}
+/**
+ * resctrl_mon_resource_init() - Initialise global monitoring structures.
+ *
+ * Allocate and initialise global monitor resources that do not belong to a
+ * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists.
+ * Called once during boot after the struct rdt_resource's have been configured
+ * but before the filesystem is mounted.
+ * Resctrl's cpuhp callbacks may be called before this point to bring a domain
+ * online.
+ *
+ * Returns 0 for success, or -ENOMEM.
+ */
+int __init resctrl_mon_resource_init(void)
+{
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ int ret;
+
+ if (!r->mon_capable)
+ return 0;
+
+ ret = dom_data_init(r);
+ if (ret)
+ return ret;
+
+ l3_mon_evt_init(r);
+
+ if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) {
+ mbm_total_event.configurable = true;
+ resctrl_file_fflags_init("mbm_total_bytes_config",
+ RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
+ }
+ if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) {
+ mbm_local_event.configurable = true;
+ resctrl_file_fflags_init("mbm_local_bytes_config",
+ RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
+ }
+
+ if (resctrl_arch_is_mbm_local_enabled())
+ mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID;
+ else if (resctrl_arch_is_mbm_total_enabled())
+ mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
+
+ return 0;
+}
+
int __init rdt_get_mon_l3_config(struct rdt_resource *r)
{
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
unsigned int threshold;
- int ret;
snc_nodes_per_l3_cache = snc_get_config();
@@ -1207,39 +1271,24 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
*/
resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
- ret = dom_data_init(r);
- if (ret)
- return ret;
-
if (rdt_cpu_has(X86_FEATURE_BMEC)) {
u32 eax, ebx, ecx, edx;
/* Detect list of bandwidth sources that can be tracked */
cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
- hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
-
- if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
- mbm_total_event.configurable = true;
- resctrl_file_fflags_init("mbm_total_bytes_config",
- RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
- }
- if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
- mbm_local_event.configurable = true;
- resctrl_file_fflags_init("mbm_local_bytes_config",
- RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
- }
+ r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
}
- l3_mon_evt_init(r);
-
r->mon_capable = true;
return 0;
}
-void __exit rdt_put_mon_l3_config(void)
+void resctrl_mon_resource_exit(void)
{
- dom_data_exit();
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+
+ dom_data_exit(r);
}
void __init intel_rdt_mbm_apply_quirk(void)
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 42cc162f7fc9..92ea1472bde9 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -52,7 +52,8 @@ static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode)
rdtgrp = dev_get_drvdata(dev);
if (mode)
*mode = 0600;
- return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name);
+ guard(mutex)(&rdtgroup_mutex);
+ return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn));
}
static const struct class pseudo_lock_class = {
@@ -61,7 +62,8 @@ static const struct class pseudo_lock_class = {
};
/**
- * get_prefetch_disable_bits - prefetch disable bits of supported platforms
+ * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
+ * platforms
* @void: It takes no parameters.
*
* Capture the list of platforms that have been validated to support
@@ -75,14 +77,16 @@ static const struct class pseudo_lock_class = {
* in the SDM.
*
* When adding a platform here also add support for its cache events to
- * measure_cycles_perf_fn()
+ * resctrl_arch_measure_l*_residency()
*
* Return:
* If platform is supported, the bits to disable hardware prefetchers, 0
* if platform is not supported.
*/
-static u64 get_prefetch_disable_bits(void)
+u64 resctrl_arch_get_prefetch_disable_bits(void)
{
+ prefetch_disable_bits = 0;
+
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
boot_cpu_data.x86 != 6)
return 0;
@@ -98,7 +102,8 @@ static u64 get_prefetch_disable_bits(void)
* 3 DCU IP Prefetcher Disable (R/W)
* 63:4 Reserved
*/
- return 0xF;
+ prefetch_disable_bits = 0xF;
+ break;
case INTEL_ATOM_GOLDMONT:
case INTEL_ATOM_GOLDMONT_PLUS:
/*
@@ -109,10 +114,11 @@ static u64 get_prefetch_disable_bits(void)
* 2 DCU Hardware Prefetcher Disable (R/W)
* 63:3 Reserved
*/
- return 0x5;
+ prefetch_disable_bits = 0x5;
+ break;
}
- return 0;
+ return prefetch_disable_bits;
}
/**
@@ -408,8 +414,8 @@ static void pseudo_lock_free(struct rdtgroup *rdtgrp)
}
/**
- * pseudo_lock_fn - Load kernel memory into cache
- * @_rdtgrp: resource group to which pseudo-lock region belongs
+ * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
+ * @_plr: the pseudo-lock region descriptor
*
* This is the core pseudo-locking flow.
*
@@ -426,10 +432,9 @@ static void pseudo_lock_free(struct rdtgroup *rdtgrp)
*
* Return: 0. Waiter on waitqueue will be woken on completion.
*/
-static int pseudo_lock_fn(void *_rdtgrp)
+int resctrl_arch_pseudo_lock_fn(void *_plr)
{
- struct rdtgroup *rdtgrp = _rdtgrp;
- struct pseudo_lock_region *plr = rdtgrp->plr;
+ struct pseudo_lock_region *plr = _plr;
u32 rmid_p, closid_p;
unsigned long i;
u64 saved_msr;
@@ -489,7 +494,8 @@ static int pseudo_lock_fn(void *_rdtgrp)
* pseudo-locked followed by reading of kernel memory to load it
* into the cache.
*/
- __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, rdtgrp->closid);
+ __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid);
+
/*
* Cache was flushed earlier. Now access kernel memory to read it
* into cache region associated with just activated plr->closid.
@@ -712,8 +718,7 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
* Not knowing the bits to disable prefetching implies that this
* platform does not support Cache Pseudo-Locking.
*/
- prefetch_disable_bits = get_prefetch_disable_bits();
- if (prefetch_disable_bits == 0) {
+ if (resctrl_arch_get_prefetch_disable_bits() == 0) {
rdt_last_cmd_puts("Pseudo-locking not supported\n");
return -EINVAL;
}
@@ -872,7 +877,8 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
}
/**
- * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory
+ * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
+ * pseudo-locked memory
* @_plr: pseudo-lock region to measure
*
* There is no deterministic way to test if a memory region is cached. One
@@ -885,7 +891,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
*
* Return: 0. Waiter on waitqueue will be woken on completion.
*/
-static int measure_cycles_lat_fn(void *_plr)
+int resctrl_arch_measure_cycles_lat_fn(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
u32 saved_low, saved_high;
@@ -1069,7 +1075,7 @@ out:
return 0;
}
-static int measure_l2_residency(void *_plr)
+int resctrl_arch_measure_l2_residency(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
struct residency_counts counts = {0};
@@ -1107,7 +1113,7 @@ out:
return 0;
}
-static int measure_l3_residency(void *_plr)
+int resctrl_arch_measure_l3_residency(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
struct residency_counts counts = {0};
@@ -1205,14 +1211,14 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
plr->cpu = cpu;
if (sel == 1)
- thread = kthread_run_on_cpu(measure_cycles_lat_fn, plr,
- cpu, "pseudo_lock_measure/%u");
+ thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn,
+ plr, cpu, "pseudo_lock_measure/%u");
else if (sel == 2)
- thread = kthread_run_on_cpu(measure_l2_residency, plr,
- cpu, "pseudo_lock_measure/%u");
+ thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency,
+ plr, cpu, "pseudo_lock_measure/%u");
else if (sel == 3)
- thread = kthread_run_on_cpu(measure_l3_residency, plr,
- cpu, "pseudo_lock_measure/%u");
+ thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency,
+ plr, cpu, "pseudo_lock_measure/%u");
else
goto out;
@@ -1293,6 +1299,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
struct task_struct *thread;
unsigned int new_minor;
struct device *dev;
+ char *kn_name __free(kfree) = NULL;
int ret;
ret = pseudo_lock_region_alloc(plr);
@@ -1304,10 +1311,15 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
ret = -EINVAL;
goto out_region;
}
+ kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL);
+ if (!kn_name) {
+ ret = -ENOMEM;
+ goto out_cstates;
+ }
plr->thread_done = 0;
- thread = kthread_run_on_cpu(pseudo_lock_fn, rdtgrp,
+ thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr,
plr->cpu, "pseudo_lock/%u");
if (IS_ERR(thread)) {
ret = PTR_ERR(thread);
@@ -1348,8 +1360,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
mutex_unlock(&rdtgroup_mutex);
if (!IS_ERR_OR_NULL(debugfs_resctrl)) {
- plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name,
- debugfs_resctrl);
+ plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl);
if (!IS_ERR_OR_NULL(plr->debugfs_dir))
debugfs_create_file("pseudo_lock_measure", 0200,
plr->debugfs_dir, rdtgrp,
@@ -1358,7 +1369,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
dev = device_create(&pseudo_lock_class, NULL,
MKDEV(pseudo_lock_major, new_minor),
- rdtgrp, "%s", rdtgrp->kn->name);
+ rdtgrp, "%s", kn_name);
mutex_lock(&rdtgroup_mutex);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 6419e04d8a7b..93ec829015f1 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -57,6 +57,12 @@ static struct kernfs_node *kn_mongrp;
/* Kernel fs node for "mon_data" directory under root */
static struct kernfs_node *kn_mondata;
+/*
+ * Used to store the max resource name width to display the schemata names in
+ * a tabular format.
+ */
+int max_name_width;
+
static struct seq_buf last_cmd_status;
static char last_cmd_status_buf[512];
@@ -111,6 +117,18 @@ void rdt_staged_configs_clear(void)
}
}
+static bool resctrl_is_mbm_enabled(void)
+{
+ return (resctrl_arch_is_mbm_total_enabled() ||
+ resctrl_arch_is_mbm_local_enabled());
+}
+
+static bool resctrl_is_mbm_event(int e)
+{
+ return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
+ e <= QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+
/*
* Trivial allocator for CLOSIDs. Since h/w only supports a small number,
* we can keep a bitmap of free CLOSIDs in a single integer.
@@ -157,7 +175,8 @@ static int closid_alloc(void)
lockdep_assert_held(&rdtgroup_mutex);
- if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) &&
+ resctrl_arch_is_llc_occupancy_enabled()) {
cleanest_closid = resctrl_find_cleanest_closid();
if (cleanest_closid < 0)
return cleanest_closid;
@@ -348,13 +367,13 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
* from update_closid_rmid() is protected against __switch_to() because
* preemption is disabled.
*/
-static void update_cpu_closid_rmid(void *info)
+void resctrl_arch_sync_cpu_closid_rmid(void *info)
{
- struct rdtgroup *r = info;
+ struct resctrl_cpu_defaults *r = info;
if (r) {
this_cpu_write(pqr_state.default_closid, r->closid);
- this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
+ this_cpu_write(pqr_state.default_rmid, r->rmid);
}
/*
@@ -369,11 +388,20 @@ static void update_cpu_closid_rmid(void *info)
* Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
*
* Per task closids/rmids must have been set up before calling this function.
+ * @r may be NULL.
*/
static void
update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
{
- on_each_cpu_mask(cpu_mask, update_cpu_closid_rmid, r, 1);
+ struct resctrl_cpu_defaults defaults, *p = NULL;
+
+ if (r) {
+ defaults.closid = r->closid;
+ defaults.rmid = r->mon.rmid;
+ p = &defaults;
+ }
+
+ on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1);
}
static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
@@ -916,14 +944,14 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
continue;
seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
- rdtg->kn->name);
+ rdt_kn_name(rdtg->kn));
seq_puts(s, "mon:");
list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
mon.crdtgrp_list) {
if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
crg->mon.rmid))
continue;
- seq_printf(s, "%s", crg->kn->name);
+ seq_printf(s, "%s", rdt_kn_name(crg->kn));
break;
}
seq_putc(s, '\n');
@@ -956,10 +984,20 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
return 0;
}
+static void *rdt_kn_parent_priv(struct kernfs_node *kn)
+{
+ /*
+ * The parent pointer is only valid within RCU section since it can be
+ * replaced.
+ */
+ guard(rcu)();
+ return rcu_dereference(kn->__parent)->priv;
+}
+
static int rdt_num_closids_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
seq_printf(seq, "%u\n", s->num_closid);
return 0;
@@ -968,17 +1006,17 @@ static int rdt_num_closids_show(struct kernfs_open_file *of,
static int rdt_default_ctrl_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
- seq_printf(seq, "%x\n", r->default_ctrl);
+ seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r));
return 0;
}
static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
@@ -988,7 +1026,7 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
static int rdt_shareable_bits_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%x\n", r->cache.shareable_bits);
@@ -1012,7 +1050,7 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of,
static int rdt_bit_usage_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
/*
* Use unsigned long even though only 32 bits are used to ensure
* test_bit() is used safely.
@@ -1094,7 +1132,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
static int rdt_min_bw_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->membw.min_bw);
@@ -1104,7 +1142,7 @@ static int rdt_min_bw_show(struct kernfs_open_file *of,
static int rdt_num_rmids_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
seq_printf(seq, "%d\n", r->num_rmid);
@@ -1114,7 +1152,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of,
static int rdt_mon_features_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
struct mon_evt *mevt;
list_for_each_entry(mevt, &r->evt_list, list) {
@@ -1129,7 +1167,7 @@ static int rdt_mon_features_show(struct kernfs_open_file *of,
static int rdt_bw_gran_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->membw.bw_gran);
@@ -1139,7 +1177,7 @@ static int rdt_bw_gran_show(struct kernfs_open_file *of,
static int rdt_delay_linear_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->membw.delay_linear);
@@ -1157,13 +1195,22 @@ static int max_threshold_occ_show(struct kernfs_open_file *of,
static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
- if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD)
+ switch (r->membw.throttle_mode) {
+ case THREAD_THROTTLE_PER_THREAD:
seq_puts(seq, "per-thread\n");
- else
+ return 0;
+ case THREAD_THROTTLE_MAX:
seq_puts(seq, "max\n");
+ return 0;
+ case THREAD_THROTTLE_UNDEFINED:
+ seq_puts(seq, "undefined\n");
+ return 0;
+ }
+
+ WARN_ON_ONCE(1);
return 0;
}
@@ -1222,7 +1269,7 @@ static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
@@ -1425,7 +1472,8 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
goto out;
}
rdtgrp->mode = RDT_MODE_EXCLUSIVE;
- } else if (!strcmp(buf, "pseudo-locksetup")) {
+ } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) &&
+ !strcmp(buf, "pseudo-locksetup")) {
ret = rdtgroup_locksetup_enter(rdtgrp);
if (ret)
goto out;
@@ -1552,11 +1600,6 @@ out:
return ret;
}
-struct mon_config_info {
- u32 evtid;
- u32 mon_config;
-};
-
#define INVALID_CONFIG_INDEX UINT_MAX
/**
@@ -1581,31 +1624,32 @@ static inline unsigned int mon_event_config_index_get(u32 evtid)
}
}
-static void mon_event_config_read(void *info)
+void resctrl_arch_mon_event_config_read(void *_config_info)
{
- struct mon_config_info *mon_info = info;
+ struct resctrl_mon_config_info *config_info = _config_info;
unsigned int index;
u64 msrval;
- index = mon_event_config_index_get(mon_info->evtid);
+ index = mon_event_config_index_get(config_info->evtid);
if (index == INVALID_CONFIG_INDEX) {
- pr_warn_once("Invalid event id %d\n", mon_info->evtid);
+ pr_warn_once("Invalid event id %d\n", config_info->evtid);
return;
}
rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval);
/* Report only the valid event configuration bits */
- mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
+ config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
}
-static void mondata_config_read(struct rdt_mon_domain *d, struct mon_config_info *mon_info)
+static void mondata_config_read(struct resctrl_mon_config_info *mon_info)
{
- smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_read, mon_info, 1);
+ smp_call_function_any(&mon_info->d->hdr.cpu_mask,
+ resctrl_arch_mon_event_config_read, mon_info, 1);
}
static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
{
- struct mon_config_info mon_info;
+ struct resctrl_mon_config_info mon_info;
struct rdt_mon_domain *dom;
bool sep = false;
@@ -1616,9 +1660,11 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid
if (sep)
seq_puts(s, ";");
- memset(&mon_info, 0, sizeof(struct mon_config_info));
+ memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info));
+ mon_info.r = r;
+ mon_info.d = dom;
mon_info.evtid = evtid;
- mondata_config_read(dom, &mon_info);
+ mondata_config_read(&mon_info);
seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config);
sep = true;
@@ -1634,7 +1680,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid
static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
@@ -1644,37 +1690,39 @@ static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
return 0;
}
-static void mon_event_config_write(void *info)
+void resctrl_arch_mon_event_config_write(void *_config_info)
{
- struct mon_config_info *mon_info = info;
+ struct resctrl_mon_config_info *config_info = _config_info;
unsigned int index;
- index = mon_event_config_index_get(mon_info->evtid);
+ index = mon_event_config_index_get(config_info->evtid);
if (index == INVALID_CONFIG_INDEX) {
- pr_warn_once("Invalid event id %d\n", mon_info->evtid);
+ pr_warn_once("Invalid event id %d\n", config_info->evtid);
return;
}
- wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0);
+ wrmsr(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config, 0);
}
static void mbm_config_write_domain(struct rdt_resource *r,
struct rdt_mon_domain *d, u32 evtid, u32 val)
{
- struct mon_config_info mon_info = {0};
+ struct resctrl_mon_config_info mon_info = {0};
/*
* Read the current config value first. If both are the same then
* no need to write it again.
*/
+ mon_info.r = r;
+ mon_info.d = d;
mon_info.evtid = evtid;
- mondata_config_read(d, &mon_info);
+ mondata_config_read(&mon_info);
if (mon_info.mon_config == val)
return;
@@ -1686,7 +1734,7 @@ static void mbm_config_write_domain(struct rdt_resource *r,
* are scoped at the domain level. Writing any of these MSRs
* on one CPU is observed by all the CPUs in the domain.
*/
- smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_write,
+ smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write,
&mon_info, 1);
/*
@@ -1703,7 +1751,6 @@ static void mbm_config_write_domain(struct rdt_resource *r,
static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
{
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
char *dom_str = NULL, *id_str;
unsigned long dom_id, val;
struct rdt_mon_domain *d;
@@ -1730,9 +1777,9 @@ next:
}
/* Value from user cannot be more than the supported set of events */
- if ((val & hw_res->mbm_cfg_mask) != val) {
+ if ((val & r->mbm_cfg_mask) != val) {
rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
- hw_res->mbm_cfg_mask);
+ r->mbm_cfg_mask);
return -EINVAL;
}
@@ -1750,7 +1797,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
int ret;
/* Valid input requires a trailing newline */
@@ -1776,7 +1823,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
int ret;
/* Valid input requires a trailing newline */
@@ -2036,6 +2083,28 @@ static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
return NULL;
}
+static void thread_throttle_mode_init(void)
+{
+ enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED;
+ struct rdt_resource *r_mba, *r_smba;
+
+ r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
+ if (r_mba->alloc_capable &&
+ r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
+ throttle_mode = r_mba->membw.throttle_mode;
+
+ r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA);
+ if (r_smba->alloc_capable &&
+ r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
+ throttle_mode = r_smba->membw.throttle_mode;
+
+ if (throttle_mode == THREAD_THROTTLE_UNDEFINED)
+ return;
+
+ resctrl_file_fflags_init("thread_throttle_mode",
+ RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
+}
+
void resctrl_file_fflags_init(const char *config, unsigned long fflags)
{
struct rftype *rft;
@@ -2164,6 +2233,20 @@ static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
return ret;
}
+static unsigned long fflags_from_resource(struct rdt_resource *r)
+{
+ switch (r->rid) {
+ case RDT_RESOURCE_L3:
+ case RDT_RESOURCE_L2:
+ return RFTYPE_RES_CACHE;
+ case RDT_RESOURCE_MBA:
+ case RDT_RESOURCE_SMBA:
+ return RFTYPE_RES_MB;
+ }
+
+ return WARN_ON_ONCE(1);
+}
+
static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
{
struct resctrl_schema *s;
@@ -2184,14 +2267,14 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
/* loop over enabled controls, these are all alloc_capable */
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
- fflags = r->fflags | RFTYPE_CTRL_INFO;
+ fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO;
ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
if (ret)
goto out_destroy;
}
for_each_mon_capable_rdt_resource(r) {
- fflags = r->fflags | RFTYPE_MON_INFO;
+ fflags = fflags_from_resource(r) | RFTYPE_MON_INFO;
sprintf(name, "%s_MON", r->name);
ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
if (ret)
@@ -2255,7 +2338,7 @@ static void l2_qos_cfg_update(void *arg)
static inline bool is_mba_linear(void)
{
- return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.delay_linear;
+ return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear;
}
static int set_cache_qos_cfg(int level, bool enable)
@@ -2345,10 +2428,10 @@ static void mba_sc_domain_destroy(struct rdt_resource *r,
*/
static bool supports_mba_mbps(void)
{
- struct rdt_resource *rmbm = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+ struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
- return (is_mbm_enabled() &&
+ return (resctrl_is_mbm_enabled() &&
r->alloc_capable && is_mba_linear() &&
r->ctrl_scope == rmbm->mon_scope);
}
@@ -2359,7 +2442,7 @@ static bool supports_mba_mbps(void)
*/
static int set_mba_sc(bool mba_sc)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
u32 num_closid = resctrl_arch_get_num_closid(r);
struct rdt_ctrl_domain *d;
unsigned long fflags;
@@ -2440,12 +2523,13 @@ static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
* resource. "info" and its subdirectories don't
* have rdtgroup structures, so return NULL here.
*/
- if (kn == kn_info || kn->parent == kn_info)
+ if (kn == kn_info ||
+ rcu_access_pointer(kn->__parent) == kn_info)
return NULL;
else
return kn->priv;
} else {
- return kn->parent->priv;
+ return rdt_kn_parent_priv(kn);
}
}
@@ -2596,6 +2680,20 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type
if (cl > max_name_width)
max_name_width = cl;
+ switch (r->schema_fmt) {
+ case RESCTRL_SCHEMA_BITMAP:
+ s->fmt_str = "%d=%x";
+ break;
+ case RESCTRL_SCHEMA_RANGE:
+ s->fmt_str = "%d=%u";
+ break;
+ }
+
+ if (WARN_ON_ONCE(!s->fmt_str)) {
+ kfree(s);
+ return -EINVAL;
+ }
+
INIT_LIST_HEAD(&s->list);
list_add(&s->list, &resctrl_schema_all);
@@ -2712,8 +2810,8 @@ static int rdt_get_tree(struct fs_context *fc)
if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
resctrl_mounted = true;
- if (is_mbm_enabled()) {
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ if (resctrl_is_mbm_enabled()) {
+ r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
list_for_each_entry(dom, &r->mon_domains, hdr.list)
mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
RESCTRL_PICK_ANY_CPU);
@@ -2823,7 +2921,7 @@ static int rdt_init_fs_context(struct fs_context *fc)
return 0;
}
-static int reset_all_ctrls(struct rdt_resource *r)
+void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
struct rdt_hw_ctrl_domain *hw_dom;
@@ -2847,12 +2945,12 @@ static int reset_all_ctrls(struct rdt_resource *r)
hw_dom = resctrl_to_arch_ctrl_dom(d);
for (i = 0; i < hw_res->num_closid; i++)
- hw_dom->ctrl_val[i] = r->default_ctrl;
+ hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r);
msr_param.dom = d;
smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
}
- return 0;
+ return;
}
/*
@@ -2971,9 +3069,10 @@ static void rdt_kill_sb(struct super_block *sb)
rdt_disable_ctx();
- /*Put everything back to default values. */
+ /* Put everything back to default values. */
for_each_alloc_capable_rdt_resource(r)
- reset_all_ctrls(r);
+ resctrl_arch_reset_all_ctrls(r);
+
rmdir_all_sub();
rdt_pseudo_lock_release();
rdtgroup_default.mode = RDT_MODE_SHAREABLE;
@@ -3080,7 +3179,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
if (ret)
return ret;
- if (!do_sum && is_mbm_event(mevt->evtid))
+ if (!do_sum && resctrl_is_mbm_event(mevt->evtid))
mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true);
}
@@ -3382,7 +3481,7 @@ static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
}
cfg = &d->staged_config[CDP_NONE];
- cfg->new_ctrl = r->default_ctrl;
+ cfg->new_ctrl = resctrl_get_default_ctrl(r);
cfg->have_new_ctrl = true;
}
}
@@ -3664,7 +3763,7 @@ out_unlock:
*/
static bool is_mon_groups(struct kernfs_node *kn, const char *name)
{
- return (!strcmp(kn->name, "mon_groups") &&
+ return (!strcmp(rdt_kn_name(kn), "mon_groups") &&
strcmp(name, "mon_groups"));
}
@@ -3696,14 +3795,21 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
{
struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+ u32 closid, rmid;
int cpu;
/* Give any tasks back to the parent group */
rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
- /* Update per cpu rmid of the moved CPUs first */
+ /*
+ * Update per cpu closid/rmid of the moved CPUs first.
+ * Note: the closid will not change, but the arch code still needs it.
+ */
+ closid = prdtgrp->closid;
+ rmid = prdtgrp->mon.rmid;
for_each_cpu(cpu, &rdtgrp->cpu_mask)
- per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
+ resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
+
/*
* Update the MSR on moved CPUs and CPUs which have moved
* task running on them.
@@ -3736,6 +3842,7 @@ static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
{
+ u32 closid, rmid;
int cpu;
/* Give any tasks back to the default group */
@@ -3746,10 +3853,10 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
&rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
/* Update per cpu closid and rmid of the moved CPUs first */
- for_each_cpu(cpu, &rdtgrp->cpu_mask) {
- per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
- per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
- }
+ closid = rdtgroup_default.closid;
+ rmid = rdtgroup_default.mon.rmid;
+ for_each_cpu(cpu, &rdtgrp->cpu_mask)
+ resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
/*
* Update the MSR on moved CPUs and CPUs which have moved
@@ -3771,9 +3878,18 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
return 0;
}
+static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn)
+{
+ /*
+ * Valid within the RCU section it was obtained or while rdtgroup_mutex
+ * is held.
+ */
+ return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex));
+}
+
static int rdtgroup_rmdir(struct kernfs_node *kn)
{
- struct kernfs_node *parent_kn = kn->parent;
+ struct kernfs_node *parent_kn;
struct rdtgroup *rdtgrp;
cpumask_var_t tmpmask;
int ret = 0;
@@ -3786,6 +3902,7 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
ret = -EPERM;
goto out;
}
+ parent_kn = rdt_kn_parent(kn);
/*
* If the rdtgroup is a ctrl_mon group and parent directory
@@ -3803,7 +3920,7 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
}
} else if (rdtgrp->type == RDTMON_GROUP &&
- is_mon_groups(parent_kn, kn->name)) {
+ is_mon_groups(parent_kn, rdt_kn_name(kn))) {
ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
} else {
ret = -EPERM;
@@ -3854,6 +3971,7 @@ static void mongrp_reparent(struct rdtgroup *rdtgrp,
static int rdtgroup_rename(struct kernfs_node *kn,
struct kernfs_node *new_parent, const char *new_name)
{
+ struct kernfs_node *kn_parent;
struct rdtgroup *new_prdtgrp;
struct rdtgroup *rdtgrp;
cpumask_var_t tmpmask;
@@ -3888,8 +4006,9 @@ static int rdtgroup_rename(struct kernfs_node *kn,
goto out;
}
- if (rdtgrp->type != RDTMON_GROUP || !kn->parent ||
- !is_mon_groups(kn->parent, kn->name)) {
+ kn_parent = rdt_kn_parent(kn);
+ if (rdtgrp->type != RDTMON_GROUP || !kn_parent ||
+ !is_mon_groups(kn_parent, rdt_kn_name(kn))) {
rdt_last_cmd_puts("Source must be a MON group\n");
ret = -EPERM;
goto out;
@@ -3950,7 +4069,7 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
seq_puts(seq, ",cdpl2");
- if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl))
+ if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA)))
seq_puts(seq, ",mba_MBps");
if (resctrl_debug)
@@ -4029,9 +4148,9 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d
if (resctrl_mounted && resctrl_arch_mon_capable())
rmdir_mondata_subdir_allrdtgrp(r, d);
- if (is_mbm_enabled())
+ if (resctrl_is_mbm_enabled())
cancel_delayed_work(&d->mbm_over);
- if (is_llc_occupancy_enabled() && has_busy_rmid(d)) {
+ if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) {
/*
* When a package is going down, forcefully
* decrement rmid->ebusy. There is no way to know
@@ -4049,17 +4168,30 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d
mutex_unlock(&rdtgroup_mutex);
}
+/**
+ * domain_setup_mon_state() - Initialise domain monitoring structures.
+ * @r: The resource for the newly online domain.
+ * @d: The newly online domain.
+ *
+ * Allocate monitor resources that belong to this domain.
+ * Called when the first CPU of a domain comes online, regardless of whether
+ * the filesystem is mounted.
+ * During boot this may be called before global allocations have been made by
+ * resctrl_mon_resource_init().
+ *
+ * Returns 0 for success, or -ENOMEM.
+ */
static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
{
u32 idx_limit = resctrl_arch_system_num_rmid_idx();
size_t tsize;
- if (is_llc_occupancy_enabled()) {
+ if (resctrl_arch_is_llc_occupancy_enabled()) {
d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
if (!d->rmid_busy_llc)
return -ENOMEM;
}
- if (is_mbm_total_enabled()) {
+ if (resctrl_arch_is_mbm_total_enabled()) {
tsize = sizeof(*d->mbm_total);
d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
if (!d->mbm_total) {
@@ -4067,7 +4199,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain
return -ENOMEM;
}
}
- if (is_mbm_local_enabled()) {
+ if (resctrl_arch_is_mbm_local_enabled()) {
tsize = sizeof(*d->mbm_local);
d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
if (!d->mbm_local) {
@@ -4106,13 +4238,13 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
if (err)
goto out_unlock;
- if (is_mbm_enabled()) {
+ if (resctrl_is_mbm_enabled()) {
INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
RESCTRL_PICK_ANY_CPU);
}
- if (is_llc_occupancy_enabled())
+ if (resctrl_arch_is_llc_occupancy_enabled())
INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
/*
@@ -4148,9 +4280,25 @@ static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
}
}
+static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu,
+ struct rdt_resource *r)
+{
+ struct rdt_mon_domain *d;
+
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
+ return d;
+ }
+
+ return NULL;
+}
+
void resctrl_offline_cpu(unsigned int cpu)
{
- struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3);
struct rdt_mon_domain *d;
struct rdtgroup *rdtgrp;
@@ -4167,12 +4315,12 @@ void resctrl_offline_cpu(unsigned int cpu)
d = get_mon_domain_from_cpu(cpu, l3);
if (d) {
- if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
+ if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) {
cancel_delayed_work(&d->mbm_over);
mbm_setup_overflow_handler(d, 0, cpu);
}
- if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
- has_busy_rmid(d)) {
+ if (resctrl_arch_is_llc_occupancy_enabled() &&
+ cpu == d->cqm_work_cpu && has_busy_rmid(d)) {
cancel_delayed_work(&d->cqm_limbo);
cqm_setup_limbo_handler(d, 0, cpu);
}
@@ -4183,14 +4331,14 @@ out_unlock:
}
/*
- * rdtgroup_init - rdtgroup initialization
+ * resctrl_init - resctrl filesystem initialization
*
* Setup resctrl file system including set up root, create mount point,
- * register rdtgroup filesystem, and initialize files under root directory.
+ * register resctrl filesystem, and initialize files under root directory.
*
* Return: 0 on success or -errno
*/
-int __init rdtgroup_init(void)
+int __init resctrl_init(void)
{
int ret = 0;
@@ -4199,10 +4347,18 @@ int __init rdtgroup_init(void)
rdtgroup_setup_default();
- ret = sysfs_create_mount_point(fs_kobj, "resctrl");
+ thread_throttle_mode_init();
+
+ ret = resctrl_mon_resource_init();
if (ret)
return ret;
+ ret = sysfs_create_mount_point(fs_kobj, "resctrl");
+ if (ret) {
+ resctrl_mon_resource_exit();
+ return ret;
+ }
+
ret = register_filesystem(&rdt_fs_type);
if (ret)
goto cleanup_mountpoint;
@@ -4234,13 +4390,16 @@ int __init rdtgroup_init(void)
cleanup_mountpoint:
sysfs_remove_mount_point(fs_kobj, "resctrl");
+ resctrl_mon_resource_exit();
return ret;
}
-void __exit rdtgroup_exit(void)
+void __exit resctrl_exit(void)
{
debugfs_remove_recursive(debugfs_resctrl);
unregister_filesystem(&rdt_fs_type);
sysfs_remove_mount_point(fs_kobj, "resctrl");
+
+ resctrl_mon_resource_exit();
}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 91639d1e4ec2..c6fefd4585f8 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -195,6 +195,7 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk("%sCall Trace:\n", log_lvl);
unwind_start(&state, task, regs, stack);
+ stack = stack ?: get_stack_pointer(task, regs);
regs = unwind_get_entry_regs(&state, &partial);
/*
@@ -213,9 +214,7 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - hardirq stack
* - entry stack
*/
- for (stack = stack ?: get_stack_pointer(task, regs);
- stack;
- stack = stack_info.next_sp) {
+ for (; stack; stack = stack_info.next_sp) {
const char *stack_name;
stack = PTR_ALIGN(stack, sizeof(long));
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index fc1714bad045..611f27e3890c 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -190,7 +190,6 @@ static __init void early_serial_init(char *s)
early_serial_hw_init(divisor);
}
-#ifdef CONFIG_PCI
static __noendbr void mem32_serial_out(unsigned long addr, int offset, int value)
{
u32 __iomem *vaddr = (u32 __iomem *)addr;
@@ -208,6 +207,45 @@ static __noendbr unsigned int mem32_serial_in(unsigned long addr, int offset)
ANNOTATE_NOENDBR_SYM(mem32_serial_in);
/*
+ * early_mmio_serial_init() - Initialize MMIO-based early serial console.
+ * @s: MMIO-based serial specification.
+ */
+static __init void early_mmio_serial_init(char *s)
+{
+ unsigned long baudrate;
+ unsigned long membase;
+ char *e;
+
+ if (*s == ',')
+ s++;
+
+ if (!strncmp(s, "0x", 2)) {
+ /* NB: only 32-bit addresses are supported. */
+ membase = simple_strtoul(s, &e, 16);
+ early_serial_base = (unsigned long)early_ioremap(membase, PAGE_SIZE);
+
+ static_call_update(serial_in, mem32_serial_in);
+ static_call_update(serial_out, mem32_serial_out);
+
+ s += strcspn(s, ",");
+ if (*s == ',')
+ s++;
+ }
+
+ if (!strncmp(s, "nocfg", 5)) {
+ baudrate = 0;
+ } else {
+ baudrate = simple_strtoul(s, &e, 0);
+ if (baudrate == 0 || s == e)
+ baudrate = DEFAULT_BAUD;
+ }
+
+ if (baudrate)
+ early_serial_hw_init(115200 / baudrate);
+}
+
+#ifdef CONFIG_PCI
+/*
* early_pci_serial_init()
*
* This function is invoked when the early_printk param starts with "pciserial"
@@ -351,6 +389,11 @@ static int __init setup_early_printk(char *buf)
keep = (strstr(buf, "keep") != NULL);
while (*buf != '\0') {
+ if (!strncmp(buf, "mmio", 4)) {
+ early_mmio_serial_init(buf + 4);
+ early_console_register(&early_serial_console, keep);
+ buf += 4;
+ }
if (!strncmp(buf, "serial", 6)) {
buf += 6;
early_serial_init(buf);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1b734a9ff088..91d6341f281f 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -508,7 +508,7 @@ static inline void fpstate_init_fstate(struct fpstate *fpstate)
/*
* Used in two places:
* 1) Early boot to setup init_fpstate for non XSAVE systems
- * 2) fpu_init_fpstate_user() which is invoked from KVM
+ * 2) fpu_alloc_guest_fpstate() which is invoked from KVM
*/
void fpstate_init_user(struct fpstate *fpstate)
{
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 9c9faa1634fb..102641fd2172 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -655,7 +655,7 @@ void kgdb_arch_late(void)
if (breakinfo[i].pev)
continue;
breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL);
- if (IS_ERR((void * __force)breakinfo[i].pev)) {
+ if (IS_ERR_PCPU(breakinfo[i].pev)) {
printk(KERN_ERR "kgdb: Could not allocate hw"
"breakpoints\nDisabling the kernel debugger\n");
breakinfo[i].pev = NULL;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 97925632c28e..1ccd05d8999f 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -75,6 +75,11 @@ void paravirt_set_sched_clock(u64 (*func)(void))
static_call_update(pv_sched_clock, func);
}
+static noinstr void pv_native_safe_halt(void)
+{
+ native_safe_halt();
+}
+
#ifdef CONFIG_PARAVIRT_XXL
static noinstr void pv_native_write_cr2(unsigned long val)
{
@@ -100,11 +105,6 @@ static noinstr void pv_native_set_debugreg(int regno, unsigned long val)
{
native_set_debugreg(regno, val);
}
-
-static noinstr void pv_native_safe_halt(void)
-{
- native_safe_halt();
-}
#endif
struct pv_info pv_info = {
@@ -161,9 +161,11 @@ struct paravirt_patch_template pv_ops = {
.irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl),
.irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable),
.irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable),
+#endif /* CONFIG_PARAVIRT_XXL */
+
+ /* Irq HLT ops. */
.irq.safe_halt = pv_native_safe_halt,
.irq.halt = native_halt,
-#endif /* CONFIG_PARAVIRT_XXL */
/* Mmu ops. */
.mmu.flush_tlb_user = native_flush_tlb_local,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 91f6ff618852..962c3ce39323 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -939,7 +939,7 @@ void __init select_idle_routine(void)
static_call_update(x86_idle, mwait_idle);
} else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
pr_info("using TDX aware idle routine\n");
- static_call_update(x86_idle, tdx_safe_halt);
+ static_call_update(x86_idle, tdx_halt);
} else {
static_call_update(x86_idle, default_idle);
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c7164a8de983..9d2a13b37833 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -578,14 +578,13 @@ static void __init memblock_x86_reserve_range_setup_data(void)
static void __init arch_reserve_crashkernel(void)
{
unsigned long long crash_base, crash_size, low_size = 0;
- char *cmdline = boot_command_line;
bool high = false;
int ret;
if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
return;
- ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
+ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
&crash_size, &crash_base,
&low_size, &high);
if (ret)
@@ -596,8 +595,7 @@ static void __init arch_reserve_crashkernel(void)
return;
}
- reserve_crashkernel_generic(cmdline, crash_size, crash_base,
- low_size, high);
+ reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
}
static struct resource standard_io_resources[] = {
@@ -1031,8 +1029,6 @@ void __init setup_arch(char **cmdline_p)
max_low_pfn = e820__end_of_low_ram_pfn();
else
max_low_pfn = max_pfn;
-
- high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
#endif
/* Find and reserve MPTABLE area */
@@ -1166,8 +1162,10 @@ void __init setup_arch(char **cmdline_p)
initmem_init();
dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
- if (boot_cpu_has(X86_FEATURE_GBPAGES))
+ if (boot_cpu_has(X86_FEATURE_GBPAGES)) {
hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
+ hugetlb_bootmem_alloc();
+ }
/*
* Reserve memory for crash kernel after SRAT is parsed so that it
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 9e51242ed125..a59c72e77645 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -158,7 +158,7 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
{
mutex_lock(&text_mutex);
- if (tramp) {
+ if (tramp && !site) {
__static_call_validate(tramp, true, true);
__static_call_transform(tramp, __sc_insn(!func, true), func, false);
}
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index d4705a348a80..977ee75e047c 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -476,7 +476,7 @@ bool unwind_next_frame(struct unwind_state *state)
return false;
/* Don't let modules unload while we're reading their ORC data. */
- preempt_disable();
+ guard(rcu)();
/* End-of-stack check for user tasks: */
if (state->regs && user_mode(state->regs))
@@ -669,14 +669,12 @@ bool unwind_next_frame(struct unwind_state *state)
goto err;
}
- preempt_enable();
return true;
err:
state->error = true;
the_end:
- preempt_enable();
state->stack_info.type = STACK_TYPE_UNKNOWN;
return false;
}