diff options
28 files changed, 817 insertions, 291 deletions
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index bec227f9500a..9da54d4ee49e 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -81,6 +81,8 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff, __KVM_HOST_SMCCC_FUNC___vgic_v3_save_vmcr_aprs, __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs, + __KVM_HOST_SMCCC_FUNC___pkvm_reserve_vm, + __KVM_HOST_SMCCC_FUNC___pkvm_unreserve_vm, __KVM_HOST_SMCCC_FUNC___pkvm_init_vm, __KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu, __KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm, diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 2b07f0a27a7d..8ede884f091d 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -252,7 +252,8 @@ struct kvm_protected_vm { pkvm_handle_t handle; struct kvm_hyp_memcache teardown_mc; struct kvm_hyp_memcache stage2_teardown_mc; - bool enabled; + bool is_protected; + bool is_created; }; struct kvm_mpidr_data { @@ -1441,7 +1442,7 @@ struct kvm *kvm_arch_alloc_vm(void); #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE -#define kvm_vm_is_protected(kvm) (is_protected_kvm_enabled() && (kvm)->arch.pkvm.enabled) +#define kvm_vm_is_protected(kvm) (is_protected_kvm_enabled() && (kvm)->arch.pkvm.is_protected) #define vcpu_is_protected(vcpu) kvm_vm_is_protected((vcpu)->kvm) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 7fd76f41c296..2be6c3de74e3 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -265,7 +265,7 @@ static inline u64 decode_range_tlbi(u64 val, u64 *range, u16 *asid) return base; } -static inline unsigned int ps_to_output_size(unsigned int ps) +static inline unsigned int ps_to_output_size(unsigned int ps, bool pa52bit) { switch (ps) { case 0: return 32; @@ -273,7 +273,10 @@ static inline unsigned int ps_to_output_size(unsigned int ps) case 2: return 40; case 3: return 42; case 4: return 44; - case 5: + case 5: return 48; + case 6: if (pa52bit) + return 52; + fallthrough; default: return 48; } @@ -285,13 +288,28 @@ enum trans_regime { TR_EL2, }; +struct s1_walk_info; + +struct s1_walk_context { + struct s1_walk_info *wi; + u64 table_ipa; + int level; +}; + +struct s1_walk_filter { + int (*fn)(struct s1_walk_context *, void *); + void *priv; +}; + struct s1_walk_info { + struct s1_walk_filter *filter; u64 baddr; enum trans_regime regime; unsigned int max_oa_bits; unsigned int pgshift; unsigned int txsz; int sl; + u8 sh; bool as_el0; bool hpd; bool e0poe; @@ -299,6 +317,7 @@ struct s1_walk_info { bool pan; bool be; bool s2; + bool pa52bit; }; struct s1_walk_result { @@ -334,6 +353,8 @@ struct s1_walk_result { int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va); +int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, + int *level); /* VNCR management */ int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index 35f9d9478004..0aecd4ac5f45 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -18,6 +18,7 @@ int pkvm_init_host_vm(struct kvm *kvm); int pkvm_create_hyp_vm(struct kvm *kvm); +bool pkvm_hyp_vm_is_created(struct kvm *kvm); void pkvm_destroy_hyp_vm(struct kvm *kvm); int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h index e3e8944a71c3..e92e4a0e48fc 100644 --- a/arch/arm64/include/asm/traps.h +++ b/arch/arm64/include/asm/traps.h @@ -36,6 +36,7 @@ int kasan_brk_handler(struct pt_regs *regs, unsigned long esr); int ubsan_brk_handler(struct pt_regs *regs, unsigned long esr); int early_brk64(unsigned long addr, unsigned long esr, struct pt_regs *regs); +void dump_kernel_instr(unsigned long kaddr); /* * Move regs->pc to next instruction and do necessary setup before it diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index f528b6041f6a..83e6d1409e1f 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -149,19 +149,18 @@ pstate_check_t * const aarch32_opcode_cond_checks[16] = { int show_unhandled_signals = 0; -static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) +void dump_kernel_instr(unsigned long kaddr) { - unsigned long addr = instruction_pointer(regs); char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; int i; - if (user_mode(regs)) + if (!is_ttbr1_addr(kaddr)) return; for (i = -4; i < 1; i++) { unsigned int val, bad; - bad = aarch64_insn_read(&((u32 *)addr)[i], &val); + bad = aarch64_insn_read(&((u32 *)kaddr)[i], &val); if (!bad) p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val); @@ -169,7 +168,7 @@ static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) p += sprintf(p, i == 0 ? "(????????) " : "???????? "); } - printk("%sCode: %s\n", lvl, str); + printk(KERN_EMERG "Code: %s\n", str); } #define S_SMP " SMP" @@ -178,6 +177,7 @@ static int __die(const char *str, long err, struct pt_regs *regs) { static int die_counter; int ret; + unsigned long addr = instruction_pointer(regs); pr_emerg("Internal error: %s: %016lx [#%d] " S_SMP "\n", str, err, ++die_counter); @@ -190,7 +190,10 @@ static int __die(const char *str, long err, struct pt_regs *regs) print_modules(); show_regs(regs); - dump_kernel_instr(KERN_EMERG, regs); + if (user_mode(regs)) + return ret; + + dump_kernel_instr(addr); return ret; } diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 49485d08ece2..a59b4046617c 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -170,10 +170,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) return ret; - ret = pkvm_init_host_vm(kvm); - if (ret) - goto err_unshare_kvm; - if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) { ret = -ENOMEM; goto err_unshare_kvm; @@ -184,6 +180,16 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) goto err_free_cpumask; + if (is_protected_kvm_enabled()) { + /* + * If any failures occur after this is successful, make sure to + * call __pkvm_unreserve_vm to unreserve the VM in hyp. + */ + ret = pkvm_init_host_vm(kvm); + if (ret) + goto err_free_cpumask; + } + kvm_vgic_early_init(kvm); kvm_timer_init_vm(kvm); diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index d71ca4ddc9d1..20bb9af125b1 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -28,9 +28,57 @@ static int get_ia_size(struct s1_walk_info *wi) /* Return true if the IPA is out of the OA range */ static bool check_output_size(u64 ipa, struct s1_walk_info *wi) { + if (wi->pa52bit) + return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits)); return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); } +static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr) +{ + switch (BIT(wi->pgshift)) { + case SZ_64K: + default: /* IMPDEF: treat any other value as 64k */ + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52)) + return false; + return ((wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_PS_MASK, tcr) : + FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110); + case SZ_16K: + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT)) + return false; + break; + case SZ_4K: + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT)) + return false; + break; + } + + return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS)); +} + +static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc) +{ + u64 addr; + + if (!wi->pa52bit) + return desc & GENMASK_ULL(47, wi->pgshift); + + switch (BIT(wi->pgshift)) { + case SZ_4K: + case SZ_16K: + addr = desc & GENMASK_ULL(49, wi->pgshift); + addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50; + break; + case SZ_64K: + default: /* IMPDEF: treat any other value as 64k */ + addr = desc & GENMASK_ULL(47, wi->pgshift); + addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48; + break; + } + + return addr; +} + /* Return the translation regime that applies to an AT instruction */ static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) { @@ -50,21 +98,26 @@ static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 o } } +static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + if (regime == TR_EL10) { + if (vcpu_has_nv(vcpu) && + !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En)) + return 0; + + return vcpu_read_sys_reg(vcpu, TCR2_EL1); + } + + return vcpu_read_sys_reg(vcpu, TCR2_EL2); +} + static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) { if (!kvm_has_s1pie(vcpu->kvm)) return false; - switch (regime) { - case TR_EL2: - case TR_EL20: - return vcpu_read_sys_reg(vcpu, TCR2_EL2) & TCR2_EL2_PIE; - case TR_EL10: - return (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) && - (__vcpu_sys_reg(vcpu, TCR2_EL1) & TCR2_EL1_PIE); - default: - BUG(); - } + /* Abuse TCR2_EL1_PIE and use it for EL2 as well */ + return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE; } static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) @@ -76,23 +129,11 @@ static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) return; } - switch (wi->regime) { - case TR_EL2: - case TR_EL20: - val = vcpu_read_sys_reg(vcpu, TCR2_EL2); - wi->poe = val & TCR2_EL2_POE; - wi->e0poe = (wi->regime == TR_EL20) && (val & TCR2_EL2_E0POE); - break; - case TR_EL10: - if (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) { - wi->poe = wi->e0poe = false; - return; - } + val = effective_tcr2(vcpu, wi->regime); - val = __vcpu_sys_reg(vcpu, TCR2_EL1); - wi->poe = val & TCR2_EL1_POE; - wi->e0poe = val & TCR2_EL1_E0POE; - } + /* Abuse TCR2_EL1_* for EL2 */ + wi->poe = val & TCR2_EL1_POE; + wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE); } static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, @@ -102,14 +143,16 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, unsigned int stride, x; bool va55, tbi, lva; - hcr = __vcpu_sys_reg(vcpu, HCR_EL2); - va55 = va & BIT(55); - if (wi->regime == TR_EL2 && va55) - goto addrsz; - - wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); + if (vcpu_has_nv(vcpu)) { + hcr = __vcpu_sys_reg(vcpu, HCR_EL2); + wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); + } else { + WARN_ON_ONCE(wi->regime != TR_EL10); + wi->s2 = false; + hcr = 0; + } switch (wi->regime) { case TR_EL10: @@ -131,6 +174,46 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, BUG(); } + /* Someone was silly enough to encode TG0/TG1 differently */ + if (va55 && wi->regime != TR_EL2) { + wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG1_MASK, tcr); + + switch (tg << TCR_TG1_SHIFT) { + case TCR_TG1_4K: + wi->pgshift = 12; break; + case TCR_TG1_16K: + wi->pgshift = 14; break; + case TCR_TG1_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } else { + wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG0_MASK, tcr); + + switch (tg << TCR_TG0_SHIFT) { + case TCR_TG0_4K: + wi->pgshift = 12; break; + case TCR_TG0_16K: + wi->pgshift = 14; break; + case TCR_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } + + wi->pa52bit = has_52bit_pa(vcpu, wi, tcr); + + ia_bits = get_ia_size(wi); + + /* AArch64.S1StartLevel() */ + stride = wi->pgshift - 3; + wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); + + if (wi->regime == TR_EL2 && va55) + goto addrsz; + tbi = (wi->regime == TR_EL2 ? FIELD_GET(TCR_EL2_TBI, tcr) : (va55 ? @@ -140,6 +223,12 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, if (!tbi && (u64)sign_extend64(va, 55) != va) goto addrsz; + wi->sh = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_SH0_MASK, tcr) : + (va55 ? + FIELD_GET(TCR_SH1_MASK, tcr) : + FIELD_GET(TCR_SH0_MASK, tcr))); + va = (u64)sign_extend64(va, 55); /* Let's put the MMU disabled case aside immediately */ @@ -194,53 +283,20 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, /* R_BVXDG */ wi->hpd |= (wi->poe || wi->e0poe); - /* Someone was silly enough to encode TG0/TG1 differently */ - if (va55) { - wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); - tg = FIELD_GET(TCR_TG1_MASK, tcr); - - switch (tg << TCR_TG1_SHIFT) { - case TCR_TG1_4K: - wi->pgshift = 12; break; - case TCR_TG1_16K: - wi->pgshift = 14; break; - case TCR_TG1_64K: - default: /* IMPDEF: treat any other value as 64k */ - wi->pgshift = 16; break; - } - } else { - wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); - tg = FIELD_GET(TCR_TG0_MASK, tcr); - - switch (tg << TCR_TG0_SHIFT) { - case TCR_TG0_4K: - wi->pgshift = 12; break; - case TCR_TG0_16K: - wi->pgshift = 14; break; - case TCR_TG0_64K: - default: /* IMPDEF: treat any other value as 64k */ - wi->pgshift = 16; break; - } - } - /* R_PLCGL, R_YXNYW */ if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { if (wi->txsz > 39) - goto transfault_l0; + goto transfault; } else { if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) - goto transfault_l0; + goto transfault; } /* R_GTJBY, R_SXWGM */ switch (BIT(wi->pgshift)) { case SZ_4K: - lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT); - lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); - break; case SZ_16K: - lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT); - lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); + lva = wi->pa52bit; break; case SZ_64K: lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); @@ -248,38 +304,42 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, } if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) - goto transfault_l0; - - ia_bits = get_ia_size(wi); + goto transfault; /* R_YYVYV, I_THCZK */ if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || (va55 && va < GENMASK(63, ia_bits))) - goto transfault_l0; + goto transfault; /* I_ZFSYQ */ if (wi->regime != TR_EL2 && (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) - goto transfault_l0; + goto transfault; /* R_BNDVG and following statements */ if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) - goto transfault_l0; - - /* AArch64.S1StartLevel() */ - stride = wi->pgshift - 3; - wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); + goto transfault; ps = (wi->regime == TR_EL2 ? FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); - wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps)); + wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit)); /* Compute minimal alignment */ x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); wi->baddr = ttbr & TTBRx_EL1_BADDR; + if (wi->pa52bit) { + /* + * Force the alignment on 64 bytes for top-level tables + * smaller than 8 entries, since TTBR.BADDR[5:2] are used to + * store bits [51:48] of the first level of lookup. + */ + x = max(x, 6); + + wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48; + } /* R_VPBBF */ if (check_output_size(wi->baddr, wi)) @@ -289,12 +349,17 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, return 0; -addrsz: /* Address Size Fault level 0 */ +addrsz: + /* + * Address Size Fault level 0 to indicate it comes from TTBR. + * yes, this is an oddity. + */ fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false); return -EFAULT; -transfault_l0: /* Translation Fault level 0 */ - fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false); +transfault: + /* Translation Fault on start level */ + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false); return -EFAULT; } @@ -339,6 +404,17 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, ipa = kvm_s2_trans_output(&s2_trans); } + if (wi->filter) { + ret = wi->filter->fn(&(struct s1_walk_context) + { + .wi = wi, + .table_ipa = baddr, + .level = level, + }, wi->filter->priv); + if (ret) + return ret; + } + ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); if (ret) { fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); @@ -369,7 +445,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); } - baddr = desc & GENMASK_ULL(47, wi->pgshift); + baddr = desc_to_oa(wi, desc); /* Check for out-of-range OA */ if (check_output_size(baddr, wi)) @@ -386,11 +462,11 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, switch (BIT(wi->pgshift)) { case SZ_4K: - valid_block = level == 1 || level == 2; + valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0); break; case SZ_16K: case SZ_64K: - valid_block = level == 2; + valid_block = level == 2 || (wi->pa52bit && level == 1); break; } @@ -398,7 +474,8 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, goto transfault; } - if (check_output_size(desc & GENMASK(47, va_bottom), wi)) + baddr = desc_to_oa(wi, desc); + if (check_output_size(baddr & GENMASK(52, va_bottom), wi)) goto addrsz; if (!(desc & PTE_AF)) { @@ -411,7 +488,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wr->failed = false; wr->level = level; wr->desc = desc; - wr->pa = desc & GENMASK(47, va_bottom); + wr->pa = baddr & GENMASK(52, va_bottom); wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG); @@ -640,21 +717,36 @@ static u8 combine_s1_s2_attr(u8 s1, u8 s2) #define ATTR_OSH 0b10 #define ATTR_ISH 0b11 -static u8 compute_sh(u8 attr, u64 desc) +static u8 compute_final_sh(u8 attr, u8 sh) { - u8 sh; - /* Any form of device, as well as NC has SH[1:0]=0b10 */ if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) return ATTR_OSH; - sh = FIELD_GET(PTE_SHARED, desc); if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ sh = ATTR_NSH; return sh; } +static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr, + u8 attr) +{ + u8 sh; + + /* + * non-52bit and LPA have their basic shareability described in the + * descriptor. LPA2 gets it from the corresponding field in TCR, + * conveniently recorded in the walk info. + */ + if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K) + sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc); + else + sh = wi->sh; + + return compute_final_sh(attr, sh); +} + static u8 combine_sh(u8 s1_sh, u8 s2_sh) { if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH) @@ -668,7 +760,7 @@ static u8 combine_sh(u8 s1_sh, u8 s2_sh) static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, struct kvm_s2_trans *tr) { - u8 s1_parattr, s2_memattr, final_attr; + u8 s1_parattr, s2_memattr, final_attr, s2_sh; u64 par; /* If S2 has failed to translate, report the damage */ @@ -741,17 +833,19 @@ static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, !MEMATTR_IS_DEVICE(final_attr)) final_attr = MEMATTR(NC, NC); + s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc); + par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); par |= tr->output & GENMASK(47, 12); par |= FIELD_PREP(SYS_PAR_EL1_SH, combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), - compute_sh(final_attr, tr->desc))); + compute_final_sh(final_attr, s2_sh))); return par; } -static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, - enum trans_regime regime) +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr) { u64 par; @@ -764,9 +858,9 @@ static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, } else if (wr->level == S1_MMU_DISABLED) { /* MMU off or HCR_EL2.DC == 1 */ par = SYS_PAR_EL1_NSE; - par |= wr->pa & GENMASK_ULL(47, 12); + par |= wr->pa & SYS_PAR_EL1_PA; - if (regime == TR_EL10 && + if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { par |= FIELD_PREP(SYS_PAR_EL1_ATTR, MEMATTR(WbRaWa, WbRaWa)); @@ -781,14 +875,14 @@ static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, par = SYS_PAR_EL1_NSE; - mair = (regime == TR_EL10 ? + mair = (wi->regime == TR_EL10 ? vcpu_read_sys_reg(vcpu, MAIR_EL1) : vcpu_read_sys_reg(vcpu, MAIR_EL2)); mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; mair &= 0xff; - sctlr = (regime == TR_EL10 ? + sctlr = (wi->regime == TR_EL10 ? vcpu_read_sys_reg(vcpu, SCTLR_EL1) : vcpu_read_sys_reg(vcpu, SCTLR_EL2)); @@ -797,9 +891,9 @@ static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, mair = MEMATTR(NC, NC); par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); - par |= wr->pa & GENMASK_ULL(47, 12); + par |= wr->pa & SYS_PAR_EL1_PA; - sh = compute_sh(mair, wr->desc); + sh = compute_s1_sh(wi, wr, mair); par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); } @@ -873,7 +967,7 @@ static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN); break; case TR_EL10: - wxn = (__vcpu_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); + wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); break; } @@ -1186,7 +1280,7 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false); compute_par: - return compute_par_s1(vcpu, &wr, wi.regime); + return compute_par_s1(vcpu, &wi, &wr); } /* @@ -1202,7 +1296,7 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { struct mmu_config config; struct kvm_s2_mmu *mmu; - bool fail; + bool fail, mmu_cs; u64 par; par = SYS_PAR_EL1_F; @@ -1218,8 +1312,13 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already * the right one (as we trapped from vEL2). If not, save the * full MMU context. + * + * We are also guaranteed to be in the correct context if + * we're not in a nested VM. */ - if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) + mmu_cs = (vcpu_has_nv(vcpu) && + !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))); + if (!mmu_cs) goto skip_mmu_switch; /* @@ -1287,7 +1386,7 @@ skip_mmu_switch: write_sysreg_hcr(HCR_HOST_VHE_FLAGS); - if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) + if (mmu_cs) __mmu_config_restore(&config); return par; @@ -1470,3 +1569,68 @@ int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, return 0; } + +struct desc_match { + u64 ipa; + int level; +}; + +static int match_s1_desc(struct s1_walk_context *ctxt, void *priv) +{ + struct desc_match *dm = priv; + u64 ipa = dm->ipa; + + /* Use S1 granule alignment */ + ipa &= GENMASK(51, ctxt->wi->pgshift); + + /* Not the IPA we're looking for? Continue. */ + if (ipa != ctxt->table_ipa) + return 0; + + /* Note the level and interrupt the walk */ + dm->level = ctxt->level; + return -EINTR; +} + +int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) +{ + struct desc_match dm = { + .ipa = ipa, + }; + struct s1_walk_info wi = { + .filter = &(struct s1_walk_filter){ + .fn = match_s1_desc, + .priv = &dm, + }, + .regime = TR_EL10, + .as_el0 = false, + .pan = false, + }; + struct s1_walk_result wr = {}; + int ret; + + ret = setup_s1_walk(vcpu, &wi, &wr, va); + if (ret) + return ret; + + /* We really expect the S1 MMU to be on here... */ + if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) { + *level = 0; + return 0; + } + + /* Walk the guest's PT, looking for a match along the way */ + ret = walk_s1(vcpu, &wi, &wr, va); + switch (ret) { + case -EINTR: + /* We interrupted the walk on a match, return the level */ + *level = dm.level; + return 0; + case 0: + /* The walk completed, we failed to find the entry */ + return -ENOENT; + default: + /* Any other error... */ + return ret; + } +} diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index a598072f36d2..d449e15680e4 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -559,6 +559,9 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, /* Dump the nVHE hypervisor backtrace */ kvm_nvhe_dump_backtrace(hyp_offset); + /* Dump the faulting instruction */ + dump_kernel_instr(panic_addr + kaslr_offset()); + /* * Hyp has panicked and we're going to handle that by panicking the * kernel. The kernel offset will be revealed in the panic so we're diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index ce31d3b73603..184ad7a39950 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -29,7 +29,7 @@ struct pkvm_hyp_vcpu { }; /* - * Holds the relevant data for running a protected vm. + * Holds the relevant data for running a vm in protected mode. */ struct pkvm_hyp_vm { struct kvm kvm; @@ -67,6 +67,8 @@ static inline bool pkvm_hyp_vm_is_protected(struct pkvm_hyp_vm *hyp_vm) void pkvm_hyp_vm_table_init(void *tbl); +int __pkvm_reserve_vm(void); +void __pkvm_unreserve_vm(pkvm_handle_t handle); int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, unsigned long pgd_hva); int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h index 1e6d995968a1..ba5382c12787 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h +++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h @@ -12,7 +12,8 @@ #include <asm/kvm_host.h> #define cpu_reg(ctxt, r) (ctxt)->regs.regs[r] -#define DECLARE_REG(type, name, ctxt, reg) \ +#define DECLARE_REG(type, name, ctxt, reg) \ + __always_unused int ___check_reg_ ## reg; \ type name = (type)cpu_reg(ctxt, (reg)) #endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 0b0a68b663d4..a244ec25f8c5 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -27,6 +27,7 @@ hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o +hyp-obj-y += ../../../kernel/smccc-call.o hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o hyp-obj-y += $(lib-objs) diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 3369dd0c4009..4e16f9b96f63 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -71,36 +71,68 @@ static u32 hyp_ffa_version; static bool has_version_negotiated; static hyp_spinlock_t version_lock; -static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno) +static void ffa_to_smccc_error(struct arm_smccc_1_2_regs *res, u64 ffa_errno) { - *res = (struct arm_smccc_res) { + *res = (struct arm_smccc_1_2_regs) { .a0 = FFA_ERROR, .a2 = ffa_errno, }; } -static void ffa_to_smccc_res_prop(struct arm_smccc_res *res, int ret, u64 prop) +static void ffa_to_smccc_res_prop(struct arm_smccc_1_2_regs *res, int ret, u64 prop) { if (ret == FFA_RET_SUCCESS) { - *res = (struct arm_smccc_res) { .a0 = FFA_SUCCESS, - .a2 = prop }; + *res = (struct arm_smccc_1_2_regs) { .a0 = FFA_SUCCESS, + .a2 = prop }; } else { ffa_to_smccc_error(res, ret); } } -static void ffa_to_smccc_res(struct arm_smccc_res *res, int ret) +static void ffa_to_smccc_res(struct arm_smccc_1_2_regs *res, int ret) { ffa_to_smccc_res_prop(res, ret, 0); } static void ffa_set_retval(struct kvm_cpu_context *ctxt, - struct arm_smccc_res *res) + struct arm_smccc_1_2_regs *res) { cpu_reg(ctxt, 0) = res->a0; cpu_reg(ctxt, 1) = res->a1; cpu_reg(ctxt, 2) = res->a2; cpu_reg(ctxt, 3) = res->a3; + cpu_reg(ctxt, 4) = res->a4; + cpu_reg(ctxt, 5) = res->a5; + cpu_reg(ctxt, 6) = res->a6; + cpu_reg(ctxt, 7) = res->a7; + + /* + * DEN0028C 2.6: SMC32/HVC32 call from aarch64 must preserve x8-x30. + * + * In FF-A 1.2, we cannot rely on the function ID sent by the caller to + * detect 32-bit calls because the CPU cycle management interfaces (e.g. + * FFA_MSG_WAIT, FFA_RUN) are 32-bit only but can have 64-bit responses. + * + * FFA-1.3 introduces 64-bit variants of the CPU cycle management + * interfaces. Moreover, FF-A 1.3 clarifies that SMC32 direct requests + * complete with SMC32 direct reponses which *should* allow us use the + * function ID sent by the caller to determine whether to return x8-x17. + * + * Note that we also cannot rely on function IDs in the response. + * + * Given the above, assume SMC64 and send back x0-x17 unconditionally + * as the passthrough code (__kvm_hyp_host_forward_smc) does the same. + */ + cpu_reg(ctxt, 8) = res->a8; + cpu_reg(ctxt, 9) = res->a9; + cpu_reg(ctxt, 10) = res->a10; + cpu_reg(ctxt, 11) = res->a11; + cpu_reg(ctxt, 12) = res->a12; + cpu_reg(ctxt, 13) = res->a13; + cpu_reg(ctxt, 14) = res->a14; + cpu_reg(ctxt, 15) = res->a15; + cpu_reg(ctxt, 16) = res->a16; + cpu_reg(ctxt, 17) = res->a17; } static bool is_ffa_call(u64 func_id) @@ -113,82 +145,92 @@ static bool is_ffa_call(u64 func_id) static int ffa_map_hyp_buffers(u64 ffa_page_count) { - struct arm_smccc_res res; + struct arm_smccc_1_2_regs res; - arm_smccc_1_1_smc(FFA_FN64_RXTX_MAP, - hyp_virt_to_phys(hyp_buffers.tx), - hyp_virt_to_phys(hyp_buffers.rx), - ffa_page_count, - 0, 0, 0, 0, - &res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_FN64_RXTX_MAP, + .a1 = hyp_virt_to_phys(hyp_buffers.tx), + .a2 = hyp_virt_to_phys(hyp_buffers.rx), + .a3 = ffa_page_count, + }, &res); return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; } static int ffa_unmap_hyp_buffers(void) { - struct arm_smccc_res res; + struct arm_smccc_1_2_regs res; - arm_smccc_1_1_smc(FFA_RXTX_UNMAP, - HOST_FFA_ID, - 0, 0, 0, 0, 0, 0, - &res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_RXTX_UNMAP, + .a1 = HOST_FFA_ID, + }, &res); return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; } -static void ffa_mem_frag_tx(struct arm_smccc_res *res, u32 handle_lo, +static void ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, u32 handle_lo, u32 handle_hi, u32 fraglen, u32 endpoint_id) { - arm_smccc_1_1_smc(FFA_MEM_FRAG_TX, - handle_lo, handle_hi, fraglen, endpoint_id, - 0, 0, 0, - res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_MEM_FRAG_TX, + .a1 = handle_lo, + .a2 = handle_hi, + .a3 = fraglen, + .a4 = endpoint_id, + }, res); } -static void ffa_mem_frag_rx(struct arm_smccc_res *res, u32 handle_lo, +static void ffa_mem_frag_rx(struct arm_smccc_1_2_regs *res, u32 handle_lo, u32 handle_hi, u32 fragoff) { - arm_smccc_1_1_smc(FFA_MEM_FRAG_RX, - handle_lo, handle_hi, fragoff, HOST_FFA_ID, - 0, 0, 0, - res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_MEM_FRAG_RX, + .a1 = handle_lo, + .a2 = handle_hi, + .a3 = fragoff, + .a4 = HOST_FFA_ID, + }, res); } -static void ffa_mem_xfer(struct arm_smccc_res *res, u64 func_id, u32 len, +static void ffa_mem_xfer(struct arm_smccc_1_2_regs *res, u64 func_id, u32 len, u32 fraglen) { - arm_smccc_1_1_smc(func_id, len, fraglen, - 0, 0, 0, 0, 0, - res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = func_id, + .a1 = len, + .a2 = fraglen, + }, res); } -static void ffa_mem_reclaim(struct arm_smccc_res *res, u32 handle_lo, +static void ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, u32 handle_lo, u32 handle_hi, u32 flags) { - arm_smccc_1_1_smc(FFA_MEM_RECLAIM, - handle_lo, handle_hi, flags, - 0, 0, 0, 0, - res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_MEM_RECLAIM, + .a1 = handle_lo, + .a2 = handle_hi, + .a3 = flags, + }, res); } -static void ffa_retrieve_req(struct arm_smccc_res *res, u32 len) +static void ffa_retrieve_req(struct arm_smccc_1_2_regs *res, u32 len) { - arm_smccc_1_1_smc(FFA_FN64_MEM_RETRIEVE_REQ, - len, len, - 0, 0, 0, 0, 0, - res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_FN64_MEM_RETRIEVE_REQ, + .a1 = len, + .a2 = len, + }, res); } -static void ffa_rx_release(struct arm_smccc_res *res) +static void ffa_rx_release(struct arm_smccc_1_2_regs *res) { - arm_smccc_1_1_smc(FFA_RX_RELEASE, - 0, 0, - 0, 0, 0, 0, 0, - res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_RX_RELEASE, + }, res); } -static void do_ffa_rxtx_map(struct arm_smccc_res *res, +static void do_ffa_rxtx_map(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(phys_addr_t, tx, ctxt, 1); @@ -267,7 +309,7 @@ err_unmap: goto out_unlock; } -static void do_ffa_rxtx_unmap(struct arm_smccc_res *res, +static void do_ffa_rxtx_unmap(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, id, ctxt, 1); @@ -368,7 +410,7 @@ static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, return ret; } -static void do_ffa_mem_frag_tx(struct arm_smccc_res *res, +static void do_ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, handle_lo, ctxt, 1); @@ -427,7 +469,7 @@ out: } static void __do_ffa_mem_xfer(const u64 func_id, - struct arm_smccc_res *res, + struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, len, ctxt, 1); @@ -521,7 +563,7 @@ err_unshare: __do_ffa_mem_xfer((fid), (res), (ctxt)); \ } while (0); -static void do_ffa_mem_reclaim(struct arm_smccc_res *res, +static void do_ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, handle_lo, ctxt, 1); @@ -628,13 +670,26 @@ static bool ffa_call_supported(u64 func_id) case FFA_RXTX_MAP: case FFA_MEM_DONATE: case FFA_MEM_RETRIEVE_REQ: + /* Optional notification interfaces added in FF-A 1.1 */ + case FFA_NOTIFICATION_BITMAP_CREATE: + case FFA_NOTIFICATION_BITMAP_DESTROY: + case FFA_NOTIFICATION_BIND: + case FFA_NOTIFICATION_UNBIND: + case FFA_NOTIFICATION_SET: + case FFA_NOTIFICATION_GET: + case FFA_NOTIFICATION_INFO_GET: + /* Optional interfaces added in FF-A 1.2 */ + case FFA_MSG_SEND_DIRECT_REQ2: /* Optional per 7.5.1 */ + case FFA_MSG_SEND_DIRECT_RESP2: /* Optional per 7.5.1 */ + case FFA_CONSOLE_LOG: /* Optional per 13.1: not in Table 13.1 */ + case FFA_PARTITION_INFO_GET_REGS: /* Optional for virtual instances per 13.1 */ return false; } return true; } -static bool do_ffa_features(struct arm_smccc_res *res, +static bool do_ffa_features(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, id, ctxt, 1); @@ -666,21 +721,25 @@ out_handled: static int hyp_ffa_post_init(void) { size_t min_rxtx_sz; - struct arm_smccc_res res; + struct arm_smccc_1_2_regs res; - arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){ + .a0 = FFA_ID_GET, + }, &res); if (res.a0 != FFA_SUCCESS) return -EOPNOTSUPP; if (res.a2 != HOST_FFA_ID) return -EINVAL; - arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP, - 0, 0, 0, 0, 0, 0, &res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){ + .a0 = FFA_FEATURES, + .a1 = FFA_FN64_RXTX_MAP, + }, &res); if (res.a0 != FFA_SUCCESS) return -EOPNOTSUPP; - switch (res.a2) { + switch (res.a2 & FFA_FEAT_RXTX_MIN_SZ_MASK) { case FFA_FEAT_RXTX_MIN_SZ_4K: min_rxtx_sz = SZ_4K; break; @@ -700,7 +759,7 @@ static int hyp_ffa_post_init(void) return 0; } -static void do_ffa_version(struct arm_smccc_res *res, +static void do_ffa_version(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, ffa_req_version, ctxt, 1); @@ -712,7 +771,10 @@ static void do_ffa_version(struct arm_smccc_res *res, hyp_spin_lock(&version_lock); if (has_version_negotiated) { - res->a0 = hyp_ffa_version; + if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) + res->a0 = FFA_RET_NOT_SUPPORTED; + else + res->a0 = hyp_ffa_version; goto unlock; } @@ -721,9 +783,10 @@ static void do_ffa_version(struct arm_smccc_res *res, * first if TEE supports it. */ if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) { - arm_smccc_1_1_smc(FFA_VERSION, ffa_req_version, 0, - 0, 0, 0, 0, 0, - res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_VERSION, + .a1 = ffa_req_version, + }, res); if (res->a0 == FFA_RET_NOT_SUPPORTED) goto unlock; @@ -740,7 +803,7 @@ unlock: hyp_spin_unlock(&version_lock); } -static void do_ffa_part_get(struct arm_smccc_res *res, +static void do_ffa_part_get(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, uuid0, ctxt, 1); @@ -756,9 +819,14 @@ static void do_ffa_part_get(struct arm_smccc_res *res, goto out_unlock; } - arm_smccc_1_1_smc(FFA_PARTITION_INFO_GET, uuid0, uuid1, - uuid2, uuid3, flags, 0, 0, - res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_PARTITION_INFO_GET, + .a1 = uuid0, + .a2 = uuid1, + .a3 = uuid2, + .a4 = uuid3, + .a5 = flags, + }, res); if (res->a0 != FFA_SUCCESS) goto out_unlock; @@ -791,7 +859,7 @@ out_unlock: bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id) { - struct arm_smccc_res res; + struct arm_smccc_1_2_regs res; /* * There's no way we can tell what a non-standard SMC call might @@ -860,13 +928,16 @@ out_handled: int hyp_ffa_init(void *pages) { - struct arm_smccc_res res; + struct arm_smccc_1_2_regs res; void *tx, *rx; if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2) return 0; - arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_1, 0, 0, 0, 0, 0, 0, &res); + arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_VERSION, + .a1 = FFA_VERSION_1_2, + }, &res); if (res.a0 == FFA_RET_NOT_SUPPORTED) return 0; @@ -886,10 +957,10 @@ int hyp_ffa_init(void *pages) if (FFA_MAJOR_VERSION(res.a0) != 1) return -EOPNOTSUPP; - if (FFA_MINOR_VERSION(res.a0) < FFA_MINOR_VERSION(FFA_VERSION_1_1)) + if (FFA_MINOR_VERSION(res.a0) < FFA_MINOR_VERSION(FFA_VERSION_1_2)) hyp_ffa_version = res.a0; else - hyp_ffa_version = FFA_VERSION_1_1; + hyp_ffa_version = FFA_VERSION_1_2; tx = pages; pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE; diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 3206b2c07f82..29430c031095 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -546,6 +546,18 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } +static void handle___pkvm_reserve_vm(struct kvm_cpu_context *host_ctxt) +{ + cpu_reg(host_ctxt, 1) = __pkvm_reserve_vm(); +} + +static void handle___pkvm_unreserve_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + + __pkvm_unreserve_vm(handle); +} + static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1); @@ -606,6 +618,8 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_timer_set_cntvoff), HANDLE_FUNC(__vgic_v3_save_vmcr_aprs), HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), + HANDLE_FUNC(__pkvm_reserve_vm), + HANDLE_FUNC(__pkvm_unreserve_vm), HANDLE_FUNC(__pkvm_init_vm), HANDLE_FUNC(__pkvm_init_vcpu), HANDLE_FUNC(__pkvm_teardown_vm), diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 338505cb0171..05774aed09cb 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -23,8 +23,8 @@ unsigned int kvm_arm_vmid_bits; unsigned int kvm_host_sve_max_vl; /* - * The currently loaded hyp vCPU for each physical CPU. Used only when - * protected KVM is enabled, but for both protected and non-protected VMs. + * The currently loaded hyp vCPU for each physical CPU. Used in protected mode + * for both protected and non-protected VMs. */ static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu); @@ -135,7 +135,7 @@ static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - /* Protected KVM does not support AArch32 guests. */ + /* No AArch32 support for protected guests. */ if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL0, AARCH32) || kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL1, AARCH32)) return -EINVAL; @@ -192,6 +192,11 @@ static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) */ #define HANDLE_OFFSET 0x1000 +/* + * Marks a reserved but not yet used entry in the VM table. + */ +#define RESERVED_ENTRY ((void *)0xa110ca7ed) + static unsigned int vm_handle_to_idx(pkvm_handle_t handle) { return handle - HANDLE_OFFSET; @@ -210,8 +215,8 @@ static pkvm_handle_t idx_to_vm_handle(unsigned int idx) DEFINE_HYP_SPINLOCK(vm_table_lock); /* - * The table of VM entries for protected VMs in hyp. - * Allocated at hyp initialization and setup. + * A table that tracks all VMs in protected mode. + * Allocated during hyp initialization and setup. */ static struct pkvm_hyp_vm **vm_table; @@ -231,6 +236,10 @@ static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle) if (unlikely(idx >= KVM_MAX_PVMS)) return NULL; + /* A reserved entry doesn't represent an initialized VM. */ + if (unlikely(vm_table[idx] == RESERVED_ENTRY)) + return NULL; + return vm_table[idx]; } @@ -401,14 +410,26 @@ static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[], } static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, - unsigned int nr_vcpus) + unsigned int nr_vcpus, pkvm_handle_t handle) { + struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu; + int idx = vm_handle_to_idx(handle); + + hyp_vm->kvm.arch.pkvm.handle = handle; + hyp_vm->host_kvm = host_kvm; hyp_vm->kvm.created_vcpus = nr_vcpus; - hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; - hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled); + hyp_vm->kvm.arch.pkvm.is_protected = READ_ONCE(host_kvm->arch.pkvm.is_protected); + hyp_vm->kvm.arch.pkvm.is_created = true; hyp_vm->kvm.arch.flags = 0; pkvm_init_features_from_host(hyp_vm, host_kvm); + + /* VMID 0 is reserved for the host */ + atomic64_set(&mmu->vmid.id, idx + 1); + + mmu->vtcr = host_mmu.arch.mmu.vtcr; + mmu->arch = &hyp_vm->kvm.arch; + mmu->pgt = &hyp_vm->pgt; } static int pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu) @@ -480,7 +501,7 @@ done: return ret; } -static int find_free_vm_table_entry(struct kvm *host_kvm) +static int find_free_vm_table_entry(void) { int i; @@ -493,15 +514,13 @@ static int find_free_vm_table_entry(struct kvm *host_kvm) } /* - * Allocate a VM table entry and insert a pointer to the new vm. + * Reserve a VM table entry. * - * Return a unique handle to the protected VM on success, + * Return a unique handle to the VM on success, * negative error code on failure. */ -static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm, - struct pkvm_hyp_vm *hyp_vm) +static int allocate_vm_table_entry(void) { - struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu; int idx; hyp_assert_lock_held(&vm_table_lock); @@ -514,20 +533,57 @@ static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm, if (unlikely(!vm_table)) return -EINVAL; - idx = find_free_vm_table_entry(host_kvm); - if (idx < 0) + idx = find_free_vm_table_entry(); + if (unlikely(idx < 0)) return idx; - hyp_vm->kvm.arch.pkvm.handle = idx_to_vm_handle(idx); + vm_table[idx] = RESERVED_ENTRY; - /* VMID 0 is reserved for the host */ - atomic64_set(&mmu->vmid.id, idx + 1); + return idx; +} - mmu->arch = &hyp_vm->kvm.arch; - mmu->pgt = &hyp_vm->pgt; +static int __insert_vm_table_entry(pkvm_handle_t handle, + struct pkvm_hyp_vm *hyp_vm) +{ + unsigned int idx; + + hyp_assert_lock_held(&vm_table_lock); + + /* + * Initializing protected state might have failed, yet a malicious + * host could trigger this function. Thus, ensure that 'vm_table' + * exists. + */ + if (unlikely(!vm_table)) + return -EINVAL; + + idx = vm_handle_to_idx(handle); + if (unlikely(idx >= KVM_MAX_PVMS)) + return -EINVAL; + + if (unlikely(vm_table[idx] != RESERVED_ENTRY)) + return -EINVAL; vm_table[idx] = hyp_vm; - return hyp_vm->kvm.arch.pkvm.handle; + + return 0; +} + +/* + * Insert a pointer to the initialized VM into the VM table. + * + * Return 0 on success, or negative error code on failure. + */ +static int insert_vm_table_entry(pkvm_handle_t handle, + struct pkvm_hyp_vm *hyp_vm) +{ + int ret; + + hyp_spin_lock(&vm_table_lock); + ret = __insert_vm_table_entry(handle, hyp_vm); + hyp_spin_unlock(&vm_table_lock); + + return ret; } /* @@ -594,10 +650,45 @@ static void unmap_donated_memory_noclear(void *va, size_t size) } /* - * Initialize the hypervisor copy of the protected VM state using the - * memory donated by the host. + * Reserves an entry in the hypervisor for a new VM in protected mode. * - * Unmaps the donated memory from the host at stage 2. + * Return a unique handle to the VM on success, negative error code on failure. + */ +int __pkvm_reserve_vm(void) +{ + int ret; + + hyp_spin_lock(&vm_table_lock); + ret = allocate_vm_table_entry(); + hyp_spin_unlock(&vm_table_lock); + + if (ret < 0) + return ret; + + return idx_to_vm_handle(ret); +} + +/* + * Removes a reserved entry, but only if is hasn't been used yet. + * Otherwise, the VM needs to be destroyed. + */ +void __pkvm_unreserve_vm(pkvm_handle_t handle) +{ + unsigned int idx = vm_handle_to_idx(handle); + + if (unlikely(!vm_table)) + return; + + hyp_spin_lock(&vm_table_lock); + if (likely(idx < KVM_MAX_PVMS && vm_table[idx] == RESERVED_ENTRY)) + remove_vm_table_entry(handle); + hyp_spin_unlock(&vm_table_lock); +} + +/* + * Initialize the hypervisor copy of the VM state using host-donated memory. + * + * Unmap the donated memory from the host at stage 2. * * host_kvm: A pointer to the host's struct kvm. * vm_hva: The host va of the area being donated for the VM state. @@ -606,8 +697,7 @@ static void unmap_donated_memory_noclear(void *va, size_t size) * the VM. Must be page aligned. Its size is implied by the VM's * VTCR. * - * Return a unique handle to the protected VM on success, - * negative error code on failure. + * Return 0 success, negative error code on failure. */ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, unsigned long pgd_hva) @@ -615,6 +705,7 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, struct pkvm_hyp_vm *hyp_vm = NULL; size_t vm_size, pgd_size; unsigned int nr_vcpus; + pkvm_handle_t handle; void *pgd = NULL; int ret; @@ -628,6 +719,12 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, goto err_unpin_kvm; } + handle = READ_ONCE(host_kvm->arch.pkvm.handle); + if (unlikely(handle < HANDLE_OFFSET)) { + ret = -EINVAL; + goto err_unpin_kvm; + } + vm_size = pkvm_get_hyp_vm_size(nr_vcpus); pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.mmu.vtcr); @@ -641,24 +738,19 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, if (!pgd) goto err_remove_mappings; - init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus); - - hyp_spin_lock(&vm_table_lock); - ret = insert_vm_table_entry(host_kvm, hyp_vm); - if (ret < 0) - goto err_unlock; + init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus, handle); ret = kvm_guest_prepare_stage2(hyp_vm, pgd); if (ret) - goto err_remove_vm_table_entry; - hyp_spin_unlock(&vm_table_lock); + goto err_remove_mappings; - return hyp_vm->kvm.arch.pkvm.handle; + /* Must be called last since this publishes the VM. */ + ret = insert_vm_table_entry(handle, hyp_vm); + if (ret) + goto err_remove_mappings; + + return 0; -err_remove_vm_table_entry: - remove_vm_table_entry(hyp_vm->kvm.arch.pkvm.handle); -err_unlock: - hyp_spin_unlock(&vm_table_lock); err_remove_mappings: unmap_donated_memory(hyp_vm, vm_size); unmap_donated_memory(pgd, pgd_size); @@ -668,10 +760,9 @@ err_unpin_kvm: } /* - * Initialize the hypervisor copy of the protected vCPU state using the - * memory donated by the host. + * Initialize the hypervisor copy of the vCPU state using host-donated memory. * - * handle: The handle for the protected vm. + * handle: The hypervisor handle for the vm. * host_vcpu: A pointer to the corresponding host vcpu. * vcpu_hva: The host va of the area being donated for the vcpu state. * Must be page aligned. The size of the area must be equal to diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index a48d3f5a5afb..90bd014e952f 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -192,6 +192,7 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, enum pkvm_page_state state; struct hyp_page *page; phys_addr_t phys; + enum kvm_pgtable_prot prot; if (!kvm_pte_valid(ctx->old)) return 0; @@ -210,11 +211,18 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, * configured in the hypervisor stage-1, and make sure to propagate them * to the hyp_vmemmap state. */ - state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(ctx->old)); + prot = kvm_pgtable_hyp_pte_prot(ctx->old); + state = pkvm_getstate(prot); switch (state) { case PKVM_PAGE_OWNED: set_hyp_state(page, PKVM_PAGE_OWNED); - return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); + /* hyp text is RO in the host stage-2 to be inspected on panic. */ + if (prot == PAGE_HYP_EXEC) { + set_host_state(page, PKVM_NOPAGE); + return host_stage2_idmap_locked(phys, PAGE_SIZE, KVM_PGTABLE_PROT_R); + } else { + return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); + } case PKVM_PAGE_SHARED_OWNED: set_hyp_state(page, PKVM_PAGE_SHARED_OWNED); set_host_state(page, PKVM_PAGE_SHARED_BORROWED); diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 6745f38b64f9..dfcd66c65517 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -106,7 +106,30 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr { unsigned long cpsr = *vcpu_cpsr(vcpu); bool is_aarch32 = vcpu_mode_is_32bit(vcpu); - u64 esr = 0; + u64 esr = 0, fsc; + int level; + + /* + * If injecting an abort from a failed S1PTW, rewalk the S1 PTs to + * find the failing level. If we can't find it, assume the error was + * transient and restart without changing the state. + */ + if (kvm_vcpu_abt_iss1tw(vcpu)) { + u64 hpfar = kvm_vcpu_get_fault_ipa(vcpu); + int ret; + + if (hpfar == INVALID_GPA) + return; + + ret = __kvm_find_s1_desc_level(vcpu, addr, hpfar, &level); + if (ret) + return; + + WARN_ON_ONCE(level < -1 || level > 3); + fsc = ESR_ELx_FSC_SEA_TTW(level); + } else { + fsc = ESR_ELx_FSC_EXTABT; + } /* This delight is brought to you by FEAT_DoubleFault2. */ if (effective_sctlr2_ease(vcpu)) @@ -133,7 +156,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr if (!is_iabt) esr |= ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT; - esr |= ESR_ELx_FSC_EXTABT; + esr |= fsc; vcpu_write_sys_reg(vcpu, addr, exception_far_elx(vcpu)); vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu)); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 77db81bae86f..cb36974e010a 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -349,7 +349,7 @@ static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi) wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); /* Global limit for now, should eventually be per-VM */ wi->max_oa_bits = min(get_kvm_ipa_limit(), - ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr))); + ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false)); } int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 61827cf6fea4..d7a0f69a9982 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -85,16 +85,23 @@ void __init kvm_hyp_reserve(void) hyp_mem_base); } -static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) +static void __pkvm_destroy_hyp_vm(struct kvm *kvm) { - if (host_kvm->arch.pkvm.handle) { + if (pkvm_hyp_vm_is_created(kvm)) { WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm, - host_kvm->arch.pkvm.handle)); + kvm->arch.pkvm.handle)); + } else if (kvm->arch.pkvm.handle) { + /* + * The VM could have been reserved but hyp initialization has + * failed. Make sure to unreserve it. + */ + kvm_call_hyp_nvhe(__pkvm_unreserve_vm, kvm->arch.pkvm.handle); } - host_kvm->arch.pkvm.handle = 0; - free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc); - free_hyp_memcache(&host_kvm->arch.pkvm.stage2_teardown_mc); + kvm->arch.pkvm.handle = 0; + kvm->arch.pkvm.is_created = false; + free_hyp_memcache(&kvm->arch.pkvm.teardown_mc); + free_hyp_memcache(&kvm->arch.pkvm.stage2_teardown_mc); } static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) @@ -129,16 +136,16 @@ static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) * * Return 0 on success, negative error code on failure. */ -static int __pkvm_create_hyp_vm(struct kvm *host_kvm) +static int __pkvm_create_hyp_vm(struct kvm *kvm) { size_t pgd_sz, hyp_vm_sz; void *pgd, *hyp_vm; int ret; - if (host_kvm->created_vcpus < 1) + if (kvm->created_vcpus < 1) return -EINVAL; - pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.mmu.vtcr); + pgd_sz = kvm_pgtable_stage2_pgd_size(kvm->arch.mmu.vtcr); /* * The PGD pages will be reclaimed using a hyp_memcache which implies @@ -152,7 +159,7 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) /* Allocate memory to donate to hyp for vm and vcpu pointers. */ hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE, size_mul(sizeof(void *), - host_kvm->created_vcpus))); + kvm->created_vcpus))); hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT); if (!hyp_vm) { ret = -ENOMEM; @@ -160,12 +167,12 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) } /* Donate the VM memory to hyp and let hyp initialize it. */ - ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd); - if (ret < 0) + ret = kvm_call_hyp_nvhe(__pkvm_init_vm, kvm, hyp_vm, pgd); + if (ret) goto free_vm; - host_kvm->arch.pkvm.handle = ret; - host_kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; + kvm->arch.pkvm.is_created = true; + kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2; kvm_account_pgtable_pages(pgd, pgd_sz / PAGE_SIZE); return 0; @@ -176,14 +183,19 @@ free_pgd: return ret; } -int pkvm_create_hyp_vm(struct kvm *host_kvm) +bool pkvm_hyp_vm_is_created(struct kvm *kvm) +{ + return READ_ONCE(kvm->arch.pkvm.is_created); +} + +int pkvm_create_hyp_vm(struct kvm *kvm) { int ret = 0; - mutex_lock(&host_kvm->arch.config_lock); - if (!host_kvm->arch.pkvm.handle) - ret = __pkvm_create_hyp_vm(host_kvm); - mutex_unlock(&host_kvm->arch.config_lock); + mutex_lock(&kvm->arch.config_lock); + if (!pkvm_hyp_vm_is_created(kvm)) + ret = __pkvm_create_hyp_vm(kvm); + mutex_unlock(&kvm->arch.config_lock); return ret; } @@ -200,15 +212,31 @@ int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu) return ret; } -void pkvm_destroy_hyp_vm(struct kvm *host_kvm) +void pkvm_destroy_hyp_vm(struct kvm *kvm) { - mutex_lock(&host_kvm->arch.config_lock); - __pkvm_destroy_hyp_vm(host_kvm); - mutex_unlock(&host_kvm->arch.config_lock); + mutex_lock(&kvm->arch.config_lock); + __pkvm_destroy_hyp_vm(kvm); + mutex_unlock(&kvm->arch.config_lock); } -int pkvm_init_host_vm(struct kvm *host_kvm) +int pkvm_init_host_vm(struct kvm *kvm) { + int ret; + + if (pkvm_hyp_vm_is_created(kvm)) + return -EINVAL; + + /* VM is already reserved, no need to proceed. */ + if (kvm->arch.pkvm.handle) + return 0; + + /* Reserve the VM in hyp and obtain a hyp handle for the VM. */ + ret = kvm_call_hyp_nvhe(__pkvm_reserve_vm); + if (ret < 0) + return ret; + + kvm->arch.pkvm.handle = ret; + return 0; } diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 1e680ad6e863..ccccb5c04ac1 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -554,7 +554,6 @@ int vgic_lazy_init(struct kvm *kvm) * Also map the virtual CPU interface into the VM. * v2 calls vgic_init() if not already done. * v3 and derivatives return an error if the VGIC is not initialized. - * vgic_ready() returns true if this function has succeeded. */ int kvm_vgic_map_resources(struct kvm *kvm) { @@ -563,12 +562,12 @@ int kvm_vgic_map_resources(struct kvm *kvm) gpa_t dist_base; int ret = 0; - if (likely(vgic_ready(kvm))) + if (likely(smp_load_acquire(&dist->ready))) return 0; mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->arch.config_lock); - if (vgic_ready(kvm)) + if (dist->ready) goto out; if (!irqchip_in_kernel(kvm)) @@ -594,14 +593,7 @@ int kvm_vgic_map_resources(struct kvm *kvm) goto out_slots; } - /* - * kvm_io_bus_register_dev() guarantees all readers see the new MMIO - * registration before returning through synchronize_srcu(), which also - * implies a full memory barrier. As such, marking the distributor as - * 'ready' here is guaranteed to be ordered after all vCPUs having seen - * a completely configured distributor. - */ - dist->ready = true; + smp_store_release(&dist->ready, true); goto out_slots; out: mutex_unlock(&kvm->arch.config_lock); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aa157fe5b7b3..0bdf9405969a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5785,6 +5785,13 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; + /* + * Ensure that any updates to kvm->buses[] observed by the + * previous instruction (emulated or otherwise) are also + * visible to the instruction KVM is about to emulate. + */ + smp_rmb(); + if (!kvm_emulate_instruction(vcpu, 0)) return 0; diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 9a6340d9c91e..dd9af8a401d3 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -407,7 +407,6 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu); #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) ((k)->arch.vgic.initialized) -#define vgic_ready(k) ((k)->arch.vgic.ready) #define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \ ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index e1634897e159..cd7ee4df9045 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -128,6 +128,7 @@ #define FFA_FEAT_RXTX_MIN_SZ_4K 0 #define FFA_FEAT_RXTX_MIN_SZ_64K 1 #define FFA_FEAT_RXTX_MIN_SZ_16K 2 +#define FFA_FEAT_RXTX_MIN_SZ_MASK GENMASK(1, 0) /* FFA Bus/Device/Driver related */ struct ffa_device { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 15656b7fba6c..103be35caf0d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -206,6 +206,7 @@ struct kvm_io_range { struct kvm_io_bus { int dev_count; int ioeventfd_count; + struct rcu_head rcu; struct kvm_io_range range[]; }; @@ -966,11 +967,15 @@ static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm) return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET); } +/* + * Get a bus reference under the update-side lock. No long-term SRCU reader + * references are permitted, to avoid stale reads vs concurrent IO + * registrations. + */ static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) { - return srcu_dereference_check(kvm->buses[idx], &kvm->srcu, - lockdep_is_held(&kvm->slots_lock) || - !refcount_read(&kvm->users_count)); + return rcu_dereference_protected(kvm->buses[idx], + lockdep_is_held(&kvm->slots_lock)); } static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) diff --git a/tools/testing/selftests/kvm/arm64/external_aborts.c b/tools/testing/selftests/kvm/arm64/external_aborts.c index 062bf84cced1..592b26ded779 100644 --- a/tools/testing/selftests/kvm/arm64/external_aborts.c +++ b/tools/testing/selftests/kvm/arm64/external_aborts.c @@ -250,6 +250,47 @@ static void test_serror(void) kvm_vm_free(vm); } +static void expect_sea_s1ptw_handler(struct ex_regs *regs) +{ + u64 esr = read_sysreg(esr_el1); + + GUEST_ASSERT_EQ(regs->pc, expected_abort_pc); + GUEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_CUR); + GUEST_ASSERT_EQ((esr & ESR_ELx_FSC), ESR_ELx_FSC_SEA_TTW(3)); + + GUEST_DONE(); +} + +static noinline void test_s1ptw_abort_guest(void) +{ + extern char test_s1ptw_abort_insn; + + WRITE_ONCE(expected_abort_pc, (u64)&test_s1ptw_abort_insn); + + asm volatile("test_s1ptw_abort_insn:\n\t" + "ldr x0, [%0]\n\t" + : : "r" (MMIO_ADDR) : "x0", "memory"); + + GUEST_FAIL("Load on S1PTW abort should not retire"); +} + +static void test_s1ptw_abort(void) +{ + struct kvm_vcpu *vcpu; + u64 *ptep, bad_pa; + struct kvm_vm *vm = vm_create_with_dabt_handler(&vcpu, test_s1ptw_abort_guest, + expect_sea_s1ptw_handler); + + ptep = virt_get_pte_hva_at_level(vm, MMIO_ADDR, 2); + bad_pa = BIT(vm->pa_bits) - vm->page_size; + + *ptep &= ~GENMASK(47, 12); + *ptep |= bad_pa; + + vcpu_run_expect_done(vcpu); + kvm_vm_free(vm); +} + static void test_serror_emulated_guest(void) { GUEST_ASSERT(!(read_sysreg(isr_el1) & ISR_EL1_A)); @@ -327,4 +368,5 @@ int main(void) test_serror_masked(); test_serror_emulated(); test_mmio_ease(); + test_s1ptw_abort(); } diff --git a/tools/testing/selftests/kvm/include/arm64/processor.h b/tools/testing/selftests/kvm/include/arm64/processor.h index 255fed769a8a..e3e916b1d9c4 100644 --- a/tools/testing/selftests/kvm/include/arm64/processor.h +++ b/tools/testing/selftests/kvm/include/arm64/processor.h @@ -175,6 +175,7 @@ void vm_install_exception_handler(struct kvm_vm *vm, void vm_install_sync_handler(struct kvm_vm *vm, int vector, int ec, handler_fn handler); +uint64_t *virt_get_pte_hva_at_level(struct kvm_vm *vm, vm_vaddr_t gva, int level); uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva); static inline void cpu_relax(void) diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index eb115123d741..bd7480a93f96 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -185,7 +185,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) _virt_pg_map(vm, vaddr, paddr, attr_idx); } -uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva) +uint64_t *virt_get_pte_hva_at_level(struct kvm_vm *vm, vm_vaddr_t gva, int level) { uint64_t *ptep; @@ -195,17 +195,23 @@ uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva) ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, gva) * 8; if (!ptep) goto unmapped_gva; + if (level == 0) + return ptep; switch (vm->pgtable_levels) { case 4: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, gva) * 8; if (!ptep) goto unmapped_gva; + if (level == 1) + break; /* fall through */ case 3: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, gva) * 8; if (!ptep) goto unmapped_gva; + if (level == 2) + break; /* fall through */ case 2: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, gva) * 8; @@ -223,6 +229,11 @@ unmapped_gva: exit(EXIT_FAILURE); } +uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva) +{ + return virt_get_pte_hva_at_level(vm, gva, 3); +} + vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { uint64_t *ptep = virt_get_pte_hva(vm, gva); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6c07dd423458..bcef324ccbf2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1103,6 +1103,14 @@ void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) { } +/* Called only on cleanup and destruction paths when there are no users. */ +static inline struct kvm_io_bus *kvm_get_bus_for_destruction(struct kvm *kvm, + enum kvm_bus idx) +{ + return rcu_dereference_protected(kvm->buses[idx], + !refcount_read(&kvm->users_count)); +} + static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) { struct kvm *kvm = kvm_arch_alloc_vm(); @@ -1228,7 +1236,7 @@ out_err_no_disable: out_err_no_arch_destroy_vm: WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); for (i = 0; i < KVM_NR_BUSES; i++) - kfree(kvm_get_bus(kvm, i)); + kfree(kvm_get_bus_for_destruction(kvm, i)); kvm_free_irq_routing(kvm); out_err_no_irq_routing: cleanup_srcu_struct(&kvm->irq_srcu); @@ -1276,7 +1284,7 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { - struct kvm_io_bus *bus = kvm_get_bus(kvm, i); + struct kvm_io_bus *bus = kvm_get_bus_for_destruction(kvm, i); if (bus) kvm_io_bus_destroy(bus); @@ -1312,6 +1320,7 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_free_memslots(kvm, &kvm->__memslots[i][1]); } cleanup_srcu_struct(&kvm->irq_srcu); + srcu_barrier(&kvm->srcu); cleanup_srcu_struct(&kvm->srcu); #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES xa_destroy(&kvm->mem_attr_array); @@ -5843,6 +5852,18 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } +static struct kvm_io_bus *kvm_get_bus_srcu(struct kvm *kvm, enum kvm_bus idx) +{ + /* + * Ensure that any updates to kvm_buses[] observed by the previous vCPU + * machine instruction are also visible to the vCPU machine instruction + * that triggered this call. + */ + smp_mb__after_srcu_read_lock(); + + return srcu_dereference(kvm->buses[idx], &kvm->srcu); +} + int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { @@ -5855,7 +5876,7 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, .len = len, }; - bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx); if (!bus) return -ENOMEM; r = __kvm_io_bus_write(vcpu, bus, &range, val); @@ -5874,7 +5895,7 @@ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, .len = len, }; - bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx); if (!bus) return -ENOMEM; @@ -5924,7 +5945,7 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, .len = len, }; - bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx); if (!bus) return -ENOMEM; r = __kvm_io_bus_read(vcpu, bus, &range, val); @@ -5932,6 +5953,13 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, } EXPORT_SYMBOL_GPL(kvm_io_bus_read); +static void __free_bus(struct rcu_head *rcu) +{ + struct kvm_io_bus *bus = container_of(rcu, struct kvm_io_bus, rcu); + + kfree(bus); +} + int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev) { @@ -5970,8 +5998,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, memcpy(new_bus->range + i + 1, bus->range + i, (bus->dev_count - i) * sizeof(struct kvm_io_range)); rcu_assign_pointer(kvm->buses[bus_idx], new_bus); - synchronize_srcu_expedited(&kvm->srcu); - kfree(bus); + call_srcu(&kvm->srcu, &bus->rcu, __free_bus); return 0; } @@ -6033,7 +6060,7 @@ struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, srcu_idx = srcu_read_lock(&kvm->srcu); - bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + bus = kvm_get_bus_srcu(kvm, bus_idx); if (!bus) goto out_unlock; |
