diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/cpuid.c | 28 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 20 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.h | 4 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 30 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 35 | ||||
-rw-r--r-- | arch/x86/kvm/svm/avic.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/svm/nested.c | 10 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 45 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 26 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm_onhyperv.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 15 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 56 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 17 |
16 files changed, 194 insertions, 106 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 739be5da3bca..fe03bd978761 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -208,30 +208,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_mmu_after_set_cpuid(vcpu); } -static int is_efer_nx(void) -{ - return host_efer & EFER_NX; -} - -static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_cpuid_entry2 *e, *entry; - - entry = NULL; - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - e = &vcpu->arch.cpuid_entries[i]; - if (e->function == 0x80000001) { - entry = e; - break; - } - } - if (entry && cpuid_entry_has(entry, X86_FEATURE_NX) && !is_efer_nx()) { - cpuid_entry_clear(entry, X86_FEATURE_NX); - printk(KERN_INFO "kvm: guest NX capability removed\n"); - } -} - int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -302,7 +278,6 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_entries = e2; vcpu->arch.cpuid_nent = cpuid->nent; - cpuid_fix_nx_cap(vcpu); kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); @@ -401,7 +376,6 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) void kvm_set_cpu_caps(void) { - unsigned int f_nx = is_efer_nx() ? F(NX) : 0; #ifdef CONFIG_X86_64 unsigned int f_gbpages = F(GBPAGES); unsigned int f_lm = F(LM); @@ -515,7 +489,7 @@ void kvm_set_cpu_caps(void) F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* Reserved */ | - f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | + F(NX) | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW) ); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index b07592ca92f0..41d2a53c5dea 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1933,7 +1933,7 @@ ret_success: void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *entry; - struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + struct kvm_vcpu_hv *hv_vcpu; entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0); if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) { @@ -2016,6 +2016,7 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) { + trace_kvm_hv_hypercall_done(result); kvm_hv_hypercall_set_result(vcpu, result); ++vcpu->stat.hypercalls; return kvm_skip_emulated_instruction(vcpu); @@ -2139,6 +2140,7 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_hv_hcall hc; u64 ret = HV_STATUS_SUCCESS; @@ -2173,17 +2175,25 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; hc.rep = !!(hc.rep_cnt || hc.rep_idx); - if (hc.fast && is_xmm_fast_hypercall(&hc)) - kvm_hv_hypercall_read_xmm(&hc); - trace_kvm_hv_hypercall(hc.code, hc.fast, hc.rep_cnt, hc.rep_idx, hc.ingpa, hc.outgpa); - if (unlikely(!hv_check_hypercall_access(to_hv_vcpu(vcpu), hc.code))) { + if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) { ret = HV_STATUS_ACCESS_DENIED; goto hypercall_complete; } + if (hc.fast && is_xmm_fast_hypercall(&hc)) { + if (unlikely(hv_vcpu->enforce_cpuid && + !(hv_vcpu->cpuid_cache.features_edx & + HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE))) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + kvm_hv_hypercall_read_xmm(&hc); + } + switch (hc.code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: if (unlikely(hc.rep)) { diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 698969e18fe3..ff005fe738a4 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -96,7 +96,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) { ioapic->rtc_status.pending_eoi = 0; - bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID); + bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID + 1); } static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 660401700075..11e4065e1617 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -43,13 +43,13 @@ struct kvm_vcpu; struct dest_map { /* vcpu bitmap where IRQ has been sent */ - DECLARE_BITMAP(map, KVM_MAX_VCPU_ID); + DECLARE_BITMAP(map, KVM_MAX_VCPU_ID + 1); /* * Vector sent to a given vcpu, only valid when * the vcpu's bit in map is set */ - u8 vectors[KVM_MAX_VCPU_ID]; + u8 vectors[KVM_MAX_VCPU_ID + 1]; }; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 66f7f5bc3482..47b765270239 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1644,7 +1644,7 @@ static int is_empty_shadow_page(u64 *spt) * aggregate version in order to make the slab shrinker * faster */ -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr) +static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) { kvm->arch.n_used_mmu_pages += nr; percpu_counter_add(&kvm_total_used_mmu_pages, nr); @@ -2535,6 +2535,7 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { struct kvm_mmu_page *sp; + bool locked = false; /* * Force write-protection if the page is being tracked. Note, the page @@ -2557,9 +2558,34 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) if (sp->unsync) continue; + /* + * TDP MMU page faults require an additional spinlock as they + * run with mmu_lock held for read, not write, and the unsync + * logic is not thread safe. Take the spinklock regardless of + * the MMU type to avoid extra conditionals/parameters, there's + * no meaningful penalty if mmu_lock is held for write. + */ + if (!locked) { + locked = true; + spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock); + + /* + * Recheck after taking the spinlock, a different vCPU + * may have since marked the page unsync. A false + * positive on the unprotected check above is not + * possible as clearing sp->unsync _must_ hold mmu_lock + * for write, i.e. unsync cannot transition from 0->1 + * while this CPU holds mmu_lock for read (or write). + */ + if (READ_ONCE(sp->unsync)) + continue; + } + WARN_ON(sp->role.level != PG_LEVEL_4K); kvm_unsync_page(vcpu, sp); } + if (locked) + spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock); /* * We need to ensure that the marking of unsync pages is visible @@ -5537,6 +5563,8 @@ void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); + if (!kvm_mmu_init_tdp_mmu(kvm)) /* * No smp_load/store wrappers needed here as we are in diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 0853370bd811..d80cb122b5f3 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -43,6 +43,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) if (!kvm->arch.tdp_mmu_enabled) return; + WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* @@ -81,8 +82,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared) { - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); - kvm_lockdep_assert_mmu_lock_held(kvm, shared); if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) @@ -94,7 +93,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, list_del_rcu(&root->link); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); - zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared); + zap_gfn_range(kvm, root, 0, -1ull, false, false, shared); call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); } @@ -724,13 +723,29 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, bool can_yield, bool flush, bool shared) { + gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT); + bool zap_all = (start == 0 && end >= max_gfn_host); struct tdp_iter iter; + /* + * No need to try to step down in the iterator when zapping all SPTEs, + * zapping the top-level non-leaf SPTEs will recurse on their children. + */ + int min_level = zap_all ? root->role.level : PG_LEVEL_4K; + + /* + * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will + * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF, + * and so KVM will never install a SPTE for such addresses. + */ + end = min(end, max_gfn_host); + kvm_lockdep_assert_mmu_lock_held(kvm, shared); rcu_read_lock(); - tdp_root_for_each_pte(iter, root, start, end) { + for_each_tdp_pte_min_level(iter, root->spt, root->role.level, + min_level, start, end) { retry: if (can_yield && tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) { @@ -744,9 +759,10 @@ retry: /* * If this is a non-last-level SPTE that covers a larger range * than should be zapped, continue, and zap the mappings at a - * lower level. + * lower level, except when zapping all SPTEs. */ - if ((iter.gfn < start || + if (!zap_all && + (iter.gfn < start || iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && !is_last_spte(iter.old_spte, iter.level)) continue; @@ -794,12 +810,11 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, void kvm_tdp_mmu_zap_all(struct kvm *kvm) { - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); bool flush = false; int i; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush, false); if (flush) @@ -838,7 +853,6 @@ static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm, */ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) { - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); struct kvm_mmu_page *next_root; struct kvm_mmu_page *root; bool flush = false; @@ -854,8 +868,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) rcu_read_unlock(); - flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush, - true); + flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true); /* * Put the reference acquired in diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 1d01da64c333..a8ad78a2faa1 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -646,7 +646,7 @@ out: void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *vmcb = svm->vmcb; + struct vmcb *vmcb = svm->vmcb01.ptr; bool activated = kvm_vcpu_apicv_active(vcpu); if (!enable_apicv) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 3bd09c50c98b..61738ff8ef33 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -515,7 +515,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id, * avic_physical_id. */ - WARN_ON(svm->vmcb01.ptr->control.int_ctl & AVIC_ENABLE_MASK); + WARN_ON(kvm_apicv_activated(svm->vcpu.kvm)); /* Copied from vmcb01. msrpm_base can be overwritten later. */ svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl; @@ -702,8 +702,8 @@ out: } /* Copy state save area fields which are handled by VMRUN */ -void svm_copy_vmrun_state(struct vmcb_save_area *from_save, - struct vmcb_save_area *to_save) +void svm_copy_vmrun_state(struct vmcb_save_area *to_save, + struct vmcb_save_area *from_save) { to_save->es = from_save->es; to_save->cs = from_save->cs; @@ -722,7 +722,7 @@ void svm_copy_vmrun_state(struct vmcb_save_area *from_save, to_save->cpl = 0; } -void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) +void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb) { to_vmcb->save.fs = from_vmcb->save.fs; to_vmcb->save.gs = from_vmcb->save.gs; @@ -1385,7 +1385,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; - svm_copy_vmrun_state(save, &svm->vmcb01.ptr->save); + svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save); nested_load_control_from_vmcb12(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 6710d9ee2e4b..7fbce342eec4 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -64,6 +64,7 @@ static DEFINE_MUTEX(sev_bitmap_lock); unsigned int max_sev_asid; static unsigned int min_sev_asid; static unsigned long sev_me_mask; +static unsigned int nr_asids; static unsigned long *sev_asid_bitmap; static unsigned long *sev_reclaim_asid_bitmap; @@ -78,11 +79,11 @@ struct enc_region { /* Called with the sev_bitmap_lock held, or on shutdown */ static int sev_flush_asids(int min_asid, int max_asid) { - int ret, pos, error = 0; + int ret, asid, error = 0; /* Check if there are any ASIDs to reclaim before performing a flush */ - pos = find_next_bit(sev_reclaim_asid_bitmap, max_asid, min_asid); - if (pos >= max_asid) + asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid); + if (asid > max_asid) return -EBUSY; /* @@ -115,15 +116,15 @@ static bool __sev_recycle_asids(int min_asid, int max_asid) /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */ bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap, - max_sev_asid); - bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid); + nr_asids); + bitmap_zero(sev_reclaim_asid_bitmap, nr_asids); return true; } static int sev_asid_new(struct kvm_sev_info *sev) { - int pos, min_asid, max_asid, ret; + int asid, min_asid, max_asid, ret; bool retry = true; enum misc_res_type type; @@ -143,11 +144,11 @@ static int sev_asid_new(struct kvm_sev_info *sev) * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. */ - min_asid = sev->es_active ? 0 : min_sev_asid - 1; + min_asid = sev->es_active ? 1 : min_sev_asid; max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; again: - pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_asid); - if (pos >= max_asid) { + asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid); + if (asid > max_asid) { if (retry && __sev_recycle_asids(min_asid, max_asid)) { retry = false; goto again; @@ -157,11 +158,11 @@ again: goto e_uncharge; } - __set_bit(pos, sev_asid_bitmap); + __set_bit(asid, sev_asid_bitmap); mutex_unlock(&sev_bitmap_lock); - return pos + 1; + return asid; e_uncharge: misc_cg_uncharge(type, sev->misc_cg, 1); put_misc_cg(sev->misc_cg); @@ -179,17 +180,16 @@ static int sev_get_asid(struct kvm *kvm) static void sev_asid_free(struct kvm_sev_info *sev) { struct svm_cpu_data *sd; - int cpu, pos; + int cpu; enum misc_res_type type; mutex_lock(&sev_bitmap_lock); - pos = sev->asid - 1; - __set_bit(pos, sev_reclaim_asid_bitmap); + __set_bit(sev->asid, sev_reclaim_asid_bitmap); for_each_possible_cpu(cpu) { sd = per_cpu(svm_data, cpu); - sd->sev_vmcbs[pos] = NULL; + sd->sev_vmcbs[sev->asid] = NULL; } mutex_unlock(&sev_bitmap_lock); @@ -1857,12 +1857,17 @@ void __init sev_hardware_setup(void) min_sev_asid = edx; sev_me_mask = 1UL << (ebx & 0x3f); - /* Initialize SEV ASID bitmaps */ - sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); + /* + * Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap, + * even though it's never used, so that the bitmap is indexed by the + * actual ASID. + */ + nr_asids = max_sev_asid + 1; + sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL); if (!sev_asid_bitmap) goto out; - sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); + sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL); if (!sev_reclaim_asid_bitmap) { bitmap_free(sev_asid_bitmap); sev_asid_bitmap = NULL; @@ -1907,7 +1912,7 @@ void sev_hardware_teardown(void) return; /* No need to take sev_bitmap_lock, all VMs have been destroyed. */ - sev_flush_asids(0, max_sev_asid); + sev_flush_asids(1, max_sev_asid); bitmap_free(sev_asid_bitmap); bitmap_free(sev_reclaim_asid_bitmap); @@ -1921,7 +1926,7 @@ int sev_cpu_init(struct svm_cpu_data *sd) if (!sev_enabled) return 0; - sd->sev_vmcbs = kcalloc(max_sev_asid + 1, sizeof(void *), GFP_KERNEL); + sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL); if (!sd->sev_vmcbs) return -ENOMEM; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 664d20f0689c..e8ccab50ebf6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1406,8 +1406,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) goto error_free_vmsa_page; } - svm_vcpu_init_msrpm(vcpu, svm->msrpm); - svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); @@ -1419,6 +1417,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm_switch_vmcb(svm, &svm->vmcb01); init_vmcb(vcpu); + svm_vcpu_init_msrpm(vcpu, svm->msrpm); + svm_init_osvw(vcpu); vcpu->arch.microcode_version = 0x01000065; @@ -1568,8 +1568,11 @@ static void svm_set_vintr(struct vcpu_svm *svm) { struct vmcb_control_area *control; - /* The following fields are ignored when AVIC is enabled */ - WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu)); + /* + * The following fields are ignored when AVIC is enabled + */ + WARN_ON(kvm_apicv_activated(svm->vcpu.kvm)); + svm_set_intercept(svm, INTERCEPT_VINTR); /* @@ -2147,11 +2150,12 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) ret = kvm_skip_emulated_instruction(vcpu); if (vmload) { - nested_svm_vmloadsave(vmcb12, svm->vmcb); + svm_copy_vmloadsave_state(svm->vmcb, vmcb12); svm->sysenter_eip_hi = 0; svm->sysenter_esp_hi = 0; - } else - nested_svm_vmloadsave(svm->vmcb, vmcb12); + } else { + svm_copy_vmloadsave_state(vmcb12, svm->vmcb); + } kvm_vcpu_unmap(vcpu, &map, true); @@ -4344,8 +4348,8 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); - svm_copy_vmrun_state(&svm->vmcb01.ptr->save, - map_save.hva + 0x400); + svm_copy_vmrun_state(map_save.hva + 0x400, + &svm->vmcb01.ptr->save); kvm_vcpu_unmap(vcpu, &map_save, true); } @@ -4393,8 +4397,8 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) &map_save) == -EINVAL) return 1; - svm_copy_vmrun_state(map_save.hva + 0x400, - &svm->vmcb01.ptr->save); + svm_copy_vmrun_state(&svm->vmcb01.ptr->save, + map_save.hva + 0x400); kvm_vcpu_unmap(vcpu, &map_save, true); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 7e2090752d8f..bd0fe94c2920 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -464,9 +464,9 @@ void svm_leave_nested(struct vcpu_svm *svm); void svm_free_nested(struct vcpu_svm *svm); int svm_allocate_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct kvm_vcpu *vcpu); -void svm_copy_vmrun_state(struct vmcb_save_area *from_save, - struct vmcb_save_area *to_save); -void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); +void svm_copy_vmrun_state(struct vmcb_save_area *to_save, + struct vmcb_save_area *from_save); +void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb); int nested_svm_vmexit(struct vcpu_svm *svm); static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 9b9a55abc29f..c53b8bf8d013 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -89,7 +89,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments( * as we mark it dirty unconditionally towards end of vcpu * init phase. */ - if (vmcb && vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) && + if (vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) && hve->hv_enlightenments_control.msr_bitmap) vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); } diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index b484141ea15b..03ebe368333e 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -92,6 +92,21 @@ TRACE_EVENT(kvm_hv_hypercall, __entry->outgpa) ); +TRACE_EVENT(kvm_hv_hypercall_done, + TP_PROTO(u64 result), + TP_ARGS(result), + + TP_STRUCT__entry( + __field(__u64, result) + ), + + TP_fast_assign( + __entry->result = result; + ), + + TP_printk("result 0x%llx", __entry->result) +); + /* * Tracepoint for Xen hypercall. */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1a52134b0c42..b3f77d18eb5a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -330,6 +330,31 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) vcpu_put(vcpu); } +#define EPTP_PA_MASK GENMASK_ULL(51, 12) + +static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) +{ + return VALID_PAGE(root_hpa) && + ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); +} + +static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, + gpa_t addr) +{ + uint i; + struct kvm_mmu_root_info *cached_root; + + WARN_ON_ONCE(!mmu_is_nested(vcpu)); + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + cached_root = &vcpu->arch.mmu->prev_roots[i]; + + if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, + eptp)) + vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa); + } +} + static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -342,10 +367,22 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, vm_exit_reason = EXIT_REASON_PML_FULL; vmx->nested.pml_full = false; exit_qualification &= INTR_INFO_UNBLOCK_NMI; - } else if (fault->error_code & PFERR_RSVD_MASK) - vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; - else - vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + } else { + if (fault->error_code & PFERR_RSVD_MASK) + vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; + else + vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + + /* + * Although the caller (kvm_inject_emulated_page_fault) would + * have already synced the faulting address in the shadow EPT + * tables for the current EPTP12, we also need to sync it for + * any other cached EPTP02s based on the same EP4TA, since the + * TLB associates mappings to the EP4TA rather than the full EPTP. + */ + nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, + fault->address); + } nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); vmcs12->guest_physical_address = fault->address; @@ -5325,14 +5362,6 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return nested_vmx_succeed(vcpu); } -#define EPTP_PA_MASK GENMASK_ULL(51, 12) - -static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) -{ - return VALID_PAGE(root_hpa) && - ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); -} - /* Emulate the INVEPT instruction */ static int handle_invept(struct kvm_vcpu *vcpu) { @@ -5826,7 +5855,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, if (is_nmi(intr_info)) return true; else if (is_page_fault(intr_info)) - return vcpu->arch.apf.host_apf_flags || !enable_ept; + return vcpu->arch.apf.host_apf_flags || + vmx_need_pf_intercept(vcpu); else if (is_debug(intr_info) && vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index db88ed4f2121..17a1cb4b059d 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -522,7 +522,7 @@ static inline struct vmcs *alloc_vmcs(bool shadow) static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) { - return vmx->secondary_exec_control & + return secondary_exec_controls_get(vmx) & SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a4fd10604f72..e5d5c5ed7dd4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3407,7 +3407,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; break; case MSR_KVM_ASYNC_PF_ACK: - if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; if (data & 0x1) { vcpu->arch.apf.pageready_pending = false; @@ -3746,7 +3746,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.apf.msr_int_val; break; case MSR_KVM_ASYNC_PF_ACK: - if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; msr_info->data = 0; @@ -4358,8 +4358,17 @@ static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu) static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) { - return kvm_arch_interrupt_allowed(vcpu) && - kvm_cpu_accept_dm_intr(vcpu); + /* + * Do not cause an interrupt window exit if an exception + * is pending or an event needs reinjection; userspace + * might want to inject the interrupt manually using KVM_SET_REGS + * or KVM_SET_SREGS. For that to work, we must be at an + * instruction boundary and with no events half-injected. + */ + return (kvm_arch_interrupt_allowed(vcpu) && + kvm_cpu_accept_dm_intr(vcpu) && + !kvm_event_needs_reinjection(vcpu) && + !vcpu->arch.exception.pending); } static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, |