summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c800
1 files changed, 503 insertions, 297 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2686f2edb47c..c1c4e2b05a63 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -790,30 +790,6 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
}
EXPORT_SYMBOL_GPL(kvm_require_dr);
-/*
- * This function will be used to read from the physical memory of the currently
- * running guest. The difference to kvm_vcpu_read_guest_page is that this function
- * can read from guest physical or from the guest's guest physical memory.
- */
-int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gfn_t ngfn, void *data, int offset, int len,
- u32 access)
-{
- struct x86_exception exception;
- gfn_t real_gfn;
- gpa_t ngpa;
-
- ngpa = gfn_to_gpa(ngfn);
- real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
- if (real_gfn == UNMAPPED_GVA)
- return -EFAULT;
-
- real_gfn = gpa_to_gfn(real_gfn);
-
- return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
-}
-EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
-
static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
{
return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
@@ -825,34 +801,38 @@ static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
{
gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
- unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
+ gpa_t real_gpa;
int i;
int ret;
u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
- ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
- offset * sizeof(u64), sizeof(pdpte),
- PFERR_USER_MASK|PFERR_WRITE_MASK);
- if (ret < 0) {
- ret = 0;
- goto out;
- }
+ /*
+ * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated
+ * to an L1 GPA.
+ */
+ real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(pdpt_gfn),
+ PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
+ if (real_gpa == UNMAPPED_GVA)
+ return 0;
+
+ /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte,
+ cr3 & GENMASK(11, 5), sizeof(pdpte));
+ if (ret < 0)
+ return 0;
+
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
if ((pdpte[i] & PT_PRESENT_MASK) &&
(pdpte[i] & pdptr_rsvd_bits(vcpu))) {
- ret = 0;
- goto out;
+ return 0;
}
}
- ret = 1;
memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
vcpu->arch.pdptrs_from_userspace = false;
-out:
-
- return ret;
+ return 1;
}
EXPORT_SYMBOL_GPL(load_pdptrs);
@@ -993,7 +973,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
/*
* Do not allow the guest to set bits that we do not support
* saving. However, xcr0 bit 0 is always set, even if the
- * emulated CPU does not support XSAVE (see fx_init).
+ * emulated CPU does not support XSAVE (see kvm_vcpu_reset()).
*/
valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
if (xcr0 & ~valid_bits)
@@ -1042,9 +1022,28 @@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
{
- if (((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) ||
- (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
+ /*
+ * If any role bit is changed, the MMU needs to be reset.
+ *
+ * If CR4.PCIDE is changed 1 -> 0, the guest TLB must be flushed.
+ * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
+ * according to the SDM; however, stale prev_roots could be reused
+ * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we
+ * free them all. KVM_REQ_MMU_RELOAD is fit for the both cases; it
+ * is slow, but changing CR4.PCIDE is a rare case.
+ *
+ * If CR4.PGE is changed, the guest TLB must be flushed.
+ *
+ * Note: resetting MMU is a superset of KVM_REQ_MMU_RELOAD and
+ * KVM_REQ_MMU_RELOAD is a superset of KVM_REQ_TLB_FLUSH_GUEST, hence
+ * the usage of "else if".
+ */
+ if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
+ else if ((cr4 ^ old_cr4) & X86_CR4_PCIDE)
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+ else if ((cr4 ^ old_cr4) & X86_CR4_PGE)
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
@@ -1092,6 +1091,18 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
int i;
/*
+ * MOV CR3 and INVPCID are usually not intercepted when using TDP, but
+ * this is reachable when running EPT=1 and unrestricted_guest=0, and
+ * also via the emulator. KVM's TDP page tables are not in the scope of
+ * the invalidation, but the guest's TLB entries need to be flushed as
+ * the CPU may have cached entries in its TLB for the target PCID.
+ */
+ if (unlikely(tdp_enabled)) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return;
+ }
+
+ /*
* If neither the current CR3 nor any of the prev_roots use the given
* PCID, then nothing needs to be done here because a resync will
* happen anyway before switching to any other CR3.
@@ -1101,6 +1112,14 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
+ /*
+ * If PCID is disabled, there is no need to free prev_roots even if the
+ * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB
+ * with PCIDE=0.
+ */
+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ return;
+
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
@@ -1381,6 +1400,7 @@ static const u32 emulated_msrs_all[] = {
MSR_PLATFORM_INFO,
MSR_MISC_FEATURES_ENABLES,
MSR_AMD64_VIRT_SPEC_CTRL,
+ MSR_AMD64_TSC_RATIO,
MSR_IA32_POWER_CTL,
MSR_IA32_UCODE_REV,
@@ -2454,13 +2474,64 @@ static inline bool kvm_check_tsc_unstable(void)
return check_tsc_unstable();
}
+/*
+ * Infers attempts to synchronize the guest's tsc from host writes. Sets the
+ * offset for the vcpu and tracks the TSC matching generation that the vcpu
+ * participates in.
+ */
+static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
+ u64 ns, bool matched)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ lockdep_assert_held(&kvm->arch.tsc_write_lock);
+
+ /*
+ * We also track th most recent recorded KHZ, write and time to
+ * allow the matching interval to be extended at each write.
+ */
+ kvm->arch.last_tsc_nsec = ns;
+ kvm->arch.last_tsc_write = tsc;
+ kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ kvm->arch.last_tsc_offset = offset;
+
+ vcpu->arch.last_guest_tsc = tsc;
+
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
+
+ if (!matched) {
+ /*
+ * We split periods of matched TSC writes into generations.
+ * For each generation, we track the original measured
+ * nanosecond time, offset, and write, so if TSCs are in
+ * sync, we can match exact offset, and if not, we can match
+ * exact software computation in compute_guest_tsc()
+ *
+ * These values are tracked in kvm->arch.cur_xxx variables.
+ */
+ kvm->arch.cur_tsc_generation++;
+ kvm->arch.cur_tsc_nsec = ns;
+ kvm->arch.cur_tsc_write = tsc;
+ kvm->arch.cur_tsc_offset = offset;
+ kvm->arch.nr_vcpus_matched_tsc = 0;
+ } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
+ kvm->arch.nr_vcpus_matched_tsc++;
+ }
+
+ /* Keep track of which generation this VCPU has synchronized to */
+ vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+ vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+ vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+ kvm_track_tsc_matching(vcpu);
+}
+
static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
- bool matched;
- bool already_matched;
+ bool matched = false;
bool synchronizing = false;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@@ -2506,51 +2577,10 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
offset = kvm_compute_l1_tsc_offset(vcpu, data);
}
matched = true;
- already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
- } else {
- /*
- * We split periods of matched TSC writes into generations.
- * For each generation, we track the original measured
- * nanosecond time, offset, and write, so if TSCs are in
- * sync, we can match exact offset, and if not, we can match
- * exact software computation in compute_guest_tsc()
- *
- * These values are tracked in kvm->arch.cur_xxx variables.
- */
- kvm->arch.cur_tsc_generation++;
- kvm->arch.cur_tsc_nsec = ns;
- kvm->arch.cur_tsc_write = data;
- kvm->arch.cur_tsc_offset = offset;
- matched = false;
}
- /*
- * We also track th most recent recorded KHZ, write and time to
- * allow the matching interval to be extended at each write.
- */
- kvm->arch.last_tsc_nsec = ns;
- kvm->arch.last_tsc_write = data;
- kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
-
- vcpu->arch.last_guest_tsc = data;
-
- /* Keep track of which generation this VCPU has synchronized to */
- vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
- vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
- vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
-
- kvm_vcpu_write_tsc_offset(vcpu, offset);
+ __kvm_synchronize_tsc(vcpu, offset, data, ns, matched);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
-
- raw_spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
- if (!matched) {
- kvm->arch.nr_vcpus_matched_tsc = 0;
- } else if (!already_matched) {
- kvm->arch.nr_vcpus_matched_tsc++;
- }
-
- kvm_track_tsc_matching(vcpu);
- raw_spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
}
static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
@@ -2738,6 +2768,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
int vclock_mode;
bool host_tsc_clocksource, vcpus_matched;
+ lockdep_assert_held(&kvm->arch.tsc_write_lock);
vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
atomic_read(&kvm->online_vcpus));
@@ -2762,68 +2793,101 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
#endif
}
-void kvm_make_mclock_inprogress_request(struct kvm *kvm)
+static void kvm_make_mclock_inprogress_request(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
}
-static void kvm_gen_update_masterclock(struct kvm *kvm)
+static void __kvm_start_pvclock_update(struct kvm *kvm)
{
-#ifdef CONFIG_X86_64
- int i;
- struct kvm_vcpu *vcpu;
- struct kvm_arch *ka = &kvm->arch;
- unsigned long flags;
-
- kvm_hv_invalidate_tsc_page(kvm);
+ raw_spin_lock_irq(&kvm->arch.tsc_write_lock);
+ write_seqcount_begin(&kvm->arch.pvclock_sc);
+}
+static void kvm_start_pvclock_update(struct kvm *kvm)
+{
kvm_make_mclock_inprogress_request(kvm);
/* no guest entries from this point */
- raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
- pvclock_update_vm_gtod_copy(kvm);
- raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+ __kvm_start_pvclock_update(kvm);
+}
+
+static void kvm_end_pvclock_update(struct kvm *kvm)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ struct kvm_vcpu *vcpu;
+ int i;
+ write_seqcount_end(&ka->pvclock_sc);
+ raw_spin_unlock_irq(&ka->tsc_write_lock);
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
/* guest entries allowed */
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
-#endif
}
-u64 get_kvmclock_ns(struct kvm *kvm)
+static void kvm_update_masterclock(struct kvm *kvm)
+{
+ kvm_hv_invalidate_tsc_page(kvm);
+ kvm_start_pvclock_update(kvm);
+ pvclock_update_vm_gtod_copy(kvm);
+ kvm_end_pvclock_update(kvm);
+}
+
+/* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */
+static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
{
struct kvm_arch *ka = &kvm->arch;
struct pvclock_vcpu_time_info hv_clock;
- unsigned long flags;
- u64 ret;
-
- raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
- if (!ka->use_master_clock) {
- raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
- return get_kvmclock_base_ns() + ka->kvmclock_offset;
- }
-
- hv_clock.tsc_timestamp = ka->master_cycle_now;
- hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
- raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
/* both __this_cpu_read() and rdtsc() should be on the same cpu */
get_cpu();
- if (__this_cpu_read(cpu_tsc_khz)) {
+ data->flags = 0;
+ if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) {
+#ifdef CONFIG_X86_64
+ struct timespec64 ts;
+
+ if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
+ data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
+ data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
+ } else
+#endif
+ data->host_tsc = rdtsc();
+
+ data->flags |= KVM_CLOCK_TSC_STABLE;
+ hv_clock.tsc_timestamp = ka->master_cycle_now;
+ hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
&hv_clock.tsc_shift,
&hv_clock.tsc_to_system_mul);
- ret = __pvclock_read_cycles(&hv_clock, rdtsc());
- } else
- ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
+ data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
+ } else {
+ data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+ }
put_cpu();
+}
- return ret;
+static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&ka->pvclock_sc);
+ __get_kvmclock(kvm, data);
+ } while (read_seqcount_retry(&ka->pvclock_sc, seq));
+}
+
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+ struct kvm_clock_data data;
+
+ get_kvmclock(kvm, &data);
+ return data.clock;
}
static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
@@ -2888,6 +2952,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
unsigned long flags, tgt_tsc_khz;
+ unsigned seq;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
@@ -2902,13 +2967,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
* If the host uses TSC clock, then passthrough TSC as stable
* to the guest.
*/
- raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
- use_master_clock = ka->use_master_clock;
- if (use_master_clock) {
- host_tsc = ka->master_cycle_now;
- kernel_ns = ka->master_kernel_ns;
- }
- raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+ do {
+ seq = read_seqcount_begin(&ka->pvclock_sc);
+ use_master_clock = ka->use_master_clock;
+ if (use_master_clock) {
+ host_tsc = ka->master_cycle_now;
+ kernel_ns = ka->master_kernel_ns;
+ }
+ } while (read_seqcount_retry(&ka->pvclock_sc, seq));
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
@@ -3179,15 +3245,14 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
++vcpu->stat.tlb_flush;
if (!tdp_enabled) {
- /*
+ /*
* A TLB flush on behalf of the guest is equivalent to
* INVPCID(all), toggling CR4.PGE, etc., which requires
- * a forced sync of the shadow page tables. Unload the
- * entire MMU here and the subsequent load will sync the
- * shadow page tables, and also flush the TLB.
+ * a forced sync of the shadow page tables. Ensure all the
+ * roots are synced and the guest TLB in hardware is clean.
*/
- kvm_mmu_unload(vcpu);
- return;
+ kvm_mmu_sync_roots(vcpu);
+ kvm_mmu_sync_prev_roots(vcpu);
}
static_call(kvm_x86_tlb_flush_guest)(vcpu);
@@ -4028,6 +4093,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
case KVM_CAP_SREGS2:
case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
+ case KVM_CAP_VCPU_ATTRIBUTES:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@@ -4048,7 +4114,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_SYNC_X86_VALID_FIELDS;
break;
case KVM_CAP_ADJUST_CLOCK:
- r = KVM_CLOCK_TSC_STABLE;
+ r = KVM_CLOCK_VALID_FLAGS;
break;
case KVM_CAP_X86_DISABLE_EXITS:
r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
@@ -4077,7 +4143,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_MAX_VCPUS;
break;
case KVM_CAP_MAX_VCPU_ID:
- r = KVM_MAX_VCPU_ID;
+ r = KVM_MAX_VCPU_IDS;
break;
case KVM_CAP_PV_MMU: /* obsolete */
r = 0;
@@ -4775,6 +4841,115 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
return 0;
}
+static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ int r;
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET:
+ r = 0;
+ break;
+ default:
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr;
+ int r;
+
+ if ((u64)(unsigned long)uaddr != attr->addr)
+ return -EFAULT;
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET:
+ r = -EFAULT;
+ if (put_user(vcpu->arch.l1_tsc_offset, uaddr))
+ break;
+ r = 0;
+ break;
+ default:
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr;
+ struct kvm *kvm = vcpu->kvm;
+ int r;
+
+ if ((u64)(unsigned long)uaddr != attr->addr)
+ return -EFAULT;
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET: {
+ u64 offset, tsc, ns;
+ unsigned long flags;
+ bool matched;
+
+ r = -EFAULT;
+ if (get_user(offset, uaddr))
+ break;
+
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+
+ matched = (vcpu->arch.virtual_tsc_khz &&
+ kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
+ kvm->arch.last_tsc_offset == offset);
+
+ tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
+ ns = get_kvmclock_base_ns();
+
+ __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+ r = 0;
+ break;
+ }
+ default:
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu,
+ unsigned int ioctl,
+ void __user *argp)
+{
+ struct kvm_device_attr attr;
+ int r;
+
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ return -EFAULT;
+
+ if (attr.group != KVM_VCPU_TSC_CTRL)
+ return -ENXIO;
+
+ switch (ioctl) {
+ case KVM_HAS_DEVICE_ATTR:
+ r = kvm_arch_tsc_has_attr(vcpu, &attr);
+ break;
+ case KVM_GET_DEVICE_ATTR:
+ r = kvm_arch_tsc_get_attr(vcpu, &attr);
+ break;
+ case KVM_SET_DEVICE_ATTR:
+ r = kvm_arch_tsc_set_attr(vcpu, &attr);
+ break;
+ }
+
+ return r;
+}
+
static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
struct kvm_enable_cap *cap)
{
@@ -5229,6 +5404,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = __set_sregs2(vcpu, u.sregs2);
break;
}
+ case KVM_HAS_DEVICE_ATTR:
+ case KVM_GET_DEVICE_ATTR:
+ case KVM_SET_DEVICE_ATTR:
+ r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
+ break;
default:
r = -EINVAL;
}
@@ -5712,6 +5892,63 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
}
#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
+static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_clock_data data = { 0 };
+
+ get_kvmclock(kvm, &data);
+ if (copy_to_user(argp, &data, sizeof(data)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ struct kvm_clock_data data;
+ u64 now_raw_ns;
+
+ if (copy_from_user(&data, argp, sizeof(data)))
+ return -EFAULT;
+
+ /*
+ * Only KVM_CLOCK_REALTIME is used, but allow passing the
+ * result of KVM_GET_CLOCK back to KVM_SET_CLOCK.
+ */
+ if (data.flags & ~KVM_CLOCK_VALID_FLAGS)
+ return -EINVAL;
+
+ kvm_hv_invalidate_tsc_page(kvm);
+ kvm_start_pvclock_update(kvm);
+ pvclock_update_vm_gtod_copy(kvm);
+
+ /*
+ * This pairs with kvm_guest_time_update(): when masterclock is
+ * in use, we use master_kernel_ns + kvmclock_offset to set
+ * unsigned 'system_time' so if we use get_kvmclock_ns() (which
+ * is slightly ahead) here we risk going negative on unsigned
+ * 'system_time' when 'data.clock' is very small.
+ */
+ if (data.flags & KVM_CLOCK_REALTIME) {
+ u64 now_real_ns = ktime_get_real_ns();
+
+ /*
+ * Avoid stepping the kvmclock backwards.
+ */
+ if (now_real_ns > data.realtime)
+ data.clock += now_real_ns - data.realtime;
+ }
+
+ if (ka->use_master_clock)
+ now_raw_ns = ka->master_kernel_ns;
+ else
+ now_raw_ns = get_kvmclock_base_ns();
+ ka->kvmclock_offset = data.clock - now_raw_ns;
+ kvm_end_pvclock_update(kvm);
+ return 0;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -5955,60 +6192,12 @@ set_pit2_out:
break;
}
#endif
- case KVM_SET_CLOCK: {
- struct kvm_arch *ka = &kvm->arch;
- struct kvm_clock_data user_ns;
- u64 now_ns;
-
- r = -EFAULT;
- if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
- goto out;
-
- r = -EINVAL;
- if (user_ns.flags)
- goto out;
-
- r = 0;
- /*
- * TODO: userspace has to take care of races with VCPU_RUN, so
- * kvm_gen_update_masterclock() can be cut down to locked
- * pvclock_update_vm_gtod_copy().
- */
- kvm_gen_update_masterclock(kvm);
-
- /*
- * This pairs with kvm_guest_time_update(): when masterclock is
- * in use, we use master_kernel_ns + kvmclock_offset to set
- * unsigned 'system_time' so if we use get_kvmclock_ns() (which
- * is slightly ahead) here we risk going negative on unsigned
- * 'system_time' when 'user_ns.clock' is very small.
- */
- raw_spin_lock_irq(&ka->pvclock_gtod_sync_lock);
- if (kvm->arch.use_master_clock)
- now_ns = ka->master_kernel_ns;
- else
- now_ns = get_kvmclock_base_ns();
- ka->kvmclock_offset = user_ns.clock - now_ns;
- raw_spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
-
- kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
+ case KVM_SET_CLOCK:
+ r = kvm_vm_ioctl_set_clock(kvm, argp);
break;
- }
- case KVM_GET_CLOCK: {
- struct kvm_clock_data user_ns;
- u64 now_ns;
-
- now_ns = get_kvmclock_ns(kvm);
- user_ns.clock = now_ns;
- user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
- memset(&user_ns.pad, 0, sizeof(user_ns.pad));
-
- r = -EFAULT;
- if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
- goto out;
- r = 0;
+ case KVM_GET_CLOCK:
+ r = kvm_vm_ioctl_get_clock(kvm, argp);
break;
- }
case KVM_MEMORY_ENCRYPT_OP: {
r = -ENOTTY;
if (kvm_x86_ops.mem_enc_op)
@@ -7375,28 +7564,77 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
}
EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
-static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
+static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
+ u8 ndata, u8 *insn_bytes, u8 insn_size)
{
- struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
- u32 insn_size = ctxt->fetch.end - ctxt->fetch.data;
struct kvm_run *run = vcpu->run;
+ u64 info[5];
+ u8 info_start;
+
+ /*
+ * Zero the whole array used to retrieve the exit info, as casting to
+ * u32 for select entries will leave some chunks uninitialized.
+ */
+ memset(&info, 0, sizeof(info));
+
+ static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1],
+ &info[2], (u32 *)&info[3],
+ (u32 *)&info[4]);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
- run->emulation_failure.ndata = 0;
+
+ /*
+ * There's currently space for 13 entries, but 5 are used for the exit
+ * reason and info. Restrict to 4 to reduce the maintenance burden
+ * when expanding kvm_run.emulation_failure in the future.
+ */
+ if (WARN_ON_ONCE(ndata > 4))
+ ndata = 4;
+
+ /* Always include the flags as a 'data' entry. */
+ info_start = 1;
run->emulation_failure.flags = 0;
if (insn_size) {
- run->emulation_failure.ndata = 3;
+ BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) +
+ sizeof(run->emulation_failure.insn_bytes) != 16));
+ info_start += 2;
run->emulation_failure.flags |=
KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
run->emulation_failure.insn_size = insn_size;
memset(run->emulation_failure.insn_bytes, 0x90,
sizeof(run->emulation_failure.insn_bytes));
- memcpy(run->emulation_failure.insn_bytes,
- ctxt->fetch.data, insn_size);
+ memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
}
+
+ memcpy(&run->internal.data[info_start], info, sizeof(info));
+ memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data,
+ ndata * sizeof(data[0]));
+
+ run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata;
+}
+
+static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
+ ctxt->fetch.end - ctxt->fetch.data);
+}
+
+void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
+ u8 ndata)
+{
+ prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0);
}
+EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit);
+
+void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
+{
+ __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);
static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
{
@@ -7412,16 +7650,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
if (kvm->arch.exit_on_emulation_error ||
(emulation_type & EMULTYPE_SKIP)) {
- prepare_emulation_failure_exit(vcpu);
+ prepare_emulation_ctxt_failure_exit(vcpu);
return 0;
}
kvm_queue_exception(vcpu, UD_VECTOR);
if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ prepare_emulation_ctxt_failure_exit(vcpu);
return 0;
}
@@ -8021,14 +8257,13 @@ static void tsc_khz_changed(void *data)
static void kvm_hyperv_tsc_notifier(void)
{
struct kvm *kvm;
- struct kvm_vcpu *vcpu;
int cpu;
- unsigned long flags;
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_make_mclock_inprogress_request(kvm);
+ /* no guest entries from this point */
hyperv_stop_tsc_emulation();
/* TSC frequency always matches when on Hyper-V */
@@ -8037,18 +8272,11 @@ static void kvm_hyperv_tsc_notifier(void)
kvm_max_guest_tsc_khz = tsc_khz;
list_for_each_entry(kvm, &vm_list, vm_list) {
- struct kvm_arch *ka = &kvm->arch;
-
- raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+ __kvm_start_pvclock_update(kvm);
pvclock_update_vm_gtod_copy(kvm);
- raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
-
- kvm_for_each_vcpu(cpu, vcpu, kvm)
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-
- kvm_for_each_vcpu(cpu, vcpu, kvm)
- kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
+ kvm_end_pvclock_update(kvm);
}
+
mutex_unlock(&kvm_lock);
}
#endif
@@ -8289,18 +8517,20 @@ int kvm_arch_init(void *opaque)
int r;
if (kvm_x86_ops.hardware_enable) {
- printk(KERN_ERR "kvm: already loaded the other module\n");
+ pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name);
r = -EEXIST;
goto out;
}
if (!ops->cpu_has_kvm_support()) {
- pr_err_ratelimited("kvm: no hardware support\n");
+ pr_err_ratelimited("kvm: no hardware support for '%s'\n",
+ ops->runtime_ops->name);
r = -EOPNOTSUPP;
goto out;
}
if (ops->disabled_by_bios()) {
- pr_err_ratelimited("kvm: disabled by bios\n");
+ pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
+ ops->runtime_ops->name);
r = -EOPNOTSUPP;
goto out;
}
@@ -8485,7 +8715,7 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated);
static void kvm_apicv_init(struct kvm *kvm)
{
- mutex_init(&kvm->arch.apicv_update_lock);
+ init_rwsem(&kvm->arch.apicv_update_lock);
if (enable_apicv)
clear_bit(APICV_INHIBIT_REASON_DISABLE,
@@ -9140,14 +9370,7 @@ static void process_smi(struct kvm_vcpu *vcpu)
void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
unsigned long *vcpu_bitmap)
{
- cpumask_var_t cpus;
-
- zalloc_cpumask_var(&cpus, GFP_ATOMIC);
-
- kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
- NULL, vcpu_bitmap, cpus);
-
- free_cpumask_var(cpus);
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap);
}
void kvm_make_scan_ioapic_request(struct kvm *kvm)
@@ -9162,7 +9385,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
if (!lapic_in_kernel(vcpu))
return;
- mutex_lock(&vcpu->kvm->arch.apicv_update_lock);
+ down_read(&vcpu->kvm->arch.apicv_update_lock);
activate = kvm_apicv_activated(vcpu->kvm);
if (vcpu->arch.apicv_active == activate)
@@ -9182,7 +9405,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
out:
- mutex_unlock(&vcpu->kvm->arch.apicv_update_lock);
+ up_read(&vcpu->kvm->arch.apicv_update_lock);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
@@ -9190,6 +9413,8 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
{
unsigned long old, new;
+ lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
+
if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
!static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
return;
@@ -9203,6 +9428,18 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
if (!!old != !!new) {
trace_kvm_apicv_update_request(activate, bit);
+ /*
+ * Kick all vCPUs before setting apicv_inhibit_reasons to avoid
+ * false positives in the sanity check WARN in svm_vcpu_run().
+ * This task will wait for all vCPUs to ack the kick IRQ before
+ * updating apicv_inhibit_reasons, and all other vCPUs will
+ * block on acquiring apicv_update_lock so that vCPUs can't
+ * redo svm_vcpu_run() without seeing the new inhibit state.
+ *
+ * Note, holding apicv_update_lock and taking it in the read
+ * side (handling the request) also prevents other vCPUs from
+ * servicing the request with a stale apicv_inhibit_reasons.
+ */
kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
kvm->arch.apicv_inhibit_reasons = new;
if (new) {
@@ -9216,9 +9453,9 @@ EXPORT_SYMBOL_GPL(__kvm_request_apicv_update);
void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
{
- mutex_lock(&kvm->arch.apicv_update_lock);
+ down_write(&kvm->arch.apicv_update_lock);
__kvm_request_apicv_update(kvm, activate, bit);
- mutex_unlock(&kvm->arch.apicv_update_lock);
+ up_write(&kvm->arch.apicv_update_lock);
}
EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
@@ -9330,7 +9567,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
- kvm_gen_update_masterclock(vcpu->kvm);
+ kvm_update_masterclock(vcpu->kvm);
if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
kvm_gen_kvmclock_update(vcpu);
if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
@@ -9537,6 +9774,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
for (;;) {
+ /*
+ * Assert that vCPU vs. VM APICv state is consistent. An APICv
+ * update must kick and wait for all vCPUs before toggling the
+ * per-VM state, and responsing vCPUs must wait for the update
+ * to complete before servicing KVM_REQ_APICV_UPDATE.
+ */
+ WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
+
exit_fastpath = static_call(kvm_x86_run)(vcpu);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
@@ -10485,16 +10730,6 @@ static int sync_regs(struct kvm_vcpu *vcpu)
return 0;
}
-static void fx_init(struct kvm_vcpu *vcpu)
-{
- /*
- * Ensure guest xcr0 is valid for loading
- */
- vcpu->arch.xcr0 = XFEATURE_MASK_FP;
-
- vcpu->arch.cr0 |= X86_CR0_ET;
-}
-
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
@@ -10556,8 +10791,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
goto free_emulate_ctxt;
}
- fx_init(vcpu);
-
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
@@ -10654,9 +10887,19 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
+ struct kvm_cpuid_entry2 *cpuid_0x1;
unsigned long old_cr0 = kvm_read_cr0(vcpu);
unsigned long new_cr0;
- u32 eax, dummy;
+
+ /*
+ * Several of the "set" flows, e.g. ->set_cr0(), read other registers
+ * to handle side effects. RESET emulation hits those flows and relies
+ * on emulated/virtualized registers, including those that are loaded
+ * into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel
+ * to detect improper or missing initialization.
+ */
+ WARN_ON_ONCE(!init_event &&
+ (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));
kvm_lapic_reset(vcpu, init_event);
@@ -10715,21 +10958,19 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.xcr0 = XFEATURE_MASK_FP;
}
+ /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
- vcpu->arch.regs_avail = ~0;
- vcpu->arch.regs_dirty = ~0;
+ kvm_register_mark_dirty(vcpu, VCPU_REGS_RSP);
/*
* Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon)
* if no CPUID match is found. Note, it's impossible to get a match at
* RESET since KVM emulates RESET before exposing the vCPU to userspace,
- * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET.
- * But, go through the motions in case that's ever remedied.
+ * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry
+ * on RESET. But, go through the motions in case that's ever remedied.
*/
- eax = 1;
- if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true))
- eax = 0x600;
- kvm_rdx_write(vcpu, eax);
+ cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0);
+ kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
vcpu->arch.ia32_xss = 0;
@@ -10981,13 +11222,14 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
void kvm_arch_free_vm(struct kvm *kvm)
{
kfree(to_kvm_hv(kvm)->hv_pa_pg);
- vfree(kvm);
+ __kvm_arch_free_vm(kvm);
}
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
int ret;
+ unsigned long flags;
if (type)
return -EINVAL;
@@ -11011,10 +11253,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
- raw_spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
-
+ seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
+
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
pvclock_update_vm_gtod_copy(kvm);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
kvm->arch.guest_can_read_msr_platform_info = true;
@@ -11211,8 +11455,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
kvm_page_track_free_memslot(slot);
}
-static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
- unsigned long npages)
+int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
{
const int sz = sizeof(*slot->arch.rmap[0]);
int i;
@@ -11234,50 +11477,6 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
return 0;
}
-int alloc_all_memslots_rmaps(struct kvm *kvm)
-{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *slot;
- int r, i;
-
- /*
- * Check if memslots alreday have rmaps early before acquiring
- * the slots_arch_lock below.
- */
- if (kvm_memslots_have_rmaps(kvm))
- return 0;
-
- mutex_lock(&kvm->slots_arch_lock);
-
- /*
- * Read memslots_have_rmaps again, under the slots arch lock,
- * before allocating the rmaps
- */
- if (kvm_memslots_have_rmaps(kvm)) {
- mutex_unlock(&kvm->slots_arch_lock);
- return 0;
- }
-
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot(slot, slots) {
- r = memslot_rmap_alloc(slot, slot->npages);
- if (r) {
- mutex_unlock(&kvm->slots_arch_lock);
- return r;
- }
- }
- }
-
- /*
- * Ensure that memslots_have_rmaps becomes true strictly after
- * all the rmap pointers are set.
- */
- smp_store_release(&kvm->arch.memslots_have_rmaps, true);
- mutex_unlock(&kvm->slots_arch_lock);
- return 0;
-}
-
static int kvm_alloc_memslot_metadata(struct kvm *kvm,
struct kvm_memory_slot *slot,
unsigned long npages)
@@ -11328,7 +11527,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
}
}
- if (kvm_page_track_create_memslot(slot, npages))
+ if (kvm_page_track_create_memslot(kvm, slot, npages))
goto out_free;
return 0;
@@ -11926,6 +12125,15 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set);
}
+bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
+{
+ if (new->type != KVM_IRQ_ROUTING_MSI)
+ return true;
+
+ return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
+}
+
bool kvm_vector_hashing_enabled(void)
{
return vector_hashing;
@@ -12007,9 +12215,7 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
* doesn't seem to be a real use-case behind such requests, just return
* KVM_EXIT_INTERNAL_ERROR for now.
*/
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}