From 35e6eaa3df55822d0cb1df3bf08e6cb816737131 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 16 Dec 2016 16:10:01 +0100
Subject: KVM: x86: don't allow kernel irqchip with split irqchip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Split irqchip cannot be created after creating the kernel irqchip, but
we forgot to restrict the other way.  This is an API change.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2f22810a7e0c..c72a8d00a1c0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3961,7 +3961,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 		mutex_lock(&kvm->lock);
 		r = -EEXIST;
-		if (kvm->arch.vpic)
+		if (irqchip_in_kernel(kvm))
 			goto create_irqchip_unlock;
 		r = -EINVAL;
 		if (kvm->created_vcpus)
-- 
cgit v1.2.3


From 49776faf93f8074bb4990beac04781a9507d3650 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 16 Dec 2016 16:10:02 +0100
Subject: KVM: x86: decouple irqchip_in_kernel() and pic_irqchip()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

irqchip_in_kernel() tried to save a bit by reusing pic_irqchip(), but it
just complicated the code.
Add a separate state for the irqchip mode.

Reviewed-by: David Hildenbrand <david@redhat.com>
[Used Paolo's version of condition in irqchip_in_kernel().]
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  8 +++++++-
 arch/x86/kvm/irq.h              | 15 ++++++++-------
 arch/x86/kvm/x86.c              |  5 +++--
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a7066dc1a7e9..fc03ab1f6110 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -716,6 +716,12 @@ struct kvm_hv {
 	HV_REFERENCE_TSC_PAGE tsc_ref;
 };
 
+enum kvm_irqchip_mode {
+	KVM_IRQCHIP_NONE,
+	KVM_IRQCHIP_KERNEL,       /* created with KVM_CREATE_IRQCHIP */
+	KVM_IRQCHIP_SPLIT,        /* created with KVM_CAP_SPLIT_IRQCHIP */
+};
+
 struct kvm_arch {
 	unsigned int n_used_mmu_pages;
 	unsigned int n_requested_mmu_pages;
@@ -788,7 +794,7 @@ struct kvm_arch {
 
 	u64 disabled_quirks;
 
-	bool irqchip_split;
+	enum kvm_irqchip_mode irqchip_mode;
 	u8 nr_reserved_ioapic_pins;
 
 	bool disabled_lapic_found;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 035731eb3897..f4965bc2613c 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -93,18 +93,19 @@ static inline int pic_in_kernel(struct kvm *kvm)
 
 static inline int irqchip_split(struct kvm *kvm)
 {
-	return kvm->arch.irqchip_split;
+	return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT;
 }
 
-static inline int irqchip_in_kernel(struct kvm *kvm)
+static inline int irqchip_kernel(struct kvm *kvm)
 {
-	struct kvm_pic *vpic = pic_irqchip(kvm);
-	bool ret;
+	return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL;
+}
 
-	ret = (vpic != NULL);
-	ret |= irqchip_split(kvm);
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+	bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE;
 
-	/* Read vpic before kvm->irq_routing.  */
+	/* Matches with wmb after initializing kvm->irq_routing. */
 	smp_rmb();
 	return ret;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c72a8d00a1c0..0630ab438bd5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3894,7 +3894,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 			goto split_irqchip_unlock;
 		/* Pairs with irqchip_in_kernel. */
 		smp_wmb();
-		kvm->arch.irqchip_split = true;
+		kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
 		kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
 		r = 0;
 split_irqchip_unlock:
@@ -3988,8 +3988,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			mutex_unlock(&kvm->slots_lock);
 			goto create_irqchip_unlock;
 		}
-		/* Write kvm->irq_routing before kvm->arch.vpic.  */
+		/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
 		smp_wmb();
+		kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
 		kvm->arch.vpic = vpic;
 	create_irqchip_unlock:
 		mutex_unlock(&kvm->lock);
-- 
cgit v1.2.3


From 099413664c71fcf9d0099eba4f8a4dd59653d5a3 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 16 Dec 2016 16:10:03 +0100
Subject: KVM: x86: make pic setup code look like ioapic setup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We don't treat kvm->arch.vpic specially anymore, so the setup can look
like ioapic.  This gets a bit more information out of return values.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8259.c | 16 +++++++++++-----
 arch/x86/kvm/irq.h   |  4 ++--
 arch/x86/kvm/x86.c   | 30 +++++++++++++++---------------
 3 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 7cc2360f1848..73ea24d4f119 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -598,14 +598,14 @@ static const struct kvm_io_device_ops picdev_eclr_ops = {
 	.write    = picdev_eclr_write,
 };
 
-struct kvm_pic *kvm_create_pic(struct kvm *kvm)
+int kvm_pic_init(struct kvm *kvm)
 {
 	struct kvm_pic *s;
 	int ret;
 
 	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
 	if (!s)
-		return NULL;
+		return -ENOMEM;
 	spin_lock_init(&s->lock);
 	s->kvm = kvm;
 	s->pics[0].elcr_mask = 0xf8;
@@ -635,7 +635,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 
 	mutex_unlock(&kvm->slots_lock);
 
-	return s;
+	kvm->arch.vpic = s;
+
+	return 0;
 
 fail_unreg_1:
 	kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
@@ -648,13 +650,17 @@ fail_unlock:
 
 	kfree(s);
 
-	return NULL;
+	return ret;
 }
 
-void kvm_destroy_pic(struct kvm_pic *vpic)
+void kvm_pic_destroy(struct kvm *kvm)
 {
+	struct kvm_pic *vpic = kvm->arch.vpic;
+
 	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
 	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
 	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr);
+
+	kvm->arch.vpic = NULL;
 	kfree(vpic);
 }
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index f4965bc2613c..40d5b2cf6061 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -73,8 +73,8 @@ struct kvm_pic {
 	unsigned long irq_states[PIC_NUM_PINS];
 };
 
-struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_destroy_pic(struct kvm_pic *vpic);
+int kvm_pic_init(struct kvm *kvm);
+void kvm_pic_destroy(struct kvm *kvm);
 int kvm_pic_read_irq(struct kvm *kvm);
 void kvm_pic_update_irq(struct kvm_pic *s);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0630ab438bd5..05ac71a01f99 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3957,33 +3957,34 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
 		break;
 	case KVM_CREATE_IRQCHIP: {
-		struct kvm_pic *vpic;
-
 		mutex_lock(&kvm->lock);
+
 		r = -EEXIST;
 		if (irqchip_in_kernel(kvm))
 			goto create_irqchip_unlock;
+
 		r = -EINVAL;
 		if (kvm->created_vcpus)
 			goto create_irqchip_unlock;
-		r = -ENOMEM;
-		vpic = kvm_create_pic(kvm);
-		if (vpic) {
-			r = kvm_ioapic_init(kvm);
-			if (r) {
-				mutex_lock(&kvm->slots_lock);
-				kvm_destroy_pic(vpic);
-				mutex_unlock(&kvm->slots_lock);
-				goto create_irqchip_unlock;
-			}
-		} else
+
+		r = kvm_pic_init(kvm);
+		if (r)
 			goto create_irqchip_unlock;
+
+		r = kvm_ioapic_init(kvm);
+		if (r) {
+			mutex_lock(&kvm->slots_lock);
+			kvm_pic_destroy(kvm);
+			mutex_unlock(&kvm->slots_lock);
+			goto create_irqchip_unlock;
+		}
+
 		r = kvm_setup_default_irq_routing(kvm);
 		if (r) {
 			mutex_lock(&kvm->slots_lock);
 			mutex_lock(&kvm->irq_lock);
 			kvm_ioapic_destroy(kvm);
-			kvm_destroy_pic(vpic);
+			kvm_pic_destroy(kvm);
 			mutex_unlock(&kvm->irq_lock);
 			mutex_unlock(&kvm->slots_lock);
 			goto create_irqchip_unlock;
@@ -3991,7 +3992,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
 		smp_wmb();
 		kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
-		kvm->arch.vpic = vpic;
 	create_irqchip_unlock:
 		mutex_unlock(&kvm->lock);
 		break;
-- 
cgit v1.2.3


From e5dc48777dcc898210e2f16d80d44718db38cdc3 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 16 Dec 2016 16:10:04 +0100
Subject: KVM: x86: refactor pic setup in kvm_set_routing_entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq_comm.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 6c0191615f23..1dfeb185a1e3 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -297,14 +297,12 @@ int kvm_set_routing_entry(struct kvm *kvm,
 	case KVM_IRQ_ROUTING_IRQCHIP:
 		delta = 0;
 		switch (ue->u.irqchip.irqchip) {
-		case KVM_IRQCHIP_PIC_MASTER:
-			e->set = kvm_set_pic_irq;
-			max_pin = PIC_NUM_PINS;
-			break;
 		case KVM_IRQCHIP_PIC_SLAVE:
+			delta = 8;
+			/* fall through */
+		case KVM_IRQCHIP_PIC_MASTER:
 			e->set = kvm_set_pic_irq;
 			max_pin = PIC_NUM_PINS;
-			delta = 8;
 			break;
 		case KVM_IRQCHIP_IOAPIC:
 			max_pin = KVM_IOAPIC_NUM_PINS;
-- 
cgit v1.2.3


From 8231f50d9853274ed104aac86b6b6263ca666c4d Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 16 Dec 2016 16:10:05 +0100
Subject: KVM: x86: prevent setup of invalid routes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The check in kvm_set_pic_irq() and kvm_set_ioapic_irq() was just a
temporary measure until the code improved enough for us to do this.

This changes APIC in a case when KVM_SET_GSI_ROUTING is called to set up pic
and ioapic routes before KVM_CREATE_IRQCHIP.  Those rules would get overwritten
by KVM_CREATE_IRQCHIP at best, so it is pointless to allow it.  Userspaces
hopefully noticed that things don't work if they do that and don't do that.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq_comm.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 1dfeb185a1e3..2639b8d3dce2 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -41,15 +41,6 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 			   bool line_status)
 {
 	struct kvm_pic *pic = pic_irqchip(kvm);
-
-	/*
-	 * XXX: rejecting pic routes when pic isn't in use would be better,
-	 * but the default routing table is installed while kvm->arch.vpic is
-	 * NULL and KVM_CREATE_IRQCHIP can race with KVM_IRQ_LINE.
-	 */
-	if (!pic)
-		return -1;
-
 	return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
 }
 
@@ -58,10 +49,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 			      bool line_status)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-
-	if (!ioapic)
-		return -1;
-
 	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
 				line_status);
 }
@@ -301,10 +288,16 @@ int kvm_set_routing_entry(struct kvm *kvm,
 			delta = 8;
 			/* fall through */
 		case KVM_IRQCHIP_PIC_MASTER:
+			if (!pic_in_kernel(kvm))
+				goto out;
+
 			e->set = kvm_set_pic_irq;
 			max_pin = PIC_NUM_PINS;
 			break;
 		case KVM_IRQCHIP_IOAPIC:
+			if (!ioapic_in_kernel(kvm))
+				goto out;
+
 			max_pin = KVM_IOAPIC_NUM_PINS;
 			e->set = kvm_set_ioapic_irq;
 			break;
-- 
cgit v1.2.3


From 826da32140dada1467f4216410525511393317e8 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 16 Dec 2016 16:10:06 +0100
Subject: KVM: x86: simplify conditions with split/kernel irqchip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq_comm.c | 2 +-
 arch/x86/kvm/x86.c      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 2639b8d3dce2..b96d3893f121 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -400,7 +400,7 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm)
 
 void kvm_arch_post_irq_routing_update(struct kvm *kvm)
 {
-	if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
+	if (!irqchip_split(kvm))
 		return;
 	kvm_make_scan_ioapic_request(kvm);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 05ac71a01f99..a356d8e12c2f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4027,7 +4027,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		}
 
 		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
+		if (!irqchip_kernel(kvm))
 			goto get_irqchip_out;
 		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
 		if (r)
@@ -4051,7 +4051,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		}
 
 		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
+		if (!irqchip_kernel(kvm))
 			goto set_irqchip_out;
 		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
 		if (r)
-- 
cgit v1.2.3


From f3414bc77419463c0d81eaa2cea7ee4ccb447c7d Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Tue, 20 Dec 2016 15:25:57 -0800
Subject: kvm: x86: export maximum number of mmu_page_hash collisions

Report the maximum number of mmu_page_hash collisions as a per-VM stat.
This will make it easy to identify problems with the mmu_page_hash in
the future.

Signed-off-by: David Matlack <dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 25 +++++++++++++++++--------
 arch/x86/kvm/x86.c              |  2 ++
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fc03ab1f6110..1bb1ffc0024c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -821,6 +821,7 @@ struct kvm_vm_stat {
 	ulong mmu_unsync;
 	ulong remote_tlb_flush;
 	ulong lpages;
+	ulong max_mmu_page_hash_collisions;
 };
 
 struct kvm_vcpu_stat {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7012de4a1fed..45ee7ae88239 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1904,17 +1904,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
  * since it has been deleted from active_mmu_pages but still can be found
  * at hast list.
  *
- * for_each_gfn_valid_sp() has skipped that kind of pages.
+ * for_each_valid_sp() has skipped that kind of pages.
  */
-#define for_each_gfn_valid_sp(_kvm, _sp, _gfn)				\
+#define for_each_valid_sp(_kvm, _sp, _gfn)				\
 	hlist_for_each_entry(_sp,					\
 	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
-		if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \
-			|| (_sp)->role.invalid) {} else
+		if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) {    \
+		} else
 
 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)			\
-	for_each_gfn_valid_sp(_kvm, _sp, _gfn)				\
-		if ((_sp)->role.direct) {} else
+	for_each_valid_sp(_kvm, _sp, _gfn)				\
+		if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
 
 /* @sp->gfn should be write-protected at the call site */
 static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -2116,6 +2116,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	struct kvm_mmu_page *sp;
 	bool need_sync = false;
 	bool flush = false;
+	int collisions = 0;
 	LIST_HEAD(invalid_list);
 
 	role = vcpu->arch.mmu.base_role;
@@ -2130,7 +2131,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 		role.quadrant = quadrant;
 	}
-	for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) {
+	for_each_valid_sp(vcpu->kvm, sp, gfn) {
+		if (sp->gfn != gfn) {
+			collisions++;
+			continue;
+		}
+
 		if (!need_sync && sp->unsync)
 			need_sync = true;
 
@@ -2153,7 +2159,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 
 		__clear_sp_write_flooding_count(sp);
 		trace_kvm_mmu_get_page(sp, false);
-		return sp;
+		goto out;
 	}
 
 	++vcpu->kvm->stat.mmu_cache_miss;
@@ -2183,6 +2189,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	trace_kvm_mmu_get_page(sp, true);
 
 	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+out:
+	if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
+		vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
 	return sp;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a356d8e12c2f..4aece8b0a4aa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -190,6 +190,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "mmu_unsync", VM_STAT(mmu_unsync) },
 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 	{ "largepages", VM_STAT(lpages) },
+	{ "max_mmu_page_hash_collisions",
+		VM_STAT(max_mmu_page_hash_collisions) },
 	{ NULL }
 };
 
-- 
cgit v1.2.3


From 114df303a7eeae8b50ebf68229b7e647714a9bea Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Mon, 19 Dec 2016 13:58:25 -0800
Subject: kvm: x86: reduce collisions in mmu_page_hash

When using two-dimensional paging, the mmu_page_hash (which provides
lookups for existing kvm_mmu_page structs), becomes imbalanced; with
too many collisions in buckets 0 and 512. This has been seen to cause
mmu_lock to be held for multiple milliseconds in kvm_mmu_get_page on
VMs with a large amount of RAM mapped with 4K pages.

The current hash function uses the lower 10 bits of gfn to index into
mmu_page_hash. When doing shadow paging, gfn is the address of the
guest page table being shadow. These tables are 4K-aligned, which
makes the low bits of gfn a good hash. However, with two-dimensional
paging, no guest page tables are being shadowed, so gfn is the base
address that is mapped by the table. Thus page tables (level=1) have
a 2MB aligned gfn, page directories (level=2) have a 1GB aligned gfn,
etc. This means hashes will only differ in their 10th bit.

hash_64() provides a better hash. For example, on a VM with ~200G
(99458 direct=1 kvm_mmu_page structs):

hash            max_mmu_page_hash_collisions
--------------------------------------------
low 10 bits     49847
hash_64         105
perfect         97

While we're changing the hash, increase the table size by 4x to better
support large VMs (further reduces number of collisions in 200G VM to
29).

Note that hash_64() does not provide a good distribution prior to commit
ef703f49a6c5 ("Eliminate bad hash multipliers from hash_32() and
hash_64()").

Signed-off-by: David Matlack <dmatlack@google.com>
Change-Id: I5aa6b13c834722813c6cca46b8b1ed6f53368ade
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 arch/x86/kvm/mmu.c              | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1bb1ffc0024c..7e594a325158 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -115,7 +115,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
 
 #define KVM_PERMILLE_MMU_PAGES 20
 #define KVM_MIN_ALLOC_MMU_PAGES 64
-#define KVM_MMU_HASH_SHIFT 10
+#define KVM_MMU_HASH_SHIFT 12
 #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
 #define KVM_MIN_FREE_MMU_PAGES 5
 #define KVM_REFILL_PAGES 25
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 45ee7ae88239..3f9fa39f1469 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -37,6 +37,7 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/hash.h>
 
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
@@ -1713,7 +1714,7 @@ static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 
 static unsigned kvm_page_table_hashfn(gfn_t gfn)
 {
-	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
+	return hash_64(gfn, KVM_MMU_HASH_SHIFT);
 }
 
 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
-- 
cgit v1.2.3


From 27959a4415a5a00881a7b9353ab9b1274da2ca47 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Tue, 6 Dec 2016 16:46:10 -0800
Subject: kvm: x86: mmu: Use symbolic constants for EPT Violation Exit
 Qualifications

This change adds some symbolic constants for VM Exit Qualifications
related to EPT Violations and updates handle_ept_violation() to use
these constants instead of hard-coded numbers.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/vmx.h | 16 ++++++++++++++++
 arch/x86/kvm/vmx.c         | 22 ++++++++++++++--------
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2b5b2d4b924e..25a482fb5241 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -499,6 +499,22 @@ struct vmx_msr_entry {
 #define ENTRY_FAIL_NMI			3
 #define ENTRY_FAIL_VMCS_LINK_PTR	4
 
+/*
+ * Exit Qualifications for EPT Violations
+ */
+#define EPT_VIOLATION_READ_BIT		0
+#define EPT_VIOLATION_WRITE_BIT		1
+#define EPT_VIOLATION_INSTR_BIT		2
+#define EPT_VIOLATION_READABLE_BIT	3
+#define EPT_VIOLATION_WRITABLE_BIT	4
+#define EPT_VIOLATION_EXECUTABLE_BIT	5
+#define EPT_VIOLATION_READ		(1 << EPT_VIOLATION_READ_BIT)
+#define EPT_VIOLATION_WRITE		(1 << EPT_VIOLATION_WRITE_BIT)
+#define EPT_VIOLATION_INSTR		(1 << EPT_VIOLATION_INSTR_BIT)
+#define EPT_VIOLATION_READABLE		(1 << EPT_VIOLATION_READABLE_BIT)
+#define EPT_VIOLATION_WRITABLE		(1 << EPT_VIOLATION_WRITABLE_BIT)
+#define EPT_VIOLATION_EXECUTABLE	(1 << EPT_VIOLATION_EXECUTABLE_BIT)
+
 /*
  * VM-instruction error numbers
  */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a236decb81e4..81159a3878f4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6374,14 +6374,20 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 	trace_kvm_page_fault(gpa, exit_qualification);
 
-	/* it is a read fault? */
-	error_code = (exit_qualification << 2) & PFERR_USER_MASK;
-	/* it is a write fault? */
-	error_code |= exit_qualification & PFERR_WRITE_MASK;
-	/* It is a fetch fault? */
-	error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
-	/* ept page table is present? */
-	error_code |= (exit_qualification & 0x38) != 0;
+	/* Is it a read fault? */
+	error_code = (exit_qualification & EPT_VIOLATION_READ)
+		     ? PFERR_USER_MASK : 0;
+	/* Is it a write fault? */
+	error_code |= (exit_qualification & EPT_VIOLATION_WRITE)
+		      ? PFERR_WRITE_MASK : 0;
+	/* Is it a fetch fault? */
+	error_code |= (exit_qualification & EPT_VIOLATION_INSTR)
+		      ? PFERR_FETCH_MASK : 0;
+	/* ept page table entry is present? */
+	error_code |= (exit_qualification &
+		       (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
+			EPT_VIOLATION_EXECUTABLE))
+		      ? PFERR_PRESENT_MASK : 0;
 
 	vcpu->arch.exit_qualification = exit_qualification;
 
-- 
cgit v1.2.3


From ea4114bcd3a8c84f0eb0b52e56d348c27ddede2e Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Tue, 6 Dec 2016 16:46:11 -0800
Subject: kvm: x86: mmu: Rename spte_is_locklessly_modifiable()

This change renames spte_is_locklessly_modifiable() to
spte_can_locklessly_be_made_writable() to distinguish it from other
forms of lockless modifications. The full set of lockless modifications
is covered by spte_has_volatile_bits().

Signed-off-by: Junaid Shahid <junaids@google.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3f9fa39f1469..e923f393ac26 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -474,7 +474,7 @@ retry:
 }
 #endif
 
-static bool spte_is_locklessly_modifiable(u64 spte)
+static bool spte_can_locklessly_be_made_writable(u64 spte)
 {
 	return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
 		(SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
@@ -488,7 +488,7 @@ static bool spte_has_volatile_bits(u64 spte)
 	 * also, it can help us to get a stable is_writable_pte()
 	 * to ensure tlb flush is not missed.
 	 */
-	if (spte_is_locklessly_modifiable(spte))
+	if (spte_can_locklessly_be_made_writable(spte))
 		return true;
 
 	if (!shadow_accessed_mask)
@@ -557,7 +557,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 	 * we always atomically update it, see the comments in
 	 * spte_has_volatile_bits().
 	 */
-	if (spte_is_locklessly_modifiable(old_spte) &&
+	if (spte_can_locklessly_be_made_writable(old_spte) &&
 	      !is_writable_pte(new_spte))
 		ret = true;
 
@@ -1213,7 +1213,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
 	u64 spte = *sptep;
 
 	if (!is_writable_pte(spte) &&
-	      !(pt_protect && spte_is_locklessly_modifiable(spte)))
+	      !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
 		return false;
 
 	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
@@ -2975,7 +2975,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 	 * Currently, to simplify the code, only the spte write-protected
 	 * by dirty-log can be fast fixed.
 	 */
-	if (!spte_is_locklessly_modifiable(spte))
+	if (!spte_can_locklessly_be_made_writable(spte))
 		goto exit;
 
 	/*
-- 
cgit v1.2.3


From 97dceba29a6acbb28d16c8c5757ae9f4e1e482ea Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Tue, 6 Dec 2016 16:46:12 -0800
Subject: kvm: x86: mmu: Fast Page Fault path retries

This change adds retries into the Fast Page Fault path. Without the
retries, the code still works, but if a retry does end up being needed,
then it will result in a second page fault for the same memory access,
which will cause much more overhead compared to just retrying within the
original fault.

This would be especially useful with the upcoming fast access tracking
change, as that would make it more likely for retries to be needed
(e.g. due to read and write faults happening on different CPUs at
the same time).

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 124 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 73 insertions(+), 51 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e923f393ac26..f6d3505c8d18 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2891,6 +2891,10 @@ static bool page_fault_can_be_fast(u32 error_code)
 	return true;
 }
 
+/*
+ * Returns true if the SPTE was fixed successfully. Otherwise,
+ * someone else modified the SPTE from its original value.
+ */
 static bool
 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			u64 *sptep, u64 spte)
@@ -2917,8 +2921,10 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 *
 	 * Compare with set_spte where instead shadow_dirty_mask is set.
 	 */
-	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
-		kvm_vcpu_mark_page_dirty(vcpu, gfn);
+	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) != spte)
+		return false;
+
+	kvm_vcpu_mark_page_dirty(vcpu, gfn);
 
 	return true;
 }
@@ -2933,8 +2939,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
-	bool ret = false;
+	bool fault_handled = false;
 	u64 spte = 0ull;
+	uint retry_count = 0;
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return false;
@@ -2947,62 +2954,77 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		if (!is_shadow_present_pte(spte) || iterator.level < level)
 			break;
 
-	/*
-	 * If the mapping has been changed, let the vcpu fault on the
-	 * same address again.
-	 */
-	if (!is_shadow_present_pte(spte)) {
-		ret = true;
-		goto exit;
-	}
+	do {
+		/*
+		 * If the mapping has been changed, let the vcpu fault on the
+		 * same address again.
+		 */
+		if (!is_shadow_present_pte(spte)) {
+			fault_handled = true;
+			break;
+		}
 
-	sp = page_header(__pa(iterator.sptep));
-	if (!is_last_spte(spte, sp->role.level))
-		goto exit;
+		sp = page_header(__pa(iterator.sptep));
+		if (!is_last_spte(spte, sp->role.level))
+			break;
 
-	/*
-	 * Check if it is a spurious fault caused by TLB lazily flushed.
-	 *
-	 * Need not check the access of upper level table entries since
-	 * they are always ACC_ALL.
-	 */
-	 if (is_writable_pte(spte)) {
-		ret = true;
-		goto exit;
-	}
+		/*
+		 * Check if it is a spurious fault caused by TLB lazily flushed.
+		 *
+		 * Need not check the access of upper level table entries since
+		 * they are always ACC_ALL.
+		 */
+		if (is_writable_pte(spte)) {
+			fault_handled = true;
+			break;
+		}
 
-	/*
-	 * Currently, to simplify the code, only the spte write-protected
-	 * by dirty-log can be fast fixed.
-	 */
-	if (!spte_can_locklessly_be_made_writable(spte))
-		goto exit;
+		/*
+		 * Currently, to simplify the code, only the spte
+		 * write-protected by dirty-log can be fast fixed.
+		 */
+		if (!spte_can_locklessly_be_made_writable(spte))
+			break;
 
-	/*
-	 * Do not fix write-permission on the large spte since we only dirty
-	 * the first page into the dirty-bitmap in fast_pf_fix_direct_spte()
-	 * that means other pages are missed if its slot is dirty-logged.
-	 *
-	 * Instead, we let the slow page fault path create a normal spte to
-	 * fix the access.
-	 *
-	 * See the comments in kvm_arch_commit_memory_region().
-	 */
-	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
-		goto exit;
+		/*
+		 * Do not fix write-permission on the large spte since we only
+		 * dirty the first page into the dirty-bitmap in
+		 * fast_pf_fix_direct_spte() that means other pages are missed
+		 * if its slot is dirty-logged.
+		 *
+		 * Instead, we let the slow page fault path create a normal spte
+		 * to fix the access.
+		 *
+		 * See the comments in kvm_arch_commit_memory_region().
+		 */
+		if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+			break;
+
+		/*
+		 * Currently, fast page fault only works for direct mapping
+		 * since the gfn is not stable for indirect shadow page. See
+		 * Documentation/virtual/kvm/locking.txt to get more detail.
+		 */
+		fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
+							iterator.sptep, spte);
+		if (fault_handled)
+			break;
+
+		if (++retry_count > 4) {
+			printk_once(KERN_WARNING
+				"kvm: Fast #PF retrying more than 4 times.\n");
+			break;
+		}
+
+		spte = mmu_spte_get_lockless(iterator.sptep);
+
+	} while (true);
 
-	/*
-	 * Currently, fast page fault only works for direct mapping since
-	 * the gfn is not stable for indirect shadow page.
-	 * See Documentation/virtual/kvm/locking.txt to get more detail.
-	 */
-	ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
-exit:
 	trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
-			      spte, ret);
+			      spte, fault_handled);
 	walk_shadow_page_lockless_end(vcpu);
 
-	return ret;
+	return fault_handled;
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-- 
cgit v1.2.3


From 83ef6c8155c0ecb4c1a7e6bfbe425c85f7cb676d Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Tue, 6 Dec 2016 16:46:13 -0800
Subject: kvm: x86: mmu: Refactor accessed/dirty checks in
 mmu_spte_update/clear

This simplifies mmu_spte_update() a little bit.
The checks for clearing of accessed and dirty bits are refactored into
separate functions, which are used inside both mmu_spte_update() and
mmu_spte_clear_track_bits(), as well as kvm_test_age_rmapp(). The new
helper functions handle both the case when A/D bits are supported in
hardware and the case when they are not.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 66 +++++++++++++++++++++++++-----------------------------
 1 file changed, 31 insertions(+), 35 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f6d3505c8d18..cfef95969335 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -504,14 +504,16 @@ static bool spte_has_volatile_bits(u64 spte)
 	return true;
 }
 
-static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
+static bool is_accessed_spte(u64 spte)
 {
-	return (old_spte & bit_mask) && !(new_spte & bit_mask);
+	return shadow_accessed_mask ? spte & shadow_accessed_mask
+				    : true;
 }
 
-static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask)
+static bool is_dirty_spte(u64 spte)
 {
-	return (old_spte & bit_mask) != (new_spte & bit_mask);
+	return shadow_dirty_mask ? spte & shadow_dirty_mask
+				 : spte & PT_WRITABLE_MASK;
 }
 
 /* Rules for using mmu_spte_set:
@@ -534,17 +536,19 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
  * will find a read-only spte, even though the writable spte
  * might be cached on a CPU's TLB, the return value indicates this
  * case.
+ *
+ * Returns true if the TLB needs to be flushed
  */
 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 {
 	u64 old_spte = *sptep;
-	bool ret = false;
+	bool flush = false;
 
 	WARN_ON(!is_shadow_present_pte(new_spte));
 
 	if (!is_shadow_present_pte(old_spte)) {
 		mmu_spte_set(sptep, new_spte);
-		return ret;
+		return flush;
 	}
 
 	if (!spte_has_volatile_bits(old_spte))
@@ -552,6 +556,8 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 	else
 		old_spte = __update_clear_spte_slow(sptep, new_spte);
 
+	WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+
 	/*
 	 * For the spte updated out of mmu-lock is safe, since
 	 * we always atomically update it, see the comments in
@@ -559,38 +565,31 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 	 */
 	if (spte_can_locklessly_be_made_writable(old_spte) &&
 	      !is_writable_pte(new_spte))
-		ret = true;
-
-	if (!shadow_accessed_mask) {
-		/*
-		 * We don't set page dirty when dropping non-writable spte.
-		 * So do it now if the new spte is becoming non-writable.
-		 */
-		if (ret)
-			kvm_set_pfn_dirty(spte_to_pfn(old_spte));
-		return ret;
-	}
+		flush = true;
 
 	/*
-	 * Flush TLB when accessed/dirty bits are changed in the page tables,
+	 * Flush TLB when accessed/dirty states are changed in the page tables,
 	 * to guarantee consistency between TLB and page tables.
 	 */
-	if (spte_is_bit_changed(old_spte, new_spte,
-                                shadow_accessed_mask | shadow_dirty_mask))
-		ret = true;
 
-	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
+	if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
+		flush = true;
 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
-	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
+	}
+
+	if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
+		flush = true;
 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+	}
 
-	return ret;
+	return flush;
 }
 
 /*
  * Rules for using mmu_spte_clear_track_bits:
  * It sets the sptep from present to nonpresent, and track the
  * state bits, it is used to clear the last level sptep.
+ * Returns non-zero if the PTE was previously valid.
  */
 static int mmu_spte_clear_track_bits(u64 *sptep)
 {
@@ -614,11 +613,12 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
 	 */
 	WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
 
-	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+	if (is_accessed_spte(old_spte))
 		kvm_set_pfn_accessed(pfn);
-	if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask :
-					    PT_WRITABLE_MASK))
+
+	if (is_dirty_spte(old_spte))
 		kvm_set_pfn_dirty(pfn);
+
 	return 1;
 }
 
@@ -1616,7 +1616,6 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
-	int young = 0;
 
 	/*
 	 * If there's no access bit in the secondary pte set by the
@@ -1626,14 +1625,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 	if (!shadow_accessed_mask)
 		goto out;
 
-	for_each_rmap_spte(rmap_head, &iter, sptep) {
-		if (*sptep & shadow_accessed_mask) {
-			young = 1;
-			break;
-		}
-	}
+	for_each_rmap_spte(rmap_head, &iter, sptep)
+		if (is_accessed_spte(*sptep))
+			return 1;
 out:
-	return young;
+	return 0;
 }
 
 #define RMAP_RECYCLE_THRESHOLD 1000
-- 
cgit v1.2.3


From f39a058d0ea2f58b9c69cfcf7c93184f33302c98 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Tue, 6 Dec 2016 16:46:14 -0800
Subject: kvm: x86: mmu: Introduce a no-tracking version of mmu_spte_update

mmu_spte_update() tracks changes in the accessed/dirty state of
the SPTE being updated and calls kvm_set_pfn_accessed/dirty
appropriately. However, in some cases (e.g. when aging the SPTE),
this shouldn't be done. mmu_spte_update_no_track() is introduced
for use in such cases.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 40 +++++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cfef95969335..b8b5259c8ebb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -528,27 +528,19 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
 	__set_spte(sptep, new_spte);
 }
 
-/* Rules for using mmu_spte_update:
- * Update the state bits, it means the mapped pfn is not changed.
- *
- * Whenever we overwrite a writable spte with a read-only one we
- * should flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB, the return value indicates this
- * case.
- *
- * Returns true if the TLB needs to be flushed
+/*
+ * Update the SPTE (excluding the PFN), but do not track changes in its
+ * accessed/dirty status.
  */
-static bool mmu_spte_update(u64 *sptep, u64 new_spte)
+static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
 {
 	u64 old_spte = *sptep;
-	bool flush = false;
 
 	WARN_ON(!is_shadow_present_pte(new_spte));
 
 	if (!is_shadow_present_pte(old_spte)) {
 		mmu_spte_set(sptep, new_spte);
-		return flush;
+		return old_spte;
 	}
 
 	if (!spte_has_volatile_bits(old_spte))
@@ -558,6 +550,28 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 
 	WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
 
+	return old_spte;
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changed.
+ *
+ * Whenever we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB, the return value indicates this
+ * case.
+ *
+ * Returns true if the TLB needs to be flushed
+ */
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
+{
+	bool flush = false;
+	u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
+
+	if (!is_shadow_present_pte(old_spte))
+		return false;
+
 	/*
 	 * For the spte updated out of mmu-lock is safe, since
 	 * we always atomically update it, see the comments in
-- 
cgit v1.2.3


From 37f0e8fe6b10ee2ab52576caa721ee1282de74a6 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Tue, 6 Dec 2016 16:46:15 -0800
Subject: kvm: x86: mmu: Do not use bit 63 for tracking special SPTEs

MMIO SPTEs currently set both bits 62 and 63 to distinguish them as special
PTEs. However, bit 63 is used as the SVE bit in Intel EPT PTEs. The SVE bit
is ignored for misconfigured PTEs but not necessarily for not-Present PTEs.
Since MMIO SPTEs use an EPT misconfiguration, so using bit 63 for them is
acceptable. However, the upcoming fast access tracking feature adds another
type of special tracking PTE, which uses not-Present PTEs and hence should
not set bit 63.

In order to use common bits to distinguish both type of special PTEs, we
now use only bit 62 as the special bit.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 7 +++++++
 arch/x86/include/asm/vmx.h      | 9 +++++++--
 arch/x86/kvm/vmx.c              | 6 +++---
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7e594a325158..3272a5e4aaad 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -208,6 +208,13 @@ enum {
 				 PFERR_WRITE_MASK |		\
 				 PFERR_PRESENT_MASK)
 
+/*
+ * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
+ * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting
+ * with the SVE bit in EPT PTEs.
+ */
+#define SPTE_SPECIAL_MASK (1ULL << 62)
+
 /* apic attention bits */
 #define KVM_APIC_CHECK_VAPIC	0
 /*
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 25a482fb5241..fc061cbb46e0 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -467,8 +467,13 @@ enum vmcs_field {
 #define VMX_EPT_WRITABLE_MASK			0x2ull
 #define VMX_EPT_EXECUTABLE_MASK			0x4ull
 #define VMX_EPT_IPAT_BIT    			(1ull << 6)
-#define VMX_EPT_ACCESS_BIT				(1ull << 8)
-#define VMX_EPT_DIRTY_BIT				(1ull << 9)
+#define VMX_EPT_ACCESS_BIT			(1ull << 8)
+#define VMX_EPT_DIRTY_BIT			(1ull << 9)
+
+/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */
+#define VMX_EPT_MISCONFIG_WX_VALUE           (VMX_EPT_WRITABLE_MASK |       \
+                                              VMX_EPT_EXECUTABLE_MASK)
+
 
 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81159a3878f4..6f53dedd9b96 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5236,10 +5236,10 @@ static void ept_set_mmio_spte_mask(void)
 	/*
 	 * EPT Misconfigurations can be generated if the value of bits 2:0
 	 * of an EPT paging-structure entry is 110b (write/execute).
-	 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
-	 * spte.
+	 * Also, special bit (62) is set to quickly identify mmio spte.
 	 */
-	kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
+	kvm_mmu_set_mmio_spte_mask(SPTE_SPECIAL_MASK |
+				   VMX_EPT_MISCONFIG_WX_VALUE);
 }
 
 #define VMX_XSS_EXIT_BITMAP 0
-- 
cgit v1.2.3


From f160c7b7bb322bf079a5bb4dd34c58f17553f193 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Tue, 6 Dec 2016 16:46:16 -0800
Subject: kvm: x86: mmu: Lockless access tracking for Intel CPUs without EPT A
 bits.

This change implements lockless access tracking for Intel CPUs without EPT
A bits. This is achieved by marking the PTEs as not-present (but not
completely clearing them) when clear_flush_young() is called after marking
the pages as accessed. When an EPT Violation is generated as a result of
the VM accessing those pages, the PTEs are restored to their original values.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   3 +-
 arch/x86/include/asm/vmx.h      |   9 +-
 arch/x86/kvm/mmu.c              | 279 ++++++++++++++++++++++++++++++----------
 arch/x86/kvm/vmx.c              |  26 ++--
 arch/x86/kvm/x86.c              |   2 +-
 5 files changed, 239 insertions(+), 80 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3272a5e4aaad..99a71d90b6ae 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1064,7 +1064,8 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_init_vm(struct kvm *kvm);
 void kvm_mmu_uninit_vm(struct kvm *kvm);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask);
+		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
+		u64 acc_track_mask);
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index fc061cbb46e0..a22a4790f1ac 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -469,11 +469,14 @@ enum vmcs_field {
 #define VMX_EPT_IPAT_BIT    			(1ull << 6)
 #define VMX_EPT_ACCESS_BIT			(1ull << 8)
 #define VMX_EPT_DIRTY_BIT			(1ull << 9)
+#define VMX_EPT_RWX_MASK                        (VMX_EPT_READABLE_MASK |       \
+						 VMX_EPT_WRITABLE_MASK |       \
+						 VMX_EPT_EXECUTABLE_MASK)
+#define VMX_EPT_MT_MASK				(7ull << VMX_EPT_MT_EPTE_SHIFT)
 
 /* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */
-#define VMX_EPT_MISCONFIG_WX_VALUE           (VMX_EPT_WRITABLE_MASK |       \
-                                              VMX_EPT_EXECUTABLE_MASK)
-
+#define VMX_EPT_MISCONFIG_WX_VALUE		(VMX_EPT_WRITABLE_MASK |       \
+						 VMX_EPT_EXECUTABLE_MASK)
 
 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b8b5259c8ebb..64821ca3a7c3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -38,6 +38,7 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/hash.h>
+#include <linux/kern_levels.h>
 
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
@@ -130,6 +131,10 @@ module_param(dbg, bool, 0644);
 #define ACC_USER_MASK    PT_USER_MASK
 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 
+/* The mask for the R/X bits in EPT PTEs */
+#define PT64_EPT_READABLE_MASK			0x1ull
+#define PT64_EPT_EXECUTABLE_MASK		0x4ull
+
 #include <trace/events/kvm.h>
 
 #define CREATE_TRACE_POINTS
@@ -179,6 +184,25 @@ static u64 __read_mostly shadow_dirty_mask;
 static u64 __read_mostly shadow_mmio_mask;
 static u64 __read_mostly shadow_present_mask;
 
+/*
+ * The mask/value to distinguish a PTE that has been marked not-present for
+ * access tracking purposes.
+ * The mask would be either 0 if access tracking is disabled, or
+ * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled.
+ */
+static u64 __read_mostly shadow_acc_track_mask;
+static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page.
+ */
+static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
+						    PT64_EPT_EXECUTABLE_MASK;
+static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
+
 static void mmu_spte_set(u64 *sptep, u64 spte);
 static void mmu_free_roots(struct kvm_vcpu *vcpu);
 
@@ -188,6 +212,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
+static inline bool is_access_track_spte(u64 spte)
+{
+	/* Always false if shadow_acc_track_mask is zero.  */
+	return (spte & shadow_acc_track_mask) == shadow_acc_track_value;
+}
+
 /*
  * the low bit of the generation number is always presumed to be zero.
  * This disables mmio caching during memslot updates.  The concept is
@@ -285,7 +315,8 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 }
 
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask)
+		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
+		u64 acc_track_mask)
 {
 	shadow_user_mask = user_mask;
 	shadow_accessed_mask = accessed_mask;
@@ -293,9 +324,23 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 	shadow_nx_mask = nx_mask;
 	shadow_x_mask = x_mask;
 	shadow_present_mask = p_mask;
+	shadow_acc_track_mask = acc_track_mask;
+	WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
+void kvm_mmu_clear_all_pte_masks(void)
+{
+	shadow_user_mask = 0;
+	shadow_accessed_mask = 0;
+	shadow_dirty_mask = 0;
+	shadow_nx_mask = 0;
+	shadow_x_mask = 0;
+	shadow_mmio_mask = 0;
+	shadow_present_mask = 0;
+	shadow_acc_track_mask = 0;
+}
+
 static int is_cpuid_PSE36(void)
 {
 	return 1;
@@ -308,7 +353,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
 
 static int is_shadow_present_pte(u64 pte)
 {
-	return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte);
+	return (pte != 0) && !is_mmio_spte(pte);
 }
 
 static int is_large_pte(u64 pte)
@@ -482,32 +527,32 @@ static bool spte_can_locklessly_be_made_writable(u64 spte)
 
 static bool spte_has_volatile_bits(u64 spte)
 {
+	if (!is_shadow_present_pte(spte))
+		return false;
+
 	/*
 	 * Always atomically update spte if it can be updated
 	 * out of mmu-lock, it can ensure dirty bit is not lost,
 	 * also, it can help us to get a stable is_writable_pte()
 	 * to ensure tlb flush is not missed.
 	 */
-	if (spte_can_locklessly_be_made_writable(spte))
+	if (spte_can_locklessly_be_made_writable(spte) ||
+	    is_access_track_spte(spte))
 		return true;
 
-	if (!shadow_accessed_mask)
-		return false;
-
-	if (!is_shadow_present_pte(spte))
-		return false;
-
-	if ((spte & shadow_accessed_mask) &&
-	      (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
-		return false;
+	if (shadow_accessed_mask) {
+		if ((spte & shadow_accessed_mask) == 0 ||
+	    	    (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
+			return true;
+	}
 
-	return true;
+	return false;
 }
 
 static bool is_accessed_spte(u64 spte)
 {
 	return shadow_accessed_mask ? spte & shadow_accessed_mask
-				    : true;
+				    : !is_access_track_spte(spte);
 }
 
 static bool is_dirty_spte(u64 spte)
@@ -651,6 +696,61 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
 	return __get_spte_lockless(sptep);
 }
 
+static u64 mark_spte_for_access_track(u64 spte)
+{
+	if (shadow_accessed_mask != 0)
+		return spte & ~shadow_accessed_mask;
+
+	if (shadow_acc_track_mask == 0 || is_access_track_spte(spte))
+		return spte;
+
+	/*
+	 * Verify that the write-protection that we do below will be fixable
+	 * via the fast page fault path. Currently, that is always the case, at
+	 * least when using EPT (which is when access tracking would be used).
+	 */
+	WARN_ONCE((spte & PT_WRITABLE_MASK) &&
+		  !spte_can_locklessly_be_made_writable(spte),
+		  "kvm: Writable SPTE is not locklessly dirty-trackable\n");
+
+	WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
+			  shadow_acc_track_saved_bits_shift),
+		  "kvm: Access Tracking saved bit locations are not zero\n");
+
+	spte |= (spte & shadow_acc_track_saved_bits_mask) <<
+		shadow_acc_track_saved_bits_shift;
+	spte &= ~shadow_acc_track_mask;
+	spte |= shadow_acc_track_value;
+
+	return spte;
+}
+
+/* Returns the Accessed status of the PTE and resets it at the same time. */
+static bool mmu_spte_age(u64 *sptep)
+{
+	u64 spte = mmu_spte_get_lockless(sptep);
+
+	if (!is_accessed_spte(spte))
+		return false;
+
+	if (shadow_accessed_mask) {
+		clear_bit((ffs(shadow_accessed_mask) - 1),
+			  (unsigned long *)sptep);
+	} else {
+		/*
+		 * Capture the dirty status of the page, so that it doesn't get
+		 * lost when the SPTE is marked for access tracking.
+		 */
+		if (is_writable_pte(spte))
+			kvm_set_pfn_dirty(spte_to_pfn(spte));
+
+		spte = mark_spte_for_access_track(spte);
+		mmu_spte_update_no_track(sptep, spte);
+	}
+
+	return true;
+}
+
 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -1435,7 +1535,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 restart:
 	for_each_rmap_spte(rmap_head, &iter, sptep) {
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
-			     sptep, *sptep, gfn, level);
+			    sptep, *sptep, gfn, level);
 
 		need_flush = 1;
 
@@ -1448,7 +1548,8 @@ restart:
 
 			new_spte &= ~PT_WRITABLE_MASK;
 			new_spte &= ~SPTE_HOST_WRITEABLE;
-			new_spte &= ~shadow_accessed_mask;
+
+			new_spte = mark_spte_for_access_track(new_spte);
 
 			mmu_spte_clear_track_bits(sptep);
 			mmu_spte_set(sptep, new_spte);
@@ -1610,15 +1711,8 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 	struct rmap_iterator uninitialized_var(iter);
 	int young = 0;
 
-	BUG_ON(!shadow_accessed_mask);
-
-	for_each_rmap_spte(rmap_head, &iter, sptep) {
-		if (*sptep & shadow_accessed_mask) {
-			young = 1;
-			clear_bit((ffs(shadow_accessed_mask) - 1),
-				 (unsigned long *)sptep);
-		}
-	}
+	for_each_rmap_spte(rmap_head, &iter, sptep)
+		young |= mmu_spte_age(sptep);
 
 	trace_kvm_age_page(gfn, level, slot, young);
 	return young;
@@ -1632,11 +1726,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 	struct rmap_iterator iter;
 
 	/*
-	 * If there's no access bit in the secondary pte set by the
-	 * hardware it's up to gup-fast/gup to set the access bit in
-	 * the primary pte or in the page structure.
+	 * If there's no access bit in the secondary pte set by the hardware and
+	 * fast access tracking is also not enabled, it's up to gup-fast/gup to
+	 * set the access bit in the primary pte or in the page structure.
 	 */
-	if (!shadow_accessed_mask)
+	if (!shadow_accessed_mask && !shadow_acc_track_mask)
 		goto out;
 
 	for_each_rmap_spte(rmap_head, &iter, sptep)
@@ -1671,7 +1765,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
 	 * This has some overhead, but not as much as the cost of swapping
 	 * out actively used pages or breaking up actively used hugepages.
 	 */
-	if (!shadow_accessed_mask)
+	if (!shadow_accessed_mask && !shadow_acc_track_mask)
 		return kvm_handle_hva_range(kvm, start, end, 0,
 					    kvm_unmap_rmapp);
 
@@ -2603,6 +2697,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= shadow_dirty_mask;
 	}
 
+	if (speculative)
+		spte = mark_spte_for_access_track(spte);
+
 set_pte:
 	if (mmu_spte_update(sptep, spte))
 		kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2656,7 +2753,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
 	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
 		 is_large_pte(*sptep)? "2MB" : "4kB",
-		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
+		 *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
 		 *sptep, sptep);
 	if (!was_rmapped && is_large_pte(*sptep))
 		++vcpu->kvm->stat.lpages;
@@ -2889,16 +2986,28 @@ static bool page_fault_can_be_fast(u32 error_code)
 	if (unlikely(error_code & PFERR_RSVD_MASK))
 		return false;
 
+	/* See if the page fault is due to an NX violation */
+	if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
+		      == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
+		return false;
+
 	/*
-	 * #PF can be fast only if the shadow page table is present and it
-	 * is caused by write-protect, that means we just need change the
-	 * W bit of the spte which can be done out of mmu-lock.
+	 * #PF can be fast if:
+	 * 1. The shadow page table entry is not present, which could mean that
+	 *    the fault is potentially caused by access tracking (if enabled).
+	 * 2. The shadow page table entry is present and the fault
+	 *    is caused by write-protect, that means we just need change the W
+	 *    bit of the spte which can be done out of mmu-lock.
+	 *
+	 * However, if access tracking is disabled we know that a non-present
+	 * page must be a genuine page fault where we have to create a new SPTE.
+	 * So, if access tracking is disabled, we return true only for write
+	 * accesses to a present page.
 	 */
-	if (!(error_code & PFERR_PRESENT_MASK) ||
-	      !(error_code & PFERR_WRITE_MASK))
-		return false;
 
-	return true;
+	return shadow_acc_track_mask != 0 ||
+	       ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
+		== (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
 }
 
 /*
@@ -2907,17 +3016,26 @@ static bool page_fault_can_be_fast(u32 error_code)
  */
 static bool
 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			u64 *sptep, u64 spte)
+			u64 *sptep, u64 old_spte,
+			bool remove_write_prot, bool remove_acc_track)
 {
 	gfn_t gfn;
+	u64 new_spte = old_spte;
 
 	WARN_ON(!sp->role.direct);
 
-	/*
-	 * The gfn of direct spte is stable since it is calculated
-	 * by sp->gfn.
-	 */
-	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+	if (remove_acc_track) {
+		u64 saved_bits = (old_spte >> shadow_acc_track_saved_bits_shift)
+				 & shadow_acc_track_saved_bits_mask;
+
+		new_spte &= ~shadow_acc_track_mask;
+		new_spte &= ~(shadow_acc_track_saved_bits_mask <<
+			      shadow_acc_track_saved_bits_shift);
+		new_spte |= saved_bits;
+	}
+
+	if (remove_write_prot)
+		new_spte |= PT_WRITABLE_MASK;
 
 	/*
 	 * Theoretically we could also set dirty bit (and flush TLB) here in
@@ -2931,10 +3049,17 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 *
 	 * Compare with set_spte where instead shadow_dirty_mask is set.
 	 */
-	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) != spte)
+	if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
 		return false;
 
-	kvm_vcpu_mark_page_dirty(vcpu, gfn);
+	if (remove_write_prot) {
+		/*
+		 * The gfn of direct spte is stable since it is
+		 * calculated by sp->gfn.
+		 */
+		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+		kvm_vcpu_mark_page_dirty(vcpu, gfn);
+	}
 
 	return true;
 }
@@ -2965,35 +3090,55 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 			break;
 
 	do {
-		/*
-		 * If the mapping has been changed, let the vcpu fault on the
-		 * same address again.
-		 */
-		if (!is_shadow_present_pte(spte)) {
-			fault_handled = true;
-			break;
-		}
+		bool remove_write_prot = false;
+		bool remove_acc_track;
 
 		sp = page_header(__pa(iterator.sptep));
 		if (!is_last_spte(spte, sp->role.level))
 			break;
 
 		/*
-		 * Check if it is a spurious fault caused by TLB lazily flushed.
+		 * Check whether the memory access that caused the fault would
+		 * still cause it if it were to be performed right now. If not,
+		 * then this is a spurious fault caused by TLB lazily flushed,
+		 * or some other CPU has already fixed the PTE after the
+		 * current CPU took the fault.
 		 *
 		 * Need not check the access of upper level table entries since
 		 * they are always ACC_ALL.
 		 */
-		if (is_writable_pte(spte)) {
-			fault_handled = true;
-			break;
+
+		if (error_code & PFERR_FETCH_MASK) {
+			if ((spte & (shadow_x_mask | shadow_nx_mask))
+			    == shadow_x_mask) {
+				fault_handled = true;
+				break;
+			}
+		} else if (error_code & PFERR_WRITE_MASK) {
+			if (is_writable_pte(spte)) {
+				fault_handled = true;
+				break;
+			}
+
+			/*
+			 * Currently, to simplify the code, write-protection can
+			 * be removed in the fast path only if the SPTE was
+			 * write-protected for dirty-logging.
+			 */
+			remove_write_prot =
+				spte_can_locklessly_be_made_writable(spte);
+		} else {
+			/* Fault was on Read access */
+			if (spte & PT_PRESENT_MASK) {
+				fault_handled = true;
+				break;
+			}
 		}
 
-		/*
-		 * Currently, to simplify the code, only the spte
-		 * write-protected by dirty-log can be fast fixed.
-		 */
-		if (!spte_can_locklessly_be_made_writable(spte))
+		remove_acc_track = is_access_track_spte(spte);
+
+		/* Verify that the fault can be handled in the fast path */
+		if (!remove_acc_track && !remove_write_prot)
 			break;
 
 		/*
@@ -3007,7 +3152,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		 *
 		 * See the comments in kvm_arch_commit_memory_region().
 		 */
-		if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+		if (sp->role.level > PT_PAGE_TABLE_LEVEL && remove_write_prot)
 			break;
 
 		/*
@@ -3016,7 +3161,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		 * Documentation/virtual/kvm/locking.txt to get more detail.
 		 */
 		fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
-							iterator.sptep, spte);
+							iterator.sptep, spte,
+							remove_write_prot,
+							remove_acc_track);
 		if (fault_handled)
 			break;
 
@@ -5105,6 +5252,8 @@ static void mmu_destroy_caches(void)
 
 int kvm_mmu_module_init(void)
 {
+	kvm_mmu_clear_all_pte_masks();
+
 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
 					    sizeof(struct pte_list_desc),
 					    0, 0, NULL);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6f53dedd9b96..d2fe3a51876c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6578,6 +6578,19 @@ static void wakeup_handler(void)
 	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 }
 
+void vmx_enable_tdp(void)
+{
+	kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
+		enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
+		enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
+		0ull, VMX_EPT_EXECUTABLE_MASK,
+		cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
+		enable_ept_ad_bits ? 0ull : SPTE_SPECIAL_MASK | VMX_EPT_RWX_MASK);
+
+	ept_set_mmio_spte_mask();
+	kvm_enable_tdp();
+}
+
 static __init int hardware_setup(void)
 {
 	int r = -ENOMEM, i, msr;
@@ -6703,16 +6716,9 @@ static __init int hardware_setup(void)
 	/* SELF-IPI */
 	vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
 
-	if (enable_ept) {
-		kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
-			(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
-			(enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
-			0ull, VMX_EPT_EXECUTABLE_MASK,
-			cpu_has_vmx_ept_execute_only() ?
-				      0ull : VMX_EPT_READABLE_MASK);
-		ept_set_mmio_spte_mask();
-		kvm_enable_tdp();
-	} else
+	if (enable_ept)
+		vmx_enable_tdp();
+	else
 		kvm_disable_tdp();
 
 	update_ple_window_actual_max();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4aece8b0a4aa..c3ee5e29ea2a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6025,7 +6025,7 @@ int kvm_arch_init(void *opaque)
 
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 			PT_DIRTY_MASK, PT64_NX_MASK, 0,
-			PT_PRESENT_MASK);
+			PT_PRESENT_MASK, 0);
 	kvm_timer_init();
 
 	perf_register_guest_info_callbacks(&kvm_guest_cbs);
-- 
cgit v1.2.3


From 63dbe14d39b0505e3260bed92e5f4905f49c09d9 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Tue, 6 Dec 2016 16:46:17 -0800
Subject: kvm: x86: mmu: Update documentation for fast page fault mechanism

Add a brief description of the lockless access tracking mechanism
to the documentation of fast page faults in locking.txt.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/locking.txt | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index fd013bf4115b..1bb8bcaf8497 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -26,9 +26,16 @@ sections.
 Fast page fault:
 
 Fast page fault is the fast path which fixes the guest page fault out of
-the mmu-lock on x86. Currently, the page fault can be fast only if the
-shadow page table is present and it is caused by write-protect, that means
-we just need change the W bit of the spte.
+the mmu-lock on x86. Currently, the page fault can be fast in one of the
+following two cases:
+
+1. Access Tracking: The SPTE is not present, but it is marked for access
+tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to
+restore the saved R/X bits. This is described in more detail later below.
+
+2. Write-Protection: The SPTE is present and the fault is
+caused by write-protect. That means we just need to change the W bit of the 
+spte.
 
 What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and
 SPTE_MMU_WRITEABLE bit on the spte:
@@ -38,7 +45,8 @@ SPTE_MMU_WRITEABLE bit on the spte:
   page write-protection.
 
 On fast page fault path, we will use cmpxchg to atomically set the spte W
-bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, this
+bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or 
+restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This
 is safe because whenever changing these bits can be detected by cmpxchg.
 
 But we need carefully check these cases:
@@ -142,6 +150,21 @@ Since the spte is "volatile" if it can be updated out of mmu-lock, we always
 atomically update the spte, the race caused by fast page fault can be avoided,
 See the comments in spte_has_volatile_bits() and mmu_spte_update().
 
+Lockless Access Tracking:
+
+This is used for Intel CPUs that are using EPT but do not support the EPT A/D
+bits. In this case, when the KVM MMU notifier is called to track accesses to a
+page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present
+by clearing the RWX bits in the PTE and storing the original R & X bits in
+some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the
+PTE (using the ignored bit 62). When the VM tries to access the page later on,
+a fault is generated and the fast page fault mechanism described above is used
+to atomically restore the PTE to a Present state. The W bit is not saved when
+the PTE is marked for access tracking and during restoration to the Present
+state, the W bit is set depending on whether or not it was a write access. If
+it wasn't, then the W bit will remain clear until a write access happens, at 
+which time it will be set using the Dirty tracking mechanism described above.
+
 3. Reference
 ------------
 
-- 
cgit v1.2.3


From f98a3efb284a7950745d6c95be489193e6d4c657 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Thu, 15 Dec 2016 18:06:45 +0100
Subject: KVM: x86: use delivery to self in hyperv synic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Interrupt to self can be sent without knowing the APIC ID.

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/hyperv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 1572c35b4f1a..08b27e0c7b71 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -305,13 +305,13 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
 		return -ENOENT;
 
 	memset(&irq, 0, sizeof(irq));
-	irq.dest_id = kvm_apic_id(vcpu->arch.apic);
+	irq.shorthand = APIC_DEST_SELF;
 	irq.dest_mode = APIC_DEST_PHYSICAL;
 	irq.delivery_mode = APIC_DM_FIXED;
 	irq.vector = vector;
 	irq.level = 1;
 
-	ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL);
+	ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL);
 	trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret);
 	return ret;
 }
-- 
cgit v1.2.3


From 6e50043912d9c9c119e3c9c5378869d019df70a9 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Thu, 15 Dec 2016 18:06:46 +0100
Subject: KVM: x86: replace kvm_apic_id with kvm_{x,x2}apic_id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There were three calls sites:
 - recalculate_apic_map and kvm_apic_match_physical_addr, where it would
   only complicate implementation of x2APIC hotplug;
 - in apic_debug, where it was still somewhat preserved, but keeping the
   old function just for apic_debug was not worth it

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 31 ++++++++++++++++++++++---------
 arch/x86/kvm/lapic.h | 11 -----------
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 5fe290c1b7d8..7c142f0fe9fd 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -115,6 +115,16 @@ static inline int apic_enabled(struct kvm_lapic *apic)
 	(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
 	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
 
+static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
+{
+	return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
+}
+
+static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
+{
+	return apic->vcpu->vcpu_id;
+}
+
 static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
 		u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
 	switch (map->mode) {
@@ -159,13 +169,13 @@ static void recalculate_apic_map(struct kvm *kvm)
 	struct kvm_apic_map *new, *old = NULL;
 	struct kvm_vcpu *vcpu;
 	int i;
-	u32 max_id = 255;
+	u32 max_id = 255; /* enough space for any xAPIC ID */
 
 	mutex_lock(&kvm->arch.apic_map_lock);
 
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		if (kvm_apic_present(vcpu))
-			max_id = max(max_id, kvm_apic_id(vcpu->arch.apic));
+			max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
 
 	new = kvm_kvzalloc(sizeof(struct kvm_apic_map) +
 	                   sizeof(struct kvm_lapic *) * ((u64)max_id + 1));
@@ -184,12 +194,13 @@ static void recalculate_apic_map(struct kvm *kvm)
 		if (!kvm_apic_present(vcpu))
 			continue;
 
-		aid = kvm_apic_id(apic);
-		ldr = kvm_lapic_get_reg(apic, APIC_LDR);
-
+		aid = apic_x2apic_mode(apic) ? kvm_x2apic_id(apic)
+		                             : kvm_xapic_id(apic);
 		if (aid <= new->max_apic_id)
 			new->phys_map[aid] = apic;
 
+		ldr = kvm_lapic_get_reg(apic, APIC_LDR);
+
 		if (apic_x2apic_mode(apic)) {
 			new->mode |= KVM_APIC_MODE_X2APIC;
 		} else if (ldr) {
@@ -250,6 +261,8 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
 {
 	u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
 
+	WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
+
 	kvm_lapic_set_reg(apic, APIC_ID, id);
 	kvm_lapic_set_reg(apic, APIC_LDR, ldr);
 	recalculate_apic_map(apic->vcpu->kvm);
@@ -591,9 +604,9 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
 		return true;
 
 	if (apic_x2apic_mode(apic))
-		return mda == kvm_apic_id(apic);
+		return mda == kvm_x2apic_id(apic);
 
-	return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic));
+	return mda == SET_APIC_DEST_FIELD(kvm_xapic_id(apic));
 }
 
 static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
@@ -1907,9 +1920,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 	vcpu->arch.apic_arb_prio = 0;
 	vcpu->arch.apic_attention = 0;
 
-	apic_debug("%s: vcpu=%p, id=%d, base_msr="
+	apic_debug("%s: vcpu=%p, id=0x%x, base_msr="
 		   "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
-		   vcpu, kvm_apic_id(apic),
+		   vcpu, kvm_lapic_get_reg(apic, APIC_ID),
 		   vcpu->arch.apic_base, apic->base_address);
 }
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e0c80233b3e1..cb16e6fd2330 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -202,17 +202,6 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
 	return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
 }
 
-static inline u32 kvm_apic_id(struct kvm_lapic *apic)
-{
-	/* To avoid a race between apic_base and following APIC_ID update when
-	 * switching to x2apic_mode, the x2apic mode returns initial x2apic id.
-	 */
-	if (apic_x2apic_mode(apic))
-		return apic->vcpu->vcpu_id;
-
-	return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
-}
-
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
 void wait_lapic_expire(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3


From b4535b58ae0df8b7cf0fe92a0c23aa3cf862e3cc Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Thu, 15 Dec 2016 18:06:47 +0100
Subject: KVM: x86: make interrupt delivery fast and slow path behave the same
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Slow path tried to prevent IPIs from x2APIC VCPUs from being delivered
to xAPIC VCPUs and vice-versa.  Make slow path behave like fast path,
which never distinguished that.

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 7c142f0fe9fd..3ebef53d20a0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -592,10 +592,8 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
 
 static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
 {
-	if (apic_x2apic_mode(apic))
-		return mda == X2APIC_BROADCAST;
-
-	return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST;
+	return mda == (apic_x2apic_mode(apic) ?
+			X2APIC_BROADCAST : APIC_BROADCAST);
 }
 
 static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
@@ -606,7 +604,7 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
 	if (apic_x2apic_mode(apic))
 		return mda == kvm_x2apic_id(apic);
 
-	return mda == SET_APIC_DEST_FIELD(kvm_xapic_id(apic));
+	return mda == kvm_xapic_id(apic);
 }
 
 static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
@@ -623,7 +621,6 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
 		       && (logical_id & mda & 0xffff) != 0;
 
 	logical_id = GET_APIC_LOGICAL_ID(logical_id);
-	mda = GET_APIC_DEST_FIELD(mda);
 
 	switch (kvm_lapic_get_reg(apic, APIC_DFR)) {
 	case APIC_DFR_FLAT:
@@ -640,9 +637,9 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
 
 /* The KVM local APIC implementation has two quirks:
  *
- *  - the xAPIC MDA stores the destination at bits 24-31, while this
- *    is not true of struct kvm_lapic_irq's dest_id field.  This is
- *    just a quirk in the API and is not problematic.
+ *  - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs
+ *    in xAPIC mode if the "destination & 0xff" matches its xAPIC ID.
+ *    KVM doesn't do that aliasing.
  *
  *  - in-kernel IOAPIC messages have to be delivered directly to
  *    x2APIC, because the kernel does not support interrupt remapping.
@@ -658,13 +655,12 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
 		struct kvm_lapic *source, struct kvm_lapic *target)
 {
 	bool ipi = source != NULL;
-	bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
 
 	if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
-	    !ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+	    !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target))
 		return X2APIC_BROADCAST;
 
-	return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
+	return dest_id;
 }
 
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
-- 
cgit v1.2.3


From 5bd5db385b3e13c702365574c0b7350c6ea45e84 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Thu, 15 Dec 2016 18:06:48 +0100
Subject: KVM: x86: allow hotplug of VCPU with APIC ID over 0xff
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LAPIC after reset is in xAPIC mode, which poses a problem for hotplug of
VCPUs with high APIC ID, because reset VCPU is waiting for INIT/SIPI,
but there is no way to uniquely address it using xAPIC.

From many possible options, we chose the one that also works on real
hardware: accepting interrupts addressed to LAPIC's x2APIC ID even in
xAPIC mode.

KVM intentionally differs from real hardware, because real hardware
(Knights Landing) does just "x2apic_id & 0xff" to decide whether to
accept the interrupt in xAPIC mode and it can deliver one interrupt to
more than one physical destination, e.g. 0x123 to 0x123 and 0x23.

Fixes: 682f732ecf73 ("KVM: x86: bump MAX_VCPUS to 288")
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3ebef53d20a0..7e9ac4606279 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -189,15 +189,26 @@ static void recalculate_apic_map(struct kvm *kvm)
 		struct kvm_lapic *apic = vcpu->arch.apic;
 		struct kvm_lapic **cluster;
 		u16 mask;
-		u32 ldr, aid;
+		u32 ldr;
+		u8 xapic_id;
+		u32 x2apic_id;
 
 		if (!kvm_apic_present(vcpu))
 			continue;
 
-		aid = apic_x2apic_mode(apic) ? kvm_x2apic_id(apic)
-		                             : kvm_xapic_id(apic);
-		if (aid <= new->max_apic_id)
-			new->phys_map[aid] = apic;
+		xapic_id = kvm_xapic_id(apic);
+		x2apic_id = kvm_x2apic_id(apic);
+
+		/* Hotplug hack: see kvm_apic_match_physical_addr(), ... */
+		if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
+				x2apic_id <= new->max_apic_id)
+			new->phys_map[x2apic_id] = apic;
+		/*
+		 * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around,
+		 * prevent them from masking VCPUs with APIC ID <= 0xff.
+		 */
+		if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
+			new->phys_map[xapic_id] = apic;
 
 		ldr = kvm_lapic_get_reg(apic, APIC_LDR);
 
@@ -604,6 +615,15 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
 	if (apic_x2apic_mode(apic))
 		return mda == kvm_x2apic_id(apic);
 
+	/*
+	 * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if
+	 * it were in x2APIC mode.  Hotplugged VCPUs start in xAPIC mode and
+	 * this allows unique addressing of VCPUs with APIC ID over 0xff.
+	 * The 0xff condition is needed because writeable xAPIC ID.
+	 */
+	if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic))
+		return true;
+
 	return mda == kvm_xapic_id(apic);
 }
 
-- 
cgit v1.2.3


From 0f89b207b04a1a399e19d35293658e3a571da3d7 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 14 Dec 2016 14:59:23 -0500
Subject: kvm: svm: Use the hardware provided GPA instead of page walk

When a guest causes a NPF which requires emulation, KVM sometimes walks
the guest page tables to translate the GVA to a GPA. This is unnecessary
most of the time on AMD hardware since the hardware provides the GPA in
EXITINFO2.

The only exception cases involve string operations involving rep or
operations that use two memory locations. With rep, the GPA will only be
the value of the initial NPF and with dual memory locations we won't know
which memory address was translated into EXITINFO2.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  3 +++
 arch/x86/kvm/emulate.c             | 20 +++++++++++++----
 arch/x86/kvm/svm.c                 |  2 ++
 arch/x86/kvm/x86.c                 | 45 +++++++++++++++++++++++++++++---------
 5 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index e9cd7befcb76..3e8c287090e4 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -441,5 +441,6 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
 void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
 void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt);
 
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 99a71d90b6ae..0419e114f27b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -675,6 +675,9 @@ struct kvm_vcpu_arch {
 
 	int pending_ioapic_eoi;
 	int pending_external_vector;
+
+	/* GPA available (AMD only) */
+	bool gpa_available;
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 56628a44668b..2b8349a2b14b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -173,6 +173,7 @@
 #define NearBranch  ((u64)1 << 52)  /* Near branches */
 #define No16	    ((u64)1 << 53)  /* No 16 bit operand */
 #define IncSP       ((u64)1 << 54)  /* SP is incremented before ModRM calc */
+#define TwoMemOp    ((u64)1 << 55)  /* Instruction has two memory operand */
 
 #define DstXacc     (DstAccLo | SrcAccHi | SrcWrite)
 
@@ -4256,7 +4257,7 @@ static const struct opcode group1[] = {
 };
 
 static const struct opcode group1A[] = {
-	I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N,
+	I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N,
 };
 
 static const struct opcode group2[] = {
@@ -4294,7 +4295,7 @@ static const struct opcode group5[] = {
 	I(SrcMemFAddr | ImplicitOps,		em_call_far),
 	I(SrcMem | NearBranch,			em_jmp_abs),
 	I(SrcMemFAddr | ImplicitOps,		em_jmp_far),
-	I(SrcMem | Stack,			em_push), D(Undefined),
+	I(SrcMem | Stack | TwoMemOp,		em_push), D(Undefined),
 };
 
 static const struct opcode group6[] = {
@@ -4514,8 +4515,8 @@ static const struct opcode opcode_table[256] = {
 	/* 0xA0 - 0xA7 */
 	I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
 	I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
-	I2bv(SrcSI | DstDI | Mov | String, em_mov),
-	F2bv(SrcSI | DstDI | String | NoWrite, em_cmp_r),
+	I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov),
+	F2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r),
 	/* 0xA8 - 0xAF */
 	F2bv(DstAcc | SrcImm | NoWrite, em_test),
 	I2bv(SrcAcc | DstDI | Mov | String, em_mov),
@@ -5629,3 +5630,14 @@ void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
 {
 	writeback_registers(ctxt);
 }
+
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt)
+{
+	if (ctxt->rep_prefix && (ctxt->d & String))
+		return false;
+
+	if (ctxt->d & TwoMemOp)
+		return false;
+
+	return true;
+}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 08a4d3ab3455..d0414f054bdf 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4182,6 +4182,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 
 	trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
 
+	vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF);
+
 	if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
 		vcpu->arch.cr0 = svm->vmcb->save.cr0;
 	if (npt_enabled)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c3ee5e29ea2a..edff19d1df97 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4462,6 +4462,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
 
+static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+			    gpa_t gpa, bool write)
+{
+	/* For APIC access vmexit */
+	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+		return 1;
+
+	if (vcpu_match_mmio_gpa(vcpu, gpa)) {
+		trace_vcpu_match_mmio(gva, gpa, write, true);
+		return 1;
+	}
+
+	return 0;
+}
+
 static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 				gpa_t *gpa, struct x86_exception *exception,
 				bool write)
@@ -4488,16 +4503,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 	if (*gpa == UNMAPPED_GVA)
 		return -1;
 
-	/* For APIC access vmexit */
-	if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
-		return 1;
-
-	if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
-		trace_vcpu_match_mmio(gva, *gpa, write, true);
-		return 1;
-	}
-
-	return 0;
+	return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
 }
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -4594,6 +4600,22 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
 	int handled, ret;
 	bool write = ops->write;
 	struct kvm_mmio_fragment *frag;
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+
+	/*
+	 * If the exit was due to a NPF we may already have a GPA.
+	 * If the GPA is present, use it to avoid the GVA to GPA table walk.
+	 * Note, this cannot be used on string operations since string
+	 * operation using rep will only have the initial GPA from the NPF
+	 * occurred.
+	 */
+	if (vcpu->arch.gpa_available &&
+	    emulator_can_use_gpa(ctxt) &&
+	    vcpu_is_mmio_gpa(vcpu, addr, exception->address, write) &&
+	    (addr & ~PAGE_MASK) == (exception->address & ~PAGE_MASK)) {
+		gpa = exception->address;
+		goto mmio;
+	}
 
 	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
 
@@ -5610,6 +5632,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	}
 
 restart:
+	/* Save the faulting GPA (cr2) in the address field */
+	ctxt->exception.address = cr2;
+
 	r = x86_emulate_insn(ctxt);
 
 	if (r == EMULATION_INTERCEPTED)
-- 
cgit v1.2.3


From 0f1e261ead16ce09169bf2d223d4c8803576f85e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sat, 17 Dec 2016 16:05:19 +0100
Subject: KVM: x86: add VCPU stat for KVM_REQ_EVENT processing

This statistic can be useful to estimate the cost of an IRQ injection
scenario, by comparing it with irq_injections.  For example the stat
shows that sti;hlt triggers more KVM_REQ_EVENT than sti;nop.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/x86.c              | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0419e114f27b..417502cf42b6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -861,6 +861,7 @@ struct kvm_vcpu_stat {
 	u64 hypercalls;
 	u64 irq_injections;
 	u64 nmi_injections;
+	u64 req_event;
 };
 
 struct x86_instruction_info;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index edff19d1df97..b02af6285887 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -180,6 +180,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 	{ "irq_injections", VCPU_STAT(irq_injections) },
 	{ "nmi_injections", VCPU_STAT(nmi_injections) },
+	{ "req_event", VCPU_STAT(req_event) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -6756,6 +6757,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+		++vcpu->stat.req_event;
 		kvm_apic_accept_events(vcpu);
 		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
 			r = 1;
-- 
cgit v1.2.3


From eb90f3417a0cc4880e979ccc84e41890d410ea5b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sun, 18 Dec 2016 14:02:21 +0100
Subject: KVM: vmx: speed up TPR below threshold vmexits

Since we're already in VCPU context, all we have to do here is recompute
the PPR value.  That will in turn generate a KVM_REQ_EVENT if necessary.

Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 6 ++++++
 arch/x86/kvm/lapic.h | 1 +
 arch/x86/kvm/vmx.c   | 2 +-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 7e9ac4606279..6b1d3a76c1d0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -595,6 +595,12 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 	}
 }
 
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu)
+{
+	apic_update_ppr(vcpu->arch.apic);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_ppr);
+
 static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
 {
 	kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index cb16e6fd2330..5b5b1ba644cb 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -73,6 +73,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 
 void __kvm_apic_update_irr(u32 *pir, void *regs);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 		     struct dest_map *dest_map);
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d2fe3a51876c..94fda2010f5f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6152,7 +6152,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
 
 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
 {
-	kvm_make_request(KVM_REQ_EVENT, vcpu);
+	kvm_apic_update_ppr(vcpu);
 	return 1;
 }
 
-- 
cgit v1.2.3


From b3c045d33218fe291b04d30e24b6eab0431987e6 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sun, 18 Dec 2016 21:47:54 +0100
Subject: KVM: lapic: remove unnecessary KVM_REQ_EVENT on PPR update

PPR needs to be updated whenever on every IRR read because we
may have missed TPR writes that _increased_ PPR.  However, these
writes need not generate KVM_REQ_EVENT, because either KVM_REQ_EVENT
has been set already in __apic_accept_irq, or we are going to
process the interrupt right away.

Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 6b1d3a76c1d0..a878e33119a3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -570,7 +570,15 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 	__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
 }
 
-static void apic_update_ppr(struct kvm_lapic *apic)
+static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
+{
+	int highest_irr = apic_find_highest_irr(apic);
+	if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
+		return -1;
+	return highest_irr;
+}
+
+static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr)
 {
 	u32 tpr, isrv, ppr, old_ppr;
 	int isr;
@@ -588,11 +596,19 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 	apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
 		   apic, ppr, isr, isrv);
 
-	if (old_ppr != ppr) {
+	*new_ppr = ppr;
+	if (old_ppr != ppr)
 		kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr);
-		if (ppr < old_ppr)
-			kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
-	}
+
+	return ppr < old_ppr;
+}
+
+static void apic_update_ppr(struct kvm_lapic *apic)
+{
+	u32 ppr;
+
+	if (__apic_update_ppr(apic, &ppr))
+		kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
 }
 
 void kvm_apic_update_ppr(struct kvm_vcpu *vcpu)
@@ -2056,17 +2072,13 @@ nomem:
 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
-	int highest_irr;
+	u32 ppr;
 
 	if (!apic_enabled(apic))
 		return -1;
 
-	apic_update_ppr(apic);
-	highest_irr = apic_find_highest_irr(apic);
-	if ((highest_irr == -1) ||
-	    ((highest_irr & 0xF0) <= kvm_lapic_get_reg(apic, APIC_PROCPRI)))
-		return -1;
-	return highest_irr;
+	__apic_update_ppr(apic, &ppr);
+	return apic_has_interrupt_for_ppr(apic, ppr);
 }
 
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 26fbbee5815e9352187ac18f0aa53534f62567ff Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sun, 18 Dec 2016 13:54:58 +0100
Subject: KVM: lapic: do not set KVM_REQ_EVENT unnecessarily on PPR update

On PPR update, we set KVM_REQ_EVENT unconditionally anytime PPR is lowered.
But we can take into account IRR here already.

Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a878e33119a3..457fb206647d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -607,7 +607,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 {
 	u32 ppr;
 
-	if (__apic_update_ppr(apic, &ppr))
+	if (__apic_update_ppr(apic, &ppr) &&
+	    apic_has_interrupt_for_ppr(apic, ppr) != -1)
 		kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
 }
 
-- 
cgit v1.2.3


From 4d82d12b39132e820b9ac4aa058ccc733db98917 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sun, 18 Dec 2016 21:43:41 +0100
Subject: KVM: lapic: do not scan IRR when delivering an interrupt

On interrupt delivery the PPR can only grow (except for auto-EOI),
so it is impossible that non-auto-EOI interrupt delivery results
in KVM_REQ_EVENT.  We can therefore use __apic_update_ppr.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 457fb206647d..10a745faa659 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2115,6 +2115,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
 {
 	int vector = kvm_apic_has_interrupt(vcpu);
 	struct kvm_lapic *apic = vcpu->arch.apic;
+	u32 ppr;
 
 	if (vector == -1)
 		return -1;
@@ -2126,13 +2127,23 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
 	 * because the process would deliver it through the IDT.
 	 */
 
-	apic_set_isr(vector, apic);
-	apic_update_ppr(apic);
 	apic_clear_irr(vector, apic);
-
 	if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
-		apic_clear_isr(vector, apic);
+		/*
+		 * For auto-EOI interrupts, there might be another pending
+		 * interrupt above PPR, so check whether to raise another
+		 * KVM_REQ_EVENT.
+		 */
 		apic_update_ppr(apic);
+	} else {
+		/*
+		 * For normal interrupts, PPR has been raised and there cannot
+		 * be a higher-priority pending interrupt---except if there was
+		 * a concurrent interrupt injection, but that would have
+		 * triggered KVM_REQ_EVENT already.
+		 */
+		apic_set_isr(vector, apic);
+		__apic_update_ppr(apic, &ppr);
 	}
 
 	return vector;
-- 
cgit v1.2.3


From 21e7fbe7db2a983c046a05f12419d88c554a0f5a Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Thu, 22 Dec 2016 15:49:55 -0800
Subject: kvm: nVMX: Reorder error checks for emulated VMXON
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Checks on the operand to VMXON are performed after the check for
legacy mode operation and the #GP checks, according to the pseudo-code
in Intel's SDM.

Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 94fda2010f5f..4e691035a32d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7180,9 +7180,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
-	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
-		return 1;
-
 	if (vmx->nested.vmxon) {
 		nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
 		return kvm_skip_emulated_instruction(vcpu);
@@ -7194,6 +7191,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
+	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
+		return 1;
+
 	if (cpu_has_vmx_msr_bitmap()) {
 		vmx->nested.msr_bitmap =
 				(unsigned long *)__get_free_page(GFP_KERNEL);
-- 
cgit v1.2.3


From a17f32270af1e1054bbc8858b0f27226a2c859ba Mon Sep 17 00:00:00 2001
From: Piotr Luc <piotr.luc@intel.com>
Date: Tue, 10 Jan 2017 18:34:03 +0100
Subject: kvm: x86: Expose Intel VPOPCNTDQ feature to guest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Vector population count instructions for dwords and qwords are to be
used in future Intel Xeon & Xeon Phi processors. The bit 14 of
CPUID[level:0x07, ECX] indicates that the new instructions are
supported by a processor.

The spec can be found in the Intel Software Developer Manual (SDM)
or in the Instruction Set Extensions Programming Reference (ISE).

Signed-off-by: Piotr Luc <piotr.luc@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/cpuid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index e85f6bd7b9d5..09c2ac741567 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -383,7 +383,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 7.0.ecx*/
 	const u32 kvm_cpuid_7_0_ecx_x86_features =
-		F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/;
+		F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
 
 	/* cpuid 7.0.edx*/
 	const u32 kvm_cpuid_7_0_edx_x86_features =
-- 
cgit v1.2.3


From 0b4c208d443ba2af82b4c70f99ca8df31e9a0020 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Tue, 20 Dec 2016 16:34:50 -0800
Subject: Revert "KVM: nested VMX: disable perf cpuid reporting"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit bc6134942dbbf31c25e9bd7c876be5da81c9e1ce.

A CPUID instruction executed in VMX non-root mode always causes a
VM-exit, regardless of the leaf being queried.

Fixes: bc6134942dbb ("KVM: nested VMX: disable perf cpuid reporting")
Signed-off-by: Jim Mattson <jmattson@google.com>
[The issue solved by bc6134942dbb has been resolved with ff651cb613b4
 ("KVM: nVMX: Add nested msr load/restore algorithm").]
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/cpuid.c | 6 ------
 arch/x86/kvm/vmx.c   | 2 --
 2 files changed, 8 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 09c2ac741567..c0e2036217ad 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -861,12 +861,6 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
 	if (!best)
 		best = check_cpuid_limit(vcpu, function, index);
 
-	/*
-	 * Perfmon not yet supported for L2 guest.
-	 */
-	if (is_guest_mode(vcpu) && function == 0xa)
-		best = NULL;
-
 	if (best) {
 		*eax = best->eax;
 		*ebx = best->ebx;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4e691035a32d..c7bafa1457e2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8203,8 +8203,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_TASK_SWITCH:
 		return true;
 	case EXIT_REASON_CPUID:
-		if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
-			return false;
 		return true;
 	case EXIT_REASON_HLT:
 		return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
-- 
cgit v1.2.3


From 3deda5e50c893be38c1b6b3a73f8f8fb5560baa4 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 20 Dec 2016 14:02:29 +1100
Subject: KVM: PPC: Book3S HV: Don't try to signal cpu -1

If the target vcpu for kvmppc_fast_vcpu_kick_hv() is not running on
any CPU, then we will have vcpu->arch.thread_cpu == -1, and as it
happens, kvmppc_fast_vcpu_kick_hv will call kvmppc_ipi_thread with
-1 as the cpu argument.  Although this is not meaningful, in the past,
before commit 1704a81ccebc ("KVM: PPC: Book3S HV: Use msgsnd for IPIs
to other cores on POWER9", 2016-11-18), it was harmless because CPU
-1 is not in the same core as any real CPU thread.  On a POWER9,
however, we don't do the "same core" check, so we were trying to
do a msgsnd to thread -1, which is invalid.  To avoid this, we add
a check to see that vcpu->arch.thread_cpu is >= 0 before calling
kvmppc_ipi_thread() with it.  Since vcpu->arch.thread_vcpu can change
asynchronously, we use READ_ONCE to ensure that the value we check is
the same value that we use as the argument to kvmppc_ipi_thread().

Fixes: 1704a81ccebc ("KVM: PPC: Book3S HV: Use msgsnd for IPIs to other cores on POWER9")
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index ec34e39471a7..8d9cc07b1e9c 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -182,7 +182,8 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 		++vcpu->stat.halt_wakeup;
 	}
 
-	if (kvmppc_ipi_thread(vcpu->arch.thread_cpu))
+	cpu = READ_ONCE(vcpu->arch.thread_cpu);
+	if (cpu >= 0 && kvmppc_ipi_thread(cpu))
 		return;
 
 	/* CPU points to the first thread of the core */
-- 
cgit v1.2.3


From 5efa6605151b84029edeb2e07f2d2d74b52c106f Mon Sep 17 00:00:00 2001
From: Li Zhong <zhong@linux.vnet.ibm.com>
Date: Fri, 11 Nov 2016 12:57:32 +0800
Subject: KVM: PPC: Book 3S: XICS cleanup: remove XICS_RM_REJECT

Commit b0221556dbd3 ("KVM: PPC: Book3S HV: Move virtual mode ICP functions
 to real-mode") removed the setting of the XICS_RM_REJECT flag. And
since that commit, nothing else sets the flag any more, so we can remove
the flag and the remaining code that handles it, including the counter
that counts how many times it get set.

Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_xics.c | 12 +++---------
 arch/powerpc/kvm/book3s_xics.h |  2 --
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 20dff102a06f..debda779a240 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -832,10 +832,6 @@ int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
 		icp->n_rm_check_resend++;
 		icp_check_resend(xics, icp->rm_resend_icp);
 	}
-	if (icp->rm_action & XICS_RM_REJECT) {
-		icp->n_rm_reject++;
-		icp_deliver_irq(xics, icp, icp->rm_reject);
-	}
 	if (icp->rm_action & XICS_RM_NOTIFY_EOI) {
 		icp->n_rm_notify_eoi++;
 		kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq);
@@ -920,7 +916,7 @@ static int xics_debug_show(struct seq_file *m, void *private)
 	int icsid, i;
 	unsigned long flags;
 	unsigned long t_rm_kick_vcpu, t_rm_check_resend;
-	unsigned long t_rm_reject, t_rm_notify_eoi;
+	unsigned long t_rm_notify_eoi;
 	unsigned long t_reject, t_check_resend;
 
 	if (!kvm)
@@ -929,7 +925,6 @@ static int xics_debug_show(struct seq_file *m, void *private)
 	t_rm_kick_vcpu = 0;
 	t_rm_notify_eoi = 0;
 	t_rm_check_resend = 0;
-	t_rm_reject = 0;
 	t_check_resend = 0;
 	t_reject = 0;
 
@@ -952,14 +947,13 @@ static int xics_debug_show(struct seq_file *m, void *private)
 		t_rm_kick_vcpu += icp->n_rm_kick_vcpu;
 		t_rm_notify_eoi += icp->n_rm_notify_eoi;
 		t_rm_check_resend += icp->n_rm_check_resend;
-		t_rm_reject += icp->n_rm_reject;
 		t_check_resend += icp->n_check_resend;
 		t_reject += icp->n_reject;
 	}
 
-	seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n",
+	seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu notify_eoi=%lu\n",
 			t_rm_kick_vcpu, t_rm_check_resend,
-			t_rm_reject, t_rm_notify_eoi);
+			t_rm_notify_eoi);
 	seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n",
 			t_check_resend, t_reject);
 	for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index 2a50320b55ca..1d5fac80b706 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -73,7 +73,6 @@ struct kvmppc_icp {
 	 */
 #define XICS_RM_KICK_VCPU	0x1
 #define XICS_RM_CHECK_RESEND	0x2
-#define XICS_RM_REJECT		0x4
 #define XICS_RM_NOTIFY_EOI	0x8
 	u32 rm_action;
 	struct kvm_vcpu *rm_kick_target;
@@ -84,7 +83,6 @@ struct kvmppc_icp {
 	/* Counters for each reason we exited real mode */
 	unsigned long n_rm_kick_vcpu;
 	unsigned long n_rm_check_resend;
-	unsigned long n_rm_reject;
 	unsigned long n_rm_notify_eoi;
 	/* Counters for handling ICP processing in real mode */
 	unsigned long n_check_resend;
-- 
cgit v1.2.3


From 37451bc95dee0e666927d6ffdda302dbbaaae6fa Mon Sep 17 00:00:00 2001
From: Li Zhong <zhong@linux.vnet.ibm.com>
Date: Fri, 11 Nov 2016 12:57:33 +0800
Subject: KVM: PPC: Book 3S: XICS: correct the real mode ICP rejecting counter

Some counters are added in Commit 6e0365b78273 ("KVM: PPC: Book3S HV:
Add ICP real mode counters"), to provide some performance statistics to
determine whether further optimizing is needed for real mode functions.

The n_reject counter counts how many times ICP rejects an irq because of
priority in real mode. The redelivery of an lsi that is still asserted
after eoi doesn't fall into this category, so the increasement there is
removed.

Also, it needs to be increased in icp_rm_deliver_irq() if it rejects
another one.

Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rm_xics.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 06edc4366639..9f6c8fe88db6 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -380,6 +380,7 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		 */
 		if (reject && reject != XICS_IPI) {
 			arch_spin_unlock(&ics->lock);
+			icp->n_reject++;
 			new_irq = reject;
 			goto again;
 		}
@@ -711,10 +712,8 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 	state = &ics->irq_state[src];
 
 	/* Still asserted, resend it */
-	if (state->asserted) {
-		icp->n_reject++;
+	if (state->asserted)
 		icp_rm_deliver_irq(xics, icp, irq);
-	}
 
 	if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {
 		icp->rm_action |= XICS_RM_NOTIFY_EOI;
-- 
cgit v1.2.3


From bf5a71d53835110d46d33eb5335713ffdbff9ab6 Mon Sep 17 00:00:00 2001
From: Li Zhong <zhong@linux.vnet.ibm.com>
Date: Fri, 11 Nov 2016 12:57:34 +0800
Subject: KVM: PPC: Book 3S: XICS: Fix potential issue with duplicate IRQ
 resends

It is possible that in the following order, one irq is resent twice:

        CPU 1                                   CPU 2

ics_check_resend()
  lock ics_lock
    see resend set
  unlock ics_lock
                                       /* change affinity of the irq */
                                       kvmppc_xics_set_xive()
                                         write_xive()
                                           lock ics_lock
                                             see resend set
                                           unlock ics_lock

                                         icp_deliver_irq() /* resend */

  icp_deliver_irq() /* resend again */

It doesn't have any user-visible effect at present, but needs to be avoided
when the following patch implementing the P/Q stuff is applied.

This patch clears the resend flag before releasing the ics lock, when we
know we will do a re-delivery after checking the flag, or setting the flag.

Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rm_xics.c | 3 +++
 arch/powerpc/kvm/book3s_xics.c       | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 9f6c8fe88db6..16349c9e86ba 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -52,6 +52,8 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics,
 		if (!state->resend)
 			continue;
 
+		state->resend = 0;
+
 		arch_spin_unlock(&ics->lock);
 		icp_rm_deliver_irq(xics, icp, state->number);
 		arch_spin_lock(&ics->lock);
@@ -400,6 +402,7 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		 */
 		smp_mb();
 		if (!icp->state.need_resend) {
+			state->resend = 0;
 			arch_spin_unlock(&ics->lock);
 			goto again;
 		}
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index debda779a240..cdfb4ed73601 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -125,6 +125,8 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
 		if (!state->resend)
 			continue;
 
+		state->resend = 0;
+
 		XICS_DBG("resend %#x prio %#x\n", state->number,
 			      state->priority);
 
@@ -155,6 +157,7 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
 	deliver = false;
 	if ((state->masked_pending || state->resend) && priority != MASKED) {
 		state->masked_pending = 0;
+		state->resend = 0;
 		deliver = true;
 	}
 
@@ -488,6 +491,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		 */
 		smp_mb();
 		if (!icp->state.need_resend) {
+			state->resend = 0;
 			arch_spin_unlock(&ics->lock);
 			local_irq_restore(flags);
 			goto again;
-- 
cgit v1.2.3


From 17d48610ae0fa218aa386b16a538c792991a3652 Mon Sep 17 00:00:00 2001
From: Li Zhong <zhong@linux.vnet.ibm.com>
Date: Fri, 11 Nov 2016 12:57:35 +0800
Subject: KVM: PPC: Book 3S: XICS: Implement ICS P/Q states

This patch implements P(Presented)/Q(Queued) states for ICS irqs.

When the interrupt is presented, set P. Present if P was not set.
If P is already set, don't present again, set Q.
When the interrupt is EOI'ed, move Q into P (and clear Q). If it is
set, re-present.

The asserted flag used by LSI is also incorporated into the P bit.

When the irq state is saved, P/Q bits are also saved, they need some
qemu modifications to be recognized and passed around to be restored.
KVM_XICS_PENDING bit set and saved should also indicate
KVM_XICS_PRESENTED bit set and saved. But it is possible some old
code doesn't have/recognize the P bit, so when we restore, we set P
for PENDING bit, too.

The idea and much of the code come from Ben.

Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/uapi/asm/kvm.h  |   2 +
 arch/powerpc/kvm/book3s_hv_rm_xics.c | 100 +++++++++++++++++++---------
 arch/powerpc/kvm/book3s_xics.c       | 125 ++++++++++++++++++++++++-----------
 arch/powerpc/kvm/book3s_xics.h       |   5 +-
 4 files changed, 161 insertions(+), 71 deletions(-)

diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 3603b6f51b11..e3db3a50127b 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -613,5 +613,7 @@ struct kvm_get_htab_header {
 #define  KVM_XICS_LEVEL_SENSITIVE	(1ULL << 40)
 #define  KVM_XICS_MASKED		(1ULL << 41)
 #define  KVM_XICS_PENDING		(1ULL << 42)
+#define  KVM_XICS_PRESENTED		(1ULL << 43)
+#define  KVM_XICS_QUEUED		(1ULL << 44)
 
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 16349c9e86ba..30f82c79de5d 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -672,51 +672,39 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 	return check_too_hard(xics, icp);
 }
 
-int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq)
 {
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
 	struct kvmppc_icp *icp = vcpu->arch.icp;
 	struct kvmppc_ics *ics;
 	struct ics_irq_state *state;
-	u32 irq = xirr & 0x00ffffff;
 	u16 src;
-
-	if (!xics || !xics->real_mode)
-		return H_TOO_HARD;
+	u32 pq_old, pq_new;
 
 	/*
-	 * ICP State: EOI
-	 *
-	 * Note: If EOI is incorrectly used by SW to lower the CPPR
-	 * value (ie more favored), we do not check for rejection of
-	 * a pending interrupt, this is a SW error and PAPR sepcifies
-	 * that we don't have to deal with it.
+	 * ICS EOI handling: For LSI, if P bit is still set, we need to
+	 * resend it.
 	 *
-	 * The sending of an EOI to the ICS is handled after the
-	 * CPPR update
-	 *
-	 * ICP State: Down_CPPR which we handle
-	 * in a separate function as it's shared with H_CPPR.
+	 * For MSI, we move Q bit into P (and clear Q). If it is set,
+	 * resend it.
 	 */
-	icp_rm_down_cppr(xics, icp, xirr >> 24);
 
-	/* IPIs have no EOI */
-	if (irq == XICS_IPI)
-		goto bail;
-	/*
-	 * EOI handling: If the interrupt is still asserted, we need to
-	 * resend it. We can take a lockless "peek" at the ICS state here.
-	 *
-	 * "Message" interrupts will never have "asserted" set
-	 */
 	ics = kvmppc_xics_find_ics(xics, irq, &src);
 	if (!ics)
 		goto bail;
+
 	state = &ics->irq_state[src];
 
-	/* Still asserted, resend it */
-	if (state->asserted)
-		icp_rm_deliver_irq(xics, icp, irq);
+	if (state->lsi)
+		pq_new = state->pq_state;
+	else
+		do {
+			pq_old = state->pq_state;
+			pq_new = pq_old >> 1;
+		} while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+	if (pq_new & PQ_PRESENTED)
+		icp_rm_deliver_irq(xics, NULL, irq);
 
 	if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {
 		icp->rm_action |= XICS_RM_NOTIFY_EOI;
@@ -737,10 +725,43 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 			state->intr_cpu = -1;
 		}
 	}
+
  bail:
 	return check_too_hard(xics, icp);
 }
 
+int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 irq = xirr & 0x00ffffff;
+
+	if (!xics || !xics->real_mode)
+		return H_TOO_HARD;
+
+	/*
+	 * ICP State: EOI
+	 *
+	 * Note: If EOI is incorrectly used by SW to lower the CPPR
+	 * value (ie more favored), we do not check for rejection of
+	 * a pending interrupt, this is a SW error and PAPR specifies
+	 * that we don't have to deal with it.
+	 *
+	 * The sending of an EOI to the ICS is handled after the
+	 * CPPR update
+	 *
+	 * ICP State: Down_CPPR which we handle
+	 * in a separate function as it's shared with H_CPPR.
+	 */
+	icp_rm_down_cppr(xics, icp, xirr >> 24);
+
+	/* IPIs have no EOI */
+	if (irq == XICS_IPI)
+		return check_too_hard(xics, icp);
+
+	return ics_rm_eoi(vcpu, irq);
+}
+
 unsigned long eoi_rc;
 
 static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
@@ -827,14 +848,33 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
 {
 	struct kvmppc_xics *xics;
 	struct kvmppc_icp *icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
 	u32 irq;
+	u16 src;
+	u32 pq_old, pq_new;
 
 	irq = irq_map->v_hwirq;
 	xics = vcpu->kvm->arch.xics;
 	icp = vcpu->arch.icp;
 
 	kvmppc_rm_handle_irq_desc(irq_map->desc);
-	icp_rm_deliver_irq(xics, icp, irq);
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return 2;
+
+	state = &ics->irq_state[src];
+
+	/* only MSIs register bypass producers, so it must be MSI here */
+	do {
+		pq_old = state->pq_state;
+		pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED;
+	} while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+	/* Test P=1, Q=0, this is the only case where we present */
+	if (pq_new == PQ_PRESENTED)
+		icp_rm_deliver_irq(xics, icp, irq);
 
 	/* EOI the interrupt */
 	icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr,
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index cdfb4ed73601..c7620622c846 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -75,6 +75,7 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
 	struct ics_irq_state *state;
 	struct kvmppc_ics *ics;
 	u16 src;
+	u32 pq_old, pq_new;
 
 	XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
 
@@ -87,25 +88,41 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
 	if (!state->exists)
 		return -EINVAL;
 
+	if (level == KVM_INTERRUPT_SET_LEVEL || level == KVM_INTERRUPT_SET)
+		level = 1;
+	else if (level == KVM_INTERRUPT_UNSET)
+		level = 0;
 	/*
-	 * We set state->asserted locklessly. This should be fine as
-	 * we are the only setter, thus concurrent access is undefined
-	 * to begin with.
+	 * Take other values the same as 1, consistent with original code.
+	 * maybe WARN here?
 	 */
-	if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL)
-		state->asserted = 1;
-	else if (level == 0 || level == KVM_INTERRUPT_UNSET) {
-		state->asserted = 0;
+
+	if (!state->lsi && level == 0) /* noop for MSI */
 		return 0;
-	}
+
+	do {
+		pq_old = state->pq_state;
+		if (state->lsi) {
+			if (level) {
+				if (pq_old & PQ_PRESENTED)
+					/* Setting already set LSI ... */
+					return 0;
+
+				pq_new = PQ_PRESENTED;
+			} else
+				pq_new = 0;
+		} else
+			pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED;
+	} while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+	/* Test P=1, Q=0, this is the only case where we present */
+	if (pq_new == PQ_PRESENTED)
+		icp_deliver_irq(xics, NULL, irq);
 
 	/* Record which CPU this arrived on for passed-through interrupts */
 	if (state->host_irq)
 		state->intr_cpu = raw_smp_processor_id();
 
-	/* Attempt delivery */
-	icp_deliver_irq(xics, NULL, irq);
-
 	return 0;
 }
 
@@ -768,14 +785,51 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 		icp_deliver_irq(xics, icp, reject);
 }
 
-static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq)
 {
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
 	struct kvmppc_icp *icp = vcpu->arch.icp;
 	struct kvmppc_ics *ics;
 	struct ics_irq_state *state;
-	u32 irq = xirr & 0x00ffffff;
 	u16 src;
+	u32 pq_old, pq_new;
+
+	/*
+	 * ICS EOI handling: For LSI, if P bit is still set, we need to
+	 * resend it.
+	 *
+	 * For MSI, we move Q bit into P (and clear Q). If it is set,
+	 * resend it.
+	 */
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics) {
+		XICS_DBG("ios_eoi: IRQ 0x%06x not found !\n", irq);
+		return H_PARAMETER;
+	}
+	state = &ics->irq_state[src];
+
+	if (state->lsi)
+		pq_new = state->pq_state;
+	else
+		do {
+			pq_old = state->pq_state;
+			pq_new = pq_old >> 1;
+		} while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+	if (pq_new & PQ_PRESENTED)
+		icp_deliver_irq(xics, icp, irq);
+
+	kvm_notify_acked_irq(vcpu->kvm, 0, irq);
+
+	return H_SUCCESS;
+}
+
+static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 irq = xirr & 0x00ffffff;
 
 	XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
 
@@ -798,26 +852,8 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 	/* IPIs have no EOI */
 	if (irq == XICS_IPI)
 		return H_SUCCESS;
-	/*
-	 * EOI handling: If the interrupt is still asserted, we need to
-	 * resend it. We can take a lockless "peek" at the ICS state here.
-	 *
-	 * "Message" interrupts will never have "asserted" set
-	 */
-	ics = kvmppc_xics_find_ics(xics, irq, &src);
-	if (!ics) {
-		XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
-		return H_PARAMETER;
-	}
-	state = &ics->irq_state[src];
-
-	/* Still asserted, resend it */
-	if (state->asserted)
-		icp_deliver_irq(xics, icp, irq);
-
-	kvm_notify_acked_irq(vcpu->kvm, 0, irq);
 
-	return H_SUCCESS;
+	return ics_eoi(vcpu, irq);
 }
 
 int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
@@ -975,9 +1011,9 @@ static int xics_debug_show(struct seq_file *m, void *private)
 		for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
 			struct ics_irq_state *irq = &ics->irq_state[i];
 
-			seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n",
+			seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x pq_state %d resend %d masked pending %d\n",
 				   irq->number, irq->server, irq->priority,
-				   irq->saved_priority, irq->asserted,
+				   irq->saved_priority, irq->pq_state,
 				   irq->resend, irq->masked_pending);
 
 		}
@@ -1196,10 +1232,17 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
 		val |= prio << KVM_XICS_PRIORITY_SHIFT;
 		if (irqp->lsi) {
 			val |= KVM_XICS_LEVEL_SENSITIVE;
-			if (irqp->asserted)
+			if (irqp->pq_state & PQ_PRESENTED)
 				val |= KVM_XICS_PENDING;
 		} else if (irqp->masked_pending || irqp->resend)
 			val |= KVM_XICS_PENDING;
+
+		if (irqp->pq_state & PQ_PRESENTED)
+			val |= KVM_XICS_PRESENTED;
+
+		if (irqp->pq_state & PQ_QUEUED)
+			val |= KVM_XICS_QUEUED;
+
 		ret = 0;
 	}
 	arch_spin_unlock(&ics->lock);
@@ -1251,12 +1294,14 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
 	irqp->resend = 0;
 	irqp->masked_pending = 0;
 	irqp->lsi = 0;
-	irqp->asserted = 0;
-	if (val & KVM_XICS_LEVEL_SENSITIVE) {
+	irqp->pq_state = 0;
+	if (val & KVM_XICS_LEVEL_SENSITIVE)
 		irqp->lsi = 1;
-		if (val & KVM_XICS_PENDING)
-			irqp->asserted = 1;
-	}
+	/* If PENDING, set P in case P is not saved because of old code */
+	if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
+		irqp->pq_state |= PQ_PRESENTED;
+	if (val & KVM_XICS_QUEUED)
+		irqp->pq_state |= PQ_QUEUED;
 	irqp->exists = 1;
 	arch_spin_unlock(&ics->lock);
 	local_irq_restore(flags);
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index 1d5fac80b706..ec5474cf70c6 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -31,16 +31,19 @@
 /* Priority value to use for disabling an interrupt */
 #define MASKED	0xff
 
+#define PQ_PRESENTED	1
+#define PQ_QUEUED	2
+
 /* State for one irq source */
 struct ics_irq_state {
 	u32 number;
 	u32 server;
+	u32 pq_state;
 	u8  priority;
 	u8  saved_priority;
 	u8  resend;
 	u8  masked_pending;
 	u8  lsi;		/* level-sensitive interrupt */
-	u8  asserted; /* Only for LSI */
 	u8  exists;
 	int intr_cpu;
 	u32 host_irq;
-- 
cgit v1.2.3


From 21acd0e4df04f02176e773468658c3cebff096bb Mon Sep 17 00:00:00 2001
From: Li Zhong <zhong@linux.vnet.ibm.com>
Date: Fri, 11 Nov 2016 12:57:36 +0800
Subject: KVM: PPC: Book 3S: XICS: Don't lock twice when checking for resend

This patch improves the code that takes lock twice to check the resend flag
and do the actual resending, by checking the resend flag locklessly, and
add a boolean parameter check_resend to icp_[rm_]deliver_irq(), so the
resend flag can be checked in the lock when doing the delivery.

We need make sure when we clear the ics's bit in the icp's resend_map, we
don't miss the resend flag of the irqs that set the bit. It could be
ordered through the barrier in test_and_clear_bit(), and a newly added
wmb between setting irq's resend flag, and icp's resend_map.

Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rm_xics.c | 40 ++++++++++++------------
 arch/powerpc/kvm/book3s_xics.c       | 59 +++++++++++++++++-------------------
 2 files changed, 48 insertions(+), 51 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 30f82c79de5d..44cfdd281fa1 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -35,7 +35,7 @@ int kvm_irq_bypass = 1;
 EXPORT_SYMBOL(kvm_irq_bypass);
 
 static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
-			    u32 new_irq);
+			    u32 new_irq, bool check_resend);
 static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu);
 
 /* -- ICS routines -- */
@@ -44,22 +44,12 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics,
 {
 	int i;
 
-	arch_spin_lock(&ics->lock);
-
 	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
 		struct ics_irq_state *state = &ics->irq_state[i];
-
-		if (!state->resend)
-			continue;
-
-		state->resend = 0;
-
-		arch_spin_unlock(&ics->lock);
-		icp_rm_deliver_irq(xics, icp, state->number);
-		arch_spin_lock(&ics->lock);
+		if (state->resend)
+			icp_rm_deliver_irq(xics, icp, state->number, true);
 	}
 
-	arch_spin_unlock(&ics->lock);
 }
 
 /* -- ICP routines -- */
@@ -292,7 +282,7 @@ static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
 }
 
 static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
-			    u32 new_irq)
+			    u32 new_irq, bool check_resend)
 {
 	struct ics_irq_state *state;
 	struct kvmppc_ics *ics;
@@ -337,6 +327,10 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		}
 	}
 
+	if (check_resend)
+		if (!state->resend)
+			goto out;
+
 	/* Clear the resend bit of that interrupt */
 	state->resend = 0;
 
@@ -384,6 +378,7 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 			arch_spin_unlock(&ics->lock);
 			icp->n_reject++;
 			new_irq = reject;
+			check_resend = 0;
 			goto again;
 		}
 	} else {
@@ -391,9 +386,15 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		 * We failed to deliver the interrupt we need to set the
 		 * resend map bit and mark the ICS state as needing a resend
 		 */
-		set_bit(ics->icsid, icp->resend_map);
 		state->resend = 1;
 
+		/*
+		 * Make sure when checking resend, we don't miss the resend
+		 * if resend_map bit is seen and cleared.
+		 */
+		smp_wmb();
+		set_bit(ics->icsid, icp->resend_map);
+
 		/*
 		 * If the need_resend flag got cleared in the ICP some time
 		 * between icp_rm_try_to_deliver() atomic update and now, then
@@ -404,6 +405,7 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		if (!icp->state.need_resend) {
 			state->resend = 0;
 			arch_spin_unlock(&ics->lock);
+			check_resend = 0;
 			goto again;
 		}
 	}
@@ -598,7 +600,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
 	/* Handle reject in real mode */
 	if (reject && reject != XICS_IPI) {
 		this_icp->n_reject++;
-		icp_rm_deliver_irq(xics, icp, reject);
+		icp_rm_deliver_irq(xics, icp, reject, false);
 	}
 
 	/* Handle resends in real mode */
@@ -666,7 +668,7 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 	 */
 	if (reject && reject != XICS_IPI) {
 		icp->n_reject++;
-		icp_rm_deliver_irq(xics, icp, reject);
+		icp_rm_deliver_irq(xics, icp, reject, false);
 	}
  bail:
 	return check_too_hard(xics, icp);
@@ -704,7 +706,7 @@ static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq)
 		} while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
 
 	if (pq_new & PQ_PRESENTED)
-		icp_rm_deliver_irq(xics, NULL, irq);
+		icp_rm_deliver_irq(xics, NULL, irq, false);
 
 	if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {
 		icp->rm_action |= XICS_RM_NOTIFY_EOI;
@@ -874,7 +876,7 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
 
 	/* Test P=1, Q=0, this is the only case where we present */
 	if (pq_new == PQ_PRESENTED)
-		icp_rm_deliver_irq(xics, icp, irq);
+		icp_rm_deliver_irq(xics, icp, irq, false);
 
 	/* EOI the interrupt */
 	icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr,
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index c7620622c846..e48803e2918d 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -63,7 +63,7 @@
 /* -- ICS routines -- */
 
 static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
-			    u32 new_irq);
+			    u32 new_irq, bool check_resend);
 
 /*
  * Return value ideally indicates how the interrupt was handled, but no
@@ -117,7 +117,7 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
 
 	/* Test P=1, Q=0, this is the only case where we present */
 	if (pq_new == PQ_PRESENTED)
-		icp_deliver_irq(xics, NULL, irq);
+		icp_deliver_irq(xics, NULL, irq, false);
 
 	/* Record which CPU this arrived on for passed-through interrupts */
 	if (state->host_irq)
@@ -131,31 +131,14 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
 {
 	int i;
 
-	unsigned long flags;
-
-	local_irq_save(flags);
-	arch_spin_lock(&ics->lock);
-
 	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
 		struct ics_irq_state *state = &ics->irq_state[i];
-
-		if (!state->resend)
-			continue;
-
-		state->resend = 0;
-
-		XICS_DBG("resend %#x prio %#x\n", state->number,
-			      state->priority);
-
-		arch_spin_unlock(&ics->lock);
-		local_irq_restore(flags);
-		icp_deliver_irq(xics, icp, state->number);
-		local_irq_save(flags);
-		arch_spin_lock(&ics->lock);
+		if (state->resend) {
+			XICS_DBG("resend %#x prio %#x\n", state->number,
+				      state->priority);
+			icp_deliver_irq(xics, icp, state->number, true);
+		}
 	}
-
-	arch_spin_unlock(&ics->lock);
-	local_irq_restore(flags);
 }
 
 static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
@@ -209,7 +192,7 @@ int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
 		 state->masked_pending, state->resend);
 
 	if (write_xive(xics, ics, state, server, priority, priority))
-		icp_deliver_irq(xics, icp, irq);
+		icp_deliver_irq(xics, icp, irq, false);
 
 	return 0;
 }
@@ -262,7 +245,7 @@ int kvmppc_xics_int_on(struct kvm *kvm, u32 irq)
 
 	if (write_xive(xics, ics, state, state->server, state->saved_priority,
 		       state->saved_priority))
-		icp_deliver_irq(xics, icp, irq);
+		icp_deliver_irq(xics, icp, irq, false);
 
 	return 0;
 }
@@ -396,7 +379,7 @@ static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
 }
 
 static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
-			    u32 new_irq)
+			    u32 new_irq, bool check_resend)
 {
 	struct ics_irq_state *state;
 	struct kvmppc_ics *ics;
@@ -442,6 +425,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		}
 	}
 
+	if (check_resend)
+		if (!state->resend)
+			goto out;
+
 	/* Clear the resend bit of that interrupt */
 	state->resend = 0;
 
@@ -490,6 +477,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 			arch_spin_unlock(&ics->lock);
 			local_irq_restore(flags);
 			new_irq = reject;
+			check_resend = 0;
 			goto again;
 		}
 	} else {
@@ -497,9 +485,15 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		 * We failed to deliver the interrupt we need to set the
 		 * resend map bit and mark the ICS state as needing a resend
 		 */
-		set_bit(ics->icsid, icp->resend_map);
 		state->resend = 1;
 
+		/*
+		 * Make sure when checking resend, we don't miss the resend
+		 * if resend_map bit is seen and cleared.
+		 */
+		smp_wmb();
+		set_bit(ics->icsid, icp->resend_map);
+
 		/*
 		 * If the need_resend flag got cleared in the ICP some time
 		 * between icp_try_to_deliver() atomic update and now, then
@@ -511,6 +505,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 			state->resend = 0;
 			arch_spin_unlock(&ics->lock);
 			local_irq_restore(flags);
+			check_resend = 0;
 			goto again;
 		}
 	}
@@ -702,7 +697,7 @@ static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
 
 	/* Handle reject */
 	if (reject && reject != XICS_IPI)
-		icp_deliver_irq(xics, icp, reject);
+		icp_deliver_irq(xics, icp, reject, false);
 
 	/* Handle resend */
 	if (resend)
@@ -782,7 +777,7 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 	 * attempt (see comments in icp_deliver_irq).
 	 */
 	if (reject && reject != XICS_IPI)
-		icp_deliver_irq(xics, icp, reject);
+		icp_deliver_irq(xics, icp, reject, false);
 }
 
 static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq)
@@ -818,7 +813,7 @@ static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq)
 		} while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
 
 	if (pq_new & PQ_PRESENTED)
-		icp_deliver_irq(xics, icp, irq);
+		icp_deliver_irq(xics, icp, irq, false);
 
 	kvm_notify_acked_irq(vcpu->kvm, 0, irq);
 
@@ -1307,7 +1302,7 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
 	local_irq_restore(flags);
 
 	if (val & KVM_XICS_PENDING)
-		icp_deliver_irq(xics, NULL, irqp->number);
+		icp_deliver_irq(xics, NULL, irqp->number, false);
 
 	return 0;
 }
-- 
cgit v1.2.3


From d3918e7fd4a27564f93ec46d0359a9739c5deb8d Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 22 Dec 2016 04:29:25 +1000
Subject: KVM: PPC: Book3S: Change interrupt call to reduce scratch space use
 on HV

Change the calling convention to put the trap number together with
CR in two halves of r12, which frees up HSTATE_SCRATCH2 in the HV
handler.

The 64-bit PR handler entry translates the calling convention back
to match the previous call convention (i.e., shared with 32-bit), for
simplicity.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 24 +++++++++++-------------
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 16 +++++++++-------
 arch/powerpc/kvm/book3s_segment.S        | 25 ++++++++++++++++++-------
 3 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 9a3eee661297..a02a268bde6b 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -233,7 +233,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 
 #endif
 
-#define __KVM_HANDLER_PROLOG(area, n)					\
+#define __KVM_HANDLER(area, h, n)					\
 	BEGIN_FTR_SECTION_NESTED(947)					\
 	ld	r10,area+EX_CFAR(r13);					\
 	std	r10,HSTATE_CFAR(r13);					\
@@ -243,30 +243,28 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	std	r10,HSTATE_PPR(r13);					\
 	END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948);	\
 	ld	r10,area+EX_R10(r13);					\
-	stw	r9,HSTATE_SCRATCH1(r13);				\
-	ld	r9,area+EX_R9(r13);					\
 	std	r12,HSTATE_SCRATCH0(r13);				\
-
-#define __KVM_HANDLER(area, h, n)					\
-	__KVM_HANDLER_PROLOG(area, n)					\
-	li	r12,n;							\
+	sldi	r12,r9,32;						\
+	ori	r12,r12,(n);						\
+	ld	r9,area+EX_R9(r13);					\
 	b	kvmppc_interrupt
 
 #define __KVM_HANDLER_SKIP(area, h, n)					\
 	cmpwi	r10,KVM_GUEST_MODE_SKIP;				\
-	ld	r10,area+EX_R10(r13);					\
 	beq	89f;							\
-	stw	r9,HSTATE_SCRATCH1(r13);				\
 	BEGIN_FTR_SECTION_NESTED(948)					\
-	ld	r9,area+EX_PPR(r13);					\
-	std	r9,HSTATE_PPR(r13);					\
+	ld	r10,area+EX_PPR(r13);					\
+	std	r10,HSTATE_PPR(r13);					\
 	END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948);	\
-	ld	r9,area+EX_R9(r13);					\
+	ld	r10,area+EX_R10(r13);					\
 	std	r12,HSTATE_SCRATCH0(r13);				\
-	li	r12,n;							\
+	sldi	r12,r9,32;						\
+	ori	r12,r12,(n);						\
+	ld	r9,area+EX_R9(r13);					\
 	b	kvmppc_interrupt;					\
 89:	mtocrf	0x80,r9;						\
 	ld	r9,area+EX_R9(r13);					\
+	ld	r10,area+EX_R10(r13);					\
 	b	kvmppc_skip_##h##interrupt
 
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 9338a818e05c..11882aac8216 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1057,19 +1057,18 @@ hdec_soon:
 kvmppc_interrupt_hv:
 	/*
 	 * Register contents:
-	 * R12		= interrupt vector
+	 * R12		= (guest CR << 32) | interrupt vector
 	 * R13		= PACA
-	 * guest CR, R12 saved in shadow VCPU SCRATCH1/0
+	 * guest R12 saved in shadow VCPU SCRATCH0
 	 * guest R13 saved in SPRN_SCRATCH0
 	 */
-	std	r9, HSTATE_SCRATCH2(r13)
-
+	std	r9, HSTATE_SCRATCH1(r13)
 	lbz	r9, HSTATE_IN_GUEST(r13)
 	cmpwi	r9, KVM_GUEST_MODE_HOST_HV
 	beq	kvmppc_bad_host_intr
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 	cmpwi	r9, KVM_GUEST_MODE_GUEST
-	ld	r9, HSTATE_SCRATCH2(r13)
+	ld	r9, HSTATE_SCRATCH1(r13)
 	beq	kvmppc_interrupt_pr
 #endif
 	/* We're now back in the host but in guest MMU context */
@@ -1089,13 +1088,14 @@ kvmppc_interrupt_hv:
 	std	r6, VCPU_GPR(R6)(r9)
 	std	r7, VCPU_GPR(R7)(r9)
 	std	r8, VCPU_GPR(R8)(r9)
-	ld	r0, HSTATE_SCRATCH2(r13)
+	ld	r0, HSTATE_SCRATCH1(r13)
 	std	r0, VCPU_GPR(R9)(r9)
 	std	r10, VCPU_GPR(R10)(r9)
 	std	r11, VCPU_GPR(R11)(r9)
 	ld	r3, HSTATE_SCRATCH0(r13)
-	lwz	r4, HSTATE_SCRATCH1(r13)
 	std	r3, VCPU_GPR(R12)(r9)
+	/* CR is in the high half of r12 */
+	srdi	r4, r12, 32
 	stw	r4, VCPU_CR(r9)
 BEGIN_FTR_SECTION
 	ld	r3, HSTATE_CFAR(r13)
@@ -1114,6 +1114,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	mfspr	r11, SPRN_SRR1
 	std	r10, VCPU_SRR0(r9)
 	std	r11, VCPU_SRR1(r9)
+	/* trap is in the low half of r12, clear CR from the high half */
+	clrldi	r12, r12, 32
 	andi.	r0, r12, 2		/* need to read HSRR0/1? */
 	beq	1f
 	mfspr	r10, SPRN_HSRR0
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index ca8f174289bb..68e45080cf93 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -167,20 +167,31 @@ kvmppc_handler_trampoline_enter_end:
  *                                                                            *
  *****************************************************************************/
 
-.global kvmppc_handler_trampoline_exit
-kvmppc_handler_trampoline_exit:
-
 .global kvmppc_interrupt_pr
 kvmppc_interrupt_pr:
+	/* 64-bit entry. Register usage at this point:
+	 *
+	 * SPRG_SCRATCH0   = guest R13
+	 * R12             = (guest CR << 32) | exit handler id
+	 * R13             = PACA
+	 * HSTATE.SCRATCH0 = guest R12
+	 */
+#ifdef CONFIG_PPC64
+	/* Match 32-bit entry */
+	rotldi	r12, r12, 32		  /* Flip R12 halves for stw */
+	stw	r12, HSTATE_SCRATCH1(r13) /* CR is now in the low half */
+	srdi	r12, r12, 32		  /* shift trap into low half */
+#endif
 
+.global kvmppc_handler_trampoline_exit
+kvmppc_handler_trampoline_exit:
 	/* Register usage at this point:
 	 *
-	 * SPRG_SCRATCH0  = guest R13
-	 * R12            = exit handler id
-	 * R13            = shadow vcpu (32-bit) or PACA (64-bit)
+	 * SPRG_SCRATCH0   = guest R13
+	 * R12             = exit handler id
+	 * R13             = shadow vcpu (32-bit) or PACA (64-bit)
 	 * HSTATE.SCRATCH0 = guest R12
 	 * HSTATE.SCRATCH1 = guest CR
-	 *
 	 */
 
 	/* Save registers */
-- 
cgit v1.2.3


From 7ede531773ea69fa56b02a873ed83ce3507eb8d5 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 22 Dec 2016 04:29:26 +1000
Subject: KVM: PPC: Book3S: Move 64-bit KVM interrupt handler out from alt
 section

A subsequent patch to make KVM handlers relocation-safe makes them
unusable from within alt section "else" cases (due to the way fixed
addresses are taken from within fixed section head code).

Stop open-coding the KVM handlers, and add them both as normal. A more
optimal fix may be to allow some level of alternate feature patching in
the exception macros themselves, but for now this will do.

The TRAMP_KVM handlers must be moved to the "virt" fixed section area
(name is arbitrary) in order to be closer to .text and avoid the dreaded
"relocation truncated to fit" error.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/head-64.h   | 2 +-
 arch/powerpc/kernel/exceptions-64s.S | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h
index fca7033839a9..9bd81619d090 100644
--- a/arch/powerpc/include/asm/head-64.h
+++ b/arch/powerpc/include/asm/head-64.h
@@ -218,7 +218,7 @@ name:
 
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
 #define TRAMP_KVM_BEGIN(name)						\
-	TRAMP_REAL_BEGIN(name)
+	TRAMP_VIRT_BEGIN(name)
 #else
 #define TRAMP_KVM_BEGIN(name)
 #endif
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index d39d6118c6e9..89b4f122aec6 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -717,13 +717,9 @@ hardware_interrupt_hv:
 	BEGIN_FTR_SECTION
 		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common,
 					    EXC_HV, SOFTEN_TEST_HV)
-do_kvm_H0x500:
-		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
 	FTR_SECTION_ELSE
 		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common,
 					    EXC_STD, SOFTEN_TEST_PR)
-do_kvm_0x500:
-		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
 	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 EXC_REAL_END(hardware_interrupt, 0x500, 0x600)
 
@@ -737,6 +733,8 @@ hardware_interrupt_relon_hv:
 	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x4600)
 
+TRAMP_KVM(PACA_EXGEN, 0x500)
+TRAMP_KVM_HV(PACA_EXGEN, 0x500)
 EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ)
 
 
-- 
cgit v1.2.3


From 8464c8842de2249061d3c5abc2ccce1bbbd10e7a Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 6 Dec 2016 20:42:05 +1100
Subject: KVM: PPC: Book3S HV: Fix H_PROD to actually wake the target vcpu

The H_PROD hypercall is supposed to wake up an idle vcpu.  We have
an implementation, but because Linux doesn't use it except when
doing cpu hotplug, it was never tested properly.  AIX does use it,
and reported it broken.  It turns out we were waking the wrong
vcpu (the one doing H_PROD, not the target of the prod) and we
weren't handling the case where the target needs an IPI to wake
it.  Fix it by using the existing kvmppc_fast_vcpu_kick_hv()
function, which is intended for this kind of thing, and by using
the target vcpu not the current vcpu.

We were also not looking at the prodded flag when checking whether a
ceded vcpu should wake up, so this adds checks for the prodded flag
alongside the checks for pending exceptions.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 8d9cc07b1e9c..856cc9d38efd 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -774,12 +774,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 		}
 		tvcpu->arch.prodded = 1;
 		smp_mb();
-		if (vcpu->arch.ceded) {
-			if (swait_active(&vcpu->wq)) {
-				swake_up(&vcpu->wq);
-				vcpu->stat.halt_wakeup++;
-			}
-		}
+		if (tvcpu->arch.ceded)
+			kvmppc_fast_vcpu_kick_hv(tvcpu);
 		break;
 	case H_CONFER:
 		target = kvmppc_get_gpr(vcpu, 4);
@@ -2621,7 +2617,8 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
 	int i;
 
 	for_each_runnable_thread(i, vcpu, vc) {
-		if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded)
+		if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded ||
+		    vcpu->arch.prodded)
 			return 1;
 	}
 
@@ -2807,7 +2804,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			break;
 		n_ceded = 0;
 		for_each_runnable_thread(i, v, vc) {
-			if (!v->arch.pending_exceptions)
+			if (!v->arch.pending_exceptions && !v->arch.prodded)
 				n_ceded += v->arch.ceded;
 			else
 				v->arch.ceded = 0;
-- 
cgit v1.2.3


From fcd4f3c6d150357a02af8526e69bfebb82dd5d46 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Wed, 25 Jan 2017 13:27:22 +0100
Subject: KVM: PPC: Book3S PR: Refactor program interrupt related code into
 separate function

The function kvmppc_handle_exit_pr() is quite huge and thus hard to read,
and even contains a "spaghetti-code"-like goto between the different case
labels of the big switch statement. This can be made much more readable
by moving the code related to injecting program interrupts / instruction
emulation into a separate function instead.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_pr.c | 130 +++++++++++++++++++++----------------------
 1 file changed, 65 insertions(+), 65 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 1482961ceb4d..d4dfc0ca2a44 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -902,6 +902,69 @@ static void kvmppc_clear_debug(struct kvm_vcpu *vcpu)
 	}
 }
 
+static int kvmppc_exit_pr_progint(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				  unsigned int exit_nr)
+{
+	enum emulation_result er;
+	ulong flags;
+	u32 last_inst;
+	int emul, r;
+
+	/*
+	 * shadow_srr1 only contains valid flags if we came here via a program
+	 * exception. The other exceptions (emulation assist, FP unavailable,
+	 * etc.) do not provide flags in SRR1, so use an illegal-instruction
+	 * exception when injecting a program interrupt into the guest.
+	 */
+	if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
+		flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+	else
+		flags = SRR1_PROGILL;
+
+	emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
+	if (emul != EMULATE_DONE)
+		return RESUME_GUEST;
+
+	if (kvmppc_get_msr(vcpu) & MSR_PR) {
+#ifdef EXIT_DEBUG
+		pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n",
+			kvmppc_get_pc(vcpu), last_inst);
+#endif
+		if ((last_inst & 0xff0007ff) != (INS_DCBZ & 0xfffffff7)) {
+			kvmppc_core_queue_program(vcpu, flags);
+			return RESUME_GUEST;
+		}
+	}
+
+	vcpu->stat.emulated_inst_exits++;
+	er = kvmppc_emulate_instruction(run, vcpu);
+	switch (er) {
+	case EMULATE_DONE:
+		r = RESUME_GUEST_NV;
+		break;
+	case EMULATE_AGAIN:
+		r = RESUME_GUEST;
+		break;
+	case EMULATE_FAIL:
+		pr_crit("%s: emulation at %lx failed (%08x)\n",
+			__func__, kvmppc_get_pc(vcpu), last_inst);
+		kvmppc_core_queue_program(vcpu, flags);
+		r = RESUME_GUEST;
+		break;
+	case EMULATE_DO_MMIO:
+		run->exit_reason = KVM_EXIT_MMIO;
+		r = RESUME_HOST_NV;
+		break;
+	case EMULATE_EXIT_USER:
+		r = RESUME_HOST_NV;
+		break;
+	default:
+		BUG();
+	}
+
+	return r;
+}
+
 int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			  unsigned int exit_nr)
 {
@@ -1044,71 +1107,8 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 	case BOOK3S_INTERRUPT_PROGRAM:
 	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
-	{
-		enum emulation_result er;
-		ulong flags;
-		u32 last_inst;
-		int emul;
-
-program_interrupt:
-		/*
-		 * shadow_srr1 only contains valid flags if we came here via
-		 * a program exception. The other exceptions (emulation assist,
-		 * FP unavailable, etc.) do not provide flags in SRR1, so use
-		 * an illegal-instruction exception when injecting a program
-		 * interrupt into the guest.
-		 */
-		if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
-			flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
-		else
-			flags = SRR1_PROGILL;
-
-		emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
-		if (emul != EMULATE_DONE) {
-			r = RESUME_GUEST;
-			break;
-		}
-
-		if (kvmppc_get_msr(vcpu) & MSR_PR) {
-#ifdef EXIT_DEBUG
-			pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n",
-				kvmppc_get_pc(vcpu), last_inst);
-#endif
-			if ((last_inst & 0xff0007ff) !=
-			    (INS_DCBZ & 0xfffffff7)) {
-				kvmppc_core_queue_program(vcpu, flags);
-				r = RESUME_GUEST;
-				break;
-			}
-		}
-
-		vcpu->stat.emulated_inst_exits++;
-		er = kvmppc_emulate_instruction(run, vcpu);
-		switch (er) {
-		case EMULATE_DONE:
-			r = RESUME_GUEST_NV;
-			break;
-		case EMULATE_AGAIN:
-			r = RESUME_GUEST;
-			break;
-		case EMULATE_FAIL:
-			printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
-			       __func__, kvmppc_get_pc(vcpu), last_inst);
-			kvmppc_core_queue_program(vcpu, flags);
-			r = RESUME_GUEST;
-			break;
-		case EMULATE_DO_MMIO:
-			run->exit_reason = KVM_EXIT_MMIO;
-			r = RESUME_HOST_NV;
-			break;
-		case EMULATE_EXIT_USER:
-			r = RESUME_HOST_NV;
-			break;
-		default:
-			BUG();
-		}
+		r = kvmppc_exit_pr_progint(run, vcpu, exit_nr);
 		break;
-	}
 	case BOOK3S_INTERRUPT_SYSCALL:
 	{
 		u32 last_sc;
@@ -1185,7 +1185,7 @@ program_interrupt:
 			emul = kvmppc_get_last_inst(vcpu, INST_GENERIC,
 						    &last_inst);
 			if (emul == EMULATE_DONE)
-				goto program_interrupt;
+				r = kvmppc_exit_pr_progint(run, vcpu, exit_nr);
 			else
 				r = RESUME_GUEST;
 
-- 
cgit v1.2.3


From ab22a4733fe919d22bc2957680506ed17e40941e Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 21 Dec 2016 20:29:28 -0800
Subject: kvm: x86: mmu: Rename EPT_VIOLATION_READ/WRITE/INSTR constants

Rename the EPT_VIOLATION_READ/WRITE/INSTR constants to
EPT_VIOLATION_ACC_READ/WRITE/INSTR to more clearly indicate that these
signify the type of the memory access as opposed to the permissions
granted by the PTE.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/vmx.h | 12 ++++++------
 arch/x86/kvm/vmx.c         |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index a22a4790f1ac..cc54b7026567 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -510,15 +510,15 @@ struct vmx_msr_entry {
 /*
  * Exit Qualifications for EPT Violations
  */
-#define EPT_VIOLATION_READ_BIT		0
-#define EPT_VIOLATION_WRITE_BIT		1
-#define EPT_VIOLATION_INSTR_BIT		2
+#define EPT_VIOLATION_ACC_READ_BIT	0
+#define EPT_VIOLATION_ACC_WRITE_BIT	1
+#define EPT_VIOLATION_ACC_INSTR_BIT	2
 #define EPT_VIOLATION_READABLE_BIT	3
 #define EPT_VIOLATION_WRITABLE_BIT	4
 #define EPT_VIOLATION_EXECUTABLE_BIT	5
-#define EPT_VIOLATION_READ		(1 << EPT_VIOLATION_READ_BIT)
-#define EPT_VIOLATION_WRITE		(1 << EPT_VIOLATION_WRITE_BIT)
-#define EPT_VIOLATION_INSTR		(1 << EPT_VIOLATION_INSTR_BIT)
+#define EPT_VIOLATION_ACC_READ		(1 << EPT_VIOLATION_ACC_READ_BIT)
+#define EPT_VIOLATION_ACC_WRITE		(1 << EPT_VIOLATION_ACC_WRITE_BIT)
+#define EPT_VIOLATION_ACC_INSTR		(1 << EPT_VIOLATION_ACC_INSTR_BIT)
 #define EPT_VIOLATION_READABLE		(1 << EPT_VIOLATION_READABLE_BIT)
 #define EPT_VIOLATION_WRITABLE		(1 << EPT_VIOLATION_WRITABLE_BIT)
 #define EPT_VIOLATION_EXECUTABLE	(1 << EPT_VIOLATION_EXECUTABLE_BIT)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c7bafa1457e2..81c301def1af 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6375,13 +6375,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 	trace_kvm_page_fault(gpa, exit_qualification);
 
 	/* Is it a read fault? */
-	error_code = (exit_qualification & EPT_VIOLATION_READ)
+	error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
 		     ? PFERR_USER_MASK : 0;
 	/* Is it a write fault? */
-	error_code |= (exit_qualification & EPT_VIOLATION_WRITE)
+	error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
 		      ? PFERR_WRITE_MASK : 0;
 	/* Is it a fetch fault? */
-	error_code |= (exit_qualification & EPT_VIOLATION_INSTR)
+	error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
 		      ? PFERR_FETCH_MASK : 0;
 	/* ept page table entry is present? */
 	error_code |= (exit_qualification &
-- 
cgit v1.2.3


From 312b616b30d87581b88d3db54c14ed89610cc97b Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 21 Dec 2016 20:29:29 -0800
Subject: kvm: x86: mmu: Set SPTE_SPECIAL_MASK within mmu.c

Instead of the caller including the SPTE_SPECIAL_MASK in the masks being
supplied to kvm_mmu_set_mmio_spte_mask() and kvm_mmu_set_mask_ptes(),
those functions now themselves include the SPTE_SPECIAL_MASK.

Note that bit 63 is now reset in the default MMIO mask.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 ++++-
 arch/x86/kvm/vmx.c | 6 ++----
 arch/x86/kvm/x86.c | 3 ---
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 64821ca3a7c3..e3312e22e8db 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -208,7 +208,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu);
 
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
 {
-	shadow_mmio_mask = mmio_mask;
+	shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
@@ -318,6 +318,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
 		u64 acc_track_mask)
 {
+	if (acc_track_mask != 0)
+		acc_track_mask |= SPTE_SPECIAL_MASK;
+
 	shadow_user_mask = user_mask;
 	shadow_accessed_mask = accessed_mask;
 	shadow_dirty_mask = dirty_mask;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81c301def1af..d850d5d36182 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5236,10 +5236,8 @@ static void ept_set_mmio_spte_mask(void)
 	/*
 	 * EPT Misconfigurations can be generated if the value of bits 2:0
 	 * of an EPT paging-structure entry is 110b (write/execute).
-	 * Also, special bit (62) is set to quickly identify mmio spte.
 	 */
-	kvm_mmu_set_mmio_spte_mask(SPTE_SPECIAL_MASK |
-				   VMX_EPT_MISCONFIG_WX_VALUE);
+	kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE);
 }
 
 #define VMX_XSS_EXIT_BITMAP 0
@@ -6585,7 +6583,7 @@ void vmx_enable_tdp(void)
 		enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
 		0ull, VMX_EPT_EXECUTABLE_MASK,
 		cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
-		enable_ept_ad_bits ? 0ull : SPTE_SPECIAL_MASK | VMX_EPT_RWX_MASK);
+		enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK);
 
 	ept_set_mmio_spte_mask();
 	kvm_enable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6e2c71ea0627..4fd4d4f35caf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5952,9 +5952,6 @@ static void kvm_set_mmio_spte_mask(void)
 	 /* Mask the reserved physical address bits. */
 	mask = rsvd_bits(maxphyaddr, 51);
 
-	/* Bit 62 is always reserved for 32bit host. */
-	mask |= 0x3ull << 62;
-
 	/* Set the present bit. */
 	mask |= 1ull;
 
-- 
cgit v1.2.3


From 20d65236d01cdbe14a88f0e2c0f985669f8c41fc Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 21 Dec 2016 20:29:31 -0800
Subject: kvm: x86: mmu: Update comment in mark_spte_for_access_track

Reword the comment to hopefully make it more clear.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e3312e22e8db..e13041ac7cdf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -708,9 +708,9 @@ static u64 mark_spte_for_access_track(u64 spte)
 		return spte;
 
 	/*
-	 * Verify that the write-protection that we do below will be fixable
-	 * via the fast page fault path. Currently, that is always the case, at
-	 * least when using EPT (which is when access tracking would be used).
+	 * Making an Access Tracking PTE will result in removal of write access
+	 * from the PTE. So, verify that we will be able to restore the write
+	 * access in the fast page fault path later on.
 	 */
 	WARN_ONCE((spte & PT_WRITABLE_MASK) &&
 		  !spte_can_locklessly_be_made_writable(spte),
-- 
cgit v1.2.3


From d162f30a7cebe9731fd331419b3a14089d0b41e3 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 21 Dec 2016 20:29:30 -0800
Subject: kvm: x86: mmu: Move pgtbl walk inside retry loop in fast_page_fault

Redo the page table walk in fast_page_fault when retrying so that we are
working on the latest PTE even if the hierarchy changes.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e13041ac7cdf..437d16274701 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3088,14 +3088,16 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		return false;
 
 	walk_shadow_page_lockless_begin(vcpu);
-	for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
-		if (!is_shadow_present_pte(spte) || iterator.level < level)
-			break;
 
 	do {
 		bool remove_write_prot = false;
 		bool remove_acc_track;
 
+		for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+			if (!is_shadow_present_pte(spte) ||
+			    iterator.level < level)
+				break;
+
 		sp = page_header(__pa(iterator.sptep));
 		if (!is_last_spte(spte, sp->role.level))
 			break;
@@ -3176,8 +3178,6 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 			break;
 		}
 
-		spte = mmu_spte_get_lockless(iterator.sptep);
-
 	} while (true);
 
 	trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
-- 
cgit v1.2.3


From d3e328f2cb01f6f09259a5810baae3edf5416076 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Wed, 21 Dec 2016 20:29:32 -0800
Subject: kvm: x86: mmu: Verify that restored PTE has needed perms in fast page
 fault

Before fast page fault restores an access track PTE back to a regular PTE,
it now also verifies that the restored PTE would grant the necessary
permissions for the faulting access to succeed. If not, it falls back
to the slow page fault path.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 127 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 68 insertions(+), 59 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 437d16274701..2fd7586aad4d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -373,6 +373,11 @@ static int is_last_spte(u64 pte, int level)
 	return 0;
 }
 
+static bool is_executable_pte(u64 spte)
+{
+	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+}
+
 static kvm_pfn_t spte_to_pfn(u64 pte)
 {
 	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -728,6 +733,23 @@ static u64 mark_spte_for_access_track(u64 spte)
 	return spte;
 }
 
+/* Restore an acc-track PTE back to a regular PTE */
+static u64 restore_acc_track_spte(u64 spte)
+{
+	u64 new_spte = spte;
+	u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
+			 & shadow_acc_track_saved_bits_mask;
+
+	WARN_ON_ONCE(!is_access_track_spte(spte));
+
+	new_spte &= ~shadow_acc_track_mask;
+	new_spte &= ~(shadow_acc_track_saved_bits_mask <<
+		      shadow_acc_track_saved_bits_shift);
+	new_spte |= saved_bits;
+
+	return new_spte;
+}
+
 /* Returns the Accessed status of the PTE and resets it at the same time. */
 static bool mmu_spte_age(u64 *sptep)
 {
@@ -3019,27 +3041,12 @@ static bool page_fault_can_be_fast(u32 error_code)
  */
 static bool
 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			u64 *sptep, u64 old_spte,
-			bool remove_write_prot, bool remove_acc_track)
+			u64 *sptep, u64 old_spte, u64 new_spte)
 {
 	gfn_t gfn;
-	u64 new_spte = old_spte;
 
 	WARN_ON(!sp->role.direct);
 
-	if (remove_acc_track) {
-		u64 saved_bits = (old_spte >> shadow_acc_track_saved_bits_shift)
-				 & shadow_acc_track_saved_bits_mask;
-
-		new_spte &= ~shadow_acc_track_mask;
-		new_spte &= ~(shadow_acc_track_saved_bits_mask <<
-			      shadow_acc_track_saved_bits_shift);
-		new_spte |= saved_bits;
-	}
-
-	if (remove_write_prot)
-		new_spte |= PT_WRITABLE_MASK;
-
 	/*
 	 * Theoretically we could also set dirty bit (and flush TLB) here in
 	 * order to eliminate unnecessary PML logging. See comments in
@@ -3055,7 +3062,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
 		return false;
 
-	if (remove_write_prot) {
+	if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
 		/*
 		 * The gfn of direct spte is stable since it is
 		 * calculated by sp->gfn.
@@ -3067,6 +3074,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	return true;
 }
 
+static bool is_access_allowed(u32 fault_err_code, u64 spte)
+{
+	if (fault_err_code & PFERR_FETCH_MASK)
+		return is_executable_pte(spte);
+
+	if (fault_err_code & PFERR_WRITE_MASK)
+		return is_writable_pte(spte);
+
+	/* Fault was on Read access */
+	return spte & PT_PRESENT_MASK;
+}
+
 /*
  * Return value:
  * - true: let the vcpu to access on the same address again.
@@ -3090,8 +3109,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 	walk_shadow_page_lockless_begin(vcpu);
 
 	do {
-		bool remove_write_prot = false;
-		bool remove_acc_track;
+		u64 new_spte;
 
 		for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
 			if (!is_shadow_present_pte(spte) ||
@@ -3112,52 +3130,44 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		 * Need not check the access of upper level table entries since
 		 * they are always ACC_ALL.
 		 */
+		if (is_access_allowed(error_code, spte)) {
+			fault_handled = true;
+			break;
+		}
 
-		if (error_code & PFERR_FETCH_MASK) {
-			if ((spte & (shadow_x_mask | shadow_nx_mask))
-			    == shadow_x_mask) {
-				fault_handled = true;
-				break;
-			}
-		} else if (error_code & PFERR_WRITE_MASK) {
-			if (is_writable_pte(spte)) {
-				fault_handled = true;
-				break;
-			}
+		new_spte = spte;
+
+		if (is_access_track_spte(spte))
+			new_spte = restore_acc_track_spte(new_spte);
+
+		/*
+		 * Currently, to simplify the code, write-protection can
+		 * be removed in the fast path only if the SPTE was
+		 * write-protected for dirty-logging or access tracking.
+		 */
+		if ((error_code & PFERR_WRITE_MASK) &&
+		    spte_can_locklessly_be_made_writable(spte))
+		{
+			new_spte |= PT_WRITABLE_MASK;
 
 			/*
-			 * Currently, to simplify the code, write-protection can
-			 * be removed in the fast path only if the SPTE was
-			 * write-protected for dirty-logging.
+			 * Do not fix write-permission on the large spte.  Since
+			 * we only dirty the first page into the dirty-bitmap in
+			 * fast_pf_fix_direct_spte(), other pages are missed
+			 * if its slot has dirty logging enabled.
+			 *
+			 * Instead, we let the slow page fault path create a
+			 * normal spte to fix the access.
+			 *
+			 * See the comments in kvm_arch_commit_memory_region().
 			 */
-			remove_write_prot =
-				spte_can_locklessly_be_made_writable(spte);
-		} else {
-			/* Fault was on Read access */
-			if (spte & PT_PRESENT_MASK) {
-				fault_handled = true;
+			if (sp->role.level > PT_PAGE_TABLE_LEVEL)
 				break;
-			}
 		}
 
-		remove_acc_track = is_access_track_spte(spte);
-
 		/* Verify that the fault can be handled in the fast path */
-		if (!remove_acc_track && !remove_write_prot)
-			break;
-
-		/*
-		 * Do not fix write-permission on the large spte since we only
-		 * dirty the first page into the dirty-bitmap in
-		 * fast_pf_fix_direct_spte() that means other pages are missed
-		 * if its slot is dirty-logged.
-		 *
-		 * Instead, we let the slow page fault path create a normal spte
-		 * to fix the access.
-		 *
-		 * See the comments in kvm_arch_commit_memory_region().
-		 */
-		if (sp->role.level > PT_PAGE_TABLE_LEVEL && remove_write_prot)
+		if (new_spte == spte ||
+		    !is_access_allowed(error_code, new_spte))
 			break;
 
 		/*
@@ -3167,8 +3177,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		 */
 		fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
 							iterator.sptep, spte,
-							remove_write_prot,
-							remove_acc_track);
+							new_spte);
 		if (fault_handled)
 			break;
 
-- 
cgit v1.2.3


From a679c547d19ded9b7d1a88e17cf5e5b69ac619b6 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 15 Dec 2016 15:58:14 +0100
Subject: KVM: s390: gaccess: add ESOP2 handling

When we access guest memory and run into a protection exception, we
need to pass the exception data to the guest. ESOP2 provides detailed
information about all protection exceptions which ESOP1 only partially
provided.

The gaccess changes make sure, that the guest always gets all
available information.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/gaccess.c          | 12 +++++++++---
 arch/s390/tools/gen_facilities.c |  1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 4aa8a7e2a1da..6e94705efd4e 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -465,7 +465,9 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar,
 struct trans_exc_code_bits {
 	unsigned long addr : 52; /* Translation-exception Address */
 	unsigned long fsi  : 2;  /* Access Exception Fetch/Store Indication */
-	unsigned long	   : 6;
+	unsigned long	   : 2;
+	unsigned long b56  : 1;
+	unsigned long	   : 3;
 	unsigned long b60  : 1;
 	unsigned long b61  : 1;
 	unsigned long as   : 2;  /* ASCE Identifier */
@@ -497,14 +499,18 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
 	switch (code) {
 	case PGM_PROTECTION:
 		switch (prot) {
+		case PROT_TYPE_LA:
+			tec->b56 = 1;
+			break;
+		case PROT_TYPE_KEYC:
+			tec->b60 = 1;
+			break;
 		case PROT_TYPE_ALC:
 			tec->b60 = 1;
 			/* FALL THROUGH */
 		case PROT_TYPE_DAT:
 			tec->b61 = 1;
 			break;
-		default: /* LA and KEYC set b61 to 0, other params undefined */
-			return code;
 		}
 		/* FALL THROUGH */
 	case PGM_ASCE_TYPE:
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index 8cc53b1e6d03..b6ac8dfee001 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -80,6 +80,7 @@ static struct facility_def facility_defs[] = {
 			76, /* msa extension 3 */
 			77, /* msa extension 4 */
 			78, /* enhanced-DAT 2 */
+			131, /* enhanced-SOP 2 and side-effect */
 			-1  /* END */
 		}
 	},
-- 
cgit v1.2.3


From cd1836f583d78bdd15ef748f4d85bf007569c7ad Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.vnet.ibm.com>
Date: Thu, 4 Aug 2016 09:57:36 +0200
Subject: KVM: s390: instruction-execution-protection support

The new Instruction Execution Protection needs to be enabled before
the guest can use it. Therefore we pass the IEP facility bit to the
guest and enable IEP interpretation.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c         | 2 ++
 arch/s390/kvm/vsie.c             | 3 +++
 arch/s390/tools/gen_facilities.c | 1 +
 3 files changed, 6 insertions(+)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index bec71e902be3..e5130818fb03 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1938,6 +1938,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)
 		vcpu->arch.sie_block->ecb2 |= 0x08;
+	if (test_kvm_facility(vcpu->kvm, 130))
+		vcpu->arch.sie_block->ecb2 |= 0x20;
 	vcpu->arch.sie_block->eca = 0x1002000U;
 	if (sclp.has_cei)
 		vcpu->arch.sie_block->eca |= 0x80000000U;
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index d8673e243f13..ed62c6d57d93 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -324,6 +324,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	/* Run-time-Instrumentation */
 	if (test_kvm_facility(vcpu->kvm, 64))
 		scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
+	/* Instruction Execution Prevention */
+	if (test_kvm_facility(vcpu->kvm, 130))
+		scb_s->ecb2 |= scb_o->ecb2 & 0x20U;
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
 		scb_s->eca |= scb_o->eca & 0x00000001U;
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index b6ac8dfee001..0cf802de52a1 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -80,6 +80,7 @@ static struct facility_def facility_defs[] = {
 			76, /* msa extension 3 */
 			77, /* msa extension 4 */
 			78, /* enhanced-DAT 2 */
+			130, /* instruction-execution-protection */
 			131, /* enhanced-SOP 2 and side-effect */
 			-1  /* END */
 		}
-- 
cgit v1.2.3


From d051ae531324fb5130366d47e05bf8eadeb95535 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 13 Dec 2016 14:25:32 +0100
Subject: KVM: s390: get rid of bogus cc initialization

The plo inline assembly has a cc output operand that is always written
to and is also as such an operand declared. Therefore the compiler is
free to omit the rather pointless and misleading initialization.

Get rid of this.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e5130818fb03..4f74511015b8 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -217,7 +217,7 @@ static void allow_cpu_feat(unsigned long nr)
 static inline int plo_test_bit(unsigned char nr)
 {
 	register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
-	int cc = 3; /* subfunction not available */
+	int cc;
 
 	asm volatile(
 		/* Parameter registers are ignored for "test bit" */
-- 
cgit v1.2.3


From 27f67f8727843fbbbcd05a003183af79693759e9 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 9 Dec 2016 12:44:40 +0100
Subject: KVM: s390: Get rid of ar_t

sparse with __CHECK_ENDIAN__ shows that ar_t was never properly
used across KVM on s390. We can now:
- fix all places
- do not make ar_t special
Since ar_t is just used as a register number (no endianness issues
for u8), and all other register numbers are also just plain int
variables, let's just use u8, which matches the __u8 in the userspace
ABI for the memop ioctl.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 arch/s390/kvm/gaccess.c  | 14 +++++++-------
 arch/s390/kvm/gaccess.h  | 10 +++++-----
 arch/s390/kvm/kvm-s390.h | 10 ++++------
 arch/s390/kvm/priv.c     | 30 +++++++++++++++---------------
 4 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 6e94705efd4e..4492c9363178 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -373,7 +373,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu)
 		ipte_unlock_simple(vcpu);
 }
 
-static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar,
+static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar,
 			  enum gacc_mode mode)
 {
 	union alet alet;
@@ -487,7 +487,7 @@ enum prot_type {
 };
 
 static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
-		     ar_t ar, enum gacc_mode mode, enum prot_type prot)
+		     u8 ar, enum gacc_mode mode, enum prot_type prot)
 {
 	struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
 	struct trans_exc_code_bits *tec;
@@ -545,7 +545,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
 }
 
 static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
-			 unsigned long ga, ar_t ar, enum gacc_mode mode)
+			 unsigned long ga, u8 ar, enum gacc_mode mode)
 {
 	int rc;
 	struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
@@ -777,7 +777,7 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
 	return 1;
 }
 
-static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar,
+static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 			    unsigned long *pages, unsigned long nr_pages,
 			    const union asce asce, enum gacc_mode mode)
 {
@@ -809,7 +809,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar,
 	return 0;
 }
 
-int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
+int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
 		 unsigned long len, enum gacc_mode mode)
 {
 	psw_t *psw = &vcpu->arch.sie_block->gpsw;
@@ -883,7 +883,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
  * Note: The IPTE lock is not taken during this function, so the caller
  * has to take care of this.
  */
-int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
+int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
 			    unsigned long *gpa, enum gacc_mode mode)
 {
 	psw_t *psw = &vcpu->arch.sie_block->gpsw;
@@ -916,7 +916,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
 /**
  * check_gva_range - test a range of guest virtual addresses for accessibility
  */
-int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
+int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
 		    unsigned long length, enum gacc_mode mode)
 {
 	unsigned long gpa;
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 8756569ad938..5c9cc18f3b4a 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -162,11 +162,11 @@ enum gacc_mode {
 };
 
 int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
-			    ar_t ar, unsigned long *gpa, enum gacc_mode mode);
-int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
+			    u8 ar, unsigned long *gpa, enum gacc_mode mode);
+int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
 		    unsigned long length, enum gacc_mode mode);
 
-int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
+int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
 		 unsigned long len, enum gacc_mode mode);
 
 int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
@@ -218,7 +218,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
  *	 if data has been changed in guest space in case of an exception.
  */
 static inline __must_check
-int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
+int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
 		unsigned long len)
 {
 	return access_guest(vcpu, ga, ar, data, len, GACC_STORE);
@@ -238,7 +238,7 @@ int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
  * data will be copied from guest space to kernel space.
  */
 static inline __must_check
-int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
+int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
 	       unsigned long len)
 {
 	return access_guest(vcpu, ga, ar, data, len, GACC_FETCH);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 3a4e97f1a9e6..22a0a7ceffad 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -86,9 +86,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
 	kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
 }
 
-typedef u8 __bitwise ar_t;
-
-static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, ar_t *ar)
+static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, u8 *ar)
 {
 	u32 base2 = vcpu->arch.sie_block->ipb >> 28;
 	u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
@@ -101,7 +99,7 @@ static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, ar_t *ar)
 
 static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu,
 					      u64 *address1, u64 *address2,
-					      ar_t *ar_b1, ar_t *ar_b2)
+					      u8 *ar_b1, u8 *ar_b2)
 {
 	u32 base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28;
 	u32 disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16;
@@ -125,7 +123,7 @@ static inline void kvm_s390_get_regs_rre(struct kvm_vcpu *vcpu, int *r1, int *r2
 		*r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16;
 }
 
-static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, ar_t *ar)
+static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, u8 *ar)
 {
 	u32 base2 = vcpu->arch.sie_block->ipb >> 28;
 	u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) +
@@ -140,7 +138,7 @@ static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, ar_t *ar)
 	return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + (long)(int)disp2;
 }
 
-static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu, ar_t *ar)
+static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu, u8 *ar)
 {
 	u32 base2 = vcpu->arch.sie_block->ipb >> 28;
 	u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index e18435355c16..1ecc1cffdf7c 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -54,7 +54,7 @@ int kvm_s390_handle_aa(struct kvm_vcpu *vcpu)
 static int handle_set_clock(struct kvm_vcpu *vcpu)
 {
 	int rc;
-	ar_t ar;
+	u8 ar;
 	u64 op2, val;
 
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
@@ -79,7 +79,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
 	u64 operand2;
 	u32 address;
 	int rc;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_spx++;
 
@@ -117,7 +117,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
 	u64 operand2;
 	u32 address;
 	int rc;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_stpx++;
 
@@ -147,7 +147,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
 	u16 vcpu_id = vcpu->vcpu_id;
 	u64 ga;
 	int rc;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_stap++;
 
@@ -380,7 +380,7 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
 	u32 tpi_data[3];
 	int rc;
 	u64 addr;
-	ar_t ar;
+	u8 ar;
 
 	addr = kvm_s390_get_base_disp_s(vcpu, &ar);
 	if (addr & 3)
@@ -548,7 +548,7 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
 	psw_compat_t new_psw;
 	u64 addr;
 	int rc;
-	ar_t ar;
+	u8 ar;
 
 	if (gpsw->mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@@ -575,7 +575,7 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
 	psw_t new_psw;
 	u64 addr;
 	int rc;
-	ar_t ar;
+	u8 ar;
 
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@@ -597,7 +597,7 @@ static int handle_stidp(struct kvm_vcpu *vcpu)
 	u64 stidp_data = vcpu->kvm->arch.model.cpuid;
 	u64 operand2;
 	int rc;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_stidp++;
 
@@ -644,7 +644,7 @@ static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem)
 	ASCEBC(mem->vm[0].cpi, 16);
 }
 
-static void insert_stsi_usr_data(struct kvm_vcpu *vcpu, u64 addr, ar_t ar,
+static void insert_stsi_usr_data(struct kvm_vcpu *vcpu, u64 addr, u8 ar,
 				 u8 fc, u8 sel1, u16 sel2)
 {
 	vcpu->run->exit_reason = KVM_EXIT_S390_STSI;
@@ -663,7 +663,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
 	unsigned long mem = 0;
 	u64 operand2;
 	int rc = 0;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_stsi++;
 	VCPU_EVENT(vcpu, 3, "STSI: fc: %u sel1: %u sel2: %u", fc, sel1, sel2);
@@ -970,7 +970,7 @@ int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu)
 	int reg, rc, nr_regs;
 	u32 ctl_array[16];
 	u64 ga;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_lctl++;
 
@@ -1009,7 +1009,7 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu)
 	int reg, rc, nr_regs;
 	u32 ctl_array[16];
 	u64 ga;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_stctl++;
 
@@ -1043,7 +1043,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
 	int reg, rc, nr_regs;
 	u64 ctl_array[16];
 	u64 ga;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_lctlg++;
 
@@ -1081,7 +1081,7 @@ static int handle_stctg(struct kvm_vcpu *vcpu)
 	int reg, rc, nr_regs;
 	u64 ctl_array[16];
 	u64 ga;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_stctg++;
 
@@ -1132,7 +1132,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
 	unsigned long hva, gpa;
 	int ret = 0, cc = 0;
 	bool writable;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_tprot++;
 
-- 
cgit v1.2.3


From 53743aa7f14671dea6f3567ddca2f7d97454f3fe Mon Sep 17 00:00:00 2001
From: Maxim Samoylov <max7255@linux.vnet.ibm.com>
Date: Wed, 10 Feb 2016 10:31:23 +0100
Subject: KVM: s390: Introduce Vector Enhancements facility 1 to the guest

We can directly forward the vector enhancement facility 1 to the guest
if available and VX is requested by user space.

Please note that user space will have to take care of the final state
of the facility bit when migrating to older machines.

Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Maxim Samoylov <max7255@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4f74511015b8..1fd4b854efdc 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -505,6 +505,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		} else if (MACHINE_HAS_VX) {
 			set_kvm_facility(kvm->arch.model.fac_mask, 129);
 			set_kvm_facility(kvm->arch.model.fac_list, 129);
+			if (test_facility(135)) {
+				set_kvm_facility(kvm->arch.model.fac_mask, 135);
+				set_kvm_facility(kvm->arch.model.fac_list, 135);
+			}
 			r = 0;
 		} else
 			r = -EINVAL;
-- 
cgit v1.2.3


From 2f87d942be9d0f86e44fbcbd473264c26c7f1809 Mon Sep 17 00:00:00 2001
From: Guenther Hutzl <hutzl@linux.vnet.ibm.com>
Date: Fri, 3 Jun 2016 14:37:17 +0200
Subject: KVM: s390: Introduce BCD Vector Instructions to the guest

We can directly forward the vector BCD instructions to the guest
if available and VX is requested by user space.

Please note that user space will have to take care of the final state
of the facility bit when migrating to older machines.

Signed-off-by: Guenther Hutzl <hutzl@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 1fd4b854efdc..69401b8d4521 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -505,6 +505,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		} else if (MACHINE_HAS_VX) {
 			set_kvm_facility(kvm->arch.model.fac_mask, 129);
 			set_kvm_facility(kvm->arch.model.fac_list, 129);
+			if (test_facility(134)) {
+				set_kvm_facility(kvm->arch.model.fac_mask, 134);
+				set_kvm_facility(kvm->arch.model.fac_list, 134);
+			}
 			if (test_facility(135)) {
 				set_kvm_facility(kvm->arch.model.fac_mask, 135);
 				set_kvm_facility(kvm->arch.model.fac_list, 135);
-- 
cgit v1.2.3


From f41711788c9c281a61c8cf3222dca8a0e74a4fb3 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 May 2016 12:33:52 +0200
Subject: KVM: s390: guestdbg: filter i-fetch events on icpts

We already filter PER events reported via icpt code 8. For icpt code
4 and 56, this is still missing.

So let's properly detect if we have a debugging event and if we have to
inject a PER i-fetch event into the guest at all.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/guestdbg.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index d7c6a7f53ced..a2077833ab01 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -388,14 +388,13 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu)
 #define per_write_wp_event(code) \
 			(code & (PER_CODE_STORE | PER_CODE_STORE_REAL))
 
-static int debug_exit_required(struct kvm_vcpu *vcpu)
+static int debug_exit_required(struct kvm_vcpu *vcpu, u8 perc,
+			       unsigned long peraddr)
 {
-	u8 perc = vcpu->arch.sie_block->perc;
 	struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch;
 	struct kvm_hw_wp_info_arch *wp_info = NULL;
 	struct kvm_hw_bp_info_arch *bp_info = NULL;
 	unsigned long addr = vcpu->arch.sie_block->gpsw.addr;
-	unsigned long peraddr = vcpu->arch.sie_block->peraddr;
 
 	if (guestdbg_hw_bp_enabled(vcpu)) {
 		if (per_write_wp_event(perc) &&
@@ -442,6 +441,8 @@ exit_required:
 
 int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
 {
+	const u64 cr10 = vcpu->arch.sie_block->gcr[10];
+	const u64 cr11 = vcpu->arch.sie_block->gcr[11];
 	const u8 ilen = kvm_s390_get_ilen(vcpu);
 	struct kvm_s390_pgm_info pgm_info = {
 		.code = PGM_PER,
@@ -454,7 +455,19 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
 	 * instruction generated a PER i-fetch event. PER address therefore
 	 * points at the previous PSW address (could be an EXECUTE function).
 	 */
-	return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+	if (!guestdbg_enabled(vcpu))
+		return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+
+	if (debug_exit_required(vcpu, pgm_info.per_code, pgm_info.per_address))
+		vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING;
+
+	if (!guest_per_enabled(vcpu) ||
+	    !(vcpu->arch.sie_block->gcr[9] & PER_EVENT_IFETCH))
+		return 0;
+
+	if (in_addr_range(pgm_info.per_address, cr10, cr11))
+		return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+	return 0;
 }
 
 static void filter_guest_per_event(struct kvm_vcpu *vcpu)
@@ -500,7 +513,8 @@ void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu)
 {
 	int new_as;
 
-	if (debug_exit_required(vcpu))
+	if (debug_exit_required(vcpu, vcpu->arch.sie_block->perc,
+				vcpu->arch.sie_block->peraddr))
 		vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING;
 
 	filter_guest_per_event(vcpu);
-- 
cgit v1.2.3


From 3fa8cad7402cfe982a60d746609e89aafe15d131 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 May 2016 12:00:49 +0200
Subject: KVM: s390: prepare to read random guest instructions

We will have to read instructions not residing at the current PSW
address.

Reviewed-by: Eric Farman <farman@linux.vnet.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/gaccess.h  | 9 +++++----
 arch/s390/kvm/kvm-s390.c | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 5c9cc18f3b4a..7ce47fd36f28 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -247,10 +247,11 @@ int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
 /**
  * read_guest_instr - copy instruction data from guest space to kernel space
  * @vcpu: virtual cpu
+ * @ga: guest address
  * @data: destination address in kernel space
  * @len: number of bytes to copy
  *
- * Copy @len bytes from the current psw address (guest space) to @data (kernel
+ * Copy @len bytes from the given address (guest space) to @data (kernel
  * space).
  *
  * The behaviour of read_guest_instr is identical to read_guest, except that
@@ -258,10 +259,10 @@ int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
  * address-space mode.
  */
 static inline __must_check
-int read_guest_instr(struct kvm_vcpu *vcpu, void *data, unsigned long len)
+int read_guest_instr(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
+		     unsigned long len)
 {
-	return access_guest(vcpu, vcpu->arch.sie_block->gpsw.addr, 0, data, len,
-			    GACC_IFETCH);
+	return access_guest(vcpu, ga, 0, data, len, GACC_IFETCH);
 }
 
 /**
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 69401b8d4521..66e73f4ed64b 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2588,7 +2588,7 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
 	 * to look up the current opcode to get the length of the instruction
 	 * to be able to forward the PSW.
 	 */
-	rc = read_guest_instr(vcpu, &opcode, 1);
+	rc = read_guest_instr(vcpu, vcpu->arch.sie_block->gpsw.addr, &opcode, 1);
 	ilen = insn_length(opcode);
 	if (rc < 0) {
 		return rc;
-- 
cgit v1.2.3


From a69cbe81b2f38437113c05019a134a4731a3aa78 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 May 2016 12:40:11 +0200
Subject: KVM: s390: guestdbg: filter PER i-fetch on EXECUTE properly

When we get a PER i-fetch event on an EXECUTE or EXECUTE RELATIVE LONG
instruction, because the executed instruction generated a PER i-fetch
event, then the PER address points at the EXECUTE function, not the
fetched one.

Therefore, when filtering PER events, we have to take care of the
really fetched instruction, which we can only get by reading in guest
virtual memory.

For icpt code 4 and 56, we directly have additional information about an
EXECUTE instruction at hand. For icpt code 8, we always have to read
in guest virtual memory.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
[small fixes]
---
 arch/s390/kvm/guestdbg.c  | 98 ++++++++++++++++++++++++++++++++++++++++++-----
 arch/s390/kvm/intercept.c |  4 +-
 arch/s390/kvm/kvm-s390.h  |  2 +-
 3 files changed, 93 insertions(+), 11 deletions(-)

diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index a2077833ab01..23d9a4e12da1 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -436,6 +436,64 @@ exit_required:
 	return 1;
 }
 
+static int per_fetched_addr(struct kvm_vcpu *vcpu, unsigned long *addr)
+{
+	u8 exec_ilen = 0;
+	u16 opcode[3];
+	int rc;
+
+	if (vcpu->arch.sie_block->icptcode == ICPT_PROGI) {
+		/* PER address references the fetched or the execute instr */
+		*addr = vcpu->arch.sie_block->peraddr;
+		/*
+		 * Manually detect if we have an EXECUTE instruction. As
+		 * instructions are always 2 byte aligned we can read the
+		 * first two bytes unconditionally
+		 */
+		rc = read_guest_instr(vcpu, *addr, &opcode, 2);
+		if (rc)
+			return rc;
+		if (opcode[0] >> 8 == 0x44)
+			exec_ilen = 4;
+		if ((opcode[0] & 0xff0f) == 0xc600)
+			exec_ilen = 6;
+	} else {
+		/* instr was suppressed, calculate the responsible instr */
+		*addr = __rewind_psw(vcpu->arch.sie_block->gpsw,
+				     kvm_s390_get_ilen(vcpu));
+		if (vcpu->arch.sie_block->icptstatus & 0x01) {
+			exec_ilen = (vcpu->arch.sie_block->icptstatus & 0x60) >> 4;
+			if (!exec_ilen)
+				exec_ilen = 4;
+		}
+	}
+
+	if (exec_ilen) {
+		/* read the complete EXECUTE instr to detect the fetched addr */
+		rc = read_guest_instr(vcpu, *addr, &opcode, exec_ilen);
+		if (rc)
+			return rc;
+		if (exec_ilen == 6) {
+			/* EXECUTE RELATIVE LONG - RIL-b format */
+			s32 rl = *((s32 *) (opcode + 1));
+
+			/* rl is a _signed_ 32 bit value specifying halfwords */
+			*addr += (u64)(s64) rl * 2;
+		} else {
+			/* EXECUTE - RX-a format */
+			u32 base = (opcode[1] & 0xf000) >> 12;
+			u32 disp = opcode[1] & 0x0fff;
+			u32 index = opcode[0] & 0x000f;
+
+			*addr = base ? vcpu->run->s.regs.gprs[base] : 0;
+			*addr += index ? vcpu->run->s.regs.gprs[index] : 0;
+			*addr += disp;
+		}
+		*addr = kvm_s390_logical_to_effective(vcpu, *addr);
+	}
+	return 0;
+}
+
 #define guest_per_enabled(vcpu) \
 			     (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER)
 
@@ -449,6 +507,8 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
 		.per_code = PER_CODE_IFETCH,
 		.per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen),
 	};
+	unsigned long fetched_addr;
+	int rc;
 
 	/*
 	 * The PSW points to the next instruction, therefore the intercepted
@@ -465,21 +525,29 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
 	    !(vcpu->arch.sie_block->gcr[9] & PER_EVENT_IFETCH))
 		return 0;
 
-	if (in_addr_range(pgm_info.per_address, cr10, cr11))
+	rc = per_fetched_addr(vcpu, &fetched_addr);
+	if (rc < 0)
+		return rc;
+	if (rc)
+		/* instruction-fetching exceptions */
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+	if (in_addr_range(fetched_addr, cr10, cr11))
 		return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
 	return 0;
 }
 
-static void filter_guest_per_event(struct kvm_vcpu *vcpu)
+static int filter_guest_per_event(struct kvm_vcpu *vcpu)
 {
 	const u8 perc = vcpu->arch.sie_block->perc;
-	u64 peraddr = vcpu->arch.sie_block->peraddr;
 	u64 addr = vcpu->arch.sie_block->gpsw.addr;
 	u64 cr9 = vcpu->arch.sie_block->gcr[9];
 	u64 cr10 = vcpu->arch.sie_block->gcr[10];
 	u64 cr11 = vcpu->arch.sie_block->gcr[11];
 	/* filter all events, demanded by the guest */
 	u8 guest_perc = perc & (cr9 >> 24) & PER_CODE_MASK;
+	unsigned long fetched_addr;
+	int rc;
 
 	if (!guest_per_enabled(vcpu))
 		guest_perc = 0;
@@ -491,9 +559,17 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
 		guest_perc &= ~PER_CODE_BRANCH;
 
 	/* filter "instruction-fetching" events */
-	if (guest_perc & PER_CODE_IFETCH &&
-	    !in_addr_range(peraddr, cr10, cr11))
-		guest_perc &= ~PER_CODE_IFETCH;
+	if (guest_perc & PER_CODE_IFETCH) {
+		rc = per_fetched_addr(vcpu, &fetched_addr);
+		if (rc < 0)
+			return rc;
+		/*
+		 * Don't inject an irq on exceptions. This would make handling
+		 * on icpt code 8 very complex (as PSW was already rewound).
+		 */
+		if (rc || !in_addr_range(fetched_addr, cr10, cr11))
+			guest_perc &= ~PER_CODE_IFETCH;
+	}
 
 	/* All other PER events will be given to the guest */
 	/* TODO: Check altered address/address space */
@@ -502,6 +578,7 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
 
 	if (!guest_perc)
 		vcpu->arch.sie_block->iprcc &= ~PGM_PER;
+	return 0;
 }
 
 #define pssec(vcpu) (vcpu->arch.sie_block->gcr[1] & _ASCE_SPACE_SWITCH)
@@ -509,15 +586,17 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
 #define old_ssec(vcpu) ((vcpu->arch.sie_block->tecmc >> 31) & 0x1)
 #define old_as_is_home(vcpu) !(vcpu->arch.sie_block->tecmc & 0xffff)
 
-void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu)
+int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu)
 {
-	int new_as;
+	int rc, new_as;
 
 	if (debug_exit_required(vcpu, vcpu->arch.sie_block->perc,
 				vcpu->arch.sie_block->peraddr))
 		vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING;
 
-	filter_guest_per_event(vcpu);
+	rc = filter_guest_per_event(vcpu);
+	if (rc)
+		return rc;
 
 	/*
 	 * Only RP, SAC, SACF, PT, PTI, PR, PC instructions can trigger
@@ -546,4 +625,5 @@ void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu)
 		    (pssec(vcpu) || old_ssec(vcpu)))
 			vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH;
 	}
+	return 0;
 }
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 7a27eebab28a..8b13f7098c61 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -238,7 +238,9 @@ static int handle_prog(struct kvm_vcpu *vcpu)
 	vcpu->stat.exit_program_interruption++;
 
 	if (guestdbg_enabled(vcpu) && per_event(vcpu)) {
-		kvm_s390_handle_per_event(vcpu);
+		rc = kvm_s390_handle_per_event(vcpu);
+		if (rc)
+			return rc;
 		/* the interrupt might have been filtered out completely */
 		if (vcpu->arch.sie_block->iprcc == 0)
 			return 0;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 22a0a7ceffad..af9fa91a0c91 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -377,7 +377,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
 void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu);
 void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu);
-void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
 
 /* support for Basic/Extended SCA handling */
 static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm)
-- 
cgit v1.2.3


From 4bead2a423ea5268b0ab3cba058e215c65ee2cbd Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.vnet.ibm.com>
Date: Fri, 27 Jan 2017 10:23:59 +0100
Subject: KVM: s390: Fix RRBE return code not being CC

reset_guest_reference_bit needs to return the CC, so we can set it in
the guest PSW when emulating RRBE. Right now it only returns 0.

Let's fix that.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/mm/pgtable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 7a1897c51c54..f70db837ddc4 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -741,7 +741,7 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
 
 	pgste_set_unlock(ptep, new);
 	pte_unmap_unlock(ptep, ptl);
-	return 0;
+	return cc;
 }
 EXPORT_SYMBOL(reset_guest_reference_bit);
 
-- 
cgit v1.2.3


From a8c39dd77cb9fad0d0e5c5e6581851bdcbc1e6f6 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Wed, 18 Jan 2017 16:01:02 +0100
Subject: KVM: s390: Add debug logging to basic cpu model interface

Let's log something for changes in facilities, cpuid and ibc now that we
have a cpu model in QEMU. All of these calls are pretty seldom, so we
will not spill the log, the they will help to understand pontential
guest issues, for example if some instructions are fenced off.

As the s390 debug feature has a limited amount of parameters and
strings must not go away we limit the facility printing to 3 double
words, instead of building that list dynamically. This should be enough
for several years. If we ever exceed 3 double words then the logging
will be incomplete but no functional impact will happen.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4078ba630689..dabd3b15bf11 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -829,6 +829,13 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
 		}
 		memcpy(kvm->arch.model.fac_list, proc->fac_list,
 		       S390_ARCH_FAC_LIST_SIZE_BYTE);
+		VM_EVENT(kvm, 3, "SET: guest ibc: 0x%4.4x, guest cpuid: 0x%16.16llx",
+			 kvm->arch.model.ibc,
+			 kvm->arch.model.cpuid);
+		VM_EVENT(kvm, 3, "SET: guest faclist: 0x%16.16llx.%16.16llx.%16.16llx",
+			 kvm->arch.model.fac_list[0],
+			 kvm->arch.model.fac_list[1],
+			 kvm->arch.model.fac_list[2]);
 	} else
 		ret = -EFAULT;
 	kfree(proc);
@@ -902,6 +909,13 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr)
 	proc->ibc = kvm->arch.model.ibc;
 	memcpy(&proc->fac_list, kvm->arch.model.fac_list,
 	       S390_ARCH_FAC_LIST_SIZE_BYTE);
+	VM_EVENT(kvm, 3, "GET: guest ibc: 0x%4.4x, guest cpuid: 0x%16.16llx",
+		 kvm->arch.model.ibc,
+		 kvm->arch.model.cpuid);
+	VM_EVENT(kvm, 3, "GET: guest faclist: 0x%16.16llx.%16.16llx.%16.16llx",
+		 kvm->arch.model.fac_list[0],
+		 kvm->arch.model.fac_list[1],
+		 kvm->arch.model.fac_list[2]);
 	if (copy_to_user((void __user *)attr->addr, proc, sizeof(*proc)))
 		ret = -EFAULT;
 	kfree(proc);
@@ -925,6 +939,17 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
 	       S390_ARCH_FAC_LIST_SIZE_BYTE);
 	memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list,
 	       sizeof(S390_lowcore.stfle_fac_list));
+	VM_EVENT(kvm, 3, "GET: host ibc:  0x%4.4x, host cpuid:  0x%16.16llx",
+		 kvm->arch.model.ibc,
+		 kvm->arch.model.cpuid);
+	VM_EVENT(kvm, 3, "GET: host facmask:  0x%16.16llx.%16.16llx.%16.16llx",
+		 mach->fac_mask[0],
+		 mach->fac_mask[1],
+		 mach->fac_mask[2]);
+	VM_EVENT(kvm, 3, "GET: host faclist:  0x%16.16llx.%16.16llx.%16.16llx",
+		 mach->fac_list[0],
+		 mach->fac_list[1],
+		 mach->fac_list[2]);
 	if (copy_to_user((void __user *)attr->addr, mach, sizeof(*mach)))
 		ret = -EFAULT;
 	kfree(mach);
-- 
cgit v1.2.3


From a97a65d53d9f53b6897dc1b2aed381bc1707136b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 27 Jan 2017 14:00:34 +1000
Subject: KVM: PPC: Book3S: 64-bit CONFIG_RELOCATABLE support for interrupts

64-bit Book3S exception handlers must find the dynamic kernel base
to add to the target address when branching beyond __end_interrupts,
in order to support kernel running at non-0 physical address.

Support this in KVM by branching with CTR, similarly to regular
interrupt handlers. The guest CTR saved in HSTATE_SCRATCH1 and
restored after the branch.

Without this, the host kernel hangs and crashes randomly when it is
running at a non-0 address and a KVM guest is started.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 45 +++++++++++++++++++++++++++++---
 arch/powerpc/kernel/exceptions-64s.S     |  2 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 12 ++++++---
 arch/powerpc/kvm/book3s_segment.S        |  7 +++++
 4 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index a02a268bde6b..9a5dbfb2d9f2 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -97,6 +97,15 @@
 	ld	reg,PACAKBASE(r13);					\
 	ori	reg,reg,(ABS_ADDR(label))@l;
 
+/*
+ * Branches from unrelocated code (e.g., interrupts) to labels outside
+ * head-y require >64K offsets.
+ */
+#define __LOAD_FAR_HANDLER(reg, label)					\
+	ld	reg,PACAKBASE(r13);					\
+	ori	reg,reg,(ABS_ADDR(label))@l;				\
+	addis	reg,reg,(ABS_ADDR(label))@h;
+
 /* Exception register prefixes */
 #define EXC_HV	H
 #define EXC_STD
@@ -227,12 +236,40 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	mtctr	reg;							\
 	bctr
 
+/*
+ * KVM requires __LOAD_FAR_HANDLER.
+ *
+ * __BRANCH_TO_KVM_EXIT branches are also a special case because they
+ * explicitly use r9 then reload it from PACA before branching. Hence
+ * the double-underscore.
+ */
+#define __BRANCH_TO_KVM_EXIT(area, label)				\
+	mfctr	r9;							\
+	std	r9,HSTATE_SCRATCH1(r13);				\
+	__LOAD_FAR_HANDLER(r9, label);					\
+	mtctr	r9;							\
+	ld	r9,area+EX_R9(r13);					\
+	bctr
+
+#define BRANCH_TO_KVM(reg, label)					\
+	__LOAD_FAR_HANDLER(reg, label);					\
+	mtctr	reg;							\
+	bctr
+
 #else
 #define BRANCH_TO_COMMON(reg, label)					\
 	b	label
 
+#define BRANCH_TO_KVM(reg, label)					\
+	b	label
+
+#define __BRANCH_TO_KVM_EXIT(area, label)				\
+	ld	r9,area+EX_R9(r13);					\
+	b	label
+
 #endif
 
+
 #define __KVM_HANDLER(area, h, n)					\
 	BEGIN_FTR_SECTION_NESTED(947)					\
 	ld	r10,area+EX_CFAR(r13);					\
@@ -246,8 +283,8 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	std	r12,HSTATE_SCRATCH0(r13);				\
 	sldi	r12,r9,32;						\
 	ori	r12,r12,(n);						\
-	ld	r9,area+EX_R9(r13);					\
-	b	kvmppc_interrupt
+	/* This reloads r9 before branching to kvmppc_interrupt */	\
+	__BRANCH_TO_KVM_EXIT(area, kvmppc_interrupt)
 
 #define __KVM_HANDLER_SKIP(area, h, n)					\
 	cmpwi	r10,KVM_GUEST_MODE_SKIP;				\
@@ -260,8 +297,8 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	std	r12,HSTATE_SCRATCH0(r13);				\
 	sldi	r12,r9,32;						\
 	ori	r12,r12,(n);						\
-	ld	r9,area+EX_R9(r13);					\
-	b	kvmppc_interrupt;					\
+	/* This reloads r9 before branching to kvmppc_interrupt */	\
+	__BRANCH_TO_KVM_EXIT(area, kvmppc_interrupt);			\
 89:	mtocrf	0x80,r9;						\
 	ld	r9,area+EX_R9(r13);					\
 	ld	r10,area+EX_R10(r13);					\
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 89b4f122aec6..65a2559eeb7f 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -142,7 +142,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
 	cmpwi	r0,0
 	beq	1f
-	b	kvm_start_guest
+	BRANCH_TO_KVM(r10, kvm_start_guest)
 1:
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 11882aac8216..264ac9ad4585 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1060,15 +1060,16 @@ kvmppc_interrupt_hv:
 	 * R12		= (guest CR << 32) | interrupt vector
 	 * R13		= PACA
 	 * guest R12 saved in shadow VCPU SCRATCH0
+	 * guest CTR saved in shadow VCPU SCRATCH1 if RELOCATABLE
 	 * guest R13 saved in SPRN_SCRATCH0
 	 */
-	std	r9, HSTATE_SCRATCH1(r13)
+	std	r9, HSTATE_SCRATCH2(r13)
 	lbz	r9, HSTATE_IN_GUEST(r13)
 	cmpwi	r9, KVM_GUEST_MODE_HOST_HV
 	beq	kvmppc_bad_host_intr
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 	cmpwi	r9, KVM_GUEST_MODE_GUEST
-	ld	r9, HSTATE_SCRATCH1(r13)
+	ld	r9, HSTATE_SCRATCH2(r13)
 	beq	kvmppc_interrupt_pr
 #endif
 	/* We're now back in the host but in guest MMU context */
@@ -1088,7 +1089,7 @@ kvmppc_interrupt_hv:
 	std	r6, VCPU_GPR(R6)(r9)
 	std	r7, VCPU_GPR(R7)(r9)
 	std	r8, VCPU_GPR(R8)(r9)
-	ld	r0, HSTATE_SCRATCH1(r13)
+	ld	r0, HSTATE_SCRATCH2(r13)
 	std	r0, VCPU_GPR(R9)(r9)
 	std	r10, VCPU_GPR(R10)(r9)
 	std	r11, VCPU_GPR(R11)(r9)
@@ -1151,7 +1152,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 11:	stw	r3,VCPU_HEIR(r9)
 
 	/* these are volatile across C function calls */
+#ifdef CONFIG_RELOCATABLE
+	ld	r3, HSTATE_SCRATCH1(r13)
+	mtctr	r3
+#else
 	mfctr	r3
+#endif
 	mfxer	r4
 	std	r3, VCPU_CTR(r9)
 	std	r4, VCPU_XER(r9)
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index 68e45080cf93..2a2b96d53999 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -175,9 +175,16 @@ kvmppc_interrupt_pr:
 	 * R12             = (guest CR << 32) | exit handler id
 	 * R13             = PACA
 	 * HSTATE.SCRATCH0 = guest R12
+	 * HSTATE.SCRATCH1 = guest CTR if RELOCATABLE
 	 */
 #ifdef CONFIG_PPC64
 	/* Match 32-bit entry */
+#ifdef CONFIG_RELOCATABLE
+	std	r9, HSTATE_SCRATCH2(r13)
+	ld	r9, HSTATE_SCRATCH1(r13)
+	mtctr	r9
+	ld	r9, HSTATE_SCRATCH2(r13)
+#endif
 	rotldi	r12, r12, 32		  /* Flip R12 halves for stw */
 	stw	r12, HSTATE_SCRATCH1(r13) /* CR is now in the low half */
 	srdi	r12, r12, 32		  /* shift trap into low half */
-- 
cgit v1.2.3


From 18569c1f134e1c5c88228f043c09678ae6052b7c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:34 +1100
Subject: powerpc/64: Don't try to use radix MMU under a hypervisor

Currently, if the kernel is running on a POWER9 processor under a
hypervisor, it will try to use the radix MMU even though it doesn't have
the necessary code to use radix under a hypervisor (it doesn't negotiate
use of radix, and it doesn't do the H_REGISTER_PROC_TBL hcall). The
result is that the guest kernel will crash when it tries to turn on the
MMU.

This fixes it by looking for the /chosen/ibm,architecture-vec-5
property, and if it exists, clears the radix MMU feature bit, before we
decide whether to initialize for radix or HPT. This property is created
by the hypervisor as a result of the guest calling the
ibm,client-architecture-support method to indicate its capabilities, so
it will indicate whether the hypervisor agreed to us using radix.

Systems without a hypervisor may have this property also (for example,
skiboot creates it), so we check the HV bit in the MSR to see whether we
are running as a guest or not. If we are in hypervisor mode, then we can
do whatever we like including using the radix MMU.

The reason for using this property is that in future, when we have
support for using radix under a hypervisor, we will need to check this
property to see whether the hypervisor agreed to us using radix.

Fixes: 2bfd65e45e87 ("powerpc/mm/radix: Add radix callbacks for early init routines")
Cc: stable@vger.kernel.org # v4.7+
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/init_64.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 93abf8a9813d..4d9481ec2468 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -42,6 +42,8 @@
 #include <linux/memblock.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -344,12 +346,43 @@ static int __init parse_disable_radix(char *p)
 }
 early_param("disable_radix", parse_disable_radix);
 
+/*
+ * If we're running under a hypervisor, we currently can't do radix
+ * since we don't have the code to do the H_REGISTER_PROC_TBL hcall.
+ * We tell that we're running under a hypervisor by looking for the
+ * /chosen/ibm,architecture-vec-5 property.
+ */
+static void early_check_vec5(void)
+{
+	unsigned long root, chosen;
+	int size;
+	const u8 *vec5;
+
+	root = of_get_flat_dt_root();
+	chosen = of_get_flat_dt_subnode_by_name(root, "chosen");
+	if (chosen == -FDT_ERR_NOTFOUND)
+		return;
+	vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size);
+	if (!vec5)
+		return;
+	cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+}
+
 void __init mmu_early_init_devtree(void)
 {
 	/* Disable radix mode based on kernel command line. */
 	if (disable_radix)
 		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
 
+	/*
+	 * Check /chosen/ibm,architecture-vec-5 if running as a guest.
+	 * When running bare-metal, we can use radix if we like
+	 * even though the ibm,architecture-vec-5 property created by
+	 * skiboot doesn't have the necessary bits set.
+	 */
+	if (early_radix_enabled() && !(mfmsr() & MSR_HV))
+		early_check_vec5();
+
 	if (early_radix_enabled())
 		radix__early_init_devtree();
 	else
-- 
cgit v1.2.3


From 3f4ab2f83b4e443c66549206eb88a9fa5a85d647 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:35 +1100
Subject: powerpc/pseries: Fixes for the "ibm,architecture-vec-5" options

This fixes the byte index values for some of the option bits in
the "ibm,architectur-vec-5" property. The "platform facilities options"
bits are in byte 17 not byte 14, so the upper 8 bits of their
definitions need to be 0x11 not 0x0E. The "sub processor support" option
is in byte 21 not byte 15.

Note none of these options are actually looked up in
"ibm,architecture-vec-5" at this time, so there is no bug.

When checking whether option bits are set, we should check that
the offset of the byte being checked is less than the vector
length that we got from the hypervisor.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/prom.h           | 8 ++++----
 arch/powerpc/platforms/pseries/firmware.c | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 5e57705b4759..e6d83d0fada7 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -151,10 +151,10 @@ struct of_drconf_cell {
 #define OV5_XCMO		0x0440	/* Page Coalescing */
 #define OV5_TYPE1_AFFINITY	0x0580	/* Type 1 NUMA affinity */
 #define OV5_PRRN		0x0540	/* Platform Resource Reassignment */
-#define OV5_PFO_HW_RNG		0x0E80	/* PFO Random Number Generator */
-#define OV5_PFO_HW_842		0x0E40	/* PFO Compression Accelerator */
-#define OV5_PFO_HW_ENCR		0x0E20	/* PFO Encryption Accelerator */
-#define OV5_SUB_PROCESSORS	0x0F01	/* 1,2,or 4 Sub-Processors supported */
+#define OV5_PFO_HW_RNG		0x1180	/* PFO Random Number Generator */
+#define OV5_PFO_HW_842		0x1140	/* PFO Compression Accelerator */
+#define OV5_PFO_HW_ENCR		0x1120	/* PFO Encryption Accelerator */
+#define OV5_SUB_PROCESSORS	0x1501	/* 1,2,or 4 Sub-Processors supported */
 
 /* Option Vector 6: IBM PAPR hints */
 #define OV6_LINUX		0x02	/* Linux is our OS */
diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
index ea7f09bd73b1..7d67623203b8 100644
--- a/arch/powerpc/platforms/pseries/firmware.c
+++ b/arch/powerpc/platforms/pseries/firmware.c
@@ -126,7 +126,7 @@ static void __init fw_vec5_feature_init(const char *vec5, unsigned long len)
 		index = OV5_INDX(vec5_fw_features_table[i].feature);
 		feat = OV5_FEAT(vec5_fw_features_table[i].feature);
 
-		if (vec5[index] & feat)
+		if (index < len && (vec5[index] & feat))
 			powerpc_firmware_features |=
 				vec5_fw_features_table[i].val;
 	}
-- 
cgit v1.2.3


From cc3d2940133d24000e2866b21e03ce32adfead0a Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:36 +1100
Subject: powerpc/64: Enable use of radix MMU under hypervisor on POWER9

To use radix as a guest, we first need to tell the hypervisor via
the ibm,client-architecture call first that we support POWER9 and
architecture v3.00, and that we can do either radix or hash and
that we would like to choose later using an hcall (the
H_REGISTER_PROC_TBL hcall).

Then we need to check whether the hypervisor agreed to us using
radix.  We need to do this very early on in the kernel boot process
before any of the MMU initialization is done.  If the hypervisor
doesn't agree, we can't use radix and therefore clear the radix
MMU feature bit.

Later, when we have set up our process table, which points to the
radix tree for each process, we need to install that using the
H_REGISTER_PROC_TBL hcall.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu.h |  6 ++++++
 arch/powerpc/include/asm/hvcall.h        | 11 +++++++++++
 arch/powerpc/include/asm/prom.h          |  9 +++++++++
 arch/powerpc/kernel/prom_init.c          | 18 +++++++++++++++++-
 arch/powerpc/mm/init_64.c                | 12 +++++++-----
 arch/powerpc/mm/pgtable-radix.c          |  2 ++
 arch/powerpc/platforms/pseries/lpar.c    | 29 +++++++++++++++++++++++++++++
 7 files changed, 81 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 8afb0e00f7d9..cea522c3bcae 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -138,5 +138,11 @@ static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 extern int (*register_process_table)(unsigned long base, unsigned long page_size,
 				     unsigned long tbl_size);
 
+#ifdef CONFIG_PPC_PSERIES
+extern void radix_init_pseries(void);
+#else
+static inline void radix_init_pseries(void) { };
+#endif
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 77ff1ba99d1f..54d11b3a6bf7 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -276,6 +276,7 @@
 #define H_GET_MPP_X		0x314
 #define H_SET_MODE		0x31C
 #define H_CLEAR_HPT		0x358
+#define H_REGISTER_PROC_TBL	0x37C
 #define H_SIGNAL_SYS_RESET	0x380
 #define MAX_HCALL_OPCODE	H_SIGNAL_SYS_RESET
 
@@ -313,6 +314,16 @@
 #define H_SIGNAL_SYS_RESET_ALL_OTHERS		-2
 /* >= 0 values are CPU number */
 
+/* Flag values used in H_REGISTER_PROC_TBL hcall */
+#define PROC_TABLE_OP_MASK	0x18
+#define PROC_TABLE_DEREG	0x10
+#define PROC_TABLE_NEW		0x18
+#define PROC_TABLE_TYPE_MASK	0x06
+#define PROC_TABLE_HPT_SLB	0x00
+#define PROC_TABLE_HPT_PT	0x02
+#define PROC_TABLE_RADIX	0x04
+#define PROC_TABLE_GTSE		0x01
+
 #ifndef __ASSEMBLY__
 
 /**
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index e6d83d0fada7..8af2546ea593 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -121,6 +121,8 @@ struct of_drconf_cell {
 #define OV1_PPC_2_06		0x02	/* set if we support PowerPC 2.06 */
 #define OV1_PPC_2_07		0x01	/* set if we support PowerPC 2.07 */
 
+#define OV1_PPC_3_00		0x80	/* set if we support PowerPC 3.00 */
+
 /* Option vector 2: Open Firmware options supported */
 #define OV2_REAL_MODE		0x20	/* set if we want OF in real mode */
 
@@ -155,6 +157,13 @@ struct of_drconf_cell {
 #define OV5_PFO_HW_842		0x1140	/* PFO Compression Accelerator */
 #define OV5_PFO_HW_ENCR		0x1120	/* PFO Encryption Accelerator */
 #define OV5_SUB_PROCESSORS	0x1501	/* 1,2,or 4 Sub-Processors supported */
+#define OV5_XIVE_EXPLOIT	0x1701	/* XIVE exploitation supported */
+#define OV5_MMU_RADIX_300	0x1880	/* ISA v3.00 radix MMU supported */
+#define OV5_MMU_HASH_300	0x1840	/* ISA v3.00 hash MMU supported */
+#define OV5_MMU_SEGM_RADIX	0x1820	/* radix mode (no segmentation) */
+#define OV5_MMU_PROC_TBL	0x1810	/* hcall selects SLB or proc table */
+#define OV5_MMU_SLB		0x1800	/* always use SLB */
+#define OV5_MMU_GTSE		0x1808	/* Guest translation shootdown */
 
 /* Option Vector 6: IBM PAPR hints */
 #define OV6_LINUX		0x02	/* Linux is our OS */
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index ec47a939cbdd..358d43f8f84f 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -649,6 +649,7 @@ static void __init early_cmdline_parse(void)
 struct option_vector1 {
 	u8 byte1;
 	u8 arch_versions;
+	u8 arch_versions3;
 } __packed;
 
 struct option_vector2 {
@@ -691,6 +692,9 @@ struct option_vector5 {
 	u8 reserved2;
 	__be16 reserved3;
 	u8 subprocessors;
+	u8 byte22;
+	u8 intarch;
+	u8 mmu;
 } __packed;
 
 struct option_vector6 {
@@ -700,7 +704,7 @@ struct option_vector6 {
 } __packed;
 
 struct ibm_arch_vec {
-	struct { u32 mask, val; } pvrs[10];
+	struct { u32 mask, val; } pvrs[12];
 
 	u8 num_vectors;
 
@@ -749,6 +753,14 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
 			.mask = cpu_to_be32(0xffff0000), /* POWER8 */
 			.val  = cpu_to_be32(0x004d0000),
 		},
+		{
+			.mask = cpu_to_be32(0xffff0000), /* POWER9 */
+			.val  = cpu_to_be32(0x004e0000),
+		},
+		{
+			.mask = cpu_to_be32(0xffffffff), /* all 3.00-compliant */
+			.val  = cpu_to_be32(0x0f000005),
+		},
 		{
 			.mask = cpu_to_be32(0xffffffff), /* all 2.07-compliant */
 			.val  = cpu_to_be32(0x0f000004),
@@ -774,6 +786,7 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
 		.byte1 = 0,
 		.arch_versions = OV1_PPC_2_00 | OV1_PPC_2_01 | OV1_PPC_2_02 | OV1_PPC_2_03 |
 				 OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06 | OV1_PPC_2_07,
+		.arch_versions3 = OV1_PPC_3_00,
 	},
 
 	.vec2_len = VECTOR_LENGTH(sizeof(struct option_vector2)),
@@ -836,6 +849,9 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
 		.reserved2 = 0,
 		.reserved3 = 0,
 		.subprocessors = 1,
+		.intarch = 0,
+		.mmu = OV5_FEAT(OV5_MMU_RADIX_300) | OV5_FEAT(OV5_MMU_HASH_300) |
+			OV5_FEAT(OV5_MMU_PROC_TBL) | OV5_FEAT(OV5_MMU_GTSE),
 	},
 
 	/* option vector 6: IBM PAPR hints */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 4d9481ec2468..10c9a545a646 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -347,10 +347,9 @@ static int __init parse_disable_radix(char *p)
 early_param("disable_radix", parse_disable_radix);
 
 /*
- * If we're running under a hypervisor, we currently can't do radix
- * since we don't have the code to do the H_REGISTER_PROC_TBL hcall.
- * We tell that we're running under a hypervisor by looking for the
- * /chosen/ibm,architecture-vec-5 property.
+ * If we're running under a hypervisor, we need to check the contents of
+ * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do
+ * radix.  If not, we clear the radix feature bit so we fall back to hash.
  */
 static void early_check_vec5(void)
 {
@@ -365,7 +364,10 @@ static void early_check_vec5(void)
 	vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size);
 	if (!vec5)
 		return;
-	cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+	if (size <= OV5_INDX(OV5_MMU_RADIX_300) ||
+	    !(vec5[OV5_INDX(OV5_MMU_RADIX_300)] & OV5_FEAT(OV5_MMU_RADIX_300)))
+		/* Hypervisor doesn't support radix */
+		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
 }
 
 void __init mmu_early_init_devtree(void)
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index cfa53ccc8baf..94323c4ececc 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -401,6 +401,8 @@ void __init radix__early_init_mmu(void)
 		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
 		radix_init_partition_table();
 		radix_init_amor();
+	} else {
+		radix_init_pseries();
 	}
 
 	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 5dc1c3c6e716..0587655aea69 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -609,6 +609,29 @@ static int __init disable_bulk_remove(char *str)
 
 __setup("bulk_remove=", disable_bulk_remove);
 
+/* Actually only used for radix, so far */
+static int pseries_lpar_register_process_table(unsigned long base,
+			unsigned long page_size, unsigned long table_size)
+{
+	long rc;
+	unsigned long flags = PROC_TABLE_NEW;
+
+	if (radix_enabled())
+		flags |= PROC_TABLE_RADIX | PROC_TABLE_GTSE;
+	for (;;) {
+		rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
+					page_size, table_size);
+		if (!H_IS_LONG_BUSY(rc))
+			break;
+		mdelay(get_longbusy_msecs(rc));
+	}
+	if (rc != H_SUCCESS) {
+		pr_err("Failed to register process table (rc=%ld)\n", rc);
+		BUG();
+	}
+	return rc;
+}
+
 void __init hpte_init_pseries(void)
 {
 	mmu_hash_ops.hpte_invalidate	 = pSeries_lpar_hpte_invalidate;
@@ -622,6 +645,12 @@ void __init hpte_init_pseries(void)
 	mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
 }
 
+void radix_init_pseries(void)
+{
+	pr_info("Using radix MMU under hypervisor\n");
+	register_process_table = pseries_lpar_register_process_table;
+}
+
 #ifdef CONFIG_PPC_SMLPAR
 #define CMO_FREE_HINT_DEFAULT 1
 static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;
-- 
cgit v1.2.3


From dbcbfee0c81c7938e40d7d6bc659a5191f490b50 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:37 +1100
Subject: powerpc/64: More definitions for POWER9

This adds definitions for bits in the DSISR register which are used
by POWER9 for various translation-related exception conditions, and
for some more bits in the partition table entry that will be needed
by KVM.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu.h | 12 +++++++++++-
 arch/powerpc/include/asm/reg.h           |  4 ++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index cea522c3bcae..d73e9dfa5237 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -44,10 +44,20 @@ struct patb_entry {
 };
 extern struct patb_entry *partition_tb;
 
+/* Bits in patb0 field */
 #define PATB_HR		(1UL << 63)
-#define PATB_GR		(1UL << 63)
 #define RPDB_MASK	0x0ffffffffffff00fUL
 #define RPDB_SHIFT	(1UL << 8)
+#define RTS1_SHIFT	61		/* top 2 bits of radix tree size */
+#define RTS1_MASK	(3UL << RTS1_SHIFT)
+#define RTS2_SHIFT	5		/* bottom 3 bits of radix tree size */
+#define RTS2_MASK	(7UL << RTS2_SHIFT)
+#define RPDS_MASK	0x1f		/* root page dir. size field */
+
+/* Bits in patb1 field */
+#define PATB_GR		(1UL << 63)	/* guest uses radix; must match HR */
+#define PRTS_MASK	0x1f		/* process table size field */
+
 /*
  * Limit process table to PAGE_SIZE table. This
  * also limit the max pid we can support.
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 0d4531aa2052..aa44a83ad3ec 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -274,10 +274,14 @@
 #define SPRN_DSISR	0x012	/* Data Storage Interrupt Status Register */
 #define   DSISR_NOHPTE		0x40000000	/* no translation found */
 #define   DSISR_PROTFAULT	0x08000000	/* protection fault */
+#define   DSISR_BADACCESS	0x04000000	/* bad access to CI or G */
 #define   DSISR_ISSTORE		0x02000000	/* access was a store */
 #define   DSISR_DABRMATCH	0x00400000	/* hit data breakpoint */
 #define   DSISR_NOSEGMENT	0x00200000	/* SLB miss */
 #define   DSISR_KEYFAULT	0x00200000	/* Key fault */
+#define   DSISR_UNSUPP_MMU	0x00080000	/* Unsupported MMU config */
+#define   DSISR_SET_RC		0x00040000	/* Failed setting of R/C bits */
+#define   DSISR_PGDIRFAULT      0x00020000      /* Fault on page directory */
 #define SPRN_TBRL	0x10C	/* Time Base Read Lower Register (user, R/O) */
 #define SPRN_TBRU	0x10D	/* Time Base Read Upper Register (user, R/O) */
 #define SPRN_CIR	0x11B	/* Chip Information Register (hyper, R/0) */
-- 
cgit v1.2.3


From ba9b399aee6fb70cbe988f0750d6dd9f6677293b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:38 +1100
Subject: powerpc/64: Export pgtable_cache and pgtable_cache_add for KVM

This exports the pgtable_cache array and the pgtable_cache_add
function so that HV KVM can use them for allocating radix page
tables for guests.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/init-common.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index a175cd82ae8c..2be5dc242832 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -41,6 +41,7 @@ static void pmd_ctor(void *addr)
 }
 
 struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
+EXPORT_SYMBOL_GPL(pgtable_cache);	/* used by kvm_hv module */
 
 /*
  * Create a kmem_cache() for pagetables.  This is not used for PTE
@@ -82,7 +83,7 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
 	pgtable_cache[shift - 1] = new;
 	pr_debug("Allocated pgtable cache for order %d\n", shift);
 }
-
+EXPORT_SYMBOL_GPL(pgtable_cache_add);	/* used by kvm_hv module */
 
 void pgtable_cache_init(void)
 {
-- 
cgit v1.2.3


From 16ed141677c5a1a796408e74ccd0a6f6554c3f21 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:39 +1100
Subject: powerpc/64: Make type of partition table flush depend on partition
 type

When changing a partition table entry on POWER9, we do a particular
form of the tlbie instruction which flushes all TLBs and caches of
the partition table for a given logical partition ID (LPID).
This instruction has a field in the instruction word, labelled R
(radix), which should be 1 if the partition was previously a radix
partition and 0 if it was a HPT partition.  This implements that
logic.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/pgtable_64.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 8bca7f58afc4..d6b5e5cde412 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -454,13 +454,23 @@ void __init mmu_partition_table_init(void)
 void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
 				   unsigned long dw1)
 {
+	unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
+
 	partition_tb[lpid].patb0 = cpu_to_be64(dw0);
 	partition_tb[lpid].patb1 = cpu_to_be64(dw1);
 
-	/* Global flush of TLBs and partition table caches for this lpid */
+	/*
+	 * Global flush of TLBs and partition table caches for this lpid.
+	 * The type of flush (hash or radix) depends on what the previous
+	 * use of this partition ID was, not the new use.
+	 */
 	asm volatile("ptesync" : : : "memory");
-	asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
-		     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+	if (old & PATB_HR)
+		asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
+			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+	else
+		asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
+			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
 	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
 }
 EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
-- 
cgit v1.2.3


From bc3551257af837fc603d295e59f9e32953525b98 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:40 +1100
Subject: powerpc/64: Allow for relocation-on interrupts from guest to host

With host and guest both using radix translation, it is feasible
for the host to take interrupts that come from the guest with
relocation on, and that is in fact what the POWER9 hardware will
do when LPCR[AIL] = 3.  All such interrupts use HSRR0/1 not SRR0/1
except for system call with LEV=1 (hcall).

Therefore this adds the KVM tests to the _HV variants of the
relocation-on interrupt handlers, and adds the KVM test to the
relocation-on system call entry point.

We also instantiate the relocation-on versions of the hypervisor
data storage and instruction interrupt handlers, since these can
occur with relocation on in radix guests.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 10 +++---
 arch/powerpc/kernel/exceptions-64s.S     | 53 +++++++++++++++++---------------
 2 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 9a5dbfb2d9f2..8fa09fa500f0 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -428,12 +428,12 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_STD)
 
 #define STD_RELON_EXCEPTION_HV(loc, vec, label)		\
-	/* No guest interrupts come through here */	\
 	SET_SCRATCH0(r13);	/* save r13 */		\
-	EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label, EXC_HV, NOTEST, vec);
+	EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label,	\
+				       EXC_HV, KVMTEST_HV, vec);
 
 #define STD_RELON_EXCEPTION_HV_OOL(vec, label)			\
-	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec);		\
+	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, vec);	\
 	EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_HV)
 
 /* This associate vector numbers with bits in paca->irq_happened */
@@ -510,10 +510,10 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 
 #define MASKABLE_RELON_EXCEPTION_HV(loc, vec, label)			\
 	_MASKABLE_RELON_EXCEPTION_PSERIES(vec, label,			\
-					  EXC_HV, SOFTEN_NOTEST_HV)
+					  EXC_HV, SOFTEN_TEST_HV)
 
 #define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label)			\
-	EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_NOTEST_HV, vec);		\
+	EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_HV, vec);		\
 	EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV)
 
 /*
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 65a2559eeb7f..34a04a5fa468 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -830,6 +830,31 @@ EXC_VIRT(trap_0b, 0x4b00, 0x4c00, 0xb00)
 TRAMP_KVM(PACA_EXGEN, 0xb00)
 EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
 
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+	 /*
+	  * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems
+	  * that support it) before changing to HMT_MEDIUM. That allows the KVM
+	  * code to save that value into the guest state (it is the guest's PPR
+	  * value). Otherwise just change to HMT_MEDIUM as userspace has
+	  * already saved the PPR.
+	  */
+#define SYSCALL_KVMTEST							\
+	SET_SCRATCH0(r13);						\
+	GET_PACA(r13);							\
+	std	r9,PACA_EXGEN+EX_R9(r13);				\
+	OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR);			\
+	HMT_MEDIUM;							\
+	std	r10,PACA_EXGEN+EX_R10(r13);				\
+	OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR);	\
+	mfcr	r9;							\
+	KVMTEST_PR(0xc00);						\
+	GET_SCRATCH0(r13)
+
+#else
+#define SYSCALL_KVMTEST							\
+	HMT_MEDIUM
+#endif
+	
 #define LOAD_SYSCALL_HANDLER(reg)					\
 	__LOAD_HANDLER(reg, system_call_common)
 
@@ -883,34 +908,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
 #endif
 
 EXC_REAL_BEGIN(system_call, 0xc00, 0xd00)
-	 /*
-	  * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems
-	  * that support it) before changing to HMT_MEDIUM. That allows the KVM
-	  * code to save that value into the guest state (it is the guest's PPR
-	  * value). Otherwise just change to HMT_MEDIUM as userspace has
-	  * already saved the PPR.
-	  */
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-	SET_SCRATCH0(r13)
-	GET_PACA(r13)
-	std	r9,PACA_EXGEN+EX_R9(r13)
-	OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR);
-	HMT_MEDIUM;
-	std	r10,PACA_EXGEN+EX_R10(r13)
-	OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR);
-	mfcr	r9
-	KVMTEST_PR(0xc00)
-	GET_SCRATCH0(r13)
-#else
-	HMT_MEDIUM;
-#endif
+	SYSCALL_KVMTEST
 	SYSCALL_PSERIES_1
 	SYSCALL_PSERIES_2_RFID
 	SYSCALL_PSERIES_3
 EXC_REAL_END(system_call, 0xc00, 0xd00)
 
 EXC_VIRT_BEGIN(system_call, 0x4c00, 0x4d00)
-	HMT_MEDIUM
+	SYSCALL_KVMTEST
 	SYSCALL_PSERIES_1
 	SYSCALL_PSERIES_2_DIRECT
 	SYSCALL_PSERIES_3
@@ -925,7 +930,7 @@ TRAMP_KVM(PACA_EXGEN, 0xd00)
 EXC_COMMON(single_step_common, 0xd00, single_step_exception)
 
 EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0xe20)
-EXC_VIRT_NONE(0x4e00, 0x4e20)
+EXC_VIRT_OOL_HV(h_data_storage, 0x4e00, 0x4e20, 0xe00)
 TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00)
 EXC_COMMON_BEGIN(h_data_storage_common)
 	mfspr   r10,SPRN_HDAR
@@ -941,7 +946,7 @@ EXC_COMMON_BEGIN(h_data_storage_common)
 
 
 EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0xe40)
-EXC_VIRT_NONE(0x4e20, 0x4e40)
+EXC_VIRT_OOL_HV(h_instr_storage, 0x4e20, 0x4e40, 0xe20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe20)
 EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception)
 
-- 
cgit v1.2.3


From c92701322711682de89b2bd0f32affad040b6e86 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:41 +1100
Subject: KVM: PPC: Book3S HV: Add userspace interfaces for POWER9 MMU

This adds two capabilities and two ioctls to allow userspace to
find out about and configure the POWER9 MMU in a guest.  The two
capabilities tell userspace whether KVM can support a guest using
the radix MMU, or using the hashed page table (HPT) MMU with a
process table and segment tables.  (Note that the MMUs in the
POWER9 processor cores do not use the process and segment tables
when in HPT mode, but the nest MMU does).

The KVM_PPC_CONFIGURE_V3_MMU ioctl allows userspace to specify
whether a guest will use the radix MMU or the HPT MMU, and to
specify the size and location (in guest space) of the process
table.

The KVM_PPC_GET_RMMU_INFO ioctl gives userspace information about
the radix MMU.  It returns a list of supported radix tree geometries
(base page size and number of bits indexed at each level of the
radix tree) and the encoding used to specify the various page
sizes for the TLB invalidate entry instruction.

Initially, both capabilities return 0 and the ioctls return -EINVAL,
until the necessary infrastructure for them to operate correctly
is added.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 Documentation/virtual/kvm/api.txt   | 83 +++++++++++++++++++++++++++++++++++++
 arch/powerpc/include/asm/kvm_ppc.h  |  2 +
 arch/powerpc/include/uapi/asm/kvm.h | 20 +++++++++
 arch/powerpc/kvm/book3s_hv.c        | 13 ++++++
 arch/powerpc/kvm/powerpc.c          | 32 ++++++++++++++
 include/uapi/linux/kvm.h            |  6 +++
 6 files changed, 156 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 03145b7cafaa..4470671b0c26 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3201,6 +3201,71 @@ struct kvm_reinject_control {
 pit_reinject = 0 (!reinject mode) is recommended, unless running an old
 operating system that uses the PIT for timing (e.g. Linux 2.4.x).
 
+4.99 KVM_PPC_CONFIGURE_V3_MMU
+
+Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_ppc_mmuv3_cfg (in)
+Returns: 0 on success,
+         -EFAULT if struct kvm_ppc_mmuv3_cfg cannot be read,
+         -EINVAL if the configuration is invalid
+
+This ioctl controls whether the guest will use radix or HPT (hashed
+page table) translation, and sets the pointer to the process table for
+the guest.
+
+struct kvm_ppc_mmuv3_cfg {
+	__u64	flags;
+	__u64	process_table;
+};
+
+There are two bits that can be set in flags; KVM_PPC_MMUV3_RADIX and
+KVM_PPC_MMUV3_GTSE.  KVM_PPC_MMUV3_RADIX, if set, configures the guest
+to use radix tree translation, and if clear, to use HPT translation.
+KVM_PPC_MMUV3_GTSE, if set and if KVM permits it, configures the guest
+to be able to use the global TLB and SLB invalidation instructions;
+if clear, the guest may not use these instructions.
+
+The process_table field specifies the address and size of the guest
+process table, which is in the guest's space.  This field is formatted
+as the second doubleword of the partition table entry, as defined in
+the Power ISA V3.00, Book III section 5.7.6.1.
+
+4.100 KVM_PPC_GET_RMMU_INFO
+
+Capability: KVM_CAP_PPC_RADIX_MMU
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_ppc_rmmu_info (out)
+Returns: 0 on success,
+	 -EFAULT if struct kvm_ppc_rmmu_info cannot be written,
+	 -EINVAL if no useful information can be returned
+
+This ioctl returns a structure containing two things: (a) a list
+containing supported radix tree geometries, and (b) a list that maps
+page sizes to put in the "AP" (actual page size) field for the tlbie
+(TLB invalidate entry) instruction.
+
+struct kvm_ppc_rmmu_info {
+	struct kvm_ppc_radix_geom {
+		__u8	page_shift;
+		__u8	level_bits[4];
+		__u8	pad[3];
+	}	geometries[8];
+	__u32	ap_encodings[8];
+};
+
+The geometries[] field gives up to 8 supported geometries for the
+radix page table, in terms of the log base 2 of the smallest page
+size, and the number of bits indexed at each level of the tree, from
+the PTE level up to the PGD level in that order.  Any unused entries
+will have 0 in the page_shift field.
+
+The ap_encodings gives the supported page sizes and their AP field
+encodings, encoded with the AP value in the top 3 bits and the log
+base 2 of the page size in the bottom 6 bits.
+
 5. The kvm_run structure
 ------------------------
 
@@ -3942,3 +4007,21 @@ In order to use SynIC, it has to be activated by setting this
 capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this
 will disable the use of APIC hardware virtualization even if supported
 by the CPU, as it's incompatible with SynIC auto-EOI behavior.
+
+8.3 KVM_CAP_PPC_RADIX_MMU
+
+Architectures: ppc
+
+This capability, if KVM_CHECK_EXTENSION indicates that it is
+available, means that that the kernel can support guests using the
+radix MMU defined in Power ISA V3.00 (as implemented in the POWER9
+processor).
+
+8.4 KVM_CAP_PPC_HASH_MMU_V3
+
+Architectures: ppc
+
+This capability, if KVM_CHECK_EXTENSION indicates that it is
+available, means that that the kernel can support guests using the
+hashed page table MMU defined in Power ISA V3.00 (as implemented in
+the POWER9 processor), including in-memory segment tables.
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2da67bf1f2ec..48c760f89590 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -291,6 +291,8 @@ struct kvmppc_ops {
 				       struct irq_bypass_producer *);
 	void (*irq_bypass_del_producer)(struct irq_bypass_consumer *,
 					struct irq_bypass_producer *);
+	int (*configure_mmu)(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg);
+	int (*get_rmmu_info)(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
 };
 
 extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 3603b6f51b11..cc0908b6c2a0 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -413,6 +413,26 @@ struct kvm_get_htab_header {
 	__u16	n_invalid;
 };
 
+/* For KVM_PPC_CONFIGURE_V3_MMU */
+struct kvm_ppc_mmuv3_cfg {
+	__u64	flags;
+	__u64	process_table;	/* second doubleword of partition table entry */
+};
+
+/* Flag values for KVM_PPC_CONFIGURE_V3_MMU */
+#define KVM_PPC_MMUV3_RADIX	1	/* 1 = radix mode, 0 = HPT */
+#define KVM_PPC_MMUV3_GTSE	2	/* global translation shootdown enb. */
+
+/* For KVM_PPC_GET_RMMU_INFO */
+struct kvm_ppc_rmmu_info {
+	struct kvm_ppc_radix_geom {
+		__u8	page_shift;
+		__u8	level_bits[4];
+		__u8	pad[3];
+	}	geometries[8];
+	__u32	ap_encodings[8];
+};
+
 /* Per-vcpu XICS interrupt controller state */
 #define KVM_REG_PPC_ICP_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c)
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index ec34e39471a7..5f08ed070ae5 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3657,6 +3657,17 @@ static void init_default_hcalls(void)
 	}
 }
 
+/* dummy implementations for now */
+static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
+{
+	return -EINVAL;
+}
+
+static int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
+{
+	return -EINVAL;
+}
+
 static struct kvmppc_ops kvm_ops_hv = {
 	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
 	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -3694,6 +3705,8 @@ static struct kvmppc_ops kvm_ops_hv = {
 	.irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
 	.irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
 #endif
+	.configure_mmu = kvmhv_configure_mmu,
+	.get_rmmu_info = kvmhv_get_rmmu_info,
 };
 
 static int kvm_init_subcore_bitmap(void)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index cd892dec7cb6..38c0d154c01e 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -565,6 +565,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_PPC_HWRNG:
 		r = kvmppc_hwrng_present();
 		break;
+	case KVM_CAP_PPC_MMU_RADIX:
+		r = !!(0 && hv_enabled && radix_enabled());
+		break;
+	case KVM_CAP_PPC_MMU_HASH_V3:
+		r = !!(0 && hv_enabled && !radix_enabled() &&
+		       cpu_has_feature(CPU_FTR_ARCH_300));
+		break;
 #endif
 	case KVM_CAP_SYNC_MMU:
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -1468,6 +1475,31 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
 		break;
 	}
+	case KVM_PPC_CONFIGURE_V3_MMU: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_ppc_mmuv3_cfg cfg;
+
+		r = -EINVAL;
+		if (!kvm->arch.kvm_ops->configure_mmu)
+			goto out;
+		r = -EFAULT;
+		if (copy_from_user(&cfg, argp, sizeof(cfg)))
+			goto out;
+		r = kvm->arch.kvm_ops->configure_mmu(kvm, &cfg);
+		break;
+	}
+	case KVM_PPC_GET_RMMU_INFO: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_ppc_rmmu_info info;
+
+		r = -EINVAL;
+		if (!kvm->arch.kvm_ops->get_rmmu_info)
+			goto out;
+		r = kvm->arch.kvm_ops->get_rmmu_info(kvm, &info);
+		if (r >= 0 && copy_to_user(argp, &info, sizeof(info)))
+			r = -EFAULT;
+		break;
+	}
 	default: {
 		struct kvm *kvm = filp->private_data;
 		r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index cac48eda1075..e0035808c814 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -871,6 +871,8 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_S390_USER_INSTR0 130
 #define KVM_CAP_MSI_DEVID 131
 #define KVM_CAP_PPC_HTM 132
+#define KVM_CAP_PPC_MMU_RADIX 134
+#define KVM_CAP_PPC_MMU_HASH_V3 135
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1187,6 +1189,10 @@ struct kvm_s390_ucas_mapping {
 #define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
 /* Available with KVM_CAP_PPC_RTAS */
 #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xac, struct kvm_rtas_token_args)
+/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */
+#define KVM_PPC_CONFIGURE_V3_MMU  _IOW(KVMIO,  0xaf, struct kvm_ppc_mmuv3_cfg)
+/* Available with KVM_CAP_PPC_RADIX_MMU */
+#define KVM_PPC_GET_RMMU_INFO	  _IOW(KVMIO,  0xb0, struct kvm_ppc_rmmu_info)
 
 /* ioctl for vm fd */
 #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
-- 
cgit v1.2.3


From 468808bd35c4aa3cf7d9fde0ebb010270038734b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:42 +1100
Subject: KVM: PPC: Book3S HV: Set process table for HPT guests on POWER9

This adds the implementation of the KVM_PPC_CONFIGURE_V3_MMU ioctl
for HPT guests on POWER9.  With this, we can return 1 for the
KVM_CAP_PPC_MMU_HASH_V3 capability.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/powerpc/kvm/book3s_hv.c        | 35 +++++++++++++++++++++++++++++++----
 arch/powerpc/kvm/powerpc.c          |  2 +-
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e59b172666cd..944532dc4a57 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -264,6 +264,7 @@ struct kvm_arch {
 	atomic_t hpte_mod_interest;
 	cpumask_t need_tlb_flush;
 	int hpt_cma_alloc;
+	u64 process_table;
 	struct dentry *debugfs_dir;
 	struct dentry *htab_dentry;
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 5f08ed070ae5..bb2854314ba4 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3091,8 +3091,8 @@ static void kvmppc_setup_partition_table(struct kvm *kvm)
 	/* HTABSIZE and HTABORG fields */
 	dw0 |= kvm->arch.sdr1;
 
-	/* Second dword has GR=0; other fields are unused since UPRT=0 */
-	dw1 = 0;
+	/* Second dword as set by userspace */
+	dw1 = kvm->arch.process_table;
 
 	mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
 }
@@ -3657,10 +3657,37 @@ static void init_default_hcalls(void)
 	}
 }
 
-/* dummy implementations for now */
 static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 {
-	return -EINVAL;
+	unsigned long lpcr;
+
+	/* If not on a POWER9, reject it */
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		return -ENODEV;
+
+	/* If any unknown flags set, reject it */
+	if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
+		return -EINVAL;
+
+	/* We can't do radix yet */
+	if (cfg->flags & KVM_PPC_MMUV3_RADIX)
+		return -EINVAL;
+
+	/* GR (guest radix) bit in process_table field must match */
+	if (cfg->process_table & PATB_GR)
+		return -EINVAL;
+
+	/* Process table size field must be reasonable, i.e. <= 24 */
+	if ((cfg->process_table & PRTS_MASK) > 24)
+		return -EINVAL;
+
+	kvm->arch.process_table = cfg->process_table;
+	kvmppc_setup_partition_table(kvm);
+
+	lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
+	kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
+
+	return 0;
 }
 
 static int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 38c0d154c01e..1476a480745e 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -569,7 +569,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = !!(0 && hv_enabled && radix_enabled());
 		break;
 	case KVM_CAP_PPC_MMU_HASH_V3:
-		r = !!(0 && hv_enabled && !radix_enabled() &&
+		r = !!(hv_enabled && !radix_enabled() &&
 		       cpu_has_feature(CPU_FTR_ARCH_300));
 		break;
 #endif
-- 
cgit v1.2.3


From ef8c640cb9cc865a461827b698fcc55b0ecaa600 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:43 +1100
Subject: KVM: PPC: Book3S HV: Use ASDR for HPT guests on POWER9

POWER9 adds a register called ASDR (Access Segment Descriptor
Register), which is set by hypervisor data/instruction storage
interrupts to contain the segment descriptor for the address
being accessed, assuming the guest is using HPT translation.
(For radix guests, it contains the guest real address of the
access.)

Thus, for HPT guests on POWER9, we can use this register rather
than looking up the SLB with the slbfee. instruction.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 264ac9ad4585..01f4392a284d 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1715,6 +1715,10 @@ kvmppc_hdsi:
 	/* HPTE not found fault or protection fault? */
 	andis.	r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h
 	beq	1f			/* if not, send it to the guest */
+BEGIN_FTR_SECTION
+	mfspr	r5, SPRN_ASDR		/* on POWER9, use ASDR to get VSID */
+	b	4f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	andi.	r0, r11, MSR_DR		/* data relocation enabled? */
 	beq	3f
 	clrrdi	r0, r4, 28
@@ -1791,6 +1795,10 @@ fast_interrupt_c_return:
 kvmppc_hisi:
 	andis.	r0, r11, SRR1_ISI_NOPT@h
 	beq	1f
+BEGIN_FTR_SECTION
+	mfspr	r5, SPRN_ASDR		/* on POWER9, use ASDR to get VSID */
+	b	4f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	andi.	r0, r11, MSR_IR		/* instruction relocation enabled? */
 	beq	3f
 	clrrdi	r0, r10, 28
-- 
cgit v1.2.3


From 9e04ba69beec372ddf857c700ff922e95f50b0d0 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:44 +1100
Subject: KVM: PPC: Book3S HV: Add basic infrastructure for radix guests

This adds a field in struct kvm_arch and an inline helper to
indicate whether a guest is a radix guest or not, plus a new file
to contain the radix MMU code, which currently contains just a
translate function which knows how to traverse the guest page
tables to translate an address.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_book3s.h    |   3 +
 arch/powerpc/include/asm/kvm_book3s_64.h |   6 ++
 arch/powerpc/include/asm/kvm_host.h      |   2 +
 arch/powerpc/kvm/Makefile                |   3 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c      |  10 ++-
 arch/powerpc/kvm/book3s_64_mmu_radix.c   | 139 +++++++++++++++++++++++++++++++
 6 files changed, 160 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_64_mmu_radix.c

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 5cf306ae0ac3..7adfcc03a35f 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -182,6 +182,9 @@ extern void kvmppc_mmu_hpte_sysexit(void);
 extern int kvmppc_mmu_hv_init(void);
 extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
 
+extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+			struct kvmppc_pte *gpte, bool data, bool iswrite);
+
 /* XXX remove this export when load_last_inst() is generic */
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 848292176908..0db010cc4e65 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -36,6 +36,12 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
 #endif
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+
+static inline bool kvm_is_radix(struct kvm *kvm)
+{
+	return kvm->arch.radix;
+}
+
 #define KVM_DEFAULT_HPT_ORDER	24	/* 16MB HPT by default */
 #endif
 
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 944532dc4a57..fb73518bd03b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -264,6 +264,8 @@ struct kvm_arch {
 	atomic_t hpte_mod_interest;
 	cpumask_t need_tlb_flush;
 	int hpt_cma_alloc;
+	u8 radix;
+	pgd_t *pgtable;
 	u64 process_table;
 	struct dentry *debugfs_dir;
 	struct dentry *htab_dentry;
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 7dd89b79d038..b87ccde2137a 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -70,7 +70,8 @@ endif
 kvm-hv-y += \
 	book3s_hv.o \
 	book3s_hv_interrupts.o \
-	book3s_64_mmu_hv.o
+	book3s_64_mmu_hv.o \
+	book3s_64_mmu_radix.o
 
 kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
 	book3s_hv_rm_xics.o
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index b795dd1ac2ef..c208bf3b252f 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -119,6 +119,9 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
 	long err = -EBUSY;
 	long order;
 
+	if (kvm_is_radix(kvm))
+		return -EINVAL;
+
 	mutex_lock(&kvm->lock);
 	if (kvm->arch.hpte_setup_done) {
 		kvm->arch.hpte_setup_done = 0;
@@ -157,7 +160,7 @@ void kvmppc_free_hpt(struct kvm *kvm)
 	if (kvm->arch.hpt_cma_alloc)
 		kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
 				1 << (kvm->arch.hpt_order - PAGE_SHIFT));
-	else
+	else if (kvm->arch.hpt_virt)
 		free_pages(kvm->arch.hpt_virt,
 			   kvm->arch.hpt_order - PAGE_SHIFT);
 }
@@ -1675,7 +1678,10 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.slb_nr = 32;		/* POWER7/POWER8 */
 
-	mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
+	if (kvm_is_radix(vcpu->kvm))
+		mmu->xlate = kvmppc_mmu_radix_xlate;
+	else
+		mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
 	mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
 
 	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
new file mode 100644
index 000000000000..9091407fbfd4
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -0,0 +1,139 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+
+/*
+ * Supported radix tree geometry.
+ * Like p9, we support either 5 or 9 bits at the first (lowest) level,
+ * for a page size of 64k or 4k.
+ */
+static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
+
+int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+			   struct kvmppc_pte *gpte, bool data, bool iswrite)
+{
+	struct kvm *kvm = vcpu->kvm;
+	u32 pid;
+	int ret, level, ps;
+	__be64 prte, rpte;
+	unsigned long root, pte, index;
+	unsigned long rts, bits, offset;
+	unsigned long gpa;
+	unsigned long proc_tbl_size;
+
+	/* Work out effective PID */
+	switch (eaddr >> 62) {
+	case 0:
+		pid = vcpu->arch.pid;
+		break;
+	case 3:
+		pid = 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+	proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12);
+	if (pid * 16 >= proc_tbl_size)
+		return -EINVAL;
+
+	/* Read partition table to find root of tree for effective PID */
+	ret = kvm_read_guest(kvm, kvm->arch.process_table + pid * 16,
+			     &prte, sizeof(prte));
+	if (ret)
+		return ret;
+
+	root = be64_to_cpu(prte);
+	rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
+		((root & RTS2_MASK) >> RTS2_SHIFT);
+	bits = root & RPDS_MASK;
+	root = root & RPDB_MASK;
+
+	/* P9 DD1 interprets RTS (radix tree size) differently */
+	offset = rts + 31;
+	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+		offset -= 3;
+
+	/* current implementations only support 52-bit space */
+	if (offset != 52)
+		return -EINVAL;
+
+	for (level = 3; level >= 0; --level) {
+		if (level && bits != p9_supported_radix_bits[level])
+			return -EINVAL;
+		if (level == 0 && !(bits == 5 || bits == 9))
+			return -EINVAL;
+		offset -= bits;
+		index = (eaddr >> offset) & ((1UL << bits) - 1);
+		/* check that low bits of page table base are zero */
+		if (root & ((1UL << (bits + 3)) - 1))
+			return -EINVAL;
+		ret = kvm_read_guest(kvm, root + index * 8,
+				     &rpte, sizeof(rpte));
+		if (ret)
+			return ret;
+		pte = __be64_to_cpu(rpte);
+		if (!(pte & _PAGE_PRESENT))
+			return -ENOENT;
+		if (pte & _PAGE_PTE)
+			break;
+		bits = pte & 0x1f;
+		root = pte & 0x0fffffffffffff00ul;
+	}
+	/* need a leaf at lowest level; 512GB pages not supported */
+	if (level < 0 || level == 3)
+		return -EINVAL;
+
+	/* offset is now log base 2 of the page size */
+	gpa = pte & 0x01fffffffffff000ul;
+	if (gpa & ((1ul << offset) - 1))
+		return -EINVAL;
+	gpa += eaddr & ((1ul << offset) - 1);
+	for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
+		if (offset == mmu_psize_defs[ps].shift)
+			break;
+	gpte->page_size = ps;
+
+	gpte->eaddr = eaddr;
+	gpte->raddr = gpa;
+
+	/* Work out permissions */
+	gpte->may_read = !!(pte & _PAGE_READ);
+	gpte->may_write = !!(pte & _PAGE_WRITE);
+	gpte->may_execute = !!(pte & _PAGE_EXEC);
+	if (kvmppc_get_msr(vcpu) & MSR_PR) {
+		if (pte & _PAGE_PRIVILEGED) {
+			gpte->may_read = 0;
+			gpte->may_write = 0;
+			gpte->may_execute = 0;
+		}
+	} else {
+		if (!(pte & _PAGE_PRIVILEGED)) {
+			/* Check AMR/IAMR to see if strict mode is in force */
+			if (vcpu->arch.amr & (1ul << 62))
+				gpte->may_read = 0;
+			if (vcpu->arch.amr & (1ul << 63))
+				gpte->may_write = 0;
+			if (vcpu->arch.iamr & (1ul << 62))
+				gpte->may_execute = 0;
+		}
+	}
+
+	return 0;
+}
+
-- 
cgit v1.2.3


From f4c51f841d2ac7d36cacb84efbc383190861f87c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:45 +1100
Subject: KVM: PPC: Book3S HV: Modify guest entry/exit paths to handle radix
 guests

This adds code to  branch around the parts that radix guests don't
need - clearing and loading the SLB with the guest SLB contents,
saving the guest SLB contents on exit, and restoring the host SLB
contents.

Since the host is now using radix, we need to save and restore the
host value for the PID register.

On hypervisor data/instruction storage interrupts, we don't do the
guest HPT lookup on radix, but just save the guest physical address
for the fault (from the ASDR register) in the vcpu struct.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_host.h     |  1 +
 arch/powerpc/kernel/asm-offsets.c       |  2 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 57 ++++++++++++++++++++++++++-------
 3 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index fb73518bd03b..da1421a4d6f2 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -606,6 +606,7 @@ struct kvm_vcpu_arch {
 	ulong fault_dar;
 	u32 fault_dsisr;
 	unsigned long intr_msr;
+	ulong fault_gpa;	/* guest real address of page fault (POWER9) */
 #endif
 
 #ifdef CONFIG_BOOKE
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 0601e6a7297c..3afa0ad9837f 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -498,6 +498,7 @@ int main(void)
 	DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
 	DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls));
 	DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
+	DEFINE(KVM_RADIX, offsetof(struct kvm, arch.radix));
 	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
 	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
@@ -537,6 +538,7 @@ int main(void)
 	DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
 	DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
+	DEFINE(VCPU_FAULT_GPA, offsetof(struct kvm_vcpu, arch.fault_gpa));
 	DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr));
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 01f4392a284d..7fc7a9221509 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -518,6 +518,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 /* Stack frame offsets */
 #define STACK_SLOT_TID		(112-16)
 #define STACK_SLOT_PSSCR	(112-24)
+#define STACK_SLOT_PID		(112-32)
 
 .global kvmppc_hv_entry
 kvmppc_hv_entry:
@@ -530,6 +531,7 @@ kvmppc_hv_entry:
 	 * R1 = host R1
 	 * R2 = TOC
 	 * all other volatile GPRS = free
+	 * Does not preserve non-volatile GPRs or CR fields
 	 */
 	mflr	r0
 	std	r0, PPC_LR_STKOFF(r1)
@@ -549,32 +551,38 @@ kvmppc_hv_entry:
 	bl	kvmhv_start_timing
 1:
 #endif
-	/* Clear out SLB */
+
+	/* Use cr7 as an indication of radix mode */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	ld	r9, VCORE_KVM(r5)	/* pointer to struct kvm */
+	lbz	r0, KVM_RADIX(r9)
+	cmpwi	cr7, r0, 0
+
+	/* Clear out SLB if hash */
+	bne	cr7, 2f
 	li	r6,0
 	slbmte	r6,r6
 	slbia
 	ptesync
-
+2:
 	/*
 	 * POWER7/POWER8 host -> guest partition switch code.
 	 * We don't have to lock against concurrent tlbies,
 	 * but we do have to coordinate across hardware threads.
 	 */
 	/* Set bit in entry map iff exit map is zero. */
-	ld	r5, HSTATE_KVM_VCORE(r13)
 	li	r7, 1
 	lbz	r6, HSTATE_PTID(r13)
 	sld	r7, r7, r6
-	addi	r9, r5, VCORE_ENTRY_EXIT
-21:	lwarx	r3, 0, r9
+	addi	r8, r5, VCORE_ENTRY_EXIT
+21:	lwarx	r3, 0, r8
 	cmpwi	r3, 0x100		/* any threads starting to exit? */
 	bge	secondary_too_late	/* if so we're too late to the party */
 	or	r3, r3, r7
-	stwcx.	r3, 0, r9
+	stwcx.	r3, 0, r8
 	bne	21b
 
 	/* Primary thread switches to guest partition. */
-	ld	r9,VCORE_KVM(r5)	/* pointer to struct kvm */
 	cmpwi	r6,0
 	bne	10f
 	lwz	r7,KVM_LPID(r9)
@@ -658,7 +666,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	beq	kvmppc_primary_no_guest
 kvmppc_got_guest:
 
-	/* Load up guest SLB entries */
+	/* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
 	lwz	r5,VCPU_SLB_MAX(r4)
 	cmpwi	r5,0
 	beq	9f
@@ -696,8 +704,10 @@ kvmppc_got_guest:
 BEGIN_FTR_SECTION
 	mfspr	r5, SPRN_TIDR
 	mfspr	r6, SPRN_PSSCR
+	mfspr	r7, SPRN_PID
 	std	r5, STACK_SLOT_TID(r1)
 	std	r6, STACK_SLOT_PSSCR(r1)
+	std	r7, STACK_SLOT_PID(r1)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
 BEGIN_FTR_SECTION
@@ -1293,11 +1303,15 @@ mc_cont:
 	mtspr	SPRN_CTRLT,r6
 4:
 	/* Read the guest SLB and save it away */
+	ld	r5, VCPU_KVM(r9)
+	lbz	r0, KVM_RADIX(r5)
+	cmpwi	r0, 0
+	li	r5, 0
+	bne	3f			/* for radix, save 0 entries */
 	lwz	r0,VCPU_SLB_NR(r9)	/* number of entries in SLB */
 	mtctr	r0
 	li	r6,0
 	addi	r7,r9,VCPU_SLB
-	li	r5,0
 1:	slbmfee	r8,r6
 	andis.	r0,r8,SLB_ESID_V@h
 	beq	2f
@@ -1309,7 +1323,7 @@ mc_cont:
 	addi	r5,r5,1
 2:	addi	r6,r6,1
 	bdnz	1b
-	stw	r5,VCPU_SLB_MAX(r9)
+3:	stw	r5,VCPU_SLB_MAX(r9)
 
 	/*
 	 * Save the guest PURR/SPURR
@@ -1558,8 +1572,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 BEGIN_FTR_SECTION
 	ld	r5, STACK_SLOT_TID(r1)
 	ld	r6, STACK_SLOT_PSSCR(r1)
+	ld	r7, STACK_SLOT_PID(r1)
 	mtspr	SPRN_TIDR, r5
 	mtspr	SPRN_PSSCR, r6
+	mtspr	SPRN_PID, r7
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
 	/*
@@ -1671,6 +1687,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	isync
 
 	/* load host SLB entries */
+BEGIN_MMU_FTR_SECTION
+	b	0f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
 	ld	r8,PACA_SLBSHADOWPTR(r13)
 
 	.rept	SLB_NUM_BOLTED
@@ -1683,7 +1702,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	slbmte	r6,r5
 1:	addi	r8,r8,16
 	.endr
-
+0:
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
 	/* Finish timing, if we have a vcpu */
 	ld	r4, HSTATE_KVM_VCPU(r13)
@@ -1710,8 +1729,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  * reflect the HDSI to the guest as a DSI.
  */
 kvmppc_hdsi:
+	ld	r3, VCPU_KVM(r9)
+	lbz	r0, KVM_RADIX(r3)
+	cmpwi	r0, 0
 	mfspr	r4, SPRN_HDAR
 	mfspr	r6, SPRN_HDSISR
+	bne	.Lradix_hdsi		/* on radix, just save DAR/DSISR/ASDR */
 	/* HPTE not found fault or protection fault? */
 	andis.	r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h
 	beq	1f			/* if not, send it to the guest */
@@ -1788,11 +1811,23 @@ fast_interrupt_c_return:
 	stb	r0, HSTATE_IN_GUEST(r13)
 	b	guest_exit_cont
 
+.Lradix_hdsi:
+	std	r4, VCPU_FAULT_DAR(r9)
+	stw	r6, VCPU_FAULT_DSISR(r9)
+.Lradix_hisi:
+	mfspr	r5, SPRN_ASDR
+	std	r5, VCPU_FAULT_GPA(r9)
+	b	guest_exit_cont
+
 /*
  * Similarly for an HISI, reflect it to the guest as an ISI unless
  * it is an HPTE not found fault for a page that we have paged out.
  */
 kvmppc_hisi:
+	ld	r3, VCPU_KVM(r9)
+	lbz	r0, KVM_RADIX(r3)
+	cmpwi	r0, 0
+	bne	.Lradix_hisi		/* for radix, just save ASDR */
 	andis.	r0, r11, SRR1_ISI_NOPT@h
 	beq	1f
 BEGIN_FTR_SECTION
-- 
cgit v1.2.3


From 5a319350a46572d073042a3194676099dd2c135d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:46 +1100
Subject: KVM: PPC: Book3S HV: Page table construction and page faults for
 radix guests

This adds the code to construct the second-level ("partition-scoped" in
architecturese) page tables for guests using the radix MMU.  Apart from
the PGD level, which is allocated when the guest is created, the rest
of the tree is all constructed in response to hypervisor page faults.

As well as hypervisor page faults for missing pages, we also get faults
for reference/change (RC) bits needing to be set, as well as various
other error conditions.  For now, we only set the R or C bit in the
guest page table if the same bit is set in the host PTE for the
backing page.

This code can take advantage of the guest being backed with either
transparent or ordinary 2MB huge pages, and insert 2MB page entries
into the guest page tables.  There is no support for 1GB huge pages
yet.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_book3s.h  |   8 +
 arch/powerpc/kvm/book3s.c              |   1 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c    |   7 +-
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 385 +++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c           |  17 +-
 5 files changed, 415 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 7adfcc03a35f..ff5cd5c5ce8d 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -170,6 +170,8 @@ extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run,
 			unsigned long status);
 extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr,
 			unsigned long slb_v, unsigned long valid);
+extern int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			unsigned long gpa, gva_t ea, int is_store);
 
 extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
 extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
@@ -182,8 +184,14 @@ extern void kvmppc_mmu_hpte_sysexit(void);
 extern int kvmppc_mmu_hv_init(void);
 extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
 
+extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
+			struct kvm_vcpu *vcpu,
+			unsigned long ea, unsigned long dsisr);
 extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 			struct kvmppc_pte *gpte, bool data, bool iswrite);
+extern void kvmppc_free_radix(struct kvm *kvm);
+extern int kvmppc_radix_init(void);
+extern void kvmppc_radix_exit(void);
 
 /* XXX remove this export when load_last_inst() is generic */
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 019f008775b9..b6b5c185bd92 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -239,6 +239,7 @@ void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar,
 	kvmppc_set_dsisr(vcpu, flags);
 	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE);
 }
+EXPORT_SYMBOL_GPL(kvmppc_core_queue_data_storage);	/* used by kvm_hv */
 
 void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong flags)
 {
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index c208bf3b252f..57690c22716d 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -395,8 +395,8 @@ static int instruction_is_store(unsigned int instr)
 	return (instr & mask) != 0;
 }
 
-static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
-				  unsigned long gpa, gva_t ea, int is_store)
+int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			   unsigned long gpa, gva_t ea, int is_store)
 {
 	u32 last_inst;
 
@@ -461,6 +461,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	unsigned long rcbits;
 	long mmio_update;
 
+	if (kvm_is_radix(kvm))
+		return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr);
+
 	/*
 	 * Real-mode code has already searched the HPT and found the
 	 * entry we're interested in.  Lock the entry and check that
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 9091407fbfd4..865ea9bca364 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -137,3 +137,388 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	return 0;
 }
 
+#ifdef CONFIG_PPC_64K_PAGES
+#define MMU_BASE_PSIZE	MMU_PAGE_64K
+#else
+#define MMU_BASE_PSIZE	MMU_PAGE_4K
+#endif
+
+static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
+				    unsigned int pshift)
+{
+	int psize = MMU_BASE_PSIZE;
+
+	if (pshift >= PMD_SHIFT)
+		psize = MMU_PAGE_2M;
+	addr &= ~0xfffUL;
+	addr |= mmu_psize_defs[psize].ap << 5;
+	asm volatile("ptesync": : :"memory");
+	asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1)
+		     : : "r" (addr), "r" (kvm->arch.lpid) : "memory");
+	asm volatile("ptesync": : :"memory");
+}
+
+void kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, unsigned long clr,
+			     unsigned long set, unsigned long addr,
+			     unsigned int shift)
+{
+	if (!(clr & _PAGE_PRESENT) && cpu_has_feature(CPU_FTR_POWER9_DD1) &&
+	    pte_present(*ptep)) {
+		/* have to invalidate it first */
+		__radix_pte_update(ptep, _PAGE_PRESENT, 0);
+		kvmppc_radix_tlbie_page(kvm, addr, shift);
+		set |= _PAGE_PRESENT;
+	}
+	__radix_pte_update(ptep, clr, set);
+}
+
+void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
+			     pte_t *ptep, pte_t pte)
+{
+	radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
+}
+
+static struct kmem_cache *kvm_pte_cache;
+
+static pte_t *kvmppc_pte_alloc(void)
+{
+	return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
+}
+
+static void kvmppc_pte_free(pte_t *ptep)
+{
+	kmem_cache_free(kvm_pte_cache, ptep);
+}
+
+static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
+			     unsigned int level, unsigned long mmu_seq)
+{
+	pgd_t *pgd;
+	pud_t *pud, *new_pud = NULL;
+	pmd_t *pmd, *new_pmd = NULL;
+	pte_t *ptep, *new_ptep = NULL;
+	int ret;
+
+	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
+	pgd = kvm->arch.pgtable + pgd_index(gpa);
+	pud = NULL;
+	if (pgd_present(*pgd))
+		pud = pud_offset(pgd, gpa);
+	else
+		new_pud = pud_alloc_one(kvm->mm, gpa);
+
+	pmd = NULL;
+	if (pud && pud_present(*pud))
+		pmd = pmd_offset(pud, gpa);
+	else
+		new_pmd = pmd_alloc_one(kvm->mm, gpa);
+
+	if (level == 0 && !(pmd && pmd_present(*pmd)))
+		new_ptep = kvmppc_pte_alloc();
+
+	/* Check if we might have been invalidated; let the guest retry if so */
+	spin_lock(&kvm->mmu_lock);
+	ret = -EAGAIN;
+	if (mmu_notifier_retry(kvm, mmu_seq))
+		goto out_unlock;
+
+	/* Now traverse again under the lock and change the tree */
+	ret = -ENOMEM;
+	if (pgd_none(*pgd)) {
+		if (!new_pud)
+			goto out_unlock;
+		pgd_populate(kvm->mm, pgd, new_pud);
+		new_pud = NULL;
+	}
+	pud = pud_offset(pgd, gpa);
+	if (pud_none(*pud)) {
+		if (!new_pmd)
+			goto out_unlock;
+		pud_populate(kvm->mm, pud, new_pmd);
+		new_pmd = NULL;
+	}
+	pmd = pmd_offset(pud, gpa);
+	if (pmd_large(*pmd)) {
+		/* Someone else has instantiated a large page here; retry */
+		ret = -EAGAIN;
+		goto out_unlock;
+	}
+	if (level == 1 && !pmd_none(*pmd)) {
+		/*
+		 * There's a page table page here, but we wanted
+		 * to install a large page.  Tell the caller and let
+		 * it try installing a normal page if it wants.
+		 */
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+	if (level == 0) {
+		if (pmd_none(*pmd)) {
+			if (!new_ptep)
+				goto out_unlock;
+			pmd_populate(kvm->mm, pmd, new_ptep);
+			new_ptep = NULL;
+		}
+		ptep = pte_offset_kernel(pmd, gpa);
+		if (pte_present(*ptep)) {
+			/* PTE was previously valid, so invalidate it */
+			kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT,
+						0, gpa, 0);
+			kvmppc_radix_tlbie_page(kvm, gpa, 0);
+		}
+		kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
+	} else {
+		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
+	}
+	ret = 0;
+
+ out_unlock:
+	spin_unlock(&kvm->mmu_lock);
+	if (new_pud)
+		pud_free(kvm->mm, new_pud);
+	if (new_pmd)
+		pmd_free(kvm->mm, new_pmd);
+	if (new_ptep)
+		kvmppc_pte_free(new_ptep);
+	return ret;
+}
+
+int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				   unsigned long ea, unsigned long dsisr)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long mmu_seq, pte_size;
+	unsigned long gpa, gfn, hva, pfn;
+	struct kvm_memory_slot *memslot;
+	struct page *page = NULL, *pages[1];
+	long ret, npages, ok;
+	unsigned int writing;
+	struct vm_area_struct *vma;
+	unsigned long flags;
+	pte_t pte, *ptep;
+	unsigned long pgflags;
+	unsigned int shift, level;
+
+	/* Check for unusual errors */
+	if (dsisr & DSISR_UNSUPP_MMU) {
+		pr_err("KVM: Got unsupported MMU fault\n");
+		return -EFAULT;
+	}
+	if (dsisr & DSISR_BADACCESS) {
+		/* Reflect to the guest as DSI */
+		pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
+		kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+		return RESUME_GUEST;
+	}
+
+	/* Translate the logical address and get the page */
+	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
+	gpa &= ~0xF000000000000000ul;
+	gfn = gpa >> PAGE_SHIFT;
+	if (!(dsisr & DSISR_PGDIRFAULT))
+		gpa |= ea & 0xfff;
+	memslot = gfn_to_memslot(kvm, gfn);
+
+	/* No memslot means it's an emulated MMIO region */
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
+		if (dsisr & (DSISR_PGDIRFAULT | DSISR_BADACCESS |
+			     DSISR_SET_RC)) {
+			/*
+			 * Bad address in guest page table tree, or other
+			 * unusual error - reflect it to the guest as DSI.
+			 */
+			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+			return RESUME_GUEST;
+		}
+		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
+					      dsisr & DSISR_ISSTORE);
+	}
+
+	/* used to check for invalidations in progress */
+	mmu_seq = kvm->mmu_notifier_seq;
+	smp_rmb();
+
+	writing = (dsisr & DSISR_ISSTORE) != 0;
+	hva = gfn_to_hva_memslot(memslot, gfn);
+	if (dsisr & DSISR_SET_RC) {
+		/*
+		 * Need to set an R or C bit in the 2nd-level tables;
+		 * if the relevant bits aren't already set in the linux
+		 * page tables, fall through to do the gup_fast to
+		 * set them in the linux page tables too.
+		 */
+		ok = 0;
+		pgflags = _PAGE_ACCESSED;
+		if (writing)
+			pgflags |= _PAGE_DIRTY;
+		local_irq_save(flags);
+		ptep = __find_linux_pte_or_hugepte(current->mm->pgd, hva,
+						   NULL, NULL);
+		if (ptep) {
+			pte = READ_ONCE(*ptep);
+			if (pte_present(pte) &&
+			    (pte_val(pte) & pgflags) == pgflags)
+				ok = 1;
+		}
+		local_irq_restore(flags);
+		if (ok) {
+			spin_lock(&kvm->mmu_lock);
+			if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
+				spin_unlock(&kvm->mmu_lock);
+				return RESUME_GUEST;
+			}
+			ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable,
+							gpa, NULL, &shift);
+			if (ptep && pte_present(*ptep)) {
+				kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
+							gpa, shift);
+				spin_unlock(&kvm->mmu_lock);
+				return RESUME_GUEST;
+			}
+			spin_unlock(&kvm->mmu_lock);
+		}
+	}
+
+	ret = -EFAULT;
+	pfn = 0;
+	pte_size = PAGE_SIZE;
+	pgflags = _PAGE_READ | _PAGE_EXEC;
+	level = 0;
+	npages = get_user_pages_fast(hva, 1, writing, pages);
+	if (npages < 1) {
+		/* Check if it's an I/O mapping */
+		down_read(&current->mm->mmap_sem);
+		vma = find_vma(current->mm, hva);
+		if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
+		    (vma->vm_flags & VM_PFNMAP)) {
+			pfn = vma->vm_pgoff +
+				((hva - vma->vm_start) >> PAGE_SHIFT);
+			pgflags = pgprot_val(vma->vm_page_prot);
+		}
+		up_read(&current->mm->mmap_sem);
+		if (!pfn)
+			return -EFAULT;
+	} else {
+		page = pages[0];
+		pfn = page_to_pfn(page);
+		if (PageHuge(page)) {
+			page = compound_head(page);
+			pte_size <<= compound_order(page);
+			/* See if we can insert a 2MB large-page PTE here */
+			if (pte_size >= PMD_SIZE &&
+			    (gpa & PMD_MASK & PAGE_MASK) ==
+			    (hva & PMD_MASK & PAGE_MASK)) {
+				level = 1;
+				pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
+			}
+		}
+		/* See if we can provide write access */
+		if (writing) {
+			/*
+			 * We assume gup_fast has set dirty on the host PTE.
+			 */
+			pgflags |= _PAGE_WRITE;
+		} else {
+			local_irq_save(flags);
+			ptep = __find_linux_pte_or_hugepte(current->mm->pgd,
+							hva, NULL, NULL);
+			if (ptep && pte_write(*ptep) && pte_dirty(*ptep))
+				pgflags |= _PAGE_WRITE;
+			local_irq_restore(flags);
+		}
+	}
+
+	/*
+	 * Compute the PTE value that we need to insert.
+	 */
+	pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED;
+	if (pgflags & _PAGE_WRITE)
+		pgflags |= _PAGE_DIRTY;
+	pte = pfn_pte(pfn, __pgprot(pgflags));
+
+	/* Allocate space in the tree and write the PTE */
+	ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
+	if (ret == -EBUSY) {
+		/*
+		 * There's already a PMD where wanted to install a large page;
+		 * for now, fall back to installing a small page.
+		 */
+		level = 0;
+		pfn |= gfn & ((PMD_SIZE >> PAGE_SHIFT) - 1);
+		pte = pfn_pte(pfn, __pgprot(pgflags));
+		ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
+	}
+	if (ret == 0 || ret == -EAGAIN)
+		ret = RESUME_GUEST;
+
+	if (page) {
+		/*
+		 * We drop pages[0] here, not page because page might
+		 * have been set to the head page of a compound, but
+		 * we have to drop the reference on the correct tail
+		 * page to match the get inside gup()
+		 */
+		put_page(pages[0]);
+	}
+	return ret;
+}
+
+void kvmppc_free_radix(struct kvm *kvm)
+{
+	unsigned long ig, iu, im;
+	pte_t *pte;
+	pmd_t *pmd;
+	pud_t *pud;
+	pgd_t *pgd;
+
+	if (!kvm->arch.pgtable)
+		return;
+	pgd = kvm->arch.pgtable;
+	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
+		if (!pgd_present(*pgd))
+			continue;
+		pud = pud_offset(pgd, 0);
+		for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++pud) {
+			if (!pud_present(*pud))
+				continue;
+			pmd = pmd_offset(pud, 0);
+			for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) {
+				if (pmd_huge(*pmd)) {
+					pmd_clear(pmd);
+					continue;
+				}
+				if (!pmd_present(*pmd))
+					continue;
+				pte = pte_offset_map(pmd, 0);
+				memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
+				kvmppc_pte_free(pte);
+				pmd_clear(pmd);
+			}
+			pmd_free(kvm->mm, pmd_offset(pud, 0));
+			pud_clear(pud);
+		}
+		pud_free(kvm->mm, pud_offset(pgd, 0));
+		pgd_clear(pgd);
+	}
+	pgd_free(kvm->mm, kvm->arch.pgtable);
+}
+
+static void pte_ctor(void *addr)
+{
+	memset(addr, 0, PTE_TABLE_SIZE);
+}
+
+int kvmppc_radix_init(void)
+{
+	unsigned long size = sizeof(void *) << PTE_INDEX_SIZE;
+
+	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
+	if (!kvm_pte_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void kvmppc_radix_exit(void)
+{
+	kmem_cache_destroy(kvm_pte_cache);
+}
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index bb2854314ba4..d50251f9a3c9 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3356,7 +3356,10 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 
 	kvmppc_free_vcores(kvm);
 
-	kvmppc_free_hpt(kvm);
+	if (kvm_is_radix(kvm))
+		kvmppc_free_radix(kvm);
+	else
+		kvmppc_free_hpt(kvm);
 
 	kvmppc_free_pimap(kvm);
 }
@@ -3768,6 +3771,11 @@ static int kvm_init_subcore_bitmap(void)
 	return 0;
 }
 
+static int kvmppc_radix_possible(void)
+{
+	return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
+}
+
 static int kvmppc_book3s_init_hv(void)
 {
 	int r;
@@ -3807,12 +3815,19 @@ static int kvmppc_book3s_init_hv(void)
 	init_vcore_lists();
 
 	r = kvmppc_mmu_hv_init();
+	if (r)
+		return r;
+
+	if (kvmppc_radix_possible())
+		r = kvmppc_radix_init();
 	return r;
 }
 
 static void kvmppc_book3s_exit_hv(void)
 {
 	kvmppc_free_host_rm_ops();
+	if (kvmppc_radix_possible())
+		kvmppc_radix_exit();
 	kvmppc_hv_ops = NULL;
 }
 
-- 
cgit v1.2.3


From 01756099e0a5f431bbada9693d566269acfb51f9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:47 +1100
Subject: KVM: PPC: Book3S HV: MMU notifier callbacks for radix guests

This adapts our implementations of the MMU notifier callbacks
(unmap_hva, unmap_hva_range, age_hva, test_age_hva, set_spte_hva)
to call radix functions when the guest is using radix.  These
implementations are much simpler than for HPT guests because we
have only one PTE to deal with, so we don't need to traverse
rmap chains.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_book3s.h  |  6 ++++
 arch/powerpc/kvm/book3s_64_mmu_hv.c    | 64 +++++++++++++++++++++++-----------
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 54 ++++++++++++++++++++++++++++
 3 files changed, 103 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index ff5cd5c5ce8d..952cc4b954a1 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -192,6 +192,12 @@ extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 extern void kvmppc_free_radix(struct kvm *kvm);
 extern int kvmppc_radix_init(void);
 extern void kvmppc_radix_exit(void);
+extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			unsigned long gfn);
+extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			unsigned long gfn);
+extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			unsigned long gfn);
 
 /* XXX remove this export when load_last_inst() is generic */
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 57690c22716d..088c82bb7ba4 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -701,12 +701,13 @@ static void kvmppc_rmap_reset(struct kvm *kvm)
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 }
 
+typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			      unsigned long gfn);
+
 static int kvm_handle_hva_range(struct kvm *kvm,
 				unsigned long start,
 				unsigned long end,
-				int (*handler)(struct kvm *kvm,
-					       unsigned long *rmapp,
-					       unsigned long gfn))
+				hva_handler_fn handler)
 {
 	int ret;
 	int retval = 0;
@@ -731,9 +732,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
 		for (; gfn < gfn_end; ++gfn) {
-			gfn_t gfn_offset = gfn - memslot->base_gfn;
-
-			ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn);
+			ret = handler(kvm, memslot, gfn);
 			retval |= ret;
 		}
 	}
@@ -742,20 +741,21 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 }
 
 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
-					 unsigned long gfn))
+			  hva_handler_fn handler)
 {
 	return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			   unsigned long gfn)
 {
 	struct revmap_entry *rev = kvm->arch.revmap;
 	unsigned long h, i, j;
 	__be64 *hptep;
 	unsigned long ptel, psize, rcbits;
+	unsigned long *rmapp;
 
+	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
 	for (;;) {
 		lock_rmap(rmapp);
 		if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
@@ -816,26 +816,36 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva)
 {
-	kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+	hva_handler_fn handler;
+
+	handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
+	kvm_handle_hva(kvm, hva, handler);
 	return 0;
 }
 
 int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
 {
-	kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
+	hva_handler_fn handler;
+
+	handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
+	kvm_handle_hva_range(kvm, start, end, handler);
 	return 0;
 }
 
 void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
 				  struct kvm_memory_slot *memslot)
 {
-	unsigned long *rmapp;
 	unsigned long gfn;
 	unsigned long n;
+	unsigned long *rmapp;
 
-	rmapp = memslot->arch.rmap;
 	gfn = memslot->base_gfn;
-	for (n = memslot->npages; n; --n) {
+	rmapp = memslot->arch.rmap;
+	for (n = memslot->npages; n; --n, ++gfn) {
+		if (kvm_is_radix(kvm)) {
+			kvm_unmap_radix(kvm, memslot, gfn);
+			continue;
+		}
 		/*
 		 * Testing the present bit without locking is OK because
 		 * the memslot has been marked invalid already, and hence
@@ -843,20 +853,21 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
 		 * thus the present bit can't go from 0 to 1.
 		 */
 		if (*rmapp & KVMPPC_RMAP_PRESENT)
-			kvm_unmap_rmapp(kvm, rmapp, gfn);
+			kvm_unmap_rmapp(kvm, memslot, gfn);
 		++rmapp;
-		++gfn;
 	}
 }
 
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			 unsigned long gfn)
 {
 	struct revmap_entry *rev = kvm->arch.revmap;
 	unsigned long head, i, j;
 	__be64 *hptep;
 	int ret = 0;
+	unsigned long *rmapp;
 
+	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
  retry:
 	lock_rmap(rmapp);
 	if (*rmapp & KVMPPC_RMAP_REFERENCED) {
@@ -904,17 +915,22 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
 {
-	return kvm_handle_hva_range(kvm, start, end, kvm_age_rmapp);
+	hva_handler_fn handler;
+
+	handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp;
+	return kvm_handle_hva_range(kvm, start, end, handler);
 }
 
-static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			      unsigned long gfn)
 {
 	struct revmap_entry *rev = kvm->arch.revmap;
 	unsigned long head, i, j;
 	unsigned long *hp;
 	int ret = 1;
+	unsigned long *rmapp;
 
+	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
 	if (*rmapp & KVMPPC_RMAP_REFERENCED)
 		return 1;
 
@@ -940,12 +956,18 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
 {
-	return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
+	hva_handler_fn handler;
+
+	handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp;
+	return kvm_handle_hva(kvm, hva, handler);
 }
 
 void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
 {
-	kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+	hva_handler_fn handler;
+
+	handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
+	kvm_handle_hva(kvm, hva, handler);
 }
 
 static int vcpus_running(struct kvm *kvm)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 865ea9bca364..69cabadc121a 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -463,6 +463,60 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return ret;
 }
 
+/* Called with kvm->lock held */
+int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		    unsigned long gfn)
+{
+	pte_t *ptep;
+	unsigned long gpa = gfn << PAGE_SHIFT;
+	unsigned int shift;
+
+	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
+					   NULL, &shift);
+	if (ptep && pte_present(*ptep)) {
+		kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0,
+					gpa, shift);
+		kvmppc_radix_tlbie_page(kvm, gpa, shift);
+	}
+	return 0;				
+}
+
+/* Called with kvm->lock held */
+int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		  unsigned long gfn)
+{
+	pte_t *ptep;
+	unsigned long gpa = gfn << PAGE_SHIFT;
+	unsigned int shift;
+	int ref = 0;
+
+	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
+					   NULL, &shift);
+	if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
+		kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
+					gpa, shift);
+		/* XXX need to flush tlb here? */
+		ref = 1;
+	}
+	return ref;
+}
+
+/* Called with kvm->lock held */
+int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		       unsigned long gfn)
+{
+	pte_t *ptep;
+	unsigned long gpa = gfn << PAGE_SHIFT;
+	unsigned int shift;
+	int ref = 0;
+
+	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
+					   NULL, &shift);
+	if (ptep && pte_present(*ptep) && pte_young(*ptep))
+		ref = 1;
+	return ref;
+}
+
 void kvmppc_free_radix(struct kvm *kvm)
 {
 	unsigned long ig, iu, im;
-- 
cgit v1.2.3


From 8f7b79b8379a85fb8dd0c3f42d9f452ec5552161 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:48 +1100
Subject: KVM: PPC: Book3S HV: Implement dirty page logging for radix guests

This adds code to keep track of dirty pages when requested (that is,
when memslot->dirty_bitmap is non-NULL) for radix guests.  We use the
dirty bits in the PTEs in the second-level (partition-scoped) page
tables, together with a bitmap of pages that were dirty when their
PTE was invalidated (e.g., when the page was paged out).  This bitmap
is stored in the first half of the memslot->dirty_bitmap area, and
kvm_vm_ioctl_get_dirty_log_hv() now uses the second half for the
bitmap that gets returned to userspace.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_book3s.h  |   7 ++-
 arch/powerpc/kvm/book3s_64_mmu_hv.c    |  28 ++++-----
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 111 ++++++++++++++++++++++++++++++---
 arch/powerpc/kvm/book3s_hv.c           |  31 +++++++--
 4 files changed, 144 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 952cc4b954a1..57dc407cec4a 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -198,6 +198,8 @@ extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			unsigned long gfn);
 extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			unsigned long gfn);
+extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
+			struct kvm_memory_slot *memslot, unsigned long *map);
 
 /* XXX remove this export when load_last_inst() is generic */
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
@@ -228,8 +230,11 @@ extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 			unsigned long pte_index, unsigned long avpn,
 			unsigned long *hpret);
-extern long kvmppc_hv_get_dirty_log(struct kvm *kvm,
+extern long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
 			struct kvm_memory_slot *memslot, unsigned long *map);
+extern void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
+			struct kvm_memory_slot *memslot,
+			unsigned long *map);
 extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr,
 			unsigned long mask);
 extern void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 088c82bb7ba4..c9e587a2849d 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1068,7 +1068,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
 	return npages_dirty;
 }
 
-static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
+void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
 			      struct kvm_memory_slot *memslot,
 			      unsigned long *map)
 {
@@ -1086,12 +1086,11 @@ static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
 		__set_bit_le(gfn - memslot->base_gfn, map);
 }
 
-long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
-			     unsigned long *map)
+long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
+			struct kvm_memory_slot *memslot, unsigned long *map)
 {
 	unsigned long i, j;
 	unsigned long *rmapp;
-	struct kvm_vcpu *vcpu;
 
 	preempt_disable();
 	rmapp = memslot->arch.rmap;
@@ -1107,15 +1106,6 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
 				__set_bit_le(j, map);
 		++rmapp;
 	}
-
-	/* Harvest dirty bits from VPA and DTL updates */
-	/* Note: we never modify the SLB shadow buffer areas */
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		spin_lock(&vcpu->arch.vpa_update_lock);
-		harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map);
-		harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map);
-		spin_unlock(&vcpu->arch.vpa_update_lock);
-	}
 	preempt_enable();
 	return 0;
 }
@@ -1170,10 +1160,14 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
 	srcu_idx = srcu_read_lock(&kvm->srcu);
 	memslot = gfn_to_memslot(kvm, gfn);
 	if (memslot) {
-		rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
-		lock_rmap(rmap);
-		*rmap |= KVMPPC_RMAP_CHANGED;
-		unlock_rmap(rmap);
+		if (!kvm_is_radix(kvm)) {
+			rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+			lock_rmap(rmap);
+			*rmap |= KVMPPC_RMAP_CHANGED;
+			unlock_rmap(rmap);
+		} else if (memslot->dirty_bitmap) {
+			mark_page_dirty(kvm, gfn);
+		}
 	}
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 }
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 69cabadc121a..125cc7ce1525 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -158,18 +158,21 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
 	asm volatile("ptesync": : :"memory");
 }
 
-void kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, unsigned long clr,
-			     unsigned long set, unsigned long addr,
-			     unsigned int shift)
+unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
+				      unsigned long clr, unsigned long set,
+				      unsigned long addr, unsigned int shift)
 {
+	unsigned long old = 0;
+
 	if (!(clr & _PAGE_PRESENT) && cpu_has_feature(CPU_FTR_POWER9_DD1) &&
 	    pte_present(*ptep)) {
 		/* have to invalidate it first */
-		__radix_pte_update(ptep, _PAGE_PRESENT, 0);
+		old = __radix_pte_update(ptep, _PAGE_PRESENT, 0);
 		kvmppc_radix_tlbie_page(kvm, addr, shift);
 		set |= _PAGE_PRESENT;
+		old &= _PAGE_PRESENT;
 	}
-	__radix_pte_update(ptep, clr, set);
+	return __radix_pte_update(ptep, clr, set) | old;
 }
 
 void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
@@ -197,6 +200,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 	pud_t *pud, *new_pud = NULL;
 	pmd_t *pmd, *new_pmd = NULL;
 	pte_t *ptep, *new_ptep = NULL;
+	unsigned long old;
 	int ret;
 
 	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
@@ -262,9 +266,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 		ptep = pte_offset_kernel(pmd, gpa);
 		if (pte_present(*ptep)) {
 			/* PTE was previously valid, so invalidate it */
-			kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT,
-						0, gpa, 0);
+			old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT,
+						      0, gpa, 0);
 			kvmppc_radix_tlbie_page(kvm, gpa, 0);
+			if (old & _PAGE_DIRTY)
+				mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
 		}
 		kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
 	} else {
@@ -463,6 +469,26 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return ret;
 }
 
+static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			     unsigned long gfn, unsigned int order)
+{
+	unsigned long i, limit;
+	unsigned long *dp;
+
+	if (!memslot->dirty_bitmap)
+		return;
+	limit = 1ul << order;
+	if (limit < BITS_PER_LONG) {
+		for (i = 0; i < limit; ++i)
+			mark_page_dirty(kvm, gfn + i);
+		return;
+	}
+	dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn);
+	limit /= BITS_PER_LONG;
+	for (i = 0; i < limit; ++i)
+		*dp++ = ~0ul;
+}
+
 /* Called with kvm->lock held */
 int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		    unsigned long gfn)
@@ -470,13 +496,21 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	pte_t *ptep;
 	unsigned long gpa = gfn << PAGE_SHIFT;
 	unsigned int shift;
+	unsigned long old;
 
 	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
 					   NULL, &shift);
 	if (ptep && pte_present(*ptep)) {
-		kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0,
-					gpa, shift);
+		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0,
+					      gpa, shift);
 		kvmppc_radix_tlbie_page(kvm, gpa, shift);
+		if (old & _PAGE_DIRTY) {
+			if (!shift)
+				mark_page_dirty(kvm, gfn);
+			else
+				mark_pages_dirty(kvm, memslot,
+						 gfn, shift - PAGE_SHIFT);
+		}
 	}
 	return 0;				
 }
@@ -517,6 +551,65 @@ int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	return ref;
 }
 
+/* Returns the number of PAGE_SIZE pages that are dirty */
+static int kvm_radix_test_clear_dirty(struct kvm *kvm,
+				struct kvm_memory_slot *memslot, int pagenum)
+{
+	unsigned long gfn = memslot->base_gfn + pagenum;
+	unsigned long gpa = gfn << PAGE_SHIFT;
+	pte_t *ptep;
+	unsigned int shift;
+	int ret = 0;
+
+	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
+					   NULL, &shift);
+	if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) {
+		ret = 1;
+		if (shift)
+			ret = 1 << (shift - PAGE_SHIFT);
+		kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
+					gpa, shift);
+		kvmppc_radix_tlbie_page(kvm, gpa, shift);
+	}
+	return ret;
+}
+
+long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
+			struct kvm_memory_slot *memslot, unsigned long *map)
+{
+	unsigned long i, j;
+	unsigned long n, *p;
+	int npages;
+
+	/*
+	 * Radix accumulates dirty bits in the first half of the
+	 * memslot's dirty_bitmap area, for when pages are paged
+	 * out or modified by the host directly.  Pick up these
+	 * bits and add them to the map.
+	 */
+	n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long);
+	p = memslot->dirty_bitmap;
+	for (i = 0; i < n; ++i)
+		map[i] |= xchg(&p[i], 0);
+
+	for (i = 0; i < memslot->npages; i = j) {
+		npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
+
+		/*
+		 * Note that if npages > 0 then i must be a multiple of npages,
+		 * since huge pages are only used to back the guest at guest
+		 * real addresses that are a multiple of their size.
+		 * Since we have at most one PTE covering any given guest
+		 * real address, if npages > 1 we can skip to i + npages.
+		 */
+		j = i + 1;
+		if (npages)
+			for (j = i; npages; ++j, --npages)
+				__set_bit_le(j, map);
+	}
+	return 0;
+}
+
 void kvmppc_free_radix(struct kvm *kvm)
 {
 	unsigned long ig, iu, im;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d50251f9a3c9..401e4cc8a91f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2961,8 +2961,10 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
-	int r;
+	int i, r;
 	unsigned long n;
+	unsigned long *buf;
+	struct kvm_vcpu *vcpu;
 
 	mutex_lock(&kvm->slots_lock);
 
@@ -2976,15 +2978,32 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
 	if (!memslot->dirty_bitmap)
 		goto out;
 
+	/*
+	 * Use second half of bitmap area because radix accumulates
+	 * bits in the first half.
+	 */
 	n = kvm_dirty_bitmap_bytes(memslot);
-	memset(memslot->dirty_bitmap, 0, n);
+	buf = memslot->dirty_bitmap + n / sizeof(long);
+	memset(buf, 0, n);
 
-	r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap);
+	if (kvm_is_radix(kvm))
+		r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf);
+	else
+		r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf);
 	if (r)
 		goto out;
 
+	/* Harvest dirty bits from VPA and DTL updates */
+	/* Note: we never modify the SLB shadow buffer areas */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf);
+		kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf);
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+	}
+
 	r = -EFAULT;
-	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+	if (copy_to_user(log->dirty_bitmap, buf, n))
 		goto out;
 
 	r = 0;
@@ -3037,7 +3056,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 	if (npages)
 		atomic64_inc(&kvm->arch.mmio_update);
 
-	if (npages && old->npages) {
+	if (npages && old->npages && !kvm_is_radix(kvm)) {
 		/*
 		 * If modifying a memslot, reset all the rmap dirty bits.
 		 * If this is a new memslot, we don't need to do anything
@@ -3046,7 +3065,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 		 */
 		slots = kvm_memslots(kvm);
 		memslot = id_to_memslot(slots, mem->slot);
-		kvmppc_hv_get_dirty_log(kvm, memslot, NULL);
+		kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL);
 	}
 }
 
-- 
cgit v1.2.3


From 65dae5403a162fe6ef7cd8b2835de9d23c303891 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:49 +1100
Subject: KVM: PPC: Book3S HV: Make HPT-specific hypercalls return error in
 radix mode

If the guest is in radix mode, then it doesn't have a hashed page
table (HPT), so all of the hypercalls that manipulate the HPT can't
work and should return an error.  This adds checks to make them
return H_FUNCTION ("function not supported").

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 9ef3c4be952f..6c1ac3d21b91 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -182,6 +182,8 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 	unsigned long mmu_seq;
 	unsigned long rcbits, irq_flags = 0;
 
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
 	psize = hpte_page_size(pteh, ptel);
 	if (!psize)
 		return H_PARAMETER;
@@ -458,6 +460,8 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 	struct revmap_entry *rev;
 	u64 pte, orig_pte, pte_r;
 
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
 	if (pte_index >= kvm->arch.hpt_npte)
 		return H_PARAMETER;
 	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
@@ -529,6 +533,8 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 	struct revmap_entry *rev, *revs[4];
 	u64 hp0, hp1;
 
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
 	global = global_invalidates(kvm, 0);
 	for (i = 0; i < 4 && ret == H_SUCCESS; ) {
 		n = 0;
@@ -642,6 +648,8 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	unsigned long v, r, rb, mask, bits;
 	u64 pte_v, pte_r;
 
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
 	if (pte_index >= kvm->arch.hpt_npte)
 		return H_PARAMETER;
 
@@ -711,6 +719,8 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 	int i, n = 1;
 	struct revmap_entry *rev = NULL;
 
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
 	if (pte_index >= kvm->arch.hpt_npte)
 		return H_PARAMETER;
 	if (flags & H_READ_4) {
@@ -750,6 +760,8 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
 	unsigned long *rmap;
 	long ret = H_NOT_FOUND;
 
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
 	if (pte_index >= kvm->arch.hpt_npte)
 		return H_PARAMETER;
 
@@ -796,6 +808,8 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
 	unsigned long *rmap;
 	long ret = H_NOT_FOUND;
 
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
 	if (pte_index >= kvm->arch.hpt_npte)
 		return H_PARAMETER;
 
-- 
cgit v1.2.3


From a29ebeaf5575d03eef178bb87c425a1e46cae1ca Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:50 +1100
Subject: KVM: PPC: Book3S HV: Invalidate TLB on radix guest vcpu movement

With radix, the guest can do TLB invalidations itself using the tlbie
(global) and tlbiel (local) TLB invalidation instructions.  Linux guests
use local TLB invalidations for translations that have only ever been
accessed on one vcpu.  However, that doesn't mean that the translations
have only been accessed on one physical cpu (pcpu) since vcpus can move
around from one pcpu to another.  Thus a tlbiel might leave behind stale
TLB entries on a pcpu where the vcpu previously ran, and if that task
then moves back to that previous pcpu, it could see those stale TLB
entries and thus access memory incorrectly.  The usual symptom of this
is random segfaults in userspace programs in the guest.

To cope with this, we detect when a vcpu is about to start executing on
a thread in a core that is a different core from the last time it
executed.  If that is the case, then we mark the core as needing a
TLB flush and then send an interrupt to any thread in the core that is
currently running a vcpu from the same guest.  This will get those vcpus
out of the guest, and the first one to re-enter the guest will do the
TLB flush.  The reason for interrupting the vcpus executing on the old
core is to cope with the following scenario:

	CPU 0			CPU 1			CPU 4
	(core 0)			(core 0)			(core 1)

	VCPU 0 runs task X      VCPU 1 runs
	core 0 TLB gets
	entries from task X
	VCPU 0 moves to CPU 4
							VCPU 0 runs task X
							Unmap pages of task X
							tlbiel

				(still VCPU 1)			task X moves to VCPU 1
				task X runs
				task X sees stale TLB
				entries

That is, as soon as the VCPU starts executing on the new core, it
could unmap and tlbiel some page table entries, and then the task
could migrate to one of the VCPUs running on the old core and
potentially see stale TLB entries.

Since the TLB is shared between all the threads in a core, we only
use the bit of kvm->arch.need_tlb_flush corresponding to the first
thread in the core.  To ensure that we don't have a window where we
can miss a flush, this moves the clearing of the bit from before the
actual flush to after it.  This way, two threads might both do the
flush, but we prevent the situation where one thread can enter the
guest before the flush is finished.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_host.h     |  2 ++
 arch/powerpc/kvm/book3s_hv.c            | 45 +++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv_rm_mmu.c     | 11 ++++++--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 38 +++++++++++++++++++---------
 4 files changed, 82 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index da1421a4d6f2..b2dbeac3f450 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -263,6 +263,7 @@ struct kvm_arch {
 	unsigned long hpt_mask;
 	atomic_t hpte_mod_interest;
 	cpumask_t need_tlb_flush;
+	cpumask_t cpu_in_guest;
 	int hpt_cma_alloc;
 	u8 radix;
 	pgd_t *pgtable;
@@ -661,6 +662,7 @@ struct kvm_vcpu_arch {
 	int state;
 	int ptid;
 	int thread_cpu;
+	int prev_cpu;
 	bool timer_running;
 	wait_queue_head_t cpu_run;
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 401e4cc8a91f..50c230e83f9b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1821,6 +1821,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 	vcpu->arch.vcore = vcore;
 	vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
 	vcpu->arch.thread_cpu = -1;
+	vcpu->arch.prev_cpu = -1;
 
 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	kvmppc_sanity_check(vcpu);
@@ -1950,11 +1951,33 @@ static void kvmppc_release_hwthread(int cpu)
 	tpaca->kvm_hstate.kvm_split_mode = NULL;
 }
 
+static void do_nothing(void *x)
+{
+}
+
+static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	cpu = cpu_first_thread_sibling(cpu);
+	cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
+	/*
+	 * Make sure setting of bit in need_tlb_flush precedes
+	 * testing of cpu_in_guest bits.  The matching barrier on
+	 * the other side is the first smp_mb() in kvmppc_run_core().
+	 */
+	smp_mb();
+	for (i = 0; i < threads_per_core; ++i)
+		if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
+			smp_call_function_single(cpu + i, do_nothing, NULL, 1);
+}
+
 static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
 {
 	int cpu;
 	struct paca_struct *tpaca;
 	struct kvmppc_vcore *mvc = vc->master_vcore;
+	struct kvm *kvm = vc->kvm;
 
 	cpu = vc->pcpu;
 	if (vcpu) {
@@ -1965,6 +1988,27 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
 		cpu += vcpu->arch.ptid;
 		vcpu->cpu = mvc->pcpu;
 		vcpu->arch.thread_cpu = cpu;
+
+		/*
+		 * With radix, the guest can do TLB invalidations itself,
+		 * and it could choose to use the local form (tlbiel) if
+		 * it is invalidating a translation that has only ever been
+		 * used on one vcpu.  However, that doesn't mean it has
+		 * only ever been used on one physical cpu, since vcpus
+		 * can move around between pcpus.  To cope with this, when
+		 * a vcpu moves from one pcpu to another, we need to tell
+		 * any vcpus running on the same core as this vcpu previously
+		 * ran to flush the TLB.  The TLB is shared between threads,
+		 * so we use a single bit in .need_tlb_flush for all 4 threads.
+		 */
+		if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) {
+			if (vcpu->arch.prev_cpu >= 0 &&
+			    cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
+			    cpu_first_thread_sibling(cpu))
+				radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
+			vcpu->arch.prev_cpu = cpu;
+		}
+		cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
 	}
 	tpaca = &paca[cpu];
 	tpaca->kvm_hstate.kvm_vcpu = vcpu;
@@ -2552,6 +2596,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 		kvmppc_release_hwthread(pcpu + i);
 		if (sip && sip->napped[i])
 			kvmppc_ipi_thread(pcpu + i);
+		cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
 	}
 
 	kvmppc_set_host_core(pcpu);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 6c1ac3d21b91..b095afcd4309 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -43,6 +43,7 @@ static void *real_vmalloc_addr(void *x)
 static int global_invalidates(struct kvm *kvm, unsigned long flags)
 {
 	int global;
+	int cpu;
 
 	/*
 	 * If there is only one vcore, and it's currently running,
@@ -60,8 +61,14 @@ static int global_invalidates(struct kvm *kvm, unsigned long flags)
 		/* any other core might now have stale TLB entries... */
 		smp_wmb();
 		cpumask_setall(&kvm->arch.need_tlb_flush);
-		cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
-				  &kvm->arch.need_tlb_flush);
+		cpu = local_paca->kvm_hstate.kvm_vcore->pcpu;
+		/*
+		 * On POWER9, threads are independent but the TLB is shared,
+		 * so use the bit for the first thread to represent the core.
+		 */
+		if (cpu_has_feature(CPU_FTR_ARCH_300))
+			cpu = cpu_first_thread_sibling(cpu);
+		cpumask_clear_cpu(cpu, &kvm->arch.need_tlb_flush);
 	}
 
 	return global;
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 7fc7a9221509..dcc67a87d688 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -598,30 +598,44 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 
 	/* See if we need to flush the TLB */
 	lhz	r6,PACAPACAINDEX(r13)	/* test_bit(cpu, need_tlb_flush) */
+BEGIN_FTR_SECTION
+	/*
+	 * On POWER9, individual threads can come in here, but the
+	 * TLB is shared between the 4 threads in a core, hence
+	 * invalidating on one thread invalidates for all.
+	 * Thus we make all 4 threads use the same bit here.
+	 */
+	clrrdi	r6,r6,2
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	clrldi	r7,r6,64-6		/* extract bit number (6 bits) */
 	srdi	r6,r6,6			/* doubleword number */
 	sldi	r6,r6,3			/* address offset */
 	add	r6,r6,r9
 	addi	r6,r6,KVM_NEED_FLUSH	/* dword in kvm->arch.need_tlb_flush */
-	li	r0,1
-	sld	r0,r0,r7
+	li	r8,1
+	sld	r8,r8,r7
 	ld	r7,0(r6)
-	and.	r7,r7,r0
+	and.	r7,r7,r8
 	beq	22f
-23:	ldarx	r7,0,r6			/* if set, clear the bit */
-	andc	r7,r7,r0
-	stdcx.	r7,0,r6
-	bne	23b
 	/* Flush the TLB of any entries for this LPID */
-	lwz	r6,KVM_TLB_SETS(r9)
-	li	r0,0			/* RS for P9 version of tlbiel */
-	mtctr	r6
+	lwz	r0,KVM_TLB_SETS(r9)
+	mtctr	r0
 	li	r7,0x800		/* IS field = 0b10 */
 	ptesync
-28:	tlbiel	r7
+	li	r0,0			/* RS for P9 version of tlbiel */
+	bne	cr7, 29f
+28:	tlbiel	r7			/* On P9, rs=0, RIC=0, PRS=0, R=0 */
 	addi	r7,r7,0x1000
 	bdnz	28b
-	ptesync
+	b	30f
+29:	PPC_TLBIEL(7,0,2,1,1)		/* for radix, RIC=2, PRS=1, R=1 */
+	addi	r7,r7,0x1000
+	bdnz	29b
+30:	ptesync
+23:	ldarx	r7,0,r6			/* clear the bit after TLB flushed */
+	andc	r7,r7,r8
+	stdcx.	r7,0,r6
+	bne	23b
 
 	/* Add timebase offset onto timebase */
 22:	ld	r8,VCORE_TB_OFFSET(r5)
-- 
cgit v1.2.3


From 53af3ba2e8195f504d6a3a0667ccb5e7d4c57599 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:51 +1100
Subject: KVM: PPC: Book3S HV: Allow guest exit path to have MMU on

If we allow LPCR[AIL] to be set for radix guests, then interrupts from
the guest to the host can be delivered by the hardware with relocation
on, and thus the code path starting at kvmppc_interrupt_hv can be
executed in virtual mode (MMU on) for radix guests (previously it was
only ever executed in real mode).

Most of the code is indifferent to whether the MMU is on or off, but
the calls to OPAL that use the real-mode OPAL entry code need to
be switched to use the virtual-mode code instead.  The affected
calls are the calls to the OPAL XICS emulation functions in
kvmppc_read_one_intr() and related functions.  We test the MSR[IR]
bit to detect whether we are in real or virtual mode, and call the
opal_rm_* or opal_* function as appropriate.

The other place that depends on the MMU being off is the optimization
where the guest exit code jumps to the external interrupt vector or
hypervisor doorbell interrupt vector, or returns to its caller (which
is __kvmppc_vcore_entry).  If the MMU is on and we are returning to
the caller, then we don't need to use an rfid instruction since the
MMU is already on; a simple blr suffices.  If there is an external
or hypervisor doorbell interrupt to handle, we branch to the
relocation-on version of the interrupt vector.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kvm/book3s_hv_builtin.c    | 38 ++++++++++++++++++++++-----------
 arch/powerpc/kvm/book3s_hv_rm_xics.c    |  8 +++----
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 29 +++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 5bb24be0b346..fe08fea54b70 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -29,6 +29,11 @@
 #include <asm/opal.h>
 #include <asm/smp.h>
 
+static bool in_realmode(void)
+{
+	return !(mfmsr() & MSR_IR);
+}
+
 #define KVM_CMA_CHUNK_ORDER	18
 
 /*
@@ -200,7 +205,6 @@ static inline void rm_writeb(unsigned long paddr, u8 val)
 
 /*
  * Send an interrupt or message to another CPU.
- * This can only be called in real mode.
  * The caller needs to include any barrier needed to order writes
  * to memory vs. the IPI/message.
  */
@@ -226,7 +230,9 @@ void kvmhv_rm_send_ipi(int cpu)
 
 	/* Else poke the target with an IPI */
 	xics_phys = paca[cpu].kvm_hstate.xics_phys;
-	if (xics_phys)
+	if (!in_realmode())
+		opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
+	else if (xics_phys)
 		rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
 	else
 		opal_rm_int_set_mfrr(get_hard_smp_processor_id(cpu),
@@ -412,14 +418,15 @@ static long kvmppc_read_one_intr(bool *again)
 
 	/* Now read the interrupt from the ICP */
 	xics_phys = local_paca->kvm_hstate.xics_phys;
-	if (!xics_phys) {
-		/* Use OPAL to read the XIRR */
+	rc = 0;
+	if (!in_realmode())
+		rc = opal_int_get_xirr(&xirr, false);
+	else if (!xics_phys)
 		rc = opal_rm_int_get_xirr(&xirr, false);
-		if (rc < 0)
-			return 1;
-	} else {
+	else
 		xirr = _lwzcix(xics_phys + XICS_XIRR);
-	}
+	if (rc < 0)
+		return 1;
 
 	/*
 	 * Save XIRR for later. Since we get control in reverse endian
@@ -445,15 +452,19 @@ static long kvmppc_read_one_intr(bool *again)
 	 * If it is an IPI, clear the MFRR and EOI it.
 	 */
 	if (xisr == XICS_IPI) {
-		if (xics_phys) {
+		rc = 0;
+		if (!in_realmode()) {
+			opal_int_set_mfrr(hard_smp_processor_id(), 0xff);
+			rc = opal_int_eoi(h_xirr);
+		} else if (xics_phys) {
 			_stbcix(xics_phys + XICS_MFRR, 0xff);
 			_stwcix(xics_phys + XICS_XIRR, xirr);
 		} else {
 			opal_rm_int_set_mfrr(hard_smp_processor_id(), 0xff);
 			rc = opal_rm_int_eoi(h_xirr);
-			/* If rc > 0, there is another interrupt pending */
-			*again = rc > 0;
 		}
+		/* If rc > 0, there is another interrupt pending */
+		*again = rc > 0;
 
 		/*
 		 * Need to ensure side effects of above stores
@@ -471,7 +482,10 @@ static long kvmppc_read_one_intr(bool *again)
 			/* We raced with the host,
 			 * we need to resend that IPI, bummer
 			 */
-			if (xics_phys)
+			if (!in_realmode())
+				opal_int_set_mfrr(hard_smp_processor_id(),
+						  IPI_PRIORITY);
+			else if (xics_phys)
 				_stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
 			else
 				opal_rm_int_set_mfrr(hard_smp_processor_id(),
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 06edc4366639..7e2eb3e865b3 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -70,11 +70,9 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu)
 	hcpu = hcore << threads_shift;
 	kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
 	smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
-	if (paca[hcpu].kvm_hstate.xics_phys)
-		icp_native_cause_ipi_rm(hcpu);
-	else
-		opal_rm_int_set_mfrr(get_hard_smp_processor_id(hcpu),
-				     IPI_PRIORITY);
+	kvmppc_set_host_ipi(hcpu, 1);
+	smp_mb();
+	kvmhv_rm_send_ipi(hcpu);
 }
 #else
 static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { }
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index dcc67a87d688..46c1c1fe55c8 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -148,6 +148,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	addi	r1, r1, 112
 	ld	r7, HSTATE_HOST_MSR(r13)
 
+	/*
+	 * If we came back from the guest via a relocation-on interrupt,
+	 * we will be in virtual mode at this point, which makes it a
+	 * little easier to get back to the caller.
+	 */
+	mfmsr	r0
+	andi.	r0, r0, MSR_IR		/* in real mode? */
+	bne	.Lvirt_return
+
 	cmpwi	cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
 	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
 	beq	11f
@@ -181,6 +190,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	mtspr SPRN_HSRR1, r7
 	ba    0xe80
 
+	/* Virtual-mode return - can't get here for HMI or machine check */
+.Lvirt_return:
+	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
+	beq	16f
+	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
+	beq	17f
+	andi.	r0, r7, MSR_EE		/* were interrupts hard-enabled? */
+	beq	18f
+	mtmsrd	r7, 1			/* if so then re-enable them */
+18:	mtlr	r8
+	blr
+
+16:	mtspr	SPRN_HSRR0, r8		/* jump to reloc-on external vector */
+	mtspr	SPRN_HSRR1, r7
+	b	exc_virt_0x4500_hardware_interrupt
+
+17:	mtspr	SPRN_HSRR0, r8
+	mtspr	SPRN_HSRR1, r7
+	b	exc_virt_0x4e80_h_doorbell
+
 kvmppc_primary_no_guest:
 	/* We handle this much like a ceded vcpu */
 	/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
-- 
cgit v1.2.3


From f11f6f79b606fb54bb388d0ea652ed889b2fdf86 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:52 +1100
Subject: KVM: PPC: Book3S HV: Invalidate ERAT on guest entry/exit for POWER9
 DD1

On POWER9 DD1, we need to invalidate the ERAT (effective to real
address translation cache) when changing the PIDR register, which
we do as part of guest entry and exit.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 46c1c1fe55c8..47414a6fe2dd 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -876,6 +876,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	mtspr	SPRN_BESCR, r6
 	mtspr	SPRN_PID, r7
 	mtspr	SPRN_WORT, r8
+BEGIN_FTR_SECTION
+	PPC_INVALIDATE_ERAT
+END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
 BEGIN_FTR_SECTION
 	/* POWER8-only registers */
 	ld	r5, VCPU_TCSCR(r4)
@@ -1620,6 +1623,9 @@ BEGIN_FTR_SECTION
 	mtspr	SPRN_PSSCR, r6
 	mtspr	SPRN_PID, r7
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+BEGIN_FTR_SECTION
+	PPC_INVALIDATE_ERAT
+END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
 
 	/*
 	 * POWER7/POWER8 guest -> host partition switch code.
-- 
cgit v1.2.3


From 8cf4ecc0ca9bd9bdc9b4ca0a99f7445a1e74afed Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:53 +1100
Subject: KVM: PPC: Book3S HV: Enable radix guest support

This adds a few last pieces of the support for radix guests:

* Implement the backends for the KVM_PPC_CONFIGURE_V3_MMU and
  KVM_PPC_GET_RMMU_INFO ioctls for radix guests

* On POWER9, allow secondary threads to be on/off-lined while guests
  are running.

* Set up LPCR and the partition table entry for radix guests.

* Don't allocate the rmap array in the kvm_memory_slot structure
  on radix.

* Don't try to initialize the HPT for radix guests, since they don't
  have an HPT.

* Take out the code that prevents the HV KVM module from
  initializing on radix hosts.

At this stage, we only support radix guests if the host is running
in radix mode, and only support HPT guests if the host is running in
HPT mode.  Thus a guest cannot switch from one mode to the other,
which enables some simplifications.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_book3s.h  |  2 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c    |  1 -
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 45 +++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c           | 88 ++++++++++++++++++++++++----------
 arch/powerpc/kvm/powerpc.c             |  2 +-
 5 files changed, 111 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 57dc407cec4a..2bf35017ffc0 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -189,6 +189,7 @@ extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
 			unsigned long ea, unsigned long dsisr);
 extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 			struct kvmppc_pte *gpte, bool data, bool iswrite);
+extern int kvmppc_init_vm_radix(struct kvm *kvm);
 extern void kvmppc_free_radix(struct kvm *kvm);
 extern int kvmppc_radix_init(void);
 extern void kvmppc_radix_exit(void);
@@ -200,6 +201,7 @@ extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			unsigned long gfn);
 extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
 			struct kvm_memory_slot *memslot, unsigned long *map);
+extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
 
 /* XXX remove this export when load_last_inst() is generic */
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index c9e587a2849d..9df3d940acec 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -155,7 +155,6 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
 
 void kvmppc_free_hpt(struct kvm *kvm)
 {
-	kvmppc_free_lpid(kvm->arch.lpid);
 	vfree(kvm->arch.revmap);
 	if (kvm->arch.hpt_cma_alloc)
 		kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 125cc7ce1525..4344651f408c 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -610,6 +610,51 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
 	return 0;
 }
 
+static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
+				 int psize, int *indexp)
+{
+	if (!mmu_psize_defs[psize].shift)
+		return;
+	info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
+		(mmu_psize_defs[psize].ap << 29);
+	++(*indexp);
+}
+
+int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
+{
+	int i;
+
+	if (!radix_enabled())
+		return -EINVAL;
+	memset(info, 0, sizeof(*info));
+
+	/* 4k page size */
+	info->geometries[0].page_shift = 12;
+	info->geometries[0].level_bits[0] = 9;
+	for (i = 1; i < 4; ++i)
+		info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
+	/* 64k page size */
+	info->geometries[1].page_shift = 16;
+	for (i = 0; i < 4; ++i)
+		info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
+
+	i = 0;
+	add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
+	add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
+	add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
+	add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
+
+	return 0;
+}
+
+int kvmppc_init_vm_radix(struct kvm *kvm)
+{
+	kvm->arch.pgtable = pgd_alloc(kvm->mm);
+	if (!kvm->arch.pgtable)
+		return -ENOMEM;
+	return 0;
+}
+
 void kvmppc_free_radix(struct kvm *kvm)
 {
 	unsigned long ig, iu, im;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 50c230e83f9b..e4a79679342e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1135,7 +1135,7 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
 	/*
 	 * Userspace can only modify DPFD (default prefetch depth),
 	 * ILE (interrupt little-endian) and TC (translation control).
-	 * On POWER8 userspace can also modify AIL (alt. interrupt loc.)
+	 * On POWER8 and POWER9 userspace can also modify AIL (alt. interrupt loc.).
 	 */
 	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
 	if (cpu_has_feature(CPU_FTR_ARCH_207S))
@@ -2922,7 +2922,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	smp_mb();
 
 	/* On the first time here, set up HTAB and VRMA */
-	if (!vcpu->kvm->arch.hpte_setup_done) {
+	if (!kvm_is_radix(vcpu->kvm) && !vcpu->kvm->arch.hpte_setup_done) {
 		r = kvmppc_hv_setup_htab_rma(vcpu);
 		if (r)
 			goto out;
@@ -2984,6 +2984,13 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
 {
 	struct kvm_ppc_one_seg_page_size *sps;
 
+	/*
+	 * Since we don't yet support HPT guests on a radix host,
+	 * return an error if the host uses radix.
+	 */
+	if (radix_enabled())
+		return -EINVAL;
+
 	info->flags = KVM_PPC_PAGE_SIZES_REAL;
 	if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
 		info->flags |= KVM_PPC_1T_SEGMENTS;
@@ -3069,6 +3076,15 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
 static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
 					 unsigned long npages)
 {
+	/*
+	 * For now, if radix_enabled() then we only support radix guests,
+	 * and in that case we don't need the rmap array.
+	 */
+	if (radix_enabled()) {
+		slot->arch.rmap = NULL;
+		return 0;
+	}
+
 	slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
 	if (!slot->arch.rmap)
 		return -ENOMEM;
@@ -3149,14 +3165,20 @@ static void kvmppc_setup_partition_table(struct kvm *kvm)
 {
 	unsigned long dw0, dw1;
 
-	/* PS field - page size for VRMA */
-	dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
-		((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
-	/* HTABSIZE and HTABORG fields */
-	dw0 |= kvm->arch.sdr1;
+	if (!kvm_is_radix(kvm)) {
+		/* PS field - page size for VRMA */
+		dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
+			((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
+		/* HTABSIZE and HTABORG fields */
+		dw0 |= kvm->arch.sdr1;
 
-	/* Second dword as set by userspace */
-	dw1 = kvm->arch.process_table;
+		/* Second dword as set by userspace */
+		dw1 = kvm->arch.process_table;
+	} else {
+		dw0 = PATB_HR | radix__get_tree_size() |
+			__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
+		dw1 = PATB_GR | kvm->arch.process_table;
+	}
 
 	mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
 }
@@ -3326,6 +3348,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 {
 	unsigned long lpcr, lpid;
 	char buf[32];
+	int ret;
 
 	/* Allocate the guest's logical partition ID */
 
@@ -3373,13 +3396,30 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 		lpcr |= LPCR_HVICE;
 	}
 
+	/*
+	 * For now, if the host uses radix, the guest must be radix.
+	 */
+	if (radix_enabled()) {
+		kvm->arch.radix = 1;
+		lpcr &= ~LPCR_VPM1;
+		lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
+		ret = kvmppc_init_vm_radix(kvm);
+		if (ret) {
+			kvmppc_free_lpid(kvm->arch.lpid);
+			return ret;
+		}
+		kvmppc_setup_partition_table(kvm);
+	}
+
 	kvm->arch.lpcr = lpcr;
 
 	/*
 	 * Work out how many sets the TLB has, for the use of
 	 * the TLB invalidation loop in book3s_hv_rmhandlers.S.
 	 */
-	if (cpu_has_feature(CPU_FTR_ARCH_300))
+	if (kvm_is_radix(kvm))
+		kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX;	/* 128 */
+	else if (cpu_has_feature(CPU_FTR_ARCH_300))
 		kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH;	/* 256 */
 	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
 		kvm->arch.tlb_sets = POWER8_TLB_SETS;		/* 512 */
@@ -3389,8 +3429,11 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	/*
 	 * Track that we now have a HV mode VM active. This blocks secondary
 	 * CPU threads from coming online.
+	 * On POWER9, we only need to do this for HPT guests on a radix
+	 * host, which is not yet supported.
 	 */
-	kvm_hv_vm_activated();
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		kvm_hv_vm_activated();
 
 	/*
 	 * Create a debugfs directory for the VM
@@ -3416,10 +3459,13 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 {
 	debugfs_remove_recursive(kvm->arch.debugfs_dir);
 
-	kvm_hv_vm_deactivated();
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		kvm_hv_vm_deactivated();
 
 	kvmppc_free_vcores(kvm);
 
+	kvmppc_free_lpid(kvm->arch.lpid);
+
 	if (kvm_is_radix(kvm))
 		kvmppc_free_radix(kvm);
 	else
@@ -3452,11 +3498,6 @@ static int kvmppc_core_check_processor_compat_hv(void)
 	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
 	    !cpu_has_feature(CPU_FTR_ARCH_206))
 		return -EIO;
-	/*
-	 * Disable KVM for Power9 in radix mode.
-	 */
-	if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
-		return -EIO;
 
 	return 0;
 }
@@ -3727,6 +3768,7 @@ static void init_default_hcalls(void)
 static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 {
 	unsigned long lpcr;
+	int radix;
 
 	/* If not on a POWER9, reject it */
 	if (!cpu_has_feature(CPU_FTR_ARCH_300))
@@ -3736,12 +3778,13 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 	if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
 		return -EINVAL;
 
-	/* We can't do radix yet */
-	if (cfg->flags & KVM_PPC_MMUV3_RADIX)
+	/* We can't change a guest to/from radix yet */
+	radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
+	if (radix != kvm_is_radix(kvm))
 		return -EINVAL;
 
 	/* GR (guest radix) bit in process_table field must match */
-	if (cfg->process_table & PATB_GR)
+	if (!!(cfg->process_table & PATB_GR) != radix)
 		return -EINVAL;
 
 	/* Process table size field must be reasonable, i.e. <= 24 */
@@ -3757,11 +3800,6 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 	return 0;
 }
 
-static int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
-{
-	return -EINVAL;
-}
-
 static struct kvmppc_ops kvm_ops_hv = {
 	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
 	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1476a480745e..40a5b2d75ed1 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -566,7 +566,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = kvmppc_hwrng_present();
 		break;
 	case KVM_CAP_PPC_MMU_RADIX:
-		r = !!(0 && hv_enabled && radix_enabled());
+		r = !!(hv_enabled && radix_enabled());
 		break;
 	case KVM_CAP_PPC_MMU_HASH_V3:
 		r = !!(hv_enabled && !radix_enabled() &&
-- 
cgit v1.2.3


From ccc4df4e2c3825919456c13b153d2a67bbf328dc Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 20 Dec 2016 16:48:57 +1100
Subject: Documentation: Correct duplicate section number in kvm/api.txt

Both KVM_CREATE_SPAPR_TCE_64 and KVM_REINJECT_CONTROL have section number
4.98 in Documentation/virtual/kvm/api.txt, presumably due to a naive merge.
This corrects the duplication.

[paulus@ozlabs.org - correct section numbers for following sections,
 KVM_PPC_CONFIGURE_V3_MMU and KVM_PPC_GET_RMMU_INFO, as well.]

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Documentation/virtual/kvm/api.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 4470671b0c26..aca994a90355 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3177,7 +3177,7 @@ of IOMMU pages.
 
 The rest of functionality is identical to KVM_CREATE_SPAPR_TCE.
 
-4.98 KVM_REINJECT_CONTROL
+4.99 KVM_REINJECT_CONTROL
 
 Capability: KVM_CAP_REINJECT_CONTROL
 Architectures: x86
@@ -3201,7 +3201,7 @@ struct kvm_reinject_control {
 pit_reinject = 0 (!reinject mode) is recommended, unless running an old
 operating system that uses the PIT for timing (e.g. Linux 2.4.x).
 
-4.99 KVM_PPC_CONFIGURE_V3_MMU
+4.100 KVM_PPC_CONFIGURE_V3_MMU
 
 Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3
 Architectures: ppc
@@ -3232,7 +3232,7 @@ process table, which is in the guest's space.  This field is formatted
 as the second doubleword of the partition table entry, as defined in
 the Power ISA V3.00, Book III section 5.7.6.1.
 
-4.100 KVM_PPC_GET_RMMU_INFO
+4.101 KVM_PPC_GET_RMMU_INFO
 
 Capability: KVM_CAP_PPC_RADIX_MMU
 Architectures: ppc
-- 
cgit v1.2.3


From ef1ead0c3b1dfb43d33caa4f50c8d214f86b6bc8 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 20 Dec 2016 16:48:58 +1100
Subject: KVM: PPC: Book3S HV: HPT resizing documentation and reserved numbers

This adds a new powerpc-specific KVM_CAP_SPAPR_RESIZE_HPT capability to
advertise whether KVM is capable of handling the PAPR extensions for
resizing the hashed page table during guest runtime.  It also adds
definitions for two new VM ioctl()s to implement this extension, and
documentation of the same.

Note that, HPT resizing is already possible with KVM PR without kernel
modification, since the HPT is managed within userspace (qemu).  The
capability defined here will only be set where an in-kernel implementation
of resizing is necessary, i.e. for KVM HV.  To determine if the userspace
resize implementation can be used, it's necessary to check
KVM_CAP_PPC_ALLOC_HTAB.  Unfortunately older kernels incorrectly set
KVM_CAP_PPC_ALLOC_HTAB even with KVM PR.  If userspace it want to support
resizing with KVM PR on such kernels, it will need a workaround.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Documentation/virtual/kvm/api.txt | 95 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h          | 11 +++++
 2 files changed, 106 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index aca994a90355..64f217af0416 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3266,6 +3266,101 @@ The ap_encodings gives the supported page sizes and their AP field
 encodings, encoded with the AP value in the top 3 bits and the log
 base 2 of the page size in the bottom 6 bits.
 
+4.102 KVM_PPC_RESIZE_HPT_PREPARE
+
+Capability: KVM_CAP_SPAPR_RESIZE_HPT
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_ppc_resize_hpt (in)
+Returns: 0 on successful completion,
+	 >0 if a new HPT is being prepared, the value is an estimated
+             number of milliseconds until preparation is complete
+         -EFAULT if struct kvm_reinject_control cannot be read,
+	 -EINVAL if the supplied shift or flags are invalid
+	 -ENOMEM if unable to allocate the new HPT
+	 -ENOSPC if there was a hash collision when moving existing
+                  HPT entries to the new HPT
+	 -EIO on other error conditions
+
+Used to implement the PAPR extension for runtime resizing of a guest's
+Hashed Page Table (HPT).  Specifically this starts, stops or monitors
+the preparation of a new potential HPT for the guest, essentially
+implementing the H_RESIZE_HPT_PREPARE hypercall.
+
+If called with shift > 0 when there is no pending HPT for the guest,
+this begins preparation of a new pending HPT of size 2^(shift) bytes.
+It then returns a positive integer with the estimated number of
+milliseconds until preparation is complete.
+
+If called when there is a pending HPT whose size does not match that
+requested in the parameters, discards the existing pending HPT and
+creates a new one as above.
+
+If called when there is a pending HPT of the size requested, will:
+  * If preparation of the pending HPT is already complete, return 0
+  * If preparation of the pending HPT has failed, return an error
+    code, then discard the pending HPT.
+  * If preparation of the pending HPT is still in progress, return an
+    estimated number of milliseconds until preparation is complete.
+
+If called with shift == 0, discards any currently pending HPT and
+returns 0 (i.e. cancels any in-progress preparation).
+
+flags is reserved for future expansion, currently setting any bits in
+flags will result in an -EINVAL.
+
+Normally this will be called repeatedly with the same parameters until
+it returns <= 0.  The first call will initiate preparation, subsequent
+ones will monitor preparation until it completes or fails.
+
+struct kvm_ppc_resize_hpt {
+	__u64 flags;
+	__u32 shift;
+	__u32 pad;
+};
+
+4.103 KVM_PPC_RESIZE_HPT_COMMIT
+
+Capability: KVM_CAP_SPAPR_RESIZE_HPT
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_ppc_resize_hpt (in)
+Returns: 0 on successful completion,
+         -EFAULT if struct kvm_reinject_control cannot be read,
+	 -EINVAL if the supplied shift or flags are invalid
+	 -ENXIO is there is no pending HPT, or the pending HPT doesn't
+                 have the requested size
+	 -EBUSY if the pending HPT is not fully prepared
+	 -ENOSPC if there was a hash collision when moving existing
+                  HPT entries to the new HPT
+	 -EIO on other error conditions
+
+Used to implement the PAPR extension for runtime resizing of a guest's
+Hashed Page Table (HPT).  Specifically this requests that the guest be
+transferred to working with the new HPT, essentially implementing the
+H_RESIZE_HPT_COMMIT hypercall.
+
+This should only be called after KVM_PPC_RESIZE_HPT_PREPARE has
+returned 0 with the same parameters.  In other cases
+KVM_PPC_RESIZE_HPT_COMMIT will return an error (usually -ENXIO or
+-EBUSY, though others may be possible if the preparation was started,
+but failed).
+
+This will have undefined effects on the guest if it has not already
+placed itself in a quiescent state where no vcpu will make MMU enabled
+memory accesses.
+
+On succsful completion, the pending HPT will become the guest's active
+HPT and the previous HPT will be discarded.
+
+On failure, the guest will still be operating on its previous HPT.
+
+struct kvm_ppc_resize_hpt {
+	__u64 flags;
+	__u32 shift;
+	__u32 pad;
+};
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index e0035808c814..7964b970b9ad 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -685,6 +685,13 @@ struct kvm_ppc_smmu_info {
 	struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
 };
 
+/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */
+struct kvm_ppc_resize_hpt {
+	__u64 flags;
+	__u32 shift;
+	__u32 pad;
+};
+
 #define KVMIO 0xAE
 
 /* machine type bits, to be used as argument to KVM_CREATE_VM */
@@ -871,6 +878,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_S390_USER_INSTR0 130
 #define KVM_CAP_MSI_DEVID 131
 #define KVM_CAP_PPC_HTM 132
+#define KVM_CAP_SPAPR_RESIZE_HPT 133
 #define KVM_CAP_PPC_MMU_RADIX 134
 #define KVM_CAP_PPC_MMU_HASH_V3 135
 
@@ -1189,6 +1197,9 @@ struct kvm_s390_ucas_mapping {
 #define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
 /* Available with KVM_CAP_PPC_RTAS */
 #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xac, struct kvm_rtas_token_args)
+/* Available with KVM_CAP_SPAPR_RESIZE_HPT */
+#define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt)
+#define KVM_PPC_RESIZE_HPT_COMMIT  _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt)
 /* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */
 #define KVM_PPC_CONFIGURE_V3_MMU  _IOW(KVMIO,  0xaf, struct kvm_ppc_mmuv3_cfg)
 /* Available with KVM_CAP_PPC_RADIX_MMU */
-- 
cgit v1.2.3


From db9a290d9c3c596e5325e2a42133594435e5de46 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 20 Dec 2016 16:48:59 +1100
Subject: KVM: PPC: Book3S HV: Rename kvm_alloc_hpt() for clarity

The difference between kvm_alloc_hpt() and kvmppc_alloc_hpt() is not at
all obvious from the name.  In practice kvmppc_alloc_hpt() allocates an HPT
by whatever means, and calls kvm_alloc_hpt() which will attempt to allocate
it with CMA only.

To make this less confusing, rename kvm_alloc_hpt() to kvm_alloc_hpt_cma().
Similarly, kvm_release_hpt() is renamed kvm_free_hpt_cma().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_ppc.h   | 4 ++--
 arch/powerpc/kvm/book3s_64_mmu_hv.c  | 8 ++++----
 arch/powerpc/kvm/book3s_hv_builtin.c | 8 ++++----
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 48c760f89590..6fad1f12e9ec 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -186,8 +186,8 @@ extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 		unsigned long tce_value, unsigned long npages);
 extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 			     unsigned long ioba);
-extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
-extern void kvm_release_hpt(struct page *page, unsigned long nr_pages);
+extern struct page *kvm_alloc_hpt_cma(unsigned long nr_pages);
+extern void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern void kvmppc_core_free_memslot(struct kvm *kvm,
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 9df3d940acec..16f278417c69 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -62,7 +62,7 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 	}
 
 	kvm->arch.hpt_cma_alloc = 0;
-	page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT));
+	page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
 	if (page) {
 		hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
 		memset((void *)hpt, 0, (1ul << order));
@@ -108,7 +108,7 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 
  out_freehpt:
 	if (kvm->arch.hpt_cma_alloc)
-		kvm_release_hpt(page, 1 << (order - PAGE_SHIFT));
+		kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
 	else
 		free_pages(hpt, order - PAGE_SHIFT);
 	return -ENOMEM;
@@ -157,8 +157,8 @@ void kvmppc_free_hpt(struct kvm *kvm)
 {
 	vfree(kvm->arch.revmap);
 	if (kvm->arch.hpt_cma_alloc)
-		kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
-				1 << (kvm->arch.hpt_order - PAGE_SHIFT));
+		kvm_free_hpt_cma(virt_to_page(kvm->arch.hpt_virt),
+				 1 << (kvm->arch.hpt_order - PAGE_SHIFT));
 	else if (kvm->arch.hpt_virt)
 		free_pages(kvm->arch.hpt_virt,
 			   kvm->arch.hpt_order - PAGE_SHIFT);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index fe08fea54b70..96e7e609f621 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -57,19 +57,19 @@ static int __init early_parse_kvm_cma_resv(char *p)
 }
 early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv);
 
-struct page *kvm_alloc_hpt(unsigned long nr_pages)
+struct page *kvm_alloc_hpt_cma(unsigned long nr_pages)
 {
 	VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
 
 	return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES));
 }
-EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
+EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma);
 
-void kvm_release_hpt(struct page *page, unsigned long nr_pages)
+void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages)
 {
 	cma_release(kvm_cma, page, nr_pages);
 }
-EXPORT_SYMBOL_GPL(kvm_release_hpt);
+EXPORT_SYMBOL_GPL(kvm_free_hpt_cma);
 
 /**
  * kvm_cma_reserve() - reserve area for kvm hash pagetable
-- 
cgit v1.2.3


From 3f9d4f5a5f35e402e91bedf0c15e29cef187a29d Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 20 Dec 2016 16:49:00 +1100
Subject: KVM: PPC: Book3S HV: Gather HPT related variables into sub-structure

Currently, the powerpc kvm_arch structure contains a number of variables
tracking the state of the guest's hashed page table (HPT) in KVM HV.  This
patch gathers them all together into a single kvm_hpt_info substructure.
This makes life more convenient for the upcoming HPT resizing
implementation.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_host.h | 20 +++++---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 92 ++++++++++++++++++-------------------
 arch/powerpc/kvm/book3s_hv.c        |  2 +-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 62 ++++++++++++-------------
 4 files changed, 92 insertions(+), 84 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index b2dbeac3f450..ea6f0c659936 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -241,12 +241,24 @@ struct kvm_arch_memory_slot {
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 };
 
+struct kvm_hpt_info {
+	/* Host virtual (linear mapping) address of guest HPT */
+	unsigned long virt;
+	/* Array of reverse mapping entries for each guest HPTE */
+	struct revmap_entry *rev;
+	unsigned long npte;
+	unsigned long mask;
+	/* Guest HPT size is 2**(order) bytes */
+	u32 order;
+	/* 1 if HPT allocated with CMA, 0 otherwise */
+	int cma;
+};
+
 struct kvm_arch {
 	unsigned int lpid;
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	unsigned int tlb_sets;
-	unsigned long hpt_virt;
-	struct revmap_entry *revmap;
+	struct kvm_hpt_info hpt;
 	atomic64_t mmio_update;
 	unsigned int host_lpid;
 	unsigned long host_lpcr;
@@ -256,15 +268,11 @@ struct kvm_arch {
 	unsigned long lpcr;
 	unsigned long vrma_slb_v;
 	int hpte_setup_done;
-	u32 hpt_order;
 	atomic_t vcpus_running;
 	u32 online_vcores;
-	unsigned long hpt_npte;
-	unsigned long hpt_mask;
 	atomic_t hpte_mod_interest;
 	cpumask_t need_tlb_flush;
 	cpumask_t cpu_in_guest;
-	int hpt_cma_alloc;
 	u8 radix;
 	pgd_t *pgtable;
 	u64 process_table;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 16f278417c69..2af63ce129bc 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -61,12 +61,12 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 			order = PPC_MIN_HPT_ORDER;
 	}
 
-	kvm->arch.hpt_cma_alloc = 0;
+	kvm->arch.hpt.cma = 0;
 	page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
 	if (page) {
 		hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
 		memset((void *)hpt, 0, (1ul << order));
-		kvm->arch.hpt_cma_alloc = 1;
+		kvm->arch.hpt.cma = 1;
 	}
 
 	/* Lastly try successively smaller sizes from the page allocator */
@@ -81,22 +81,22 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 	if (!hpt)
 		return -ENOMEM;
 
-	kvm->arch.hpt_virt = hpt;
-	kvm->arch.hpt_order = order;
+	kvm->arch.hpt.virt = hpt;
+	kvm->arch.hpt.order = order;
 	/* HPTEs are 2**4 bytes long */
-	kvm->arch.hpt_npte = 1ul << (order - 4);
+	kvm->arch.hpt.npte = 1ul << (order - 4);
 	/* 128 (2**7) bytes in each HPTEG */
-	kvm->arch.hpt_mask = (1ul << (order - 7)) - 1;
+	kvm->arch.hpt.mask = (1ul << (order - 7)) - 1;
 
 	atomic64_set(&kvm->arch.mmio_update, 0);
 
 	/* Allocate reverse map array */
-	rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte);
+	rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt.npte);
 	if (!rev) {
 		pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
 		goto out_freehpt;
 	}
-	kvm->arch.revmap = rev;
+	kvm->arch.hpt.rev = rev;
 	kvm->arch.sdr1 = __pa(hpt) | (order - 18);
 
 	pr_info("KVM guest htab at %lx (order %ld), LPID %x\n",
@@ -107,7 +107,7 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 	return 0;
 
  out_freehpt:
-	if (kvm->arch.hpt_cma_alloc)
+	if (kvm->arch.hpt.cma)
 		kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
 	else
 		free_pages(hpt, order - PAGE_SHIFT);
@@ -132,10 +132,10 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
 			goto out;
 		}
 	}
-	if (kvm->arch.hpt_virt) {
-		order = kvm->arch.hpt_order;
+	if (kvm->arch.hpt.virt) {
+		order = kvm->arch.hpt.order;
 		/* Set the entire HPT to 0, i.e. invalid HPTEs */
-		memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
+		memset((void *)kvm->arch.hpt.virt, 0, 1ul << order);
 		/*
 		 * Reset all the reverse-mapping chains for all memslots
 		 */
@@ -155,13 +155,13 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
 
 void kvmppc_free_hpt(struct kvm *kvm)
 {
-	vfree(kvm->arch.revmap);
-	if (kvm->arch.hpt_cma_alloc)
-		kvm_free_hpt_cma(virt_to_page(kvm->arch.hpt_virt),
-				 1 << (kvm->arch.hpt_order - PAGE_SHIFT));
-	else if (kvm->arch.hpt_virt)
-		free_pages(kvm->arch.hpt_virt,
-			   kvm->arch.hpt_order - PAGE_SHIFT);
+	vfree(kvm->arch.hpt.rev);
+	if (kvm->arch.hpt.cma)
+		kvm_free_hpt_cma(virt_to_page(kvm->arch.hpt.virt),
+				 1 << (kvm->arch.hpt.order - PAGE_SHIFT));
+	else if (kvm->arch.hpt.virt)
+		free_pages(kvm->arch.hpt.virt,
+			   kvm->arch.hpt.order - PAGE_SHIFT);
 }
 
 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
@@ -196,8 +196,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 	if (npages > 1ul << (40 - porder))
 		npages = 1ul << (40 - porder);
 	/* Can't use more than 1 HPTE per HPTEG */
-	if (npages > kvm->arch.hpt_mask + 1)
-		npages = kvm->arch.hpt_mask + 1;
+	if (npages > kvm->arch.hpt.mask + 1)
+		npages = kvm->arch.hpt.mask + 1;
 
 	hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
 		HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
@@ -207,7 +207,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 	for (i = 0; i < npages; ++i) {
 		addr = i << porder;
 		/* can't use hpt_hash since va > 64 bits */
-		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask;
+		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt.mask;
 		/*
 		 * We assume that the hash table is empty and no
 		 * vcpus are using it at this stage.  Since we create
@@ -340,11 +340,11 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 		preempt_enable();
 		return -ENOENT;
 	}
-	hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
+	hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
 	v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
 	if (cpu_has_feature(CPU_FTR_ARCH_300))
 		v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1]));
-	gr = kvm->arch.revmap[index].guest_rpte;
+	gr = kvm->arch.hpt.rev[index].guest_rpte;
 
 	unlock_hpte(hptep, orig_v);
 	preempt_enable();
@@ -485,8 +485,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		}
 	}
 	index = vcpu->arch.pgfault_index;
-	hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
-	rev = &kvm->arch.revmap[index];
+	hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
+	rev = &kvm->arch.hpt.rev[index];
 	preempt_disable();
 	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
 		cpu_relax();
@@ -748,7 +748,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			   unsigned long gfn)
 {
-	struct revmap_entry *rev = kvm->arch.revmap;
+	struct revmap_entry *rev = kvm->arch.hpt.rev;
 	unsigned long h, i, j;
 	__be64 *hptep;
 	unsigned long ptel, psize, rcbits;
@@ -768,7 +768,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		 * rmap chain lock.
 		 */
 		i = *rmapp & KVMPPC_RMAP_INDEX;
-		hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
+		hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
 		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
 			/* unlock rmap before spinning on the HPTE lock */
 			unlock_rmap(rmapp);
@@ -860,7 +860,7 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
 static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			 unsigned long gfn)
 {
-	struct revmap_entry *rev = kvm->arch.revmap;
+	struct revmap_entry *rev = kvm->arch.hpt.rev;
 	unsigned long head, i, j;
 	__be64 *hptep;
 	int ret = 0;
@@ -880,7 +880,7 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 
 	i = head = *rmapp & KVMPPC_RMAP_INDEX;
 	do {
-		hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
+		hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
 		j = rev[i].forw;
 
 		/* If this HPTE isn't referenced, ignore it */
@@ -923,7 +923,7 @@ int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
 static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			      unsigned long gfn)
 {
-	struct revmap_entry *rev = kvm->arch.revmap;
+	struct revmap_entry *rev = kvm->arch.hpt.rev;
 	unsigned long head, i, j;
 	unsigned long *hp;
 	int ret = 1;
@@ -940,7 +940,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	if (*rmapp & KVMPPC_RMAP_PRESENT) {
 		i = head = *rmapp & KVMPPC_RMAP_INDEX;
 		do {
-			hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4));
+			hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4));
 			j = rev[i].forw;
 			if (be64_to_cpu(hp[1]) & HPTE_R_R)
 				goto out;
@@ -980,7 +980,7 @@ static int vcpus_running(struct kvm *kvm)
  */
 static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
 {
-	struct revmap_entry *rev = kvm->arch.revmap;
+	struct revmap_entry *rev = kvm->arch.hpt.rev;
 	unsigned long head, i, j;
 	unsigned long n;
 	unsigned long v, r;
@@ -1005,7 +1005,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
 	i = head = *rmapp & KVMPPC_RMAP_INDEX;
 	do {
 		unsigned long hptep1;
-		hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
+		hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
 		j = rev[i].forw;
 
 		/*
@@ -1311,8 +1311,8 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 	flags = ctx->flags;
 
 	i = ctx->index;
-	hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
-	revp = kvm->arch.revmap + i;
+	hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
+	revp = kvm->arch.hpt.rev + i;
 	lbuf = (unsigned long __user *)buf;
 
 	nb = 0;
@@ -1327,7 +1327,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 
 		/* Skip uninteresting entries, i.e. clean on not-first pass */
 		if (!first_pass) {
-			while (i < kvm->arch.hpt_npte &&
+			while (i < kvm->arch.hpt.npte &&
 			       !hpte_dirty(revp, hptp)) {
 				++i;
 				hptp += 2;
@@ -1337,7 +1337,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 		hdr.index = i;
 
 		/* Grab a series of valid entries */
-		while (i < kvm->arch.hpt_npte &&
+		while (i < kvm->arch.hpt.npte &&
 		       hdr.n_valid < 0xffff &&
 		       nb + HPTE_SIZE < count &&
 		       record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
@@ -1353,7 +1353,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 			++revp;
 		}
 		/* Now skip invalid entries while we can */
-		while (i < kvm->arch.hpt_npte &&
+		while (i < kvm->arch.hpt.npte &&
 		       hdr.n_invalid < 0xffff &&
 		       record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
 			/* found an invalid entry */
@@ -1374,7 +1374,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 		}
 
 		/* Check if we've wrapped around the hash table */
-		if (i >= kvm->arch.hpt_npte) {
+		if (i >= kvm->arch.hpt.npte) {
 			i = 0;
 			ctx->first_pass = 0;
 			break;
@@ -1433,11 +1433,11 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 
 		err = -EINVAL;
 		i = hdr.index;
-		if (i >= kvm->arch.hpt_npte ||
-		    i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
+		if (i >= kvm->arch.hpt.npte ||
+		    i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt.npte)
 			break;
 
-		hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+		hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
 		lbuf = (unsigned long __user *)buf;
 		for (j = 0; j < hdr.n_valid; ++j) {
 			__be64 hpte_v;
@@ -1624,8 +1624,8 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
 
 	kvm = p->kvm;
 	i = p->hpt_index;
-	hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
-	for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) {
+	hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
+	for (; len != 0 && i < kvm->arch.hpt.npte; ++i, hptp += 2) {
 		if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
 			continue;
 
@@ -1635,7 +1635,7 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
 			cpu_relax();
 		v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
 		hr = be64_to_cpu(hptp[1]);
-		gr = kvm->arch.revmap[i].guest_rpte;
+		gr = kvm->arch.hpt.rev[i].guest_rpte;
 		unlock_hpte(hptp, v);
 		preempt_enable();
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index bdf281cc88c0..02607128a4d4 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3197,7 +3197,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 		goto out;	/* another vcpu beat us to it */
 
 	/* Allocate hashed page table (if not done already) and reset it */
-	if (!kvm->arch.hpt_virt) {
+	if (!kvm->arch.hpt.virt) {
 		err = kvmppc_alloc_hpt(kvm, NULL);
 		if (err) {
 			pr_err("KVM: Couldn't alloc HPT\n");
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index b095afcd4309..175748acc9a1 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -86,10 +86,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
 
 	if (*rmap & KVMPPC_RMAP_PRESENT) {
 		i = *rmap & KVMPPC_RMAP_INDEX;
-		head = &kvm->arch.revmap[i];
+		head = &kvm->arch.hpt.rev[i];
 		if (realmode)
 			head = real_vmalloc_addr(head);
-		tail = &kvm->arch.revmap[head->back];
+		tail = &kvm->arch.hpt.rev[head->back];
 		if (realmode)
 			tail = real_vmalloc_addr(tail);
 		rev->forw = i;
@@ -154,8 +154,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 	lock_rmap(rmap);
 
 	head = *rmap & KVMPPC_RMAP_INDEX;
-	next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]);
-	prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]);
+	next = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->forw]);
+	prev = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->back]);
 	next->back = rev->back;
 	prev->forw = rev->forw;
 	if (head == pte_index) {
@@ -292,11 +292,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
 	/* Find and lock the HPTEG slot to use */
  do_insert:
-	if (pte_index >= kvm->arch.hpt_npte)
+	if (pte_index >= kvm->arch.hpt.npte)
 		return H_PARAMETER;
 	if (likely((flags & H_EXACT) == 0)) {
 		pte_index &= ~7UL;
-		hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+		hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
 		for (i = 0; i < 8; ++i) {
 			if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0 &&
 			    try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
@@ -327,7 +327,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 		}
 		pte_index += i;
 	} else {
-		hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+		hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
 		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
 				   HPTE_V_ABSENT)) {
 			/* Lock the slot and check again */
@@ -344,7 +344,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 	}
 
 	/* Save away the guest's idea of the second HPTE dword */
-	rev = &kvm->arch.revmap[pte_index];
+	rev = &kvm->arch.hpt.rev[pte_index];
 	if (realmode)
 		rev = real_vmalloc_addr(rev);
 	if (rev) {
@@ -469,9 +469,9 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 
 	if (kvm_is_radix(kvm))
 		return H_FUNCTION;
-	if (pte_index >= kvm->arch.hpt_npte)
+	if (pte_index >= kvm->arch.hpt.npte)
 		return H_PARAMETER;
-	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+	hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
 	pte = orig_pte = be64_to_cpu(hpte[0]);
@@ -487,7 +487,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 		return H_NOT_FOUND;
 	}
 
-	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
 	v = pte & ~HPTE_V_HVLOCK;
 	if (v & HPTE_V_VALID) {
 		hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
@@ -557,13 +557,13 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 				break;
 			}
 			if (req != 1 || flags == 3 ||
-			    pte_index >= kvm->arch.hpt_npte) {
+			    pte_index >= kvm->arch.hpt.npte) {
 				/* parameter error */
 				args[j] = ((0xa0 | flags) << 56) + pte_index;
 				ret = H_PARAMETER;
 				break;
 			}
-			hp = (__be64 *) (kvm->arch.hpt_virt + (pte_index << 4));
+			hp = (__be64 *) (kvm->arch.hpt.virt + (pte_index << 4));
 			/* to avoid deadlock, don't spin except for first */
 			if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) {
 				if (n)
@@ -600,7 +600,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 			}
 
 			args[j] = ((0x80 | flags) << 56) + pte_index;
-			rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+			rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
 			note_hpte_modification(kvm, rev);
 
 			if (!(hp0 & HPTE_V_VALID)) {
@@ -657,10 +657,10 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 
 	if (kvm_is_radix(kvm))
 		return H_FUNCTION;
-	if (pte_index >= kvm->arch.hpt_npte)
+	if (pte_index >= kvm->arch.hpt.npte)
 		return H_PARAMETER;
 
-	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+	hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
 	v = pte_v = be64_to_cpu(hpte[0]);
@@ -680,7 +680,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	/* Update guest view of 2nd HPTE dword */
 	mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
 		HPTE_R_KEY_HI | HPTE_R_KEY_LO;
-	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
 	if (rev) {
 		r = (rev->guest_rpte & ~mask) | bits;
 		rev->guest_rpte = r;
@@ -728,15 +728,15 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 
 	if (kvm_is_radix(kvm))
 		return H_FUNCTION;
-	if (pte_index >= kvm->arch.hpt_npte)
+	if (pte_index >= kvm->arch.hpt.npte)
 		return H_PARAMETER;
 	if (flags & H_READ_4) {
 		pte_index &= ~3;
 		n = 4;
 	}
-	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
 	for (i = 0; i < n; ++i, ++pte_index) {
-		hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+		hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
 		v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
 		r = be64_to_cpu(hpte[1]);
 		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
@@ -769,11 +769,11 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
 
 	if (kvm_is_radix(kvm))
 		return H_FUNCTION;
-	if (pte_index >= kvm->arch.hpt_npte)
+	if (pte_index >= kvm->arch.hpt.npte)
 		return H_PARAMETER;
 
-	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
-	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
+	hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
 	v = be64_to_cpu(hpte[0]);
@@ -817,11 +817,11 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
 
 	if (kvm_is_radix(kvm))
 		return H_FUNCTION;
-	if (pte_index >= kvm->arch.hpt_npte)
+	if (pte_index >= kvm->arch.hpt.npte)
 		return H_PARAMETER;
 
-	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
-	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
+	hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
 	v = be64_to_cpu(hpte[0]);
@@ -970,7 +970,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 		somask = (1UL << 28) - 1;
 		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
 	}
-	hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask;
+	hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt.mask;
 	avpn = slb_v & ~(somask >> 16);	/* also includes B */
 	avpn |= (eaddr & somask) >> 16;
 
@@ -981,7 +981,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 	val |= avpn;
 
 	for (;;) {
-		hpte = (__be64 *)(kvm->arch.hpt_virt + (hash << 7));
+		hpte = (__be64 *)(kvm->arch.hpt.virt + (hash << 7));
 
 		for (i = 0; i < 16; i += 2) {
 			/* Read the PTE racily */
@@ -1017,7 +1017,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 		if (val & HPTE_V_SECONDARY)
 			break;
 		val |= HPTE_V_SECONDARY;
-		hash = hash ^ kvm->arch.hpt_mask;
+		hash = hash ^ kvm->arch.hpt.mask;
 	}
 	return -1;
 }
@@ -1066,14 +1066,14 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 				return status;	/* there really was no HPTE */
 			return 0;	/* for prot fault, HPTE disappeared */
 		}
-		hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
+		hpte = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
 		v = orig_v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
 		r = be64_to_cpu(hpte[1]);
 		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
 			v = hpte_new_to_old_v(v, r);
 			r = hpte_new_to_old_r(r);
 		}
-		rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
+		rev = real_vmalloc_addr(&kvm->arch.hpt.rev[index]);
 		gr = rev->guest_rpte;
 
 		unlock_hpte(hpte, orig_v);
-- 
cgit v1.2.3


From 3d089f84c6f9b7b0eda993142d73961a44b553d2 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 20 Dec 2016 16:49:01 +1100
Subject: KVM: PPC: Book3S HV: Don't store values derivable from HPT order

Currently the kvm_hpt_info structure stores the hashed page table's order,
and also the number of HPTEs it contains and a mask for its size.  The
last two can be easily derived from the order, so remove them and just
calculate them as necessary with a couple of helper inlines.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 12 ++++++++++++
 arch/powerpc/include/asm/kvm_host.h      |  2 --
 arch/powerpc/kvm/book3s_64_mmu_hv.c      | 28 +++++++++++++---------------
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      | 18 +++++++++---------
 4 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 0db010cc4e65..ecc3b33bcc59 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -356,6 +356,18 @@ extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
 
 extern void kvmhv_rm_send_ipi(int cpu);
 
+static inline unsigned long kvmppc_hpt_npte(struct kvm_hpt_info *hpt)
+{
+	/* HPTEs are 2**4 bytes long */
+	return 1UL << (hpt->order - 4);
+}
+
+static inline unsigned long kvmppc_hpt_mask(struct kvm_hpt_info *hpt)
+{
+	/* 128 (2**7) bytes in each HPTEG */
+	return (1UL << (hpt->order - 7)) - 1;
+}
+
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index ea6f0c659936..0aa0f22d775a 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -246,8 +246,6 @@ struct kvm_hpt_info {
 	unsigned long virt;
 	/* Array of reverse mapping entries for each guest HPTE */
 	struct revmap_entry *rev;
-	unsigned long npte;
-	unsigned long mask;
 	/* Guest HPT size is 2**(order) bytes */
 	u32 order;
 	/* 1 if HPT allocated with CMA, 0 otherwise */
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 2af63ce129bc..d89995ef6c7b 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -83,15 +83,11 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 
 	kvm->arch.hpt.virt = hpt;
 	kvm->arch.hpt.order = order;
-	/* HPTEs are 2**4 bytes long */
-	kvm->arch.hpt.npte = 1ul << (order - 4);
-	/* 128 (2**7) bytes in each HPTEG */
-	kvm->arch.hpt.mask = (1ul << (order - 7)) - 1;
 
 	atomic64_set(&kvm->arch.mmio_update, 0);
 
 	/* Allocate reverse map array */
-	rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt.npte);
+	rev = vmalloc(sizeof(struct revmap_entry) * kvmppc_hpt_npte(&kvm->arch.hpt));
 	if (!rev) {
 		pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
 		goto out_freehpt;
@@ -196,8 +192,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 	if (npages > 1ul << (40 - porder))
 		npages = 1ul << (40 - porder);
 	/* Can't use more than 1 HPTE per HPTEG */
-	if (npages > kvm->arch.hpt.mask + 1)
-		npages = kvm->arch.hpt.mask + 1;
+	if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1)
+		npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1;
 
 	hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
 		HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
@@ -207,7 +203,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 	for (i = 0; i < npages; ++i) {
 		addr = i << porder;
 		/* can't use hpt_hash since va > 64 bits */
-		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt.mask;
+		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25)))
+			& kvmppc_hpt_mask(&kvm->arch.hpt);
 		/*
 		 * We assume that the hash table is empty and no
 		 * vcpus are using it at this stage.  Since we create
@@ -1327,7 +1324,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 
 		/* Skip uninteresting entries, i.e. clean on not-first pass */
 		if (!first_pass) {
-			while (i < kvm->arch.hpt.npte &&
+			while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
 			       !hpte_dirty(revp, hptp)) {
 				++i;
 				hptp += 2;
@@ -1337,7 +1334,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 		hdr.index = i;
 
 		/* Grab a series of valid entries */
-		while (i < kvm->arch.hpt.npte &&
+		while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
 		       hdr.n_valid < 0xffff &&
 		       nb + HPTE_SIZE < count &&
 		       record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
@@ -1353,7 +1350,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 			++revp;
 		}
 		/* Now skip invalid entries while we can */
-		while (i < kvm->arch.hpt.npte &&
+		while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
 		       hdr.n_invalid < 0xffff &&
 		       record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
 			/* found an invalid entry */
@@ -1374,7 +1371,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 		}
 
 		/* Check if we've wrapped around the hash table */
-		if (i >= kvm->arch.hpt.npte) {
+		if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) {
 			i = 0;
 			ctx->first_pass = 0;
 			break;
@@ -1433,8 +1430,8 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 
 		err = -EINVAL;
 		i = hdr.index;
-		if (i >= kvm->arch.hpt.npte ||
-		    i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt.npte)
+		if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) ||
+		    i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt))
 			break;
 
 		hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
@@ -1625,7 +1622,8 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
 	kvm = p->kvm;
 	i = p->hpt_index;
 	hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
-	for (; len != 0 && i < kvm->arch.hpt.npte; ++i, hptp += 2) {
+	for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt);
+	     ++i, hptp += 2) {
 		if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
 			continue;
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 175748acc9a1..6fca970373ee 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -292,7 +292,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
 	/* Find and lock the HPTEG slot to use */
  do_insert:
-	if (pte_index >= kvm->arch.hpt.npte)
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
 		return H_PARAMETER;
 	if (likely((flags & H_EXACT) == 0)) {
 		pte_index &= ~7UL;
@@ -469,7 +469,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 
 	if (kvm_is_radix(kvm))
 		return H_FUNCTION;
-	if (pte_index >= kvm->arch.hpt.npte)
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
 		return H_PARAMETER;
 	hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
@@ -557,7 +557,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 				break;
 			}
 			if (req != 1 || flags == 3 ||
-			    pte_index >= kvm->arch.hpt.npte) {
+			    pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) {
 				/* parameter error */
 				args[j] = ((0xa0 | flags) << 56) + pte_index;
 				ret = H_PARAMETER;
@@ -657,7 +657,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 
 	if (kvm_is_radix(kvm))
 		return H_FUNCTION;
-	if (pte_index >= kvm->arch.hpt.npte)
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
 		return H_PARAMETER;
 
 	hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
@@ -728,7 +728,7 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 
 	if (kvm_is_radix(kvm))
 		return H_FUNCTION;
-	if (pte_index >= kvm->arch.hpt.npte)
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
 		return H_PARAMETER;
 	if (flags & H_READ_4) {
 		pte_index &= ~3;
@@ -769,7 +769,7 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
 
 	if (kvm_is_radix(kvm))
 		return H_FUNCTION;
-	if (pte_index >= kvm->arch.hpt.npte)
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
 		return H_PARAMETER;
 
 	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
@@ -817,7 +817,7 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
 
 	if (kvm_is_radix(kvm))
 		return H_FUNCTION;
-	if (pte_index >= kvm->arch.hpt.npte)
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
 		return H_PARAMETER;
 
 	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
@@ -970,7 +970,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 		somask = (1UL << 28) - 1;
 		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
 	}
-	hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt.mask;
+	hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvmppc_hpt_mask(&kvm->arch.hpt);
 	avpn = slb_v & ~(somask >> 16);	/* also includes B */
 	avpn |= (eaddr & somask) >> 16;
 
@@ -1017,7 +1017,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 		if (val & HPTE_V_SECONDARY)
 			break;
 		val |= HPTE_V_SECONDARY;
-		hash = hash ^ kvm->arch.hpt.mask;
+		hash = hash ^ kvmppc_hpt_mask(&kvm->arch.hpt);
 	}
 	return -1;
 }
-- 
cgit v1.2.3


From aae0777f1e8224b4fbb78b2c692060852ee750c8 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 20 Dec 2016 16:49:02 +1100
Subject: KVM: PPC: Book3S HV: Split HPT allocation from activation

Currently, kvmppc_alloc_hpt() both allocates a new hashed page table (HPT)
and sets it up as the active page table for a VM.  For the upcoming HPT
resize implementation we're going to want to allocate HPTs separately from
activating them.

So, split the allocation itself out into kvmppc_allocate_hpt() and perform
the activation with a new kvmppc_set_hpt() function.  Likewise we split
kvmppc_free_hpt(), which just frees the HPT, from kvmppc_release_hpt()
which unsets it as an active HPT, then frees it.

We also move the logic to fall back to smaller HPT sizes if the first try
fails into the single caller which used that behaviour,
kvmppc_hv_setup_htab_rma().  This introduces a slight semantic change, in
that previously if the initial attempt at CMA allocation failed, we would
fall back to attempting smaller sizes with the page allocator.  Now, we
try first CMA, then the page allocator at each size.  As far as I can tell
this change should be harmless.

To match, we make kvmppc_free_hpt() just free the actual HPT itself.  The
call to kvmppc_free_lpid() that was there, we move to the single caller.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_book3s_64.h |  4 ++
 arch/powerpc/include/asm/kvm_ppc.h       |  5 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c      | 97 ++++++++++++++++----------------
 arch/powerpc/kvm/book3s_hv.c             | 17 +++++-
 4 files changed, 68 insertions(+), 55 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index ecc3b33bcc59..d9b48f5bb606 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -22,6 +22,10 @@
 
 #include <asm/book3s/64/mmu-hash.h>
 
+/* Power architecture requires HPT is at least 256kiB, at most 64TiB */
+#define PPC_MIN_HPT_ORDER	18
+#define PPC_MAX_HPT_ORDER	46
+
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 6fad1f12e9ec..ba61dec72089 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -155,9 +155,10 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
 extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
 extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
 
-extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp);
+extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order);
+extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info);
 extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp);
-extern void kvmppc_free_hpt(struct kvm *kvm);
+extern void kvmppc_free_hpt(struct kvm_hpt_info *info);
 extern long kvmppc_prepare_vrma(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
 extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index d89995ef6c7b..62d132a3cec5 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -40,74 +40,66 @@
 
 #include "trace_hv.h"
 
-/* Power architecture requires HPT is at least 256kB */
-#define PPC_MIN_HPT_ORDER	18
-
 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
 				long pte_index, unsigned long pteh,
 				unsigned long ptel, unsigned long *pte_idx_ret);
 static void kvmppc_rmap_reset(struct kvm *kvm);
 
-long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
+int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
 {
 	unsigned long hpt = 0;
-	struct revmap_entry *rev;
+	int cma = 0;
 	struct page *page = NULL;
-	long order = KVM_DEFAULT_HPT_ORDER;
+	struct revmap_entry *rev;
+	unsigned long npte;
 
-	if (htab_orderp) {
-		order = *htab_orderp;
-		if (order < PPC_MIN_HPT_ORDER)
-			order = PPC_MIN_HPT_ORDER;
-	}
+	if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER))
+		return -EINVAL;
 
-	kvm->arch.hpt.cma = 0;
 	page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
 	if (page) {
 		hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
 		memset((void *)hpt, 0, (1ul << order));
-		kvm->arch.hpt.cma = 1;
+		cma = 1;
 	}
 
-	/* Lastly try successively smaller sizes from the page allocator */
-	/* Only do this if userspace didn't specify a size via ioctl */
-	while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) {
-		hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
-				       __GFP_NOWARN, order - PAGE_SHIFT);
-		if (!hpt)
-			--order;
-	}
+	if (!hpt)
+		hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT
+				       |__GFP_NOWARN, order - PAGE_SHIFT);
 
 	if (!hpt)
 		return -ENOMEM;
 
-	kvm->arch.hpt.virt = hpt;
-	kvm->arch.hpt.order = order;
-
-	atomic64_set(&kvm->arch.mmio_update, 0);
+	/* HPTEs are 2**4 bytes long */
+	npte = 1ul << (order - 4);
 
 	/* Allocate reverse map array */
-	rev = vmalloc(sizeof(struct revmap_entry) * kvmppc_hpt_npte(&kvm->arch.hpt));
+	rev = vmalloc(sizeof(struct revmap_entry) * npte);
 	if (!rev) {
-		pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
-		goto out_freehpt;
+		pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n");
+		if (cma)
+			kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
+		else
+			free_pages(hpt, order - PAGE_SHIFT);
+		return -ENOMEM;
 	}
-	kvm->arch.hpt.rev = rev;
-	kvm->arch.sdr1 = __pa(hpt) | (order - 18);
 
-	pr_info("KVM guest htab at %lx (order %ld), LPID %x\n",
-		hpt, order, kvm->arch.lpid);
+	info->order = order;
+	info->virt = hpt;
+	info->cma = cma;
+	info->rev = rev;
 
-	if (htab_orderp)
-		*htab_orderp = order;
 	return 0;
+}
 
- out_freehpt:
-	if (kvm->arch.hpt.cma)
-		kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
-	else
-		free_pages(hpt, order - PAGE_SHIFT);
-	return -ENOMEM;
+void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info)
+{
+	atomic64_set(&kvm->arch.mmio_update, 0);
+	kvm->arch.hpt = *info;
+	kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18);
+
+	pr_info("KVM guest htab at %lx (order %ld), LPID %x\n",
+		info->virt, (long)info->order, kvm->arch.lpid);
 }
 
 long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
@@ -141,23 +133,28 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
 		*htab_orderp = order;
 		err = 0;
 	} else {
-		err = kvmppc_alloc_hpt(kvm, htab_orderp);
-		order = *htab_orderp;
+		struct kvm_hpt_info info;
+
+		err = kvmppc_allocate_hpt(&info, *htab_orderp);
+		if (err < 0)
+			goto out;
+		kvmppc_set_hpt(kvm, &info);
 	}
  out:
 	mutex_unlock(&kvm->lock);
 	return err;
 }
 
-void kvmppc_free_hpt(struct kvm *kvm)
+void kvmppc_free_hpt(struct kvm_hpt_info *info)
 {
-	vfree(kvm->arch.hpt.rev);
-	if (kvm->arch.hpt.cma)
-		kvm_free_hpt_cma(virt_to_page(kvm->arch.hpt.virt),
-				 1 << (kvm->arch.hpt.order - PAGE_SHIFT));
-	else if (kvm->arch.hpt.virt)
-		free_pages(kvm->arch.hpt.virt,
-			   kvm->arch.hpt.order - PAGE_SHIFT);
+	vfree(info->rev);
+	if (info->cma)
+		kvm_free_hpt_cma(virt_to_page(info->virt),
+				 1 << (info->order - PAGE_SHIFT));
+	else if (info->virt)
+		free_pages(info->virt, info->order - PAGE_SHIFT);
+	info->virt = 0;
+	info->order = 0;
 }
 
 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 02607128a4d4..19987e4343c3 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3198,11 +3198,22 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 
 	/* Allocate hashed page table (if not done already) and reset it */
 	if (!kvm->arch.hpt.virt) {
-		err = kvmppc_alloc_hpt(kvm, NULL);
-		if (err) {
+		int order = KVM_DEFAULT_HPT_ORDER;
+		struct kvm_hpt_info info;
+
+		err = kvmppc_allocate_hpt(&info, order);
+		/* If we get here, it means userspace didn't specify a
+		 * size explicitly.  So, try successively smaller
+		 * sizes if the default failed. */
+		while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER)
+			err  = kvmppc_allocate_hpt(&info, order);
+
+		if (err < 0) {
 			pr_err("KVM: Couldn't alloc HPT\n");
 			goto out;
 		}
+
+		kvmppc_set_hpt(kvm, &info);
 	}
 
 	/* Look up the memslot for guest physical address 0 */
@@ -3467,7 +3478,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 	if (kvm_is_radix(kvm))
 		kvmppc_free_radix(kvm);
 	else
-		kvmppc_free_hpt(kvm);
+		kvmppc_free_hpt(&kvm->arch.hpt);
 
 	kvmppc_free_pimap(kvm);
 }
-- 
cgit v1.2.3


From f98a8bf9ee201b7e22fc05e27150b1e481d4949f Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 20 Dec 2016 16:49:03 +1100
Subject: KVM: PPC: Book3S HV: Allow KVM_PPC_ALLOCATE_HTAB ioctl() to change
 HPT size

The KVM_PPC_ALLOCATE_HTAB ioctl() is used to set the size of hashed page
table (HPT) that userspace expects a guest VM to have, and is also used to
clear that HPT when necessary (e.g. guest reboot).

At present, once the ioctl() is called for the first time, the HPT size can
never be changed thereafter - it will be cleared but always sized as from
the first call.

With upcoming HPT resize implementation, we're going to need to allow
userspace to resize the HPT at reset (to change it back to the default size
if the guest changed it).

So, we need to allow this ioctl() to change the HPT size.

This patch also updates Documentation/virtual/kvm/api.txt to reflect
the new behaviour.  In fact the documentation was already slightly
incorrect since 572abd5 "KVM: PPC: Book3S HV: Don't fall back to
smaller HPT size in allocation ioctl"

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Documentation/virtual/kvm/api.txt   | 14 ++++++++------
 arch/powerpc/include/asm/kvm_ppc.h  |  2 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 29 ++++++++++++++++-------------
 arch/powerpc/kvm/book3s_hv.c        |  5 +----
 4 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 64f217af0416..f1945d8cbccb 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2443,18 +2443,20 @@ are, it will do nothing and return an EBUSY error.
 The parameter is a pointer to a 32-bit unsigned integer variable
 containing the order (log base 2) of the desired size of the hash
 table, which must be between 18 and 46.  On successful return from the
-ioctl, it will have been updated with the order of the hash table that
-was allocated.
+ioctl, the value will not be changed by the kernel.
 
 If no hash table has been allocated when any vcpu is asked to run
 (with the KVM_RUN ioctl), the host kernel will allocate a
 default-sized hash table (16 MB).
 
 If this ioctl is called when a hash table has already been allocated,
-the kernel will clear out the existing hash table (zero all HPTEs) and
-return the hash table order in the parameter.  (If the guest is using
-the virtualized real-mode area (VRMA) facility, the kernel will
-re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
+with a different order from the existing hash table, the existing hash
+table will be freed and a new one allocated.  If this is ioctl is
+called when a hash table has already been allocated of the same order
+as specified, the kernel will clear out the existing hash table (zero
+all HPTEs).  In either case, if the guest is using the virtualized
+real-mode area (VRMA) facility, the kernel will re-create the VMRA
+HPTEs on the next KVM_RUN of any vcpu.
 
 4.77 KVM_S390_INTERRUPT
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index ba61dec72089..cf3ef8d75910 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -157,7 +157,7 @@ extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order);
 extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info);
-extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp);
+extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order);
 extern void kvmppc_free_hpt(struct kvm_hpt_info *info);
 extern long kvmppc_prepare_vrma(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 62d132a3cec5..3a607faf0f9f 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -102,10 +102,10 @@ void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info)
 		info->virt, (long)info->order, kvm->arch.lpid);
 }
 
-long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
+long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
 {
 	long err = -EBUSY;
-	long order;
+	struct kvm_hpt_info info;
 
 	if (kvm_is_radix(kvm))
 		return -EINVAL;
@@ -120,8 +120,9 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
 			goto out;
 		}
 	}
-	if (kvm->arch.hpt.virt) {
-		order = kvm->arch.hpt.order;
+	if (kvm->arch.hpt.order == order) {
+		/* We already have a suitable HPT */
+
 		/* Set the entire HPT to 0, i.e. invalid HPTEs */
 		memset((void *)kvm->arch.hpt.virt, 0, 1ul << order);
 		/*
@@ -130,17 +131,19 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
 		kvmppc_rmap_reset(kvm);
 		/* Ensure that each vcpu will flush its TLB on next entry. */
 		cpumask_setall(&kvm->arch.need_tlb_flush);
-		*htab_orderp = order;
 		err = 0;
-	} else {
-		struct kvm_hpt_info info;
-
-		err = kvmppc_allocate_hpt(&info, *htab_orderp);
-		if (err < 0)
-			goto out;
-		kvmppc_set_hpt(kvm, &info);
+		goto out;
 	}
- out:
+
+	if (kvm->arch.hpt.virt)
+		kvmppc_free_hpt(&kvm->arch.hpt);
+
+	err = kvmppc_allocate_hpt(&info, order);
+	if (err < 0)
+		goto out;
+	kvmppc_set_hpt(kvm, &info);
+
+out:
 	mutex_unlock(&kvm->lock);
 	return err;
 }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 19987e4343c3..fbc901746304 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3704,12 +3704,9 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
 		r = -EFAULT;
 		if (get_user(htab_order, (u32 __user *)argp))
 			break;
-		r = kvmppc_alloc_reset_hpt(kvm, &htab_order);
+		r = kvmppc_alloc_reset_hpt(kvm, htab_order);
 		if (r)
 			break;
-		r = -EFAULT;
-		if (put_user(htab_order, (u32 __user *)argp))
-			break;
 		r = 0;
 		break;
 	}
-- 
cgit v1.2.3


From 639e459768845924705933db9142baef545ff5fc Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 20 Dec 2016 16:49:04 +1100
Subject: KVM: PPC: Book3S HV: Create kvmppc_unmap_hpte_helper()

The kvm_unmap_rmapp() function, called from certain MMU notifiers, is used
to force all guest mappings of a particular host page to be set ABSENT, and
removed from the reverse mappings.

For HPT resizing, we will have some cases where we want to set just a
single guest HPTE ABSENT and remove its reverse mappings.  To prepare with
this, we split out the logic from kvm_unmap_rmapp() to evict a single HPTE,
moving it to a new helper function.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 77 +++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 3a607faf0f9f..6d70989686a7 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -742,13 +742,53 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 	return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
 }
 
+/* Must be called with both HPTE and rmap locked */
+static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
+			      unsigned long *rmapp, unsigned long gfn)
+{
+	__be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
+	struct revmap_entry *rev = kvm->arch.hpt.rev;
+	unsigned long j, h;
+	unsigned long ptel, psize, rcbits;
+
+	j = rev[i].forw;
+	if (j == i) {
+		/* chain is now empty */
+		*rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
+	} else {
+		/* remove i from chain */
+		h = rev[i].back;
+		rev[h].forw = j;
+		rev[j].back = h;
+		rev[i].forw = rev[i].back = i;
+		*rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
+	}
+
+	/* Now check and modify the HPTE */
+	ptel = rev[i].guest_rpte;
+	psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
+	if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
+	    hpte_rpn(ptel, psize) == gfn) {
+		hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
+		kvmppc_invalidate_hpte(kvm, hptep, i);
+		hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
+		/* Harvest R and C */
+		rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
+		*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+		if (rcbits & HPTE_R_C)
+			kvmppc_update_rmap_change(rmapp, psize);
+		if (rcbits & ~rev[i].guest_rpte) {
+			rev[i].guest_rpte = ptel | rcbits;
+			note_hpte_modification(kvm, &rev[i]);
+		}
+	}
+}
+
 static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			   unsigned long gfn)
 {
-	struct revmap_entry *rev = kvm->arch.hpt.rev;
-	unsigned long h, i, j;
+	unsigned long i;
 	__be64 *hptep;
-	unsigned long ptel, psize, rcbits;
 	unsigned long *rmapp;
 
 	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
@@ -773,37 +813,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 				cpu_relax();
 			continue;
 		}
-		j = rev[i].forw;
-		if (j == i) {
-			/* chain is now empty */
-			*rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
-		} else {
-			/* remove i from chain */
-			h = rev[i].back;
-			rev[h].forw = j;
-			rev[j].back = h;
-			rev[i].forw = rev[i].back = i;
-			*rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
-		}
 
-		/* Now check and modify the HPTE */
-		ptel = rev[i].guest_rpte;
-		psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
-		if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
-		    hpte_rpn(ptel, psize) == gfn) {
-			hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
-			kvmppc_invalidate_hpte(kvm, hptep, i);
-			hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
-			/* Harvest R and C */
-			rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
-			*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
-			if (rcbits & HPTE_R_C)
-				kvmppc_update_rmap_change(rmapp, psize);
-			if (rcbits & ~rev[i].guest_rpte) {
-				rev[i].guest_rpte = ptel | rcbits;
-				note_hpte_modification(kvm, &rev[i]);
-			}
-		}
+		kvmppc_unmap_hpte(kvm, i, rmapp, gfn);
 		unlock_rmap(rmapp);
 		__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 	}
-- 
cgit v1.2.3


From 5e9859699aba74c0e297645e7d1734cd4b964de7 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 20 Dec 2016 16:49:05 +1100
Subject: KVM: PPC: Book3S HV: Outline of KVM-HV HPT resizing implementation

This adds a not yet working outline of the HPT resizing PAPR
extension.  Specifically it adds the necessary ioctl() functions,
their basic steps, the work function which will handle preparation for
the resize, and synchronization between these, the guest page fault
path and guest HPT update path.

The actual guts of the implementation isn't here yet, so for now the
calls will always fail.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_host.h |   3 +
 arch/powerpc/include/asm/kvm_ppc.h  |   4 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 191 ++++++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c        |  25 +++++
 4 files changed, 223 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 0aa0f22d775a..7bba8f415627 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -252,6 +252,8 @@ struct kvm_hpt_info {
 	int cma;
 };
 
+struct kvm_resize_hpt;
+
 struct kvm_arch {
 	unsigned int lpid;
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -276,6 +278,7 @@ struct kvm_arch {
 	u64 process_table;
 	struct dentry *debugfs_dir;
 	struct dentry *htab_dentry;
+	struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 	struct mutex hpt_mutex;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index cf3ef8d75910..dd11c4c8c56a 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -215,6 +215,10 @@ extern void kvmppc_bookehv_exit(void);
 extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
 
 extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
+extern long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
+					    struct kvm_ppc_resize_hpt *rhpt);
+extern long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
+					   struct kvm_ppc_resize_hpt *rhpt);
 
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 6d70989686a7..323287fc0c01 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -40,9 +40,34 @@
 
 #include "trace_hv.h"
 
+//#define DEBUG_RESIZE_HPT	1
+
+#ifdef DEBUG_RESIZE_HPT
+#define resize_hpt_debug(resize, ...)				\
+	do {							\
+		printk(KERN_DEBUG "RESIZE HPT %p: ", resize);	\
+		printk(__VA_ARGS__);				\
+	} while (0)
+#else
+#define resize_hpt_debug(resize, ...)				\
+	do { } while (0)
+#endif
+
 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
 				long pte_index, unsigned long pteh,
 				unsigned long ptel, unsigned long *pte_idx_ret);
+
+struct kvm_resize_hpt {
+	/* These fields read-only after init */
+	struct kvm *kvm;
+	struct work_struct work;
+	u32 order;
+
+	/* These fields protected by kvm->lock */
+	int error;
+	bool prepare_done;
+};
+
 static void kvmppc_rmap_reset(struct kvm *kvm);
 
 int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
@@ -1179,6 +1204,172 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 }
 
+/*
+ * HPT resizing
+ */
+static int resize_hpt_allocate(struct kvm_resize_hpt *resize)
+{
+	return 0;
+}
+
+static int resize_hpt_rehash(struct kvm_resize_hpt *resize)
+{
+	return -EIO;
+}
+
+static void resize_hpt_pivot(struct kvm_resize_hpt *resize)
+{
+}
+
+static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
+{
+	BUG_ON(kvm->arch.resize_hpt != resize);
+	kvm->arch.resize_hpt = NULL;
+	kfree(resize);
+}
+
+static void resize_hpt_prepare_work(struct work_struct *work)
+{
+	struct kvm_resize_hpt *resize = container_of(work,
+						     struct kvm_resize_hpt,
+						     work);
+	struct kvm *kvm = resize->kvm;
+	int err;
+
+	resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
+			 resize->order);
+
+	err = resize_hpt_allocate(resize);
+
+	mutex_lock(&kvm->lock);
+
+	resize->error = err;
+	resize->prepare_done = true;
+
+	mutex_unlock(&kvm->lock);
+}
+
+long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
+				     struct kvm_ppc_resize_hpt *rhpt)
+{
+	unsigned long flags = rhpt->flags;
+	unsigned long shift = rhpt->shift;
+	struct kvm_resize_hpt *resize;
+	int ret;
+
+	if (flags != 0)
+		return -EINVAL;
+
+	if (shift && ((shift < 18) || (shift > 46)))
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+
+	resize = kvm->arch.resize_hpt;
+
+	if (resize) {
+		if (resize->order == shift) {
+			/* Suitable resize in progress */
+			if (resize->prepare_done) {
+				ret = resize->error;
+				if (ret != 0)
+					resize_hpt_release(kvm, resize);
+			} else {
+				ret = 100; /* estimated time in ms */
+			}
+
+			goto out;
+		}
+
+		/* not suitable, cancel it */
+		resize_hpt_release(kvm, resize);
+	}
+
+	ret = 0;
+	if (!shift)
+		goto out; /* nothing to do */
+
+	/* start new resize */
+
+	resize = kzalloc(sizeof(*resize), GFP_KERNEL);
+	resize->order = shift;
+	resize->kvm = kvm;
+	INIT_WORK(&resize->work, resize_hpt_prepare_work);
+	kvm->arch.resize_hpt = resize;
+
+	schedule_work(&resize->work);
+
+	ret = 100; /* estimated time in ms */
+
+out:
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
+static void resize_hpt_boot_vcpu(void *opaque)
+{
+	/* Nothing to do, just force a KVM exit */
+}
+
+long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
+				    struct kvm_ppc_resize_hpt *rhpt)
+{
+	unsigned long flags = rhpt->flags;
+	unsigned long shift = rhpt->shift;
+	struct kvm_resize_hpt *resize;
+	long ret;
+
+	if (flags != 0)
+		return -EINVAL;
+
+	if (shift && ((shift < 18) || (shift > 46)))
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+
+	resize = kvm->arch.resize_hpt;
+
+	/* This shouldn't be possible */
+	ret = -EIO;
+	if (WARN_ON(!kvm->arch.hpte_setup_done))
+		goto out_no_hpt;
+
+	/* Stop VCPUs from running while we mess with the HPT */
+	kvm->arch.hpte_setup_done = 0;
+	smp_mb();
+
+	/* Boot all CPUs out of the guest so they re-read
+	 * hpte_setup_done */
+	on_each_cpu(resize_hpt_boot_vcpu, NULL, 1);
+
+	ret = -ENXIO;
+	if (!resize || (resize->order != shift))
+		goto out;
+
+	ret = -EBUSY;
+	if (!resize->prepare_done)
+		goto out;
+
+	ret = resize->error;
+	if (ret != 0)
+		goto out;
+
+	ret = resize_hpt_rehash(resize);
+	if (ret != 0)
+		goto out;
+
+	resize_hpt_pivot(resize);
+
+out:
+	/* Let VCPUs run again */
+	kvm->arch.hpte_setup_done = 1;
+	smp_mb();
+out_no_hpt:
+	resize_hpt_release(kvm, resize);
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
 /*
  * Functions for reading and writing the hash table via reads and
  * writes on a file descriptor.
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index fbc901746304..1e107ece4e37 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3422,6 +3422,9 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 
 	kvm->arch.lpcr = lpcr;
 
+	/* Initialization for future HPT resizes */
+	kvm->arch.resize_hpt = NULL;
+
 	/*
 	 * Work out how many sets the TLB has, for the use of
 	 * the TLB invalidation loop in book3s_hv_rmhandlers.S.
@@ -3721,6 +3724,28 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
 		break;
 	}
 
+	case KVM_PPC_RESIZE_HPT_PREPARE: {
+		struct kvm_ppc_resize_hpt rhpt;
+
+		r = -EFAULT;
+		if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
+			break;
+
+		r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt);
+		break;
+	}
+
+	case KVM_PPC_RESIZE_HPT_COMMIT: {
+		struct kvm_ppc_resize_hpt rhpt;
+
+		r = -EFAULT;
+		if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
+			break;
+
+		r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt);
+		break;
+	}
+
 	default:
 		r = -ENOTTY;
 	}
-- 
cgit v1.2.3


From b5baa68773150772c275b4af1bb31327200cfc05 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 20 Dec 2016 16:49:06 +1100
Subject: KVM: PPC: Book3S HV: KVM-HV HPT resizing implementation

This adds the "guts" of the implementation for the HPT resizing PAPR
extension.  It has the code to allocate and clear a new HPT, rehash an
existing HPT's entries into it, and accomplish the switchover for a
KVM guest from the old HPT to the new one.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 188 +++++++++++++++++++++++++++++++++++-
 1 file changed, 187 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 323287fc0c01..013552f05182 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -66,6 +66,10 @@ struct kvm_resize_hpt {
 	/* These fields protected by kvm->lock */
 	int error;
 	bool prepare_done;
+
+	/* Private to the work thread, until prepare_done is true,
+	 * then protected by kvm->resize_hpt_sem */
+	struct kvm_hpt_info hpt;
 };
 
 static void kvmppc_rmap_reset(struct kvm *kvm);
@@ -1209,21 +1213,203 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
  */
 static int resize_hpt_allocate(struct kvm_resize_hpt *resize)
 {
+	int rc;
+
+	rc = kvmppc_allocate_hpt(&resize->hpt, resize->order);
+	if (rc < 0)
+		return rc;
+
+	resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n",
+			 resize->hpt.virt);
+
 	return 0;
 }
 
+static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
+					    unsigned long idx)
+{
+	struct kvm *kvm = resize->kvm;
+	struct kvm_hpt_info *old = &kvm->arch.hpt;
+	struct kvm_hpt_info *new = &resize->hpt;
+	unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1;
+	unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1;
+	__be64 *hptep, *new_hptep;
+	unsigned long vpte, rpte, guest_rpte;
+	int ret;
+	struct revmap_entry *rev;
+	unsigned long apsize, psize, avpn, pteg, hash;
+	unsigned long new_idx, new_pteg, replace_vpte;
+
+	hptep = (__be64 *)(old->virt + (idx << 4));
+
+	/* Guest is stopped, so new HPTEs can't be added or faulted
+	 * in, only unmapped or altered by host actions.  So, it's
+	 * safe to check this before we take the HPTE lock */
+	vpte = be64_to_cpu(hptep[0]);
+	if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
+		return 0; /* nothing to do */
+
+	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+		cpu_relax();
+
+	vpte = be64_to_cpu(hptep[0]);
+
+	ret = 0;
+	if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
+		/* Nothing to do */
+		goto out;
+
+	/* Unmap */
+	rev = &old->rev[idx];
+	guest_rpte = rev->guest_rpte;
+
+	ret = -EIO;
+	apsize = hpte_page_size(vpte, guest_rpte);
+	if (!apsize)
+		goto out;
+
+	if (vpte & HPTE_V_VALID) {
+		unsigned long gfn = hpte_rpn(guest_rpte, apsize);
+		int srcu_idx = srcu_read_lock(&kvm->srcu);
+		struct kvm_memory_slot *memslot =
+			__gfn_to_memslot(kvm_memslots(kvm), gfn);
+
+		if (memslot) {
+			unsigned long *rmapp;
+			rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
+
+			lock_rmap(rmapp);
+			kvmppc_unmap_hpte(kvm, idx, rmapp, gfn);
+			unlock_rmap(rmapp);
+		}
+
+		srcu_read_unlock(&kvm->srcu, srcu_idx);
+	}
+
+	/* Reload PTE after unmap */
+	vpte = be64_to_cpu(hptep[0]);
+
+	BUG_ON(vpte & HPTE_V_VALID);
+	BUG_ON(!(vpte & HPTE_V_ABSENT));
+
+	ret = 0;
+	if (!(vpte & HPTE_V_BOLTED))
+		goto out;
+
+	rpte = be64_to_cpu(hptep[1]);
+	psize = hpte_base_page_size(vpte, rpte);
+	avpn = HPTE_V_AVPN_VAL(vpte) & ~((psize - 1) >> 23);
+	pteg = idx / HPTES_PER_GROUP;
+	if (vpte & HPTE_V_SECONDARY)
+		pteg = ~pteg;
+
+	if (!(vpte & HPTE_V_1TB_SEG)) {
+		unsigned long offset, vsid;
+
+		/* We only have 28 - 23 bits of offset in avpn */
+		offset = (avpn & 0x1f) << 23;
+		vsid = avpn >> 5;
+		/* We can find more bits from the pteg value */
+		if (psize < (1ULL << 23))
+			offset |= ((vsid ^ pteg) & old_hash_mask) * psize;
+
+		hash = vsid ^ (offset / psize);
+	} else {
+		unsigned long offset, vsid;
+
+		/* We only have 40 - 23 bits of seg_off in avpn */
+		offset = (avpn & 0x1ffff) << 23;
+		vsid = avpn >> 17;
+		if (psize < (1ULL << 23))
+			offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) * psize;
+
+		hash = vsid ^ (vsid << 25) ^ (offset / psize);
+	}
+
+	new_pteg = hash & new_hash_mask;
+	if (vpte & HPTE_V_SECONDARY) {
+		BUG_ON(~pteg != (hash & old_hash_mask));
+		new_pteg = ~new_pteg;
+	} else {
+		BUG_ON(pteg != (hash & old_hash_mask));
+	}
+
+	new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP);
+	new_hptep = (__be64 *)(new->virt + (new_idx << 4));
+
+	replace_vpte = be64_to_cpu(new_hptep[0]);
+
+	if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
+		BUG_ON(new->order >= old->order);
+
+		if (replace_vpte & HPTE_V_BOLTED) {
+			if (vpte & HPTE_V_BOLTED)
+				/* Bolted collision, nothing we can do */
+				ret = -ENOSPC;
+			/* Discard the new HPTE */
+			goto out;
+		}
+
+		/* Discard the previous HPTE */
+	}
+
+	new_hptep[1] = cpu_to_be64(rpte);
+	new->rev[new_idx].guest_rpte = guest_rpte;
+	/* No need for a barrier, since new HPT isn't active */
+	new_hptep[0] = cpu_to_be64(vpte);
+	unlock_hpte(new_hptep, vpte);
+
+out:
+	unlock_hpte(hptep, vpte);
+	return ret;
+}
+
 static int resize_hpt_rehash(struct kvm_resize_hpt *resize)
 {
-	return -EIO;
+	struct kvm *kvm = resize->kvm;
+	unsigned  long i;
+	int rc;
+
+	for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) {
+		rc = resize_hpt_rehash_hpte(resize, i);
+		if (rc != 0)
+			return rc;
+	}
+
+	return 0;
 }
 
 static void resize_hpt_pivot(struct kvm_resize_hpt *resize)
 {
+	struct kvm *kvm = resize->kvm;
+	struct kvm_hpt_info hpt_tmp;
+
+	/* Exchange the pending tables in the resize structure with
+	 * the active tables */
+
+	resize_hpt_debug(resize, "resize_hpt_pivot()\n");
+
+	spin_lock(&kvm->mmu_lock);
+	asm volatile("ptesync" : : : "memory");
+
+	hpt_tmp = kvm->arch.hpt;
+	kvmppc_set_hpt(kvm, &resize->hpt);
+	resize->hpt = hpt_tmp;
+
+	spin_unlock(&kvm->mmu_lock);
+
+	synchronize_srcu_expedited(&kvm->srcu);
+
+	resize_hpt_debug(resize, "resize_hpt_pivot() done\n");
 }
 
 static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
 {
 	BUG_ON(kvm->arch.resize_hpt != resize);
+
+	if (resize->hpt.virt)
+		kvmppc_free_hpt(&resize->hpt);
+
 	kvm->arch.resize_hpt = NULL;
 	kfree(resize);
 }
-- 
cgit v1.2.3


From 050f23390f6bdbfa7dd2800884d32490489851b7 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 20 Dec 2016 16:49:07 +1100
Subject: KVM: PPC: Book3S HV: Advertise availablity of HPT resizing on KVM HV

This updates the KVM_CAP_SPAPR_RESIZE_HPT capability to advertise the
presence of in-kernel HPT resizing on KVM HV.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/powerpc.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 40a5b2d75ed1..2b3e4e620078 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -612,6 +612,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_SPAPR_MULTITCE:
 		r = 1;
 		break;
+	case KVM_CAP_SPAPR_RESIZE_HPT:
+		r = !!hv_enabled;
+		break;
 #endif
 	case KVM_CAP_PPC_HTM:
 		r = cpu_has_feature(CPU_FTR_TM_COMP) &&
-- 
cgit v1.2.3


From 5a6da5f78431f6b172078eb5bd524187833f360b Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Thu, 19 Jan 2017 11:10:26 +0100
Subject: MIPS: KVM: Return directly after a failed copy_from_user() in
 kvm_arch_vcpu_ioctl()

* Return directly after a call of the function "copy_from_user" failed
  in a case block.

* Delete the jump label "out" which became unnecessary with
  this refactoring.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: James Hogan <james.hogan@imgtec.com>
---
 arch/mips/kvm/mips.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 29ec9ab3fd55..7999ef4d1147 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1152,10 +1152,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
 		{
 			struct kvm_mips_interrupt irq;
 
-			r = -EFAULT;
 			if (copy_from_user(&irq, argp, sizeof(irq)))
-				goto out;
-
+				return -EFAULT;
 			kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__,
 				  irq.irq);
 
@@ -1165,17 +1163,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
 	case KVM_ENABLE_CAP: {
 		struct kvm_enable_cap cap;
 
-		r = -EFAULT;
 		if (copy_from_user(&cap, argp, sizeof(cap)))
-			goto out;
+			return -EFAULT;
 		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
 		break;
 	}
 	default:
 		r = -ENOIOCTLCMD;
 	}
-
-out:
 	return r;
 }
 
-- 
cgit v1.2.3


From 814f91bf3ea0962e4f802324766bf301ef6f5431 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 2 Feb 2017 01:21:35 +0000
Subject: MIPS: Move pgd_alloc() out of header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pgd_alloc() references init_mm which is not exported to modules. In
order for KVM to be able to use pgd_alloc() to allocate GVA page tables,
move pgd_alloc() into a new pgtable.c file and export it to modules.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/pgalloc.h | 16 +---------------
 arch/mips/mm/Makefile           |  2 +-
 arch/mips/mm/pgtable.c          | 25 +++++++++++++++++++++++++
 3 files changed, 27 insertions(+), 16 deletions(-)
 create mode 100644 arch/mips/mm/pgtable.c

diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h
index a03e86969f78..a8705f6c8180 100644
--- a/arch/mips/include/asm/pgalloc.h
+++ b/arch/mips/include/asm/pgalloc.h
@@ -43,21 +43,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
  * Initialize a new pgd / pmd table with invalid pointers.
  */
 extern void pgd_init(unsigned long page);
-
-static inline pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-	pgd_t *ret, *init;
-
-	ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_ORDER);
-	if (ret) {
-		init = pgd_offset(&init_mm, 0UL);
-		pgd_init((unsigned long)ret);
-		memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
-		       (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
-	}
-
-	return ret;
-}
+extern pgd_t *pgd_alloc(struct mm_struct *mm);
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
diff --git a/arch/mips/mm/Makefile b/arch/mips/mm/Makefile
index b4c64bd3f723..b4cc8811a664 100644
--- a/arch/mips/mm/Makefile
+++ b/arch/mips/mm/Makefile
@@ -4,7 +4,7 @@
 
 obj-y				+= cache.o dma-default.o extable.o fault.o \
 				   gup.o init.o mmap.o page.o page-funcs.o \
-				   tlbex.o tlbex-fault.o tlb-funcs.o
+				   pgtable.o tlbex.o tlbex-fault.o tlb-funcs.o
 
 ifdef CONFIG_CPU_MICROMIPS
 obj-y				+= uasm-micromips.o
diff --git a/arch/mips/mm/pgtable.c b/arch/mips/mm/pgtable.c
new file mode 100644
index 000000000000..05560b042d82
--- /dev/null
+++ b/arch/mips/mm/pgtable.c
@@ -0,0 +1,25 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <asm/pgalloc.h>
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	pgd_t *ret, *init;
+
+	ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_ORDER);
+	if (ret) {
+		init = pgd_offset(&init_mm, 0UL);
+		pgd_init((unsigned long)ret);
+		memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
+		       (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pgd_alloc);
-- 
cgit v1.2.3


From ccf015166ddbbd4c43a16d6871ea15f1fcf51ccc Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 16 Oct 2015 16:33:13 +0100
Subject: MIPS: Export pgd/pmd symbols for KVM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Export pmd_init(), invalid_pmd_table and tlbmiss_handler_setup_pgd to
GPL kernel modules so that MIPS KVM can use the inline page table
management functions and switch between page tables:

- pmd_init() will be used directly by KVM to initialise newly allocated
  pmd tables with invalid lower level table pointers.

- invalid_pmd_table is used by pud_present(), pud_none(), and
  pud_clear(), which KVM will use to test and clear pud entries.

- tlbmiss_handler_setup_pgd() will be called by KVM entry code to switch
  to the appropriate GVA page tables.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/mm/init.c       | 1 +
 arch/mips/mm/pgtable-64.c | 2 ++
 arch/mips/mm/tlbex.c      | 5 ++++-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index e86ebcf5c071..653569bc0da7 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -538,5 +538,6 @@ unsigned long pgd_current[NR_CPUS];
 pgd_t swapper_pg_dir[_PTRS_PER_PGD] __section(.bss..swapper_pg_dir);
 #ifndef __PAGETABLE_PMD_FOLDED
 pmd_t invalid_pmd_table[PTRS_PER_PMD] __page_aligned_bss;
+EXPORT_SYMBOL_GPL(invalid_pmd_table);
 #endif
 pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned_bss;
diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c
index ce4473e7c0d2..0ae7b28b4db5 100644
--- a/arch/mips/mm/pgtable-64.c
+++ b/arch/mips/mm/pgtable-64.c
@@ -6,6 +6,7 @@
  * Copyright (C) 1999, 2000 by Silicon Graphics
  * Copyright (C) 2003 by Ralf Baechle
  */
+#include <linux/export.h>
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <asm/fixmap.h>
@@ -60,6 +61,7 @@ void pmd_init(unsigned long addr, unsigned long pagetable)
 		p[-1] = pagetable;
 	} while (p != end);
 }
+EXPORT_SYMBOL_GPL(pmd_init);
 #endif
 
 pmd_t mk_pmd(struct page *page, pgprot_t prot)
diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c
index 55ce39606cb8..dc7bb1506103 100644
--- a/arch/mips/mm/tlbex.c
+++ b/arch/mips/mm/tlbex.c
@@ -22,6 +22,7 @@
  */
 
 #include <linux/bug.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/smp.h>
@@ -1536,7 +1537,9 @@ static void build_loongson3_tlb_refill_handler(void)
 extern u32 handle_tlbl[], handle_tlbl_end[];
 extern u32 handle_tlbs[], handle_tlbs_end[];
 extern u32 handle_tlbm[], handle_tlbm_end[];
-extern u32 tlbmiss_handler_setup_pgd_start[], tlbmiss_handler_setup_pgd[];
+extern u32 tlbmiss_handler_setup_pgd_start[];
+extern u32 tlbmiss_handler_setup_pgd[];
+EXPORT_SYMBOL_GPL(tlbmiss_handler_setup_pgd);
 extern u32 tlbmiss_handler_setup_pgd_end[];
 
 static void build_setup_pgd(void)
-- 
cgit v1.2.3


From 93a93c2461f9416a58c7d1e32fec201af8cc3aad Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Sat, 10 Sep 2016 23:53:57 +0100
Subject: MIPS: uasm: Add include guards in asm/uasm.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add include guards in asm/uasm.h to allow it to be safely used by a new
header asm/tlbex.h in the next patch to expose TLB exception building
functions for KVM to use.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/uasm.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h
index f7929f65f7ca..e9a9e2ade1d2 100644
--- a/arch/mips/include/asm/uasm.h
+++ b/arch/mips/include/asm/uasm.h
@@ -9,6 +9,9 @@
  * Copyright (C) 2012, 2013  MIPS Technologies, Inc.  All rights reserved.
  */
 
+#ifndef __ASM_UASM_H
+#define __ASM_UASM_H
+
 #include <linux/types.h>
 
 #ifdef CONFIG_EXPORT_UASM
@@ -309,3 +312,5 @@ void uasm_il_bltz(u32 **p, struct uasm_reloc **r, unsigned int reg, int lid);
 void uasm_il_bne(u32 **p, struct uasm_reloc **r, unsigned int reg1,
 		 unsigned int reg2, int lid);
 void uasm_il_bnez(u32 **p, struct uasm_reloc **r, unsigned int reg, int lid);
+
+#endif /* __ASM_UASM_H */
-- 
cgit v1.2.3


From 722b45443146f425453525d3c2270ff2733f5dc4 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Sat, 10 Sep 2016 23:55:07 +0100
Subject: MIPS: Export some tlbex internals for KVM to use
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Export to TLB exception code generating functions so that KVM can
construct a fast TLB refill handler for guest context without
reinventing the wheel quite so much.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/tlbex.h | 26 ++++++++++++++++++++++++++
 arch/mips/mm/tlbex.c          | 33 ++++++++++++++++-----------------
 2 files changed, 42 insertions(+), 17 deletions(-)
 create mode 100644 arch/mips/include/asm/tlbex.h

diff --git a/arch/mips/include/asm/tlbex.h b/arch/mips/include/asm/tlbex.h
new file mode 100644
index 000000000000..53050e9dd2c9
--- /dev/null
+++ b/arch/mips/include/asm/tlbex.h
@@ -0,0 +1,26 @@
+#ifndef __ASM_TLBEX_H
+#define __ASM_TLBEX_H
+
+#include <asm/uasm.h>
+
+/*
+ * Write random or indexed TLB entry, and care about the hazards from
+ * the preceding mtc0 and for the following eret.
+ */
+enum tlb_write_entry {
+	tlb_random,
+	tlb_indexed
+};
+
+extern int pgd_reg;
+
+void build_get_pmde64(u32 **p, struct uasm_label **l, struct uasm_reloc **r,
+		      unsigned int tmp, unsigned int ptr);
+void build_get_pgde32(u32 **p, unsigned int tmp, unsigned int ptr);
+void build_get_ptep(u32 **p, unsigned int tmp, unsigned int ptr);
+void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep);
+void build_tlb_write_entry(u32 **p, struct uasm_label **l,
+			   struct uasm_reloc **r,
+			   enum tlb_write_entry wmode);
+
+#endif /* __ASM_TLBEX_H */
diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c
index dc7bb1506103..2465f83c79c3 100644
--- a/arch/mips/mm/tlbex.c
+++ b/arch/mips/mm/tlbex.c
@@ -35,6 +35,7 @@
 #include <asm/war.h>
 #include <asm/uasm.h>
 #include <asm/setup.h>
+#include <asm/tlbex.h>
 
 static int mips_xpa_disabled;
 
@@ -345,7 +346,8 @@ static int allocate_kscratch(void)
 }
 
 static int scratch_reg;
-static int pgd_reg;
+int pgd_reg;
+EXPORT_SYMBOL_GPL(pgd_reg);
 enum vmalloc64_mode {not_refill, refill_scratch, refill_noscratch};
 
 static struct work_registers build_get_work_registers(u32 **p)
@@ -497,15 +499,9 @@ static void __maybe_unused build_tlb_probe_entry(u32 **p)
 	}
 }
 
-/*
- * Write random or indexed TLB entry, and care about the hazards from
- * the preceding mtc0 and for the following eret.
- */
-enum tlb_write_entry { tlb_random, tlb_indexed };
-
-static void build_tlb_write_entry(u32 **p, struct uasm_label **l,
-				  struct uasm_reloc **r,
-				  enum tlb_write_entry wmode)
+void build_tlb_write_entry(u32 **p, struct uasm_label **l,
+			   struct uasm_reloc **r,
+			   enum tlb_write_entry wmode)
 {
 	void(*tlbw)(u32 **) = NULL;
 
@@ -628,6 +624,7 @@ static void build_tlb_write_entry(u32 **p, struct uasm_label **l,
 		break;
 	}
 }
+EXPORT_SYMBOL_GPL(build_tlb_write_entry);
 
 static __maybe_unused void build_convert_pte_to_entrylo(u32 **p,
 							unsigned int reg)
@@ -782,9 +779,8 @@ static void build_huge_handler_tail(u32 **p, struct uasm_reloc **r,
  * TMP and PTR are scratch.
  * TMP will be clobbered, PTR will hold the pmd entry.
  */
-static void
-build_get_pmde64(u32 **p, struct uasm_label **l, struct uasm_reloc **r,
-		 unsigned int tmp, unsigned int ptr)
+void build_get_pmde64(u32 **p, struct uasm_label **l, struct uasm_reloc **r,
+		      unsigned int tmp, unsigned int ptr)
 {
 #ifndef CONFIG_MIPS_PGD_C0_CONTEXT
 	long pgdc = (long)pgd_current;
@@ -860,6 +856,7 @@ build_get_pmde64(u32 **p, struct uasm_label **l, struct uasm_reloc **r,
 	uasm_i_daddu(p, ptr, ptr, tmp); /* add in pmd offset */
 #endif
 }
+EXPORT_SYMBOL_GPL(build_get_pmde64);
 
 /*
  * BVADDR is the faulting address, PTR is scratch.
@@ -935,8 +932,7 @@ build_get_pgd_vmalloc64(u32 **p, struct uasm_label **l, struct uasm_reloc **r,
  * TMP and PTR are scratch.
  * TMP will be clobbered, PTR will hold the pgd entry.
  */
-static void __maybe_unused
-build_get_pgde32(u32 **p, unsigned int tmp, unsigned int ptr)
+void build_get_pgde32(u32 **p, unsigned int tmp, unsigned int ptr)
 {
 	if (pgd_reg != -1) {
 		/* pgd is in pgd_reg */
@@ -961,6 +957,7 @@ build_get_pgde32(u32 **p, unsigned int tmp, unsigned int ptr)
 	uasm_i_sll(p, tmp, tmp, PGD_T_LOG2);
 	uasm_i_addu(p, ptr, ptr, tmp); /* add in pgd offset */
 }
+EXPORT_SYMBOL_GPL(build_get_pgde32);
 
 #endif /* !CONFIG_64BIT */
 
@@ -990,7 +987,7 @@ static void build_adjust_context(u32 **p, unsigned int ctx)
 	uasm_i_andi(p, ctx, ctx, mask);
 }
 
-static void build_get_ptep(u32 **p, unsigned int tmp, unsigned int ptr)
+void build_get_ptep(u32 **p, unsigned int tmp, unsigned int ptr)
 {
 	/*
 	 * Bug workaround for the Nevada. It seems as if under certain
@@ -1014,8 +1011,9 @@ static void build_get_ptep(u32 **p, unsigned int tmp, unsigned int ptr)
 	build_adjust_context(p, tmp);
 	UASM_i_ADDU(p, ptr, ptr, tmp); /* add in offset */
 }
+EXPORT_SYMBOL_GPL(build_get_ptep);
 
-static void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep)
+void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep)
 {
 	int pte_off_even = 0;
 	int pte_off_odd = sizeof(pte_t);
@@ -1064,6 +1062,7 @@ static void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep)
 		UASM_i_MTC0(p, 0, C0_ENTRYLO1);
 	UASM_i_MTC0(p, ptep, C0_ENTRYLO1); /* load it */
 }
+EXPORT_SYMBOL_GPL(build_update_entries);
 
 struct mips_huge_tlb_info {
 	int huge_pte;
-- 
cgit v1.2.3


From 7170bdc777556dad87f92e6a73a6cc557bf3290e Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 28 Nov 2016 16:38:01 +0000
Subject: MIPS: Add return errors to protected cache ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The protected cache ops contain no out of line fixup code to return an
error code in the event of a fault, with the cache op being skipped in
that case. For KVM however we'd like to detect this case as page
faulting will be disabled so it could happen during normal operation if
the GVA page tables were flushed, and need to be handled by the caller.

Add the out-of-line fixup code to load the error value -EFAULT into the
return variable, and adapt the protected cache line functions to pass
the error back to the caller.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/r4kcache.h | 55 +++++++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/arch/mips/include/asm/r4kcache.h b/arch/mips/include/asm/r4kcache.h
index b42b513007a2..7227c158cbf8 100644
--- a/arch/mips/include/asm/r4kcache.h
+++ b/arch/mips/include/asm/r4kcache.h
@@ -147,49 +147,64 @@ static inline void flush_scache_line(unsigned long addr)
 }
 
 #define protected_cache_op(op,addr)				\
+({								\
+	int __err = 0;						\
 	__asm__ __volatile__(					\
 	"	.set	push			\n"		\
 	"	.set	noreorder		\n"		\
 	"	.set "MIPS_ISA_ARCH_LEVEL"	\n"		\
-	"1:	cache	%0, (%1)		\n"		\
+	"1:	cache	%1, (%2)		\n"		\
 	"2:	.set	pop			\n"		\
+	"	.section .fixup,\"ax\"		\n"		\
+	"3:	li	%0, %3			\n"		\
+	"	j	2b			\n"		\
+	"	.previous			\n"		\
 	"	.section __ex_table,\"a\"	\n"		\
-	"	"STR(PTR)" 1b, 2b		\n"		\
+	"	"STR(PTR)" 1b, 3b		\n"		\
 	"	.previous"					\
-	:							\
-	: "i" (op), "r" (addr))
+	: "+r" (__err)						\
+	: "i" (op), "r" (addr), "i" (-EFAULT));			\
+	__err;							\
+})
+
 
 #define protected_cachee_op(op,addr)				\
+({								\
+	int __err = 0;						\
 	__asm__ __volatile__(					\
 	"	.set	push			\n"		\
 	"	.set	noreorder		\n"		\
 	"	.set	mips0			\n"		\
 	"	.set	eva			\n"		\
-	"1:	cachee	%0, (%1)		\n"		\
+	"1:	cachee	%1, (%2)		\n"		\
 	"2:	.set	pop			\n"		\
+	"	.section .fixup,\"ax\"		\n"		\
+	"3:	li	%0, %3			\n"		\
+	"	j	2b			\n"		\
+	"	.previous			\n"		\
 	"	.section __ex_table,\"a\"	\n"		\
-	"	"STR(PTR)" 1b, 2b		\n"		\
+	"	"STR(PTR)" 1b, 3b		\n"		\
 	"	.previous"					\
-	:							\
-	: "i" (op), "r" (addr))
+	: "+r" (__err)						\
+	: "i" (op), "r" (addr), "i" (-EFAULT));			\
+	__err;							\
+})
 
 /*
  * The next two are for badland addresses like signal trampolines.
  */
-static inline void protected_flush_icache_line(unsigned long addr)
+static inline int protected_flush_icache_line(unsigned long addr)
 {
 	switch (boot_cpu_type()) {
 	case CPU_LOONGSON2:
-		protected_cache_op(Hit_Invalidate_I_Loongson2, addr);
-		break;
+		return protected_cache_op(Hit_Invalidate_I_Loongson2, addr);
 
 	default:
 #ifdef CONFIG_EVA
-		protected_cachee_op(Hit_Invalidate_I, addr);
+		return protected_cachee_op(Hit_Invalidate_I, addr);
 #else
-		protected_cache_op(Hit_Invalidate_I, addr);
+		return protected_cache_op(Hit_Invalidate_I, addr);
 #endif
-		break;
 	}
 }
 
@@ -199,21 +214,21 @@ static inline void protected_flush_icache_line(unsigned long addr)
  * caches.  We're talking about one cacheline unnecessarily getting invalidated
  * here so the penalty isn't overly hard.
  */
-static inline void protected_writeback_dcache_line(unsigned long addr)
+static inline int protected_writeback_dcache_line(unsigned long addr)
 {
 #ifdef CONFIG_EVA
-	protected_cachee_op(Hit_Writeback_Inv_D, addr);
+	return protected_cachee_op(Hit_Writeback_Inv_D, addr);
 #else
-	protected_cache_op(Hit_Writeback_Inv_D, addr);
+	return protected_cache_op(Hit_Writeback_Inv_D, addr);
 #endif
 }
 
-static inline void protected_writeback_scache_line(unsigned long addr)
+static inline int protected_writeback_scache_line(unsigned long addr)
 {
 #ifdef CONFIG_EVA
-	protected_cachee_op(Hit_Writeback_Inv_SD, addr);
+	return protected_cachee_op(Hit_Writeback_Inv_SD, addr);
 #else
-	protected_cache_op(Hit_Writeback_Inv_SD, addr);
+	return protected_cache_op(Hit_Writeback_Inv_SD, addr);
 #endif
 }
 
-- 
cgit v1.2.3


From 00104b4171491794b53f8d6cc255f539e8bf18b4 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 4 Jan 2017 22:05:22 +0000
Subject: KVM: MIPS: Drop partial KVM_NMI implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MIPS incompletely implements the KVM_NMI ioctl to supposedly perform a
CPU reset, but all it actually does is invalidate the ASIDs. It doesn't
expose the KVM_CAP_USER_NMI capability which is supposed to indicate the
presence of the KVM_NMI ioctl, and no user software actually uses it on
MIPS.

Since this is dead code that would technically need updating for GVA
page table handling in upcoming patches, remove it now. If we wanted to
implement NMI injection later it can always be done properly along with
the KVM_CAP_USER_NMI capability, and if we wanted to implement a proper
CPU reset it would be better done with a separate ioctl.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/mips.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 7999ef4d1147..f9e305f7ad71 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -63,18 +63,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{NULL}
 };
 
-static int kvm_mips_reset_vcpu(struct kvm_vcpu *vcpu)
-{
-	int i;
-
-	for_each_possible_cpu(i) {
-		vcpu->arch.guest_kernel_asid[i] = 0;
-		vcpu->arch.guest_user_asid[i] = 0;
-	}
-
-	return 0;
-}
-
 /*
  * XXXKYMA: We are simulatoring a processor that has the WII bit set in
  * Config7, so we are "runnable" if interrupts are pending
@@ -1144,10 +1132,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
 			return -E2BIG;
 		return kvm_mips_copy_reg_indices(vcpu, user_list->reg);
 	}
-	case KVM_NMI:
-		/* Treat the NMI as a CPU reset */
-		r = kvm_mips_reset_vcpu(vcpu);
-		break;
 	case KVM_INTERRUPT:
 		{
 			struct kvm_mips_interrupt irq;
-- 
cgit v1.2.3


From 1534b3964901ff55465a9f582838204193354966 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 13 Mar 2015 15:54:08 +0000
Subject: KVM: MIPS/MMU: Simplify ASID restoration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KVM T&E uses an ASID for guest kernel mode and an ASID for guest user
mode. The current ASID is saved when the guest is scheduled out, and
restored when scheduling back in, with checks for whether the ASID needs
to be regenerated.

This isn't really necessary as the ASID can be easily determined by the
current guest mode, so lets simplify it to just read the required ASID
from guest_kernel_asid or guest_user_asid even if the ASID hasn't been
regenerated.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  3 ---
 arch/mips/kvm/mmu.c              | 46 +++++++++++-----------------------------
 2 files changed, 12 insertions(+), 37 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index bebec370324f..7cc53e44b42e 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -318,9 +318,6 @@ struct kvm_vcpu_arch {
 	/* Bitmask of pending exceptions to be cleared */
 	unsigned long pending_exceptions_clr;
 
-	/* Save/Restore the entryhi register when are are preempted/scheduled back in */
-	unsigned long preempt_entryhi;
-
 	/* S/W Based TLB for guest */
 	struct kvm_mips_tlb guest_tlb[KVM_MIPS_GUEST_TLB_SIZE];
 
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 3b677c851be0..e1698a66253b 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -237,7 +237,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
 	unsigned long flags;
-	int newasid = 0;
 
 	kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
 
@@ -250,7 +249,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
 		vcpu->arch.guest_kernel_asid[cpu] =
 		    vcpu->arch.guest_kernel_mm.context.asid[cpu];
-		newasid++;
 
 		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
 			  cpu_context(cpu, current->mm));
@@ -263,7 +261,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
 		vcpu->arch.guest_user_asid[cpu] =
 		    vcpu->arch.guest_user_mm.context.asid[cpu];
-		newasid++;
 
 		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
 			  cpu_context(cpu, current->mm));
@@ -282,35 +279,18 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		kvm_mips_migrate_count(vcpu);
 	}
 
-	if (!newasid) {
-		/*
-		 * If we preempted while the guest was executing, then reload
-		 * the pre-empted ASID
-		 */
-		if (current->flags & PF_VCPU) {
-			write_c0_entryhi(vcpu->arch.
-					 preempt_entryhi & asid_mask);
-			ehb();
-		}
-	} else {
-		/* New ASIDs were allocated for the VM */
-
-		/*
-		 * Were we in guest context? If so then the pre-empted ASID is
-		 * no longer valid, we need to set it to what it should be based
-		 * on the mode of the Guest (Kernel/User)
-		 */
-		if (current->flags & PF_VCPU) {
-			if (KVM_GUEST_KERNEL_MODE(vcpu))
-				write_c0_entryhi(vcpu->arch.
-						 guest_kernel_asid[cpu] &
-						 asid_mask);
-			else
-				write_c0_entryhi(vcpu->arch.
-						 guest_user_asid[cpu] &
-						 asid_mask);
-			ehb();
-		}
+	/*
+	 * If we preempted while the guest was executing, then reload the ASID
+	 * based on the mode of the Guest (Kernel/User)
+	 */
+	if (current->flags & PF_VCPU) {
+		if (KVM_GUEST_KERNEL_MODE(vcpu))
+			write_c0_entryhi(vcpu->arch.guest_kernel_asid[cpu] &
+					 asid_mask);
+		else
+			write_c0_entryhi(vcpu->arch.guest_user_asid[cpu] &
+					 asid_mask);
+		ehb();
 	}
 
 	/* restore guest state to registers */
@@ -329,8 +309,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	local_irq_save(flags);
 
 	cpu = smp_processor_id();
-
-	vcpu->arch.preempt_entryhi = read_c0_entryhi();
 	vcpu->arch.last_sched_cpu = cpu;
 
 	/* save guest state in registers */
-- 
cgit v1.2.3


From a60b8438bdba4a4b77c90b6c1b22804150b4f244 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Sat, 12 Nov 2016 00:00:13 +0000
Subject: KVM: MIPS: Convert get/set_regs -> vcpu_load/put
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert the get_regs() and set_regs() callbacks to vcpu_load() and
vcpu_put(), which provide a cpu argument and more closely match the
kvm_arch_vcpu_load() / kvm_arch_vcpu_put() that they are called by.

This is in preparation for moving ASID management into the
implementations.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  4 ++--
 arch/mips/kvm/mmu.c              |  4 ++--
 arch/mips/kvm/trap_emul.c        | 12 ++++++------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 7cc53e44b42e..1c70b5224151 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -539,8 +539,8 @@ struct kvm_mips_callbacks {
 			   const struct kvm_one_reg *reg, s64 *v);
 	int (*set_one_reg)(struct kvm_vcpu *vcpu,
 			   const struct kvm_one_reg *reg, s64 v);
-	int (*vcpu_get_regs)(struct kvm_vcpu *vcpu);
-	int (*vcpu_set_regs)(struct kvm_vcpu *vcpu);
+	int (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+	int (*vcpu_put)(struct kvm_vcpu *vcpu, int cpu);
 };
 extern struct kvm_mips_callbacks *kvm_mips_callbacks;
 int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index e1698a66253b..ed46528611f4 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -294,7 +294,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	/* restore guest state to registers */
-	kvm_mips_callbacks->vcpu_set_regs(vcpu);
+	kvm_mips_callbacks->vcpu_load(vcpu, cpu);
 
 	local_irq_restore(flags);
 
@@ -312,7 +312,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	vcpu->arch.last_sched_cpu = cpu;
 
 	/* save guest state in registers */
-	kvm_mips_callbacks->vcpu_get_regs(vcpu);
+	kvm_mips_callbacks->vcpu_put(vcpu, cpu);
 
 	if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
 	     asid_version_mask(cpu))) {
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 3b20441f2beb..c0ee51465913 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -633,15 +633,15 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
 	return ret;
 }
 
-static int kvm_trap_emul_vcpu_get_regs(struct kvm_vcpu *vcpu)
+static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	kvm_lose_fpu(vcpu);
-
 	return 0;
 }
 
-static int kvm_trap_emul_vcpu_set_regs(struct kvm_vcpu *vcpu)
+static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 {
+	kvm_lose_fpu(vcpu);
+
 	return 0;
 }
 
@@ -675,8 +675,8 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
 	.copy_reg_indices = kvm_trap_emul_copy_reg_indices,
 	.get_one_reg = kvm_trap_emul_get_one_reg,
 	.set_one_reg = kvm_trap_emul_set_one_reg,
-	.vcpu_get_regs = kvm_trap_emul_vcpu_get_regs,
-	.vcpu_set_regs = kvm_trap_emul_vcpu_set_regs,
+	.vcpu_load = kvm_trap_emul_vcpu_load,
+	.vcpu_put = kvm_trap_emul_vcpu_put,
 };
 
 int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks)
-- 
cgit v1.2.3


From 1581ff3dbf698abba00f39039cc5bd854400b664 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 16 Nov 2016 23:48:56 +0000
Subject: KVM: MIPS/MMU: Move preempt/ASID handling to implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MIPS KVM host and guest GVA ASIDs may need regenerating when
scheduling a process in guest context, which is done from the
kvm_arch_vcpu_load() / kvm_arch_vcpu_put() functions in mmu.c.

However this is a fairly implementation specific detail. VZ for example
may use GuestIDs instead of normal ASIDs to distinguish mappings
belonging to different guests, and even on VZ without GuestID the root
TLB will be used differently to trap & emulate.

Trap & emulate GVA ASIDs only relate to the user part of the full
address space, so can be left active during guest exit handling (guest
context) to allow guest instructions to be easily read and translated.

VZ root ASIDs however are for GPA mappings so can't be left active
during normal kernel code. They also aren't useful for accessing guest
virtual memory, and we should have CP0_BadInstr[P] registers available
to provide encodings of trapping guest instructions anyway.

Therefore move the ASID preemption handling into the implementation
callback.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/mmu.c       | 51 ------------------------------------------
 arch/mips/kvm/trap_emul.c | 56 +++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 53 deletions(-)

diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index ed46528611f4..df013538113f 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -235,39 +235,12 @@ static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu)
 /* Restore ASID once we are scheduled back after preemption */
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
 	unsigned long flags;
 
 	kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
 
-	/* Allocate new kernel and user ASIDs if needed */
-
 	local_irq_save(flags);
 
-	if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
-						asid_version_mask(cpu)) {
-		kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
-		vcpu->arch.guest_kernel_asid[cpu] =
-		    vcpu->arch.guest_kernel_mm.context.asid[cpu];
-
-		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
-			  cpu_context(cpu, current->mm));
-		kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
-			  cpu, vcpu->arch.guest_kernel_asid[cpu]);
-	}
-
-	if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) &
-						asid_version_mask(cpu)) {
-		kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
-		vcpu->arch.guest_user_asid[cpu] =
-		    vcpu->arch.guest_user_mm.context.asid[cpu];
-
-		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
-			  cpu_context(cpu, current->mm));
-		kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
-			  vcpu->arch.guest_user_asid[cpu]);
-	}
-
 	if (vcpu->arch.last_sched_cpu != cpu) {
 		kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
 			  vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
@@ -279,25 +252,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		kvm_mips_migrate_count(vcpu);
 	}
 
-	/*
-	 * If we preempted while the guest was executing, then reload the ASID
-	 * based on the mode of the Guest (Kernel/User)
-	 */
-	if (current->flags & PF_VCPU) {
-		if (KVM_GUEST_KERNEL_MODE(vcpu))
-			write_c0_entryhi(vcpu->arch.guest_kernel_asid[cpu] &
-					 asid_mask);
-		else
-			write_c0_entryhi(vcpu->arch.guest_user_asid[cpu] &
-					 asid_mask);
-		ehb();
-	}
-
 	/* restore guest state to registers */
 	kvm_mips_callbacks->vcpu_load(vcpu, cpu);
 
 	local_irq_restore(flags);
-
 }
 
 /* ASID can change if another task is scheduled during preemption */
@@ -314,15 +272,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	/* save guest state in registers */
 	kvm_mips_callbacks->vcpu_put(vcpu, cpu);
 
-	if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
-	     asid_version_mask(cpu))) {
-		kvm_debug("%s: Dropping MMU Context:  %#lx\n", __func__,
-			  cpu_context(cpu, current->mm));
-		drop_mmu_context(current->mm, cpu);
-	}
-	write_c0_entryhi(cpu_asid(cpu, current->mm));
-	ehb();
-
 	local_irq_restore(flags);
 }
 
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index c0ee51465913..494a90221b5e 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -11,9 +11,9 @@
 
 #include <linux/errno.h>
 #include <linux/err.h>
-#include <linux/vmalloc.h>
-
 #include <linux/kvm_host.h>
+#include <linux/vmalloc.h>
+#include <asm/mmu_context.h>
 
 #include "interrupt.h"
 
@@ -635,6 +635,49 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
 
 static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
+
+	/* Allocate new kernel and user ASIDs if needed */
+
+	if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
+						asid_version_mask(cpu)) {
+		kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
+		vcpu->arch.guest_kernel_asid[cpu] =
+		    vcpu->arch.guest_kernel_mm.context.asid[cpu];
+
+		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
+			  cpu_context(cpu, current->mm));
+		kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
+			  cpu, vcpu->arch.guest_kernel_asid[cpu]);
+	}
+
+	if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) &
+						asid_version_mask(cpu)) {
+		kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
+		vcpu->arch.guest_user_asid[cpu] =
+		    vcpu->arch.guest_user_mm.context.asid[cpu];
+
+		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
+			  cpu_context(cpu, current->mm));
+		kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
+			  vcpu->arch.guest_user_asid[cpu]);
+	}
+
+	/*
+	 * Were we in guest context? If so then the pre-empted ASID is
+	 * no longer valid, we need to set it to what it should be based
+	 * on the mode of the Guest (Kernel/User)
+	 */
+	if (current->flags & PF_VCPU) {
+		if (KVM_GUEST_KERNEL_MODE(vcpu))
+			write_c0_entryhi(vcpu->arch.guest_kernel_asid[cpu] &
+					 asid_mask);
+		else
+			write_c0_entryhi(vcpu->arch.guest_user_asid[cpu] &
+					 asid_mask);
+		ehb();
+	}
+
 	return 0;
 }
 
@@ -642,6 +685,15 @@ static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 {
 	kvm_lose_fpu(vcpu);
 
+	if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
+	     asid_version_mask(cpu))) {
+		kvm_debug("%s: Dropping MMU Context:  %#lx\n", __func__,
+			  cpu_context(cpu, current->mm));
+		drop_mmu_context(current->mm, cpu);
+	}
+	write_c0_entryhi(cpu_asid(cpu, current->mm));
+	ehb();
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From c550d53934d821dbdd867ca314d417f2e918c72c Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 11 Oct 2016 23:14:39 +0100
Subject: KVM: MIPS: Remove duplicated ASIDs from vcpu
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The kvm_vcpu_arch structure contains both mm_structs for allocating MMU
contexts (primarily the ASID) but it also copies the resulting ASIDs
into guest_{user,kernel}_asid[] arrays which are referenced from uasm
generated code.

This duplication doesn't seem to serve any purpose, and it gets in the
way of generalising the ASID handling across guest kernel/user modes, so
lets just extract the ASID straight out of the mm_struct on demand, and
in fact there are convenient cpu_context() and cpu_asid() macros for
doing so.

To reduce the verbosity of this code we do also add kern_mm and user_mm
local variables where the kernel and user mm_structs are used.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  4 +---
 arch/mips/kvm/emulate.c          | 13 +++++++------
 arch/mips/kvm/entry.c            | 22 +++++++++++++---------
 arch/mips/kvm/mips.c             |  8 +++-----
 arch/mips/kvm/mmu.c              |  8 ++++----
 arch/mips/kvm/tlb.c              |  8 ++++----
 arch/mips/kvm/trap_emul.c        | 29 ++++++++++++-----------------
 7 files changed, 44 insertions(+), 48 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 1c70b5224151..923f81dc6115 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -321,9 +321,7 @@ struct kvm_vcpu_arch {
 	/* S/W Based TLB for guest */
 	struct kvm_mips_tlb guest_tlb[KVM_MIPS_GUEST_TLB_SIZE];
 
-	/* Cached guest kernel/user ASIDs */
-	u32 guest_user_asid[NR_CPUS];
-	u32 guest_kernel_asid[NR_CPUS];
+	/* Guest kernel/user [partial] mm */
 	struct mm_struct guest_kernel_mm, guest_user_mm;
 
 	/* Guest ASID of last user mode execution */
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index aa0937423e28..060acc5b3378 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -856,6 +856,8 @@ enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu)
 static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu,
 					  struct kvm_mips_tlb *tlb)
 {
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
 	int cpu, i;
 	bool user;
 
@@ -879,8 +881,8 @@ static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu,
 		if (i == cpu)
 			continue;
 		if (user)
-			vcpu->arch.guest_user_asid[i] = 0;
-		vcpu->arch.guest_kernel_asid[i] = 0;
+			cpu_context(i, user_mm) = 0;
+		cpu_context(i, kern_mm) = 0;
 	}
 
 	preempt_enable();
@@ -1056,6 +1058,7 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 					   struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
 	enum emulation_result er = EMULATE_DONE;
 	u32 rt, rd, sel;
 	unsigned long curr_pc;
@@ -1178,13 +1181,11 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 					 */
 					preempt_disable();
 					cpu = smp_processor_id();
-					kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm,
+					kvm_get_new_mmu_context(kern_mm,
 								cpu, vcpu);
-					vcpu->arch.guest_kernel_asid[cpu] =
-						vcpu->arch.guest_kernel_mm.context.asid[cpu];
 					for_each_possible_cpu(i)
 						if (i != cpu)
-							vcpu->arch.guest_kernel_asid[i] = 0;
+							cpu_context(i, kern_mm) = 0;
 					preempt_enable();
 				}
 				kvm_write_c0_guest_entryhi(cop0,
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index e92fb190e2d6..f81888704caa 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/kvm_host.h>
+#include <linux/log2.h>
 #include <asm/msa.h>
 #include <asm/setup.h>
 #include <asm/uasm.h>
@@ -286,23 +287,26 @@ static void *kvm_mips_build_enter_guest(void *addr)
 	uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL);
 	uasm_i_xori(&p, T0, T0, KSU_USER);
 	uasm_il_bnez(&p, &r, T0, label_kernel_asid);
-	 UASM_i_ADDIU(&p, T1, K1,
-		      offsetof(struct kvm_vcpu_arch, guest_kernel_asid));
+	 UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch,
+					   guest_kernel_mm.context.asid));
 	/* else user */
-	UASM_i_ADDIU(&p, T1, K1,
-		     offsetof(struct kvm_vcpu_arch, guest_user_asid));
+	UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch,
+					  guest_user_mm.context.asid));
 	uasm_l_kernel_asid(&l, p);
 
 	/* t1: contains the base of the ASID array, need to get the cpu id  */
 	/* smp_processor_id */
 	uasm_i_lw(&p, T2, offsetof(struct thread_info, cpu), GP);
-	/* x4 */
-	uasm_i_sll(&p, T2, T2, 2);
+	/* index the ASID array */
+	uasm_i_sll(&p, T2, T2, ilog2(sizeof(long)));
 	UASM_i_ADDU(&p, T3, T1, T2);
-	uasm_i_lw(&p, K0, 0, T3);
+	UASM_i_LW(&p, K0, 0, T3);
 #ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
-	/* x sizeof(struct cpuinfo_mips)/4 */
-	uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4);
+	/*
+	 * reuse ASID array offset
+	 * cpuinfo_mips is a multiple of sizeof(long)
+	 */
+	uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/sizeof(long));
 	uasm_i_mul(&p, T2, T2, T3);
 
 	UASM_i_LA_mostly(&p, AT, (long)&cpu_data[0].asid_mask);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index f9e305f7ad71..85bc54f35695 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -413,6 +413,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 /* Must be called with preemption disabled, just before entering guest */
 static void kvm_mips_check_asids(struct kvm_vcpu *vcpu)
 {
+	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	int i, cpu = smp_processor_id();
 	unsigned int gasid;
@@ -426,13 +427,10 @@ static void kvm_mips_check_asids(struct kvm_vcpu *vcpu)
 	if (!KVM_GUEST_KERNEL_MODE(vcpu)) {
 		gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID;
 		if (gasid != vcpu->arch.last_user_gasid) {
-			kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu,
-						vcpu);
-			vcpu->arch.guest_user_asid[cpu] =
-				vcpu->arch.guest_user_mm.context.asid[cpu];
+			kvm_get_new_mmu_context(user_mm, cpu, vcpu);
 			for_each_possible_cpu(i)
 				if (i != cpu)
-					vcpu->arch.guest_user_asid[cpu] = 0;
+					cpu_context(i, user_mm) = 0;
 			vcpu->arch.last_user_gasid = gasid;
 		}
 	}
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index df013538113f..27d6d0dbfeb4 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -15,18 +15,18 @@
 
 static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
 {
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
 	int cpu = smp_processor_id();
 
-	return vcpu->arch.guest_kernel_asid[cpu] &
-			cpu_asid_mask(&cpu_data[cpu]);
+	return cpu_asid(cpu, kern_mm);
 }
 
 static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
 {
+	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
 	int cpu = smp_processor_id();
 
-	return vcpu->arch.guest_user_asid[cpu] &
-			cpu_asid_mask(&cpu_data[cpu]);
+	return cpu_asid(cpu, user_mm);
 }
 
 static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 254377d8e0b9..ba490130b5e7 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -38,18 +38,18 @@ EXPORT_SYMBOL_GPL(kvm_mips_instance);
 
 static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
 {
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
 	int cpu = smp_processor_id();
 
-	return vcpu->arch.guest_kernel_asid[cpu] &
-			cpu_asid_mask(&cpu_data[cpu]);
+	return cpu_asid(cpu, kern_mm);
 }
 
 static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
 {
+	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
 	int cpu = smp_processor_id();
 
-	return vcpu->arch.guest_user_asid[cpu] &
-			cpu_asid_mask(&cpu_data[cpu]);
+	return cpu_asid(cpu, user_mm);
 }
 
 inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 494a90221b5e..c7854d32fd64 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -635,32 +635,29 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
 
 static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
 
 	/* Allocate new kernel and user ASIDs if needed */
 
-	if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
+	if ((cpu_context(cpu, kern_mm) ^ asid_cache(cpu)) &
 						asid_version_mask(cpu)) {
-		kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
-		vcpu->arch.guest_kernel_asid[cpu] =
-		    vcpu->arch.guest_kernel_mm.context.asid[cpu];
+		kvm_get_new_mmu_context(kern_mm, cpu, vcpu);
 
 		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
 			  cpu_context(cpu, current->mm));
-		kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
-			  cpu, vcpu->arch.guest_kernel_asid[cpu]);
+		kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#lx\n",
+			  cpu, cpu_context(cpu, kern_mm));
 	}
 
-	if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) &
+	if ((cpu_context(cpu, user_mm) ^ asid_cache(cpu)) &
 						asid_version_mask(cpu)) {
-		kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
-		vcpu->arch.guest_user_asid[cpu] =
-		    vcpu->arch.guest_user_mm.context.asid[cpu];
+		kvm_get_new_mmu_context(user_mm, cpu, vcpu);
 
 		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
 			  cpu_context(cpu, current->mm));
-		kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
-			  vcpu->arch.guest_user_asid[cpu]);
+		kvm_debug("[%d]: Allocated new ASID for Guest User: %#lx\n",
+			  cpu, cpu_context(cpu, user_mm));
 	}
 
 	/*
@@ -670,11 +667,9 @@ static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	 */
 	if (current->flags & PF_VCPU) {
 		if (KVM_GUEST_KERNEL_MODE(vcpu))
-			write_c0_entryhi(vcpu->arch.guest_kernel_asid[cpu] &
-					 asid_mask);
+			write_c0_entryhi(cpu_asid(cpu, kern_mm));
 		else
-			write_c0_entryhi(vcpu->arch.guest_user_asid[cpu] &
-					 asid_mask);
+			write_c0_entryhi(cpu_asid(cpu, user_mm));
 		ehb();
 	}
 
-- 
cgit v1.2.3


From a2c046e40ff16ef6c20d534b0d77d526bc02a684 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 18 Nov 2016 13:14:37 +0000
Subject: KVM: MIPS: Add vcpu_run() & vcpu_reenter() callbacks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add implementation callbacks for entering the guest (vcpu_run()) and
reentering the guest (vcpu_reenter()), allowing implementation specific
operations to be performed before entering the guest or after returning
to the host without cluttering kvm_arch_vcpu_ioctl_run().

This allows the T&E specific lazy user GVA flush to be moved into
trap_emul.c, along with disabling of the HTW. We also move
kvm_mips_deliver_interrupts() as VZ will need to restore the guest timer
state prior to delivering interrupts.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  2 ++
 arch/mips/kvm/mips.c             | 43 ++---------------------------------
 arch/mips/kvm/trap_emul.c        | 48 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 41 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 923f81dc6115..9f319375835a 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -539,6 +539,8 @@ struct kvm_mips_callbacks {
 			   const struct kvm_one_reg *reg, s64 v);
 	int (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
 	int (*vcpu_put)(struct kvm_vcpu *vcpu, int cpu);
+	int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+	void (*vcpu_reenter)(struct kvm_run *run, struct kvm_vcpu *vcpu);
 };
 extern struct kvm_mips_callbacks *kvm_mips_callbacks;
 int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 85bc54f35695..1733877d8a53 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -410,32 +410,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 	return -ENOIOCTLCMD;
 }
 
-/* Must be called with preemption disabled, just before entering guest */
-static void kvm_mips_check_asids(struct kvm_vcpu *vcpu)
-{
-	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
-	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	int i, cpu = smp_processor_id();
-	unsigned int gasid;
-
-	/*
-	 * Lazy host ASID regeneration for guest user mode.
-	 * If the guest ASID has changed since the last guest usermode
-	 * execution, regenerate the host ASID so as to invalidate stale TLB
-	 * entries.
-	 */
-	if (!KVM_GUEST_KERNEL_MODE(vcpu)) {
-		gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID;
-		if (gasid != vcpu->arch.last_user_gasid) {
-			kvm_get_new_mmu_context(user_mm, cpu, vcpu);
-			for_each_possible_cpu(i)
-				if (i != cpu)
-					cpu_context(i, user_mm) = 0;
-			vcpu->arch.last_user_gasid = gasid;
-		}
-	}
-}
-
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
 	int r = 0;
@@ -453,25 +427,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	lose_fpu(1);
 
 	local_irq_disable();
-	/* Check if we have any exceptions/interrupts pending */
-	kvm_mips_deliver_interrupts(vcpu,
-				    kvm_read_c0_guest_cause(vcpu->arch.cop0));
-
 	guest_enter_irqoff();
-
-	/* Disable hardware page table walking while in guest */
-	htw_stop();
-
 	trace_kvm_enter(vcpu);
 
-	kvm_mips_check_asids(vcpu);
+	r = kvm_mips_callbacks->vcpu_run(run, vcpu);
 
-	r = vcpu->arch.vcpu_run(run, vcpu);
 	trace_kvm_out(vcpu);
-
-	/* Re-enable HTW before enabling interrupts */
-	htw_start();
-
 	guest_exit_irqoff();
 	local_irq_enable();
 
@@ -1570,7 +1531,7 @@ skip_emul:
 	if (ret == RESUME_GUEST) {
 		trace_kvm_reenter(vcpu);
 
-		kvm_mips_check_asids(vcpu);
+		kvm_mips_callbacks->vcpu_reenter(run, vcpu);
 
 		/*
 		 * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index c7854d32fd64..92734d095c94 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -692,6 +692,52 @@ static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 	return 0;
 }
 
+static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run,
+				       struct kvm_vcpu *vcpu)
+{
+	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	int i, cpu = smp_processor_id();
+	unsigned int gasid;
+
+	/*
+	 * Lazy host ASID regeneration for guest user mode.
+	 * If the guest ASID has changed since the last guest usermode
+	 * execution, regenerate the host ASID so as to invalidate stale TLB
+	 * entries.
+	 */
+	if (!KVM_GUEST_KERNEL_MODE(vcpu)) {
+		gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID;
+		if (gasid != vcpu->arch.last_user_gasid) {
+			kvm_get_new_mmu_context(user_mm, cpu, vcpu);
+			for_each_possible_cpu(i)
+				if (i != cpu)
+					cpu_context(i, user_mm) = 0;
+			vcpu->arch.last_user_gasid = gasid;
+		}
+	}
+}
+
+static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	/* Check if we have any exceptions/interrupts pending */
+	kvm_mips_deliver_interrupts(vcpu,
+				    kvm_read_c0_guest_cause(vcpu->arch.cop0));
+
+	kvm_trap_emul_vcpu_reenter(run, vcpu);
+
+	/* Disable hardware page table walking while in guest */
+	htw_stop();
+
+	r = vcpu->arch.vcpu_run(run, vcpu);
+
+	htw_start();
+
+	return r;
+}
+
 static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
 	/* exit handlers */
 	.handle_cop_unusable = kvm_trap_emul_handle_cop_unusable,
@@ -724,6 +770,8 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
 	.set_one_reg = kvm_trap_emul_set_one_reg,
 	.vcpu_load = kvm_trap_emul_vcpu_load,
 	.vcpu_put = kvm_trap_emul_vcpu_put,
+	.vcpu_run = kvm_trap_emul_vcpu_run,
+	.vcpu_reenter = kvm_trap_emul_vcpu_reenter,
 };
 
 int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks)
-- 
cgit v1.2.3


From 91cdee5710d5fe8f81915307b5ff38d364fbde33 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 18 Nov 2016 13:25:24 +0000
Subject: KVM: MIPS/T&E: Restore host asid on return to host
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We only need the guest ASID loaded while in guest context, i.e. while
running guest code and while handling guest exits. We load the guest
ASID when entering the guest, however we restore the host ASID later
than necessary, when the VCPU state is saved i.e. vcpu_put() or slightly
earlier if preempted after returning to the host.

This mismatch is both unpleasant and causes redundant host ASID restores
in kvm_trap_emul_vcpu_put(). Lets explicitly restore the host ASID when
returning to the host, and don't bother restoring the host ASID on
context switch in unless we're already in guest context.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/trap_emul.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 92734d095c94..3e1dbcbcea85 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -680,14 +680,17 @@ static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 {
 	kvm_lose_fpu(vcpu);
 
-	if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
-	     asid_version_mask(cpu))) {
-		kvm_debug("%s: Dropping MMU Context:  %#lx\n", __func__,
-			  cpu_context(cpu, current->mm));
-		drop_mmu_context(current->mm, cpu);
+	if (current->flags & PF_VCPU) {
+		/* Restore normal Linux process memory map */
+		if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
+		     asid_version_mask(cpu))) {
+			kvm_debug("%s: Dropping MMU Context:  %#lx\n", __func__,
+				  cpu_context(cpu, current->mm));
+			get_new_mmu_context(current->mm, cpu);
+		}
+		write_c0_entryhi(cpu_asid(cpu, current->mm));
+		ehb();
 	}
-	write_c0_entryhi(cpu_asid(cpu, current->mm));
-	ehb();
 
 	return 0;
 }
@@ -720,6 +723,7 @@ static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run,
 
 static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
+	int cpu;
 	int r;
 
 	/* Check if we have any exceptions/interrupts pending */
@@ -733,6 +737,15 @@ static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
 	r = vcpu->arch.vcpu_run(run, vcpu);
 
+	/* We may have migrated while handling guest exits */
+	cpu = smp_processor_id();
+
+	/* Restore normal Linux process memory map */
+	if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
+	     asid_version_mask(cpu)))
+		get_new_mmu_context(current->mm, cpu);
+	write_c0_entryhi(cpu_asid(cpu, current->mm));
+
 	htw_start();
 
 	return r;
-- 
cgit v1.2.3


From a7ebb2e410f8be2b061557fd85241d75a1094221 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 15 Nov 2016 00:06:05 +0000
Subject: KVM: MIPS/T&E: active_mm = init_mm in guest context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Set init_mm as the active_mm and update mm_cpumask(current->mm) to
reflect that it isn't active when in guest context. This prevents cache
management code from attempting cache flushes on host virtual addresses
while in guest context, for example due to a cache management IPIs or
later when writing of dynamically translated code hits copy on write.

We do this using helpers in static kernel code to avoid having to export
init_mm to modules.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  4 ++++
 arch/mips/kvm/tlb.c              | 35 +++++++++++++++++++++++++++++++++++
 arch/mips/kvm/trap_emul.c        | 12 +++++++++++-
 3 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 9f319375835a..95320b7964a6 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -607,6 +607,10 @@ extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi);
 extern int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu,
 				     unsigned long entryhi);
 extern int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr);
+
+void kvm_mips_suspend_mm(int cpu);
+void kvm_mips_resume_mm(int cpu);
+
 extern unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
 						   unsigned long gva);
 extern void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index ba490130b5e7..6c1f894b8754 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -382,3 +382,38 @@ void kvm_local_flush_tlb_all(void)
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(kvm_local_flush_tlb_all);
+
+/**
+ * kvm_mips_suspend_mm() - Suspend the active mm.
+ * @cpu		The CPU we're running on.
+ *
+ * Suspend the active_mm, ready for a switch to a KVM guest virtual address
+ * space. This is left active for the duration of guest context, including time
+ * with interrupts enabled, so we need to be careful not to confuse e.g. cache
+ * management IPIs.
+ *
+ * kvm_mips_resume_mm() should be called before context switching to a different
+ * process so we don't need to worry about reference counting.
+ *
+ * This needs to be in static kernel code to avoid exporting init_mm.
+ */
+void kvm_mips_suspend_mm(int cpu)
+{
+	cpumask_clear_cpu(cpu, mm_cpumask(current->active_mm));
+	current->active_mm = &init_mm;
+}
+EXPORT_SYMBOL_GPL(kvm_mips_suspend_mm);
+
+/**
+ * kvm_mips_resume_mm() - Resume the current process mm.
+ * @cpu		The CPU we're running on.
+ *
+ * Resume the mm of the current process, after a switch back from a KVM guest
+ * virtual address space (see kvm_mips_suspend_mm()).
+ */
+void kvm_mips_resume_mm(int cpu)
+{
+	cpumask_set_cpu(cpu, mm_cpumask(current->mm));
+	current->active_mm = current->mm;
+}
+EXPORT_SYMBOL_GPL(kvm_mips_resume_mm);
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 3e1dbcbcea85..9cfe4d2a283c 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -670,6 +670,7 @@ static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			write_c0_entryhi(cpu_asid(cpu, kern_mm));
 		else
 			write_c0_entryhi(cpu_asid(cpu, user_mm));
+		kvm_mips_suspend_mm(cpu);
 		ehb();
 	}
 
@@ -689,6 +690,7 @@ static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 			get_new_mmu_context(current->mm, cpu);
 		}
 		write_c0_entryhi(cpu_asid(cpu, current->mm));
+		kvm_mips_resume_mm(cpu);
 		ehb();
 	}
 
@@ -723,7 +725,7 @@ static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run,
 
 static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
-	int cpu;
+	int cpu = smp_processor_id();
 	int r;
 
 	/* Check if we have any exceptions/interrupts pending */
@@ -735,6 +737,13 @@ static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	/* Disable hardware page table walking while in guest */
 	htw_stop();
 
+	/*
+	 * While in guest context we're in the guest's address space, not the
+	 * host process address space, so we need to be careful not to confuse
+	 * e.g. cache management IPIs.
+	 */
+	kvm_mips_suspend_mm(cpu);
+
 	r = vcpu->arch.vcpu_run(run, vcpu);
 
 	/* We may have migrated while handling guest exits */
@@ -745,6 +754,7 @@ static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	     asid_version_mask(cpu)))
 		get_new_mmu_context(current->mm, cpu);
 	write_c0_entryhi(cpu_asid(cpu, current->mm));
+	kvm_mips_resume_mm(cpu);
 
 	htw_start();
 
-- 
cgit v1.2.3


From 630766b3694f0574f903863ef5c3cc4f2bbe736f Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 8 Sep 2016 23:00:24 +0100
Subject: KVM: MIPS: Wire up vcpu uninit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wire up a vcpu uninit implementation callback. This will be used for the
clean up of GVA->HPA page tables.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h | 2 +-
 arch/mips/kvm/mips.c             | 5 +++++
 arch/mips/kvm/trap_emul.c        | 5 +++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 95320b7964a6..fea538fc5331 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -519,6 +519,7 @@ struct kvm_mips_callbacks {
 	int (*handle_msa_disabled)(struct kvm_vcpu *vcpu);
 	int (*vm_init)(struct kvm *kvm);
 	int (*vcpu_init)(struct kvm_vcpu *vcpu);
+	void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
 	int (*vcpu_setup)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(gva_t gva);
 	void (*queue_timer_int)(struct kvm_vcpu *vcpu);
@@ -765,7 +766,6 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *slot) {}
-static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 1733877d8a53..011710a68c6b 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1345,6 +1345,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	kvm_mips_callbacks->vcpu_uninit(vcpu);
+}
+
 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 				  struct kvm_translation *tr)
 {
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 9cfe4d2a283c..07540cf2b557 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -440,6 +440,10 @@ static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static void kvm_trap_emul_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+}
+
 static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
@@ -779,6 +783,7 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
 
 	.vm_init = kvm_trap_emul_vm_init,
 	.vcpu_init = kvm_trap_emul_vcpu_init,
+	.vcpu_uninit = kvm_trap_emul_vcpu_uninit,
 	.vcpu_setup = kvm_trap_emul_vcpu_setup,
 	.gva_to_gpa = kvm_trap_emul_gva_to_gpa_cb,
 	.queue_timer_int = kvm_mips_queue_timer_int_cb,
-- 
cgit v1.2.3


From f7f1427dc0c67e21ba9ec2200b7c8853535b3842 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 8 Sep 2016 22:57:03 +0100
Subject: KVM: MIPS/T&E: Allocate GVA -> HPA page tables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allocate GVA -> HPA page tables for guest kernel and guest user mode on
each VCPU, to allow for fast path TLB refill handling to be added later.

In the process kvm_arch_vcpu_init() needs updating to pass on any error
from the vcpu_init() callback.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/mips.c      |  7 +++++-
 arch/mips/kvm/trap_emul.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 011710a68c6b..7cf85fa1f658 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1338,7 +1338,12 @@ static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer)
 
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
-	kvm_mips_callbacks->vcpu_init(vcpu);
+	int err;
+
+	err = kvm_mips_callbacks->vcpu_init(vcpu);
+	if (err)
+		return err;
+
 	hrtimer_init(&vcpu->arch.comparecount_timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_REL);
 	vcpu->arch.comparecount_timer.function = kvm_mips_comparecount_wakeup;
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 07540cf2b557..183150a963ec 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -14,6 +14,7 @@
 #include <linux/kvm_host.h>
 #include <linux/vmalloc.h>
 #include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
 
 #include "interrupt.h"
 
@@ -435,13 +436,75 @@ static int kvm_trap_emul_vm_init(struct kvm *kvm)
 
 static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
 {
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
+
 	vcpu->arch.kscratch_enabled = 0xfc;
 
+	/*
+	 * Allocate GVA -> HPA page tables.
+	 * MIPS doesn't use the mm_struct pointer argument.
+	 */
+	kern_mm->pgd = pgd_alloc(kern_mm);
+	if (!kern_mm->pgd)
+		return -ENOMEM;
+
+	user_mm->pgd = pgd_alloc(user_mm);
+	if (!user_mm->pgd) {
+		pgd_free(kern_mm, kern_mm->pgd);
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
+static void kvm_mips_emul_free_gva_pt(pgd_t *pgd)
+{
+	/* Don't free host kernel page tables copied from init_mm.pgd */
+	const unsigned long end = 0x80000000;
+	unsigned long pgd_va, pud_va, pmd_va;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	int i, j, k;
+
+	for (i = 0; i < USER_PTRS_PER_PGD; i++) {
+		if (pgd_none(pgd[i]))
+			continue;
+
+		pgd_va = (unsigned long)i << PGDIR_SHIFT;
+		if (pgd_va >= end)
+			break;
+		pud = pud_offset(pgd + i, 0);
+		for (j = 0; j < PTRS_PER_PUD; j++) {
+			if (pud_none(pud[j]))
+				continue;
+
+			pud_va = pgd_va | ((unsigned long)j << PUD_SHIFT);
+			if (pud_va >= end)
+				break;
+			pmd = pmd_offset(pud + j, 0);
+			for (k = 0; k < PTRS_PER_PMD; k++) {
+				if (pmd_none(pmd[k]))
+					continue;
+
+				pmd_va = pud_va | (k << PMD_SHIFT);
+				if (pmd_va >= end)
+					break;
+				pte = pte_offset(pmd + k, 0);
+				pte_free_kernel(NULL, pte);
+			}
+			pmd_free(NULL, pmd);
+		}
+		pud_free(NULL, pud);
+	}
+	pgd_free(NULL, pgd);
+}
+
 static void kvm_trap_emul_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
+	kvm_mips_emul_free_gva_pt(vcpu->arch.guest_kernel_mm.pgd);
+	kvm_mips_emul_free_gva_pt(vcpu->arch.guest_user_mm.pgd);
 }
 
 static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 7faa6eec6991715d6c1d85c192738dcac405ab89 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 7 Oct 2016 23:58:53 +0100
Subject: KVM: MIPS/T&E: Activate GVA page tables in guest context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Activate the GVA page tables when in guest context. This will allow the
normal Linux TLB refill handler to fill from it when guest memory is
read, as well as preventing accidental reading from user memory.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/mmu_context.h |  4 +++-
 arch/mips/kvm/entry.c               | 16 +++++++++++++++-
 arch/mips/kvm/trap_emul.c           | 10 ++++++----
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/arch/mips/include/asm/mmu_context.h b/arch/mips/include/asm/mmu_context.h
index ddd57ade1aa8..16eb8521398e 100644
--- a/arch/mips/include/asm/mmu_context.h
+++ b/arch/mips/include/asm/mmu_context.h
@@ -29,9 +29,11 @@ do {									\
 	}								\
 } while (0)
 
+extern void tlbmiss_handler_setup_pgd(unsigned long);
+
+/* Note: This is also implemented with uasm in arch/mips/kvm/entry.c */
 #define TLBMISS_HANDLER_SETUP_PGD(pgd)					\
 do {									\
-	extern void tlbmiss_handler_setup_pgd(unsigned long);		\
 	tlbmiss_handler_setup_pgd((unsigned long)(pgd));		\
 	htw_set_pwbase((unsigned long)pgd);				\
 } while (0)
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index f81888704caa..f683d123172c 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -13,6 +13,7 @@
 
 #include <linux/kvm_host.h>
 #include <linux/log2.h>
+#include <asm/mmu_context.h>
 #include <asm/msa.h>
 #include <asm/setup.h>
 #include <asm/uasm.h>
@@ -316,7 +317,20 @@ static void *kvm_mips_build_enter_guest(void *addr)
 #else
 	uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID);
 #endif
-	uasm_i_mtc0(&p, K0, C0_ENTRYHI);
+
+	/*
+	 * Set up KVM T&E GVA pgd.
+	 * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD():
+	 * - call tlbmiss_handler_setup_pgd(mm->pgd)
+	 * - but skips write into CP0_PWBase for now
+	 */
+	UASM_i_LW(&p, A0, (int)offsetof(struct mm_struct, pgd) -
+			  (int)offsetof(struct mm_struct, context.asid), T1);
+
+	UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd);
+	uasm_i_jalr(&p, RA, T9);
+	 uasm_i_mtc0(&p, K0, C0_ENTRYHI);
+
 	uasm_i_ehb(&p);
 
 	/* Disable RDHWR access */
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 183150a963ec..f39d427649dc 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -704,6 +704,7 @@ static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
 	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
+	struct mm_struct *mm;
 
 	/* Allocate new kernel and user ASIDs if needed */
 
@@ -733,10 +734,9 @@ static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	 * on the mode of the Guest (Kernel/User)
 	 */
 	if (current->flags & PF_VCPU) {
-		if (KVM_GUEST_KERNEL_MODE(vcpu))
-			write_c0_entryhi(cpu_asid(cpu, kern_mm));
-		else
-			write_c0_entryhi(cpu_asid(cpu, user_mm));
+		mm = KVM_GUEST_KERNEL_MODE(vcpu) ? kern_mm : user_mm;
+		write_c0_entryhi(cpu_asid(cpu, mm));
+		TLBMISS_HANDLER_SETUP_PGD(mm->pgd);
 		kvm_mips_suspend_mm(cpu);
 		ehb();
 	}
@@ -757,6 +757,7 @@ static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 			get_new_mmu_context(current->mm, cpu);
 		}
 		write_c0_entryhi(cpu_asid(cpu, current->mm));
+		TLBMISS_HANDLER_SETUP_PGD(current->mm->pgd);
 		kvm_mips_resume_mm(cpu);
 		ehb();
 	}
@@ -821,6 +822,7 @@ static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	     asid_version_mask(cpu)))
 		get_new_mmu_context(current->mm, cpu);
 	write_c0_entryhi(cpu_asid(cpu, current->mm));
+	TLBMISS_HANDLER_SETUP_PGD(current->mm->pgd);
 	kvm_mips_resume_mm(cpu);
 
 	htw_start();
-- 
cgit v1.2.3


From 29b500b54ef379f1f3227b633dd477a4dd3cd62b Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 11 Nov 2016 14:08:32 +0000
Subject: KVM: MIPS: Support NetLogic KScratch registers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tlbex.c uses the implementation dependent $22 CP0 register group on
NetLogic cores, with the help of the c0_kscratch() helper. Allow these
registers to be allocated by the KVM entry code too instead of assuming
KScratch registers are all $31, which will also allow pgd_reg to be
handled since it is allocated that way.

We also drop the masking of kscratch_mask with 0xfc, as it is redundant
for the standard KScratch registers (Config4.KScrExist won't have the
low 2 bits set anyway), and apparently not necessary for NetLogic.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/entry.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index f683d123172c..7424d3d566ff 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -91,6 +91,21 @@ static void *kvm_mips_build_ret_from_exit(void *addr);
 static void *kvm_mips_build_ret_to_guest(void *addr);
 static void *kvm_mips_build_ret_to_host(void *addr);
 
+/*
+ * The version of this function in tlbex.c uses current_cpu_type(), but for KVM
+ * we assume symmetry.
+ */
+static int c0_kscratch(void)
+{
+	switch (boot_cpu_type()) {
+	case CPU_XLP:
+	case CPU_XLR:
+		return 22;
+	default:
+		return 31;
+	}
+}
+
 /**
  * kvm_mips_entry_setup() - Perform global setup for entry code.
  *
@@ -105,18 +120,18 @@ int kvm_mips_entry_setup(void)
 	 * We prefer to use KScratchN registers if they are available over the
 	 * defaults above, which may not work on all cores.
 	 */
-	unsigned int kscratch_mask = cpu_data[0].kscratch_mask & 0xfc;
+	unsigned int kscratch_mask = cpu_data[0].kscratch_mask;
 
 	/* Pick a scratch register for storing VCPU */
 	if (kscratch_mask) {
-		scratch_vcpu[0] = 31;
+		scratch_vcpu[0] = c0_kscratch();
 		scratch_vcpu[1] = ffs(kscratch_mask) - 1;
 		kscratch_mask &= ~BIT(scratch_vcpu[1]);
 	}
 
 	/* Pick a scratch register to use as a temp for saving state */
 	if (kscratch_mask) {
-		scratch_tmp[0] = 31;
+		scratch_tmp[0] = c0_kscratch();
 		scratch_tmp[1] = ffs(kscratch_mask) - 1;
 		kscratch_mask &= ~BIT(scratch_tmp[1]);
 	}
@@ -132,7 +147,7 @@ static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp,
 	UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
 
 	/* Save the temp scratch register value in cp0_cause of stack frame */
-	if (scratch_tmp[0] == 31) {
+	if (scratch_tmp[0] == c0_kscratch()) {
 		UASM_i_MFC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
 		UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
 	}
@@ -148,7 +163,7 @@ static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp,
 	UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
 	UASM_i_MTC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
 
-	if (scratch_tmp[0] == 31) {
+	if (scratch_tmp[0] == c0_kscratch()) {
 		UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
 		UASM_i_MTC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
 	}
-- 
cgit v1.2.3


From a7cfa7ac1236937dac431845596a39ba27364a00 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Sat, 10 Sep 2016 23:56:46 +0100
Subject: KVM: MIPS: Add fast path TLB refill handler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use functions from the general MIPS TLB exception vector generation code
(tlbex.c) to construct a fast path TLB refill handler similar to the
general one, but cut down and capable of preserving K0 and K1.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  1 +
 arch/mips/kvm/entry.c            | 78 ++++++++++++++++++++++++++++++++++++++++
 arch/mips/kvm/mips.c             |  8 +++--
 3 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index fea538fc5331..80928ffa0150 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -554,6 +554,7 @@ extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu);
 /* Building of entry/exception code */
 int kvm_mips_entry_setup(void);
 void *kvm_mips_build_vcpu_run(void *addr);
+void *kvm_mips_build_tlb_refill_exception(void *addr, void *handler);
 void *kvm_mips_build_exception(void *addr, void *handler);
 void *kvm_mips_build_exit(void *addr);
 
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index 7424d3d566ff..1ae33e0e675c 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -16,6 +16,7 @@
 #include <asm/mmu_context.h>
 #include <asm/msa.h>
 #include <asm/setup.h>
+#include <asm/tlbex.h>
 #include <asm/uasm.h>
 
 /* Register names */
@@ -122,6 +123,9 @@ int kvm_mips_entry_setup(void)
 	 */
 	unsigned int kscratch_mask = cpu_data[0].kscratch_mask;
 
+	if (pgd_reg != -1)
+		kscratch_mask &= ~BIT(pgd_reg);
+
 	/* Pick a scratch register for storing VCPU */
 	if (kscratch_mask) {
 		scratch_vcpu[0] = c0_kscratch();
@@ -380,6 +384,80 @@ static void *kvm_mips_build_enter_guest(void *addr)
 	return p;
 }
 
+/**
+ * kvm_mips_build_tlb_refill_exception() - Assemble TLB refill handler.
+ * @addr:	Address to start writing code.
+ * @handler:	Address of common handler (within range of @addr).
+ *
+ * Assemble TLB refill exception fast path handler for guest execution.
+ *
+ * Returns:	Next address after end of written function.
+ */
+void *kvm_mips_build_tlb_refill_exception(void *addr, void *handler)
+{
+	u32 *p = addr;
+	struct uasm_label labels[2];
+	struct uasm_reloc relocs[2];
+	struct uasm_label *l = labels;
+	struct uasm_reloc *r = relocs;
+
+	memset(labels, 0, sizeof(labels));
+	memset(relocs, 0, sizeof(relocs));
+
+	/* Save guest k1 into scratch register */
+	UASM_i_MTC0(&p, K1, scratch_tmp[0], scratch_tmp[1]);
+
+	/* Get the VCPU pointer from the VCPU scratch register */
+	UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
+
+	/* Save guest k0 into VCPU structure */
+	UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu, arch.gprs[K0]), K1);
+
+	/*
+	 * Some of the common tlbex code uses current_cpu_type(). For KVM we
+	 * assume symmetry and just disable preemption to silence the warning.
+	 */
+	preempt_disable();
+
+	/*
+	 * Now for the actual refill bit. A lot of this can be common with the
+	 * Linux TLB refill handler, however we don't need to handle so many
+	 * cases. We only need to handle user mode refills, and user mode runs
+	 * with 32-bit addressing.
+	 *
+	 * Therefore the branch to label_vmalloc generated by build_get_pmde64()
+	 * that isn't resolved should never actually get taken and is harmless
+	 * to leave in place for now.
+	 */
+
+#ifdef CONFIG_64BIT
+	build_get_pmde64(&p, &l, &r, K0, K1); /* get pmd in K1 */
+#else
+	build_get_pgde32(&p, K0, K1); /* get pgd in K1 */
+#endif
+
+	/* we don't support huge pages yet */
+
+	build_get_ptep(&p, K0, K1);
+	build_update_entries(&p, K0, K1);
+	build_tlb_write_entry(&p, &l, &r, tlb_random);
+
+	preempt_enable();
+
+	/* Get the VCPU pointer from the VCPU scratch register again */
+	UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
+
+	/* Restore the guest's k0/k1 registers */
+	UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu, arch.gprs[K0]), K1);
+	uasm_i_ehb(&p);
+	UASM_i_MFC0(&p, K1, scratch_tmp[0], scratch_tmp[1]);
+
+	/* Jump to guest */
+	uasm_i_eret(&p);
+
+	return p;
+}
+
 /**
  * kvm_mips_build_exception() - Assemble first level guest exception handler.
  * @addr:	Address to start writing code.
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 7cf85fa1f658..a687864de428 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -264,7 +264,7 @@ static inline void dump_handler(const char *symbol, void *start, void *end)
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
 	int err, size;
-	void *gebase, *p, *handler;
+	void *gebase, *p, *handler, *refill_start, *refill_end;
 	int i;
 
 	struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
@@ -317,8 +317,9 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	/* Build guest exception vectors dynamically in unmapped memory */
 	handler = gebase + 0x2000;
 
-	/* TLB Refill, EXL = 0 */
-	kvm_mips_build_exception(gebase, handler);
+	/* TLB refill */
+	refill_start = gebase;
+	refill_end = kvm_mips_build_tlb_refill_exception(refill_start, handler);
 
 	/* General Exception Entry point */
 	kvm_mips_build_exception(gebase + 0x180, handler);
@@ -344,6 +345,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	pr_debug("#include <asm/regdef.h>\n");
 	pr_debug("\n");
 	dump_handler("kvm_vcpu_run", vcpu->arch.vcpu_run, p);
+	dump_handler("kvm_tlb_refill", refill_start, refill_end);
 	dump_handler("kvm_gen_exc", gebase + 0x180, gebase + 0x200);
 	dump_handler("kvm_exit", gebase + 0x2000, vcpu->arch.vcpu_run);
 
-- 
cgit v1.2.3


From f3a8603f098fd2c68311d945a6531d1e3b62271c Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 7 Oct 2016 22:01:05 +0100
Subject: KVM: MIPS/TLB: Fix off-by-one in TLB invalidate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kvm_mips_host_tlb_inv() uses the TLBP instruction to probe the host TLB
for an entry matching the given guest virtual address, and determines
whether a match was found based on whether CP0_Index > 0. This is
technically incorrect as an index of 0 (with the high bit clear) is a
perfectly valid TLB index.

This is harmless at the moment due to the use of at least 1 wired TLB
entry for the KVM commpage, however we will soon be ridding ourselves of
that particular wired entry so lets fix the condition in case the entry
needing invalidation does land at TLB index 0.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/tlb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 6c1f894b8754..4bf82613d440 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -282,7 +282,7 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
 	if (idx >= current_cpu_data.tlbsize)
 		BUG();
 
-	if (idx > 0) {
+	if (idx >= 0) {
 		write_c0_entryhi(UNIQUE_ENTRYHI(idx));
 		write_c0_entrylo0(0);
 		write_c0_entrylo1(0);
@@ -297,7 +297,7 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
 
 	local_irq_restore(flags);
 
-	if (idx > 0)
+	if (idx >= 0)
 		kvm_debug("%s: Invalidated entryhi %#lx @ idx %d\n", __func__,
 			  (va & VPN2_MASK) | kvm_mips_get_user_asid(vcpu), idx);
 
-- 
cgit v1.2.3


From 57e3869cfaaec712f6ea1855ab7ba868f6f306ed Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Sat, 8 Oct 2016 00:15:52 +0100
Subject: KVM: MIPS/TLB: Generalise host TLB invalidate to kernel ASID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactor kvm_mips_host_tlb_inv() to also be able to invalidate any
matching TLB entry in the kernel ASID rather than assuming only the TLB
entries in the user ASID can change. Two new bool user/kernel arguments
allow the caller to indicate whether the mapping should affect each of
the ASIDs for guest user/kernel mode.

- kvm_mips_invalidate_guest_tlb() (used by TLBWI/TLBWR emulation) can
  now invalidate any corresponding TLB entry in both the kernel ASID
  (guest kernel may have accessed any guest mapping), and the user ASID
  if the entry being replaced is in guest USeg (where guest user may
  also have accessed it).

- The tlbmod fault handler (and the KSeg0 / TLB mapped / commpage fault
  handlers in later patches) can now invalidate the corresponding TLB
  entry in whichever ASID is currently active, since only a single page
  table will have been updated anyway.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  3 ++-
 arch/mips/kvm/emulate.c          |  6 ++++--
 arch/mips/kvm/tlb.c              | 40 ++++++++++++++++++++++++++++++----------
 3 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 80928ffa0150..fb2ea578c193 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -604,7 +604,8 @@ extern int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
 				   unsigned long entrylo1,
 				   int flush_dcache_mask);
 extern void kvm_mips_flush_host_tlb(int skip_kseg0);
-extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi);
+extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi,
+				 bool user, bool kernel);
 
 extern int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu,
 				     unsigned long entryhi);
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 060acc5b3378..611b8996ca0c 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -873,7 +873,7 @@ static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu,
 	 * Probe the shadow host TLB for the entry being overwritten, if one
 	 * matches, invalidate it
 	 */
-	kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi);
+	kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi, user, true);
 
 	/* Invalidate the whole ASID on other CPUs */
 	cpu = smp_processor_id();
@@ -2100,13 +2100,15 @@ enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc,
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
 			(kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
+	bool kernel = KVM_GUEST_KERNEL_MODE(vcpu);
 	int index;
 
 	/* If address not in the guest TLB, then we are in trouble */
 	index = kvm_mips_guest_tlb_lookup(vcpu, entryhi);
 	if (index < 0) {
 		/* XXXKYMA Invalidate and retry */
-		kvm_mips_host_tlb_inv(vcpu, vcpu->arch.host_cp0_badvaddr);
+		kvm_mips_host_tlb_inv(vcpu, vcpu->arch.host_cp0_badvaddr,
+				      !kernel, kernel);
 		kvm_err("%s: host got TLBMOD for %#lx but entry not present in Guest TLB\n",
 		     __func__, entryhi);
 		kvm_mips_dump_guest_tlbs(vcpu);
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 4bf82613d440..06ee9a1d78a5 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -263,16 +263,11 @@ int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr)
 }
 EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_lookup);
 
-int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
+static int _kvm_mips_host_tlb_inv(unsigned long entryhi)
 {
 	int idx;
-	unsigned long flags, old_entryhi;
-
-	local_irq_save(flags);
-
-	old_entryhi = read_c0_entryhi();
 
-	write_c0_entryhi((va & VPN2_MASK) | kvm_mips_get_user_asid(vcpu));
+	write_c0_entryhi(entryhi);
 	mtc0_tlbw_hazard();
 
 	tlb_probe();
@@ -292,14 +287,39 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
 		tlbw_use_hazard();
 	}
 
+	return idx;
+}
+
+int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va,
+			  bool user, bool kernel)
+{
+	int idx_user, idx_kernel;
+	unsigned long flags, old_entryhi;
+
+	local_irq_save(flags);
+
+	old_entryhi = read_c0_entryhi();
+
+	if (user)
+		idx_user = _kvm_mips_host_tlb_inv((va & VPN2_MASK) |
+						  kvm_mips_get_user_asid(vcpu));
+	if (kernel)
+		idx_kernel = _kvm_mips_host_tlb_inv((va & VPN2_MASK) |
+						kvm_mips_get_kernel_asid(vcpu));
+
 	write_c0_entryhi(old_entryhi);
 	mtc0_tlbw_hazard();
 
 	local_irq_restore(flags);
 
-	if (idx >= 0)
-		kvm_debug("%s: Invalidated entryhi %#lx @ idx %d\n", __func__,
-			  (va & VPN2_MASK) | kvm_mips_get_user_asid(vcpu), idx);
+	if (user && idx_user >= 0)
+		kvm_debug("%s: Invalidated guest user entryhi %#lx @ idx %d\n",
+			  __func__, (va & VPN2_MASK) |
+				    kvm_mips_get_user_asid(vcpu), idx_user);
+	if (kernel && idx_kernel >= 0)
+		kvm_debug("%s: Invalidated guest kernel entryhi %#lx @ idx %d\n",
+			  __func__, (va & VPN2_MASK) |
+				    kvm_mips_get_kernel_asid(vcpu), idx_kernel);
 
 	return 0;
 }
-- 
cgit v1.2.3


From a31b50d741bd85a127d5ef2c21c0788041bc41a9 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 16 Dec 2016 15:57:00 +0000
Subject: KVM: MIPS/MMU: Invalidate GVA PTs on ASID changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement invalidation of large ranges of virtual addresses from GVA
page tables in response to a guest ASID change (immediately for guest
kernel page table, lazily for guest user page table).

We iterate through a range of page tables invalidating entries and
freeing fully invalidated tables. To minimise overhead the exact ranges
invalidated depends on the flags argument to kvm_mips_flush_gva_pt(),
which also allows it to be used in future KVM_CAP_SYNC_MMU patches in
response to GPA changes, which unlike guest TLB mapping changes affects
guest KSeg0 mappings.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  17 +++++
 arch/mips/kvm/emulate.c          |  11 ++++
 arch/mips/kvm/mmu.c              | 134 +++++++++++++++++++++++++++++++++++++++
 arch/mips/kvm/trap_emul.c        |   5 +-
 4 files changed, 165 insertions(+), 2 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index fb2ea578c193..f5145dcab319 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -614,6 +614,23 @@ extern int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr);
 void kvm_mips_suspend_mm(int cpu);
 void kvm_mips_resume_mm(int cpu);
 
+/* MMU handling */
+
+/**
+ * enum kvm_mips_flush - Types of MMU flushes.
+ * @KMF_USER:	Flush guest user virtual memory mappings.
+ *		Guest USeg only.
+ * @KMF_KERN:	Flush guest kernel virtual memory mappings.
+ *		Guest USeg and KSeg2/3.
+ * @KMF_GPA:	Flush guest physical memory mappings.
+ *		Also includes KSeg0 if KMF_KERN is set.
+ */
+enum kvm_mips_flush {
+	KMF_USER	= 0x0,
+	KMF_KERN	= 0x1,
+	KMF_GPA		= 0x2,
+};
+void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags);
 extern unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
 						   unsigned long gva);
 extern void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 611b8996ca0c..1d399396e486 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1171,6 +1171,17 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 							& KVM_ENTRYHI_ASID,
 						nasid);
 
+					/*
+					 * Flush entries from the GVA page
+					 * tables.
+					 * Guest user page table will get
+					 * flushed lazily on re-entry to guest
+					 * user if the guest ASID actually
+					 * changes.
+					 */
+					kvm_mips_flush_gva_pt(kern_mm->pgd,
+							      KMF_KERN);
+
 					/*
 					 * Regenerate/invalidate kernel MMU
 					 * context.
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 27d6d0dbfeb4..09146b62552f 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -12,6 +12,7 @@
 #include <linux/highmem.h>
 #include <linux/kvm_host.h>
 #include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
 
 static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
 {
@@ -80,6 +81,139 @@ unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
 	return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset;
 }
 
+/*
+ * kvm_mips_flush_gva_{pte,pmd,pud,pgd,pt}.
+ * Flush a range of guest physical address space from the VM's GPA page tables.
+ */
+
+static bool kvm_mips_flush_gva_pte(pte_t *pte, unsigned long start_gva,
+				   unsigned long end_gva)
+{
+	int i_min = __pte_offset(start_gva);
+	int i_max = __pte_offset(end_gva);
+	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PTE - 1);
+	int i;
+
+	/*
+	 * There's no freeing to do, so there's no point clearing individual
+	 * entries unless only part of the last level page table needs flushing.
+	 */
+	if (safe_to_remove)
+		return true;
+
+	for (i = i_min; i <= i_max; ++i) {
+		if (!pte_present(pte[i]))
+			continue;
+
+		set_pte(pte + i, __pte(0));
+	}
+	return false;
+}
+
+static bool kvm_mips_flush_gva_pmd(pmd_t *pmd, unsigned long start_gva,
+				   unsigned long end_gva)
+{
+	pte_t *pte;
+	unsigned long end = ~0ul;
+	int i_min = __pmd_offset(start_gva);
+	int i_max = __pmd_offset(end_gva);
+	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PMD - 1);
+	int i;
+
+	for (i = i_min; i <= i_max; ++i, start_gva = 0) {
+		if (!pmd_present(pmd[i]))
+			continue;
+
+		pte = pte_offset(pmd + i, 0);
+		if (i == i_max)
+			end = end_gva;
+
+		if (kvm_mips_flush_gva_pte(pte, start_gva, end)) {
+			pmd_clear(pmd + i);
+			pte_free_kernel(NULL, pte);
+		} else {
+			safe_to_remove = false;
+		}
+	}
+	return safe_to_remove;
+}
+
+static bool kvm_mips_flush_gva_pud(pud_t *pud, unsigned long start_gva,
+				   unsigned long end_gva)
+{
+	pmd_t *pmd;
+	unsigned long end = ~0ul;
+	int i_min = __pud_offset(start_gva);
+	int i_max = __pud_offset(end_gva);
+	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PUD - 1);
+	int i;
+
+	for (i = i_min; i <= i_max; ++i, start_gva = 0) {
+		if (!pud_present(pud[i]))
+			continue;
+
+		pmd = pmd_offset(pud + i, 0);
+		if (i == i_max)
+			end = end_gva;
+
+		if (kvm_mips_flush_gva_pmd(pmd, start_gva, end)) {
+			pud_clear(pud + i);
+			pmd_free(NULL, pmd);
+		} else {
+			safe_to_remove = false;
+		}
+	}
+	return safe_to_remove;
+}
+
+static bool kvm_mips_flush_gva_pgd(pgd_t *pgd, unsigned long start_gva,
+				   unsigned long end_gva)
+{
+	pud_t *pud;
+	unsigned long end = ~0ul;
+	int i_min = pgd_index(start_gva);
+	int i_max = pgd_index(end_gva);
+	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PGD - 1);
+	int i;
+
+	for (i = i_min; i <= i_max; ++i, start_gva = 0) {
+		if (!pgd_present(pgd[i]))
+			continue;
+
+		pud = pud_offset(pgd + i, 0);
+		if (i == i_max)
+			end = end_gva;
+
+		if (kvm_mips_flush_gva_pud(pud, start_gva, end)) {
+			pgd_clear(pgd + i);
+			pud_free(NULL, pud);
+		} else {
+			safe_to_remove = false;
+		}
+	}
+	return safe_to_remove;
+}
+
+void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags)
+{
+	if (flags & KMF_GPA) {
+		/* all of guest virtual address space could be affected */
+		if (flags & KMF_KERN)
+			/* useg, kseg0, seg2/3 */
+			kvm_mips_flush_gva_pgd(pgd, 0, 0x7fffffff);
+		else
+			/* useg */
+			kvm_mips_flush_gva_pgd(pgd, 0, 0x3fffffff);
+	} else {
+		/* useg */
+		kvm_mips_flush_gva_pgd(pgd, 0, 0x3fffffff);
+
+		/* kseg2/3 */
+		if (flags & KMF_KERN)
+			kvm_mips_flush_gva_pgd(pgd, 0x60000000, 0x7fffffff);
+	}
+}
+
 /* XXXKYMA: Must be called with interrupts disabled */
 int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 				    struct kvm_vcpu *vcpu)
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index f39d427649dc..6a56e48f4bfa 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -774,14 +774,15 @@ static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run,
 	unsigned int gasid;
 
 	/*
-	 * Lazy host ASID regeneration for guest user mode.
+	 * Lazy host ASID regeneration / PT flush for guest user mode.
 	 * If the guest ASID has changed since the last guest usermode
 	 * execution, regenerate the host ASID so as to invalidate stale TLB
-	 * entries.
+	 * entries and flush GVA PT entries too.
 	 */
 	if (!KVM_GUEST_KERNEL_MODE(vcpu)) {
 		gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID;
 		if (gasid != vcpu->arch.last_user_gasid) {
+			kvm_mips_flush_gva_pt(user_mm->pgd, KMF_USER);
 			kvm_get_new_mmu_context(user_mm, cpu, vcpu);
 			for_each_possible_cpu(i)
 				if (i != cpu)
-- 
cgit v1.2.3


From aba8592950f1c698bb9c1b42d4f4dab07a145674 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 16 Dec 2016 15:57:00 +0000
Subject: KVM: MIPS/MMU: Invalidate stale GVA PTEs on TLBW
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement invalidation of specific pairs of GVA page table entries in
one or both of the GVA page tables. This is used when existing mappings
are replaced in the guest TLB by emulated TLBWI/TLBWR instructions. Due
to the sharing of page tables in the host kernel range, we should be
careful not to allow host pages to be invalidated.

Add a helper kvm_mips_walk_pgd() which can be used when walking of
either GPA (future patches) or GVA page tables is needed, optionally
with allocation of page tables along the way when they don't exist.

GPA page table walking will need to be protected by the kvm->mmu_lock,
so we also add a small MMU page cache in each KVM VCPU, like that found
for other architectures but smaller. This allows enough pages to be
pre-allocated to handle a single fault without holding the lock,
allowing the helper to run with the lock held without having to handle
allocation failures.

Using the same mechanism for GVA allows the same code to be used, and
allows it to use the same cache of allocated pages if the GPA walk
didn't need to allocate any new tables.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h | 17 +++++++
 arch/mips/kvm/emulate.c          |  6 +++
 arch/mips/kvm/mips.c             |  1 +
 arch/mips/kvm/mmu.c              | 95 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 119 insertions(+)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index f5145dcab319..40aab4f5007c 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -261,6 +261,17 @@ struct kvm_mips_tlb {
 	long tlb_lo[2];
 };
 
+#define KVM_NR_MEM_OBJS     4
+
+/*
+ * We don't want allocation failures within the mmu code, so we preallocate
+ * enough memory for a single page fault in a cache.
+ */
+struct kvm_mmu_memory_cache {
+	int nobjs;
+	void *objects[KVM_NR_MEM_OBJS];
+};
+
 #define KVM_MIPS_AUX_FPU	0x1
 #define KVM_MIPS_AUX_MSA	0x2
 
@@ -327,6 +338,9 @@ struct kvm_vcpu_arch {
 	/* Guest ASID of last user mode execution */
 	unsigned int last_user_gasid;
 
+	/* Cache some mmu pages needed inside spinlock regions */
+	struct kvm_mmu_memory_cache mmu_page_cache;
+
 	int last_sched_cpu;
 
 	/* WAIT executed */
@@ -631,6 +645,9 @@ enum kvm_mips_flush {
 	KMF_GPA		= 0x2,
 };
 void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags);
+void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
+void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
+				  bool user);
 extern unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
 						   unsigned long gva);
 extern void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 1d399396e486..19eaeda6975c 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -864,11 +864,17 @@ static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu,
 	/* No need to flush for entries which are already invalid */
 	if (!((tlb->tlb_lo[0] | tlb->tlb_lo[1]) & ENTRYLO_V))
 		return;
+	/* Don't touch host kernel page tables or TLB mappings */
+	if ((unsigned long)tlb->tlb_hi > 0x7fffffff)
+		return;
 	/* User address space doesn't need flushing for KSeg2/3 changes */
 	user = tlb->tlb_hi < KVM_GUEST_KSEG0;
 
 	preempt_disable();
 
+	/* Invalidate page table entries */
+	kvm_trap_emul_invalidate_gva(vcpu, tlb->tlb_hi & VPN2_MASK, user);
+
 	/*
 	 * Probe the shadow host TLB for the entry being overwritten, if one
 	 * matches, invalidate it
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index a687864de428..c369fdd19fbc 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -396,6 +396,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 
 	kvm_mips_dump_stats(vcpu);
 
+	kvm_mmu_free_memory_caches(vcpu);
 	kfree(vcpu->arch.guest_ebase);
 	kfree(vcpu->arch.kseg0_commpage);
 	kfree(vcpu);
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 09146b62552f..dbf2b55ee874 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -14,6 +14,26 @@
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
 
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+	while (mc->nobjs)
+		free_page((unsigned long)mc->objects[--mc->nobjs]);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
+{
+	void *p;
+
+	BUG_ON(!mc || !mc->nobjs);
+	p = mc->objects[--mc->nobjs];
+	return p;
+}
+
+void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+	mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+}
+
 static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
 {
 	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
@@ -30,6 +50,56 @@ static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
 	return cpu_asid(cpu, user_mm);
 }
 
+/**
+ * kvm_mips_walk_pgd() - Walk page table with optional allocation.
+ * @pgd:	Page directory pointer.
+ * @addr:	Address to index page table using.
+ * @cache:	MMU page cache to allocate new page tables from, or NULL.
+ *
+ * Walk the page tables pointed to by @pgd to find the PTE corresponding to the
+ * address @addr. If page tables don't exist for @addr, they will be created
+ * from the MMU cache if @cache is not NULL.
+ *
+ * Returns:	Pointer to pte_t corresponding to @addr.
+ *		NULL if a page table doesn't exist for @addr and !@cache.
+ *		NULL if a page table allocation failed.
+ */
+static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache,
+				unsigned long addr)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd += pgd_index(addr);
+	if (pgd_none(*pgd)) {
+		/* Not used on MIPS yet */
+		BUG();
+		return NULL;
+	}
+	pud = pud_offset(pgd, addr);
+	if (pud_none(*pud)) {
+		pmd_t *new_pmd;
+
+		if (!cache)
+			return NULL;
+		new_pmd = mmu_memory_cache_alloc(cache);
+		pmd_init((unsigned long)new_pmd,
+			 (unsigned long)invalid_pte_table);
+		pud_populate(NULL, pud, new_pmd);
+	}
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd)) {
+		pte_t *new_pte;
+
+		if (!cache)
+			return NULL;
+		new_pte = mmu_memory_cache_alloc(cache);
+		clear_page(new_pte);
+		pmd_populate_kernel(NULL, pmd, new_pte);
+	}
+	return pte_offset(pmd, addr);
+}
+
 static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
 {
 	int srcu_idx, err = 0;
@@ -81,6 +151,31 @@ unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
 	return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset;
 }
 
+void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
+				  bool user)
+{
+	pgd_t *pgdp;
+	pte_t *ptep;
+
+	addr &= PAGE_MASK << 1;
+
+	pgdp = vcpu->arch.guest_kernel_mm.pgd;
+	ptep = kvm_mips_walk_pgd(pgdp, NULL, addr);
+	if (ptep) {
+		ptep[0] = pfn_pte(0, __pgprot(0));
+		ptep[1] = pfn_pte(0, __pgprot(0));
+	}
+
+	if (user) {
+		pgdp = vcpu->arch.guest_user_mm.pgd;
+		ptep = kvm_mips_walk_pgd(pgdp, NULL, addr);
+		if (ptep) {
+			ptep[0] = pfn_pte(0, __pgprot(0));
+			ptep[1] = pfn_pte(0, __pgprot(0));
+		}
+	}
+}
+
 /*
  * kvm_mips_flush_gva_{pte,pmd,pud,pgd,pt}.
  * Flush a range of guest physical address space from the VM's GPA page tables.
-- 
cgit v1.2.3


From fb99589391a9ed2e505dc7c3d02651a1a7b9f72b Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 5 Jan 2017 10:44:38 +0000
Subject: KVM: MIPS/MMU: Convert KSeg0 faults to page tables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that we have GVA page tables and an optimised TLB refill handler in
place, convert the handling of KSeg0 page faults from the guest to fill
the GVA page tables and invalidate the TLB entry, rather than filling a
TLB entry directly.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/mmu.c | 79 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 64 insertions(+), 15 deletions(-)

diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index dbf2b55ee874..afb47f21d8bc 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -14,6 +14,33 @@
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
 
+/*
+ * KVM_MMU_CACHE_MIN_PAGES is the number of GPA page table translation levels
+ * for which pages need to be cached.
+ */
+#if defined(__PAGETABLE_PMD_FOLDED)
+#define KVM_MMU_CACHE_MIN_PAGES 1
+#else
+#define KVM_MMU_CACHE_MIN_PAGES 2
+#endif
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+				  int min, int max)
+{
+	void *page;
+
+	BUG_ON(max > KVM_NR_MEM_OBJS);
+	if (cache->nobjs >= min)
+		return 0;
+	while (cache->nobjs < max) {
+		page = (void *)__get_free_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+		cache->objects[cache->nobjs++] = page;
+	}
+	return 0;
+}
+
 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 {
 	while (mc->nobjs)
@@ -151,6 +178,27 @@ unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
 	return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset;
 }
 
+static pte_t *kvm_trap_emul_pte_for_gva(struct kvm_vcpu *vcpu,
+					unsigned long addr)
+{
+	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+	pgd_t *pgdp;
+	int ret;
+
+	/* We need a minimum of cached pages ready for page table creation */
+	ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
+				     KVM_NR_MEM_OBJS);
+	if (ret)
+		return NULL;
+
+	if (KVM_GUEST_KERNEL_MODE(vcpu))
+		pgdp = vcpu->arch.guest_kernel_mm.pgd;
+	else
+		pgdp = vcpu->arch.guest_user_mm.pgd;
+
+	return kvm_mips_walk_pgd(pgdp, memcache, addr);
+}
+
 void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
 				  bool user)
 {
@@ -316,10 +364,8 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 	gfn_t gfn;
 	kvm_pfn_t pfn0, pfn1;
 	unsigned long vaddr = 0;
-	unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
 	struct kvm *kvm = vcpu->kvm;
-	const int flush_dcache_mask = 0;
-	int ret;
+	pte_t *ptep_gva;
 
 	if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
 		kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
@@ -327,6 +373,8 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 		return -1;
 	}
 
+	/* Find host PFNs */
+
 	gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
 	if ((gfn | 1) >= kvm->arch.guest_pmap_npages) {
 		kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
@@ -345,20 +393,21 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 	pfn0 = kvm->arch.guest_pmap[gfn & ~0x1];
 	pfn1 = kvm->arch.guest_pmap[gfn | 0x1];
 
-	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
-		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
-		ENTRYLO_D | ENTRYLO_V;
-	entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
-		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
-		ENTRYLO_D | ENTRYLO_V;
+	/* Find GVA page table entry */
 
-	preempt_disable();
-	entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
-	ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
-				      flush_dcache_mask);
-	preempt_enable();
+	ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, vaddr);
+	if (!ptep_gva) {
+		kvm_err("No ptep for gva %lx\n", vaddr);
+		return -1;
+	}
 
-	return ret;
+	/* Write host PFNs into GVA page table */
+	ptep_gva[0] = pte_mkyoung(pte_mkdirty(pfn_pte(pfn0, PAGE_SHARED)));
+	ptep_gva[1] = pte_mkyoung(pte_mkdirty(pfn_pte(pfn1, PAGE_SHARED)));
+
+	/* Invalidate this entry in the TLB, guest kernel ASID only */
+	kvm_mips_host_tlb_inv(vcpu, vaddr, false, true);
+	return 0;
 }
 
 int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
-- 
cgit v1.2.3


From 7e3d2a750bcb0a7fed84e14e562d752c39fdf542 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Sat, 8 Oct 2016 01:15:19 +0100
Subject: KVM: MIPS/MMU: Convert TLB mapped faults to page tables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that we have GVA page tables and an optimised TLB refill handler in
place, convert the handling of page faults in TLB mapped segment from
the guest to fill a single GVA page table entry and invalidate the TLB
entry, rather than filling a TLB entry pair directly.

Also remove the now unused kvm_mips_get_{kernel,user}_asid() functions
in mmu.c and kvm_mips_host_tlb_write() in tlb.c.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  7 +--
 arch/mips/kvm/emulate.c          |  6 ++-
 arch/mips/kvm/mmu.c              | 93 +++++++++++++++-------------------------
 arch/mips/kvm/tlb.c              | 64 ---------------------------
 4 files changed, 40 insertions(+), 130 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 40aab4f5007c..f7680999e28a 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -599,7 +599,8 @@ extern int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
 					      struct kvm_vcpu *vcpu);
 
 extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
-						struct kvm_mips_tlb *tlb);
+						struct kvm_mips_tlb *tlb,
+						unsigned long gva);
 
 extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
 						     u32 *opc,
@@ -613,10 +614,6 @@ extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause,
 
 extern void kvm_mips_dump_host_tlbs(void);
 extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu);
-extern int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
-				   unsigned long entrylo0,
-				   unsigned long entrylo1,
-				   int flush_dcache_mask);
 extern void kvm_mips_flush_host_tlb(int skip_kseg0);
 extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi,
 				 bool user, bool kernel);
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 19eaeda6975c..3ced662e012e 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1770,7 +1770,8 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
 			 * We fault an entry from the guest tlb to the
 			 * shadow host TLB
 			 */
-			if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) {
+			if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb,
+								 va)) {
 				kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n",
 					__func__, va, index, vcpu,
 					read_c0_entryhi());
@@ -2746,7 +2747,8 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
 			 * OK we have a Guest TLB entry, now inject it into the
 			 * shadow host TLB
 			 */
-			if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) {
+			if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb,
+								 va)) {
 				kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n",
 					__func__, va, index, vcpu,
 					read_c0_entryhi());
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index afb47f21d8bc..62122d297e52 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -61,22 +61,6 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 	mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
 }
 
-static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
-{
-	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
-	int cpu = smp_processor_id();
-
-	return cpu_asid(cpu, kern_mm);
-}
-
-static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
-{
-	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
-	int cpu = smp_processor_id();
-
-	return cpu_asid(cpu, user_mm);
-}
-
 /**
  * kvm_mips_walk_pgd() - Walk page table with optional allocation.
  * @pgd:	Page directory pointer.
@@ -411,67 +395,58 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 }
 
 int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
-					 struct kvm_mips_tlb *tlb)
+					 struct kvm_mips_tlb *tlb,
+					 unsigned long gva)
 {
-	unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
 	struct kvm *kvm = vcpu->kvm;
-	kvm_pfn_t pfn0, pfn1;
-	gfn_t gfn0, gfn1;
-	long tlb_lo[2];
-	int ret;
-
-	tlb_lo[0] = tlb->tlb_lo[0];
-	tlb_lo[1] = tlb->tlb_lo[1];
+	kvm_pfn_t pfn;
+	gfn_t gfn;
+	long tlb_lo = 0;
+	pte_t *ptep_gva;
+	unsigned int idx;
+	bool kernel = KVM_GUEST_KERNEL_MODE(vcpu);
 
 	/*
 	 * The commpage address must not be mapped to anything else if the guest
 	 * TLB contains entries nearby, or commpage accesses will break.
 	 */
-	if (!((tlb->tlb_hi ^ KVM_GUEST_COMMPAGE_ADDR) &
-			VPN2_MASK & (PAGE_MASK << 1)))
-		tlb_lo[(KVM_GUEST_COMMPAGE_ADDR >> PAGE_SHIFT) & 1] = 0;
-
-	gfn0 = mips3_tlbpfn_to_paddr(tlb_lo[0]) >> PAGE_SHIFT;
-	gfn1 = mips3_tlbpfn_to_paddr(tlb_lo[1]) >> PAGE_SHIFT;
-	if (gfn0 >= kvm->arch.guest_pmap_npages ||
-	    gfn1 >= kvm->arch.guest_pmap_npages) {
-		kvm_err("%s: Invalid gfn: [%#llx, %#llx], EHi: %#lx\n",
-			__func__, gfn0, gfn1, tlb->tlb_hi);
+	idx = TLB_LO_IDX(*tlb, gva);
+	if ((gva ^ KVM_GUEST_COMMPAGE_ADDR) & VPN2_MASK & PAGE_MASK)
+		tlb_lo = tlb->tlb_lo[idx];
+
+	/* Find host PFN */
+	gfn = mips3_tlbpfn_to_paddr(tlb_lo) >> PAGE_SHIFT;
+	if (gfn >= kvm->arch.guest_pmap_npages) {
+		kvm_err("%s: Invalid gfn: %#llx, EHi: %#lx\n",
+			__func__, gfn, tlb->tlb_hi);
 		kvm_mips_dump_guest_tlbs(vcpu);
 		return -1;
 	}
-
-	if (kvm_mips_map_page(kvm, gfn0) < 0)
+	if (kvm_mips_map_page(kvm, gfn) < 0)
 		return -1;
+	pfn = kvm->arch.guest_pmap[gfn];
 
-	if (kvm_mips_map_page(kvm, gfn1) < 0)
+	/* Find GVA page table entry */
+	ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, gva);
+	if (!ptep_gva) {
+		kvm_err("No ptep for gva %lx\n", gva);
 		return -1;
+	}
 
-	pfn0 = kvm->arch.guest_pmap[gfn0];
-	pfn1 = kvm->arch.guest_pmap[gfn1];
+	/* Write PFN into GVA page table, taking attributes from Guest TLB */
+	*ptep_gva = pfn_pte(pfn, (!(tlb_lo & ENTRYLO_V)) ? __pgprot(0) :
+				 (tlb_lo & ENTRYLO_D) ? PAGE_SHARED :
+				 PAGE_READONLY);
+	if (pte_present(*ptep_gva))
+		*ptep_gva = pte_mkyoung(pte_mkdirty(*ptep_gva));
 
-	/* Get attributes from the Guest TLB */
-	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
-		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
-		(tlb_lo[0] & ENTRYLO_D) |
-		(tlb_lo[0] & ENTRYLO_V);
-	entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
-		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
-		(tlb_lo[1] & ENTRYLO_D) |
-		(tlb_lo[1] & ENTRYLO_V);
+	/* Invalidate this entry in the TLB, current guest mode ASID only */
+	kvm_mips_host_tlb_inv(vcpu, gva, !kernel, kernel);
 
 	kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
 		  tlb->tlb_lo[0], tlb->tlb_lo[1]);
 
-	preempt_disable();
-	entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
-					       kvm_mips_get_kernel_asid(vcpu) :
-					       kvm_mips_get_user_asid(vcpu));
-	ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
-				      tlb->tlb_mask);
-	preempt_enable();
-
-	return ret;
+	return 0;
 }
 
 void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
@@ -582,7 +557,7 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
 				return KVM_INVALID_INST;
 			}
 			if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
-						&vcpu->arch.guest_tlb[index])) {
+					&vcpu->arch.guest_tlb[index], va)) {
 				kvm_err("%s: handling mapped seg tlb fault failed for %p, index: %u, vcpu: %p, ASID: %#lx\n",
 					__func__, opc, index, vcpu,
 					read_c0_entryhi());
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 06ee9a1d78a5..2fb76869d017 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -104,70 +104,6 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs);
 
-/* XXXKYMA: Must be called with interrupts disabled */
-/* set flush_dcache_mask == 0 if no dcache flush required */
-int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
-			    unsigned long entrylo0, unsigned long entrylo1,
-			    int flush_dcache_mask)
-{
-	unsigned long flags;
-	unsigned long old_entryhi;
-	int idx;
-
-	local_irq_save(flags);
-
-	old_entryhi = read_c0_entryhi();
-	write_c0_entryhi(entryhi);
-	mtc0_tlbw_hazard();
-
-	tlb_probe();
-	tlb_probe_hazard();
-	idx = read_c0_index();
-
-	if (idx > current_cpu_data.tlbsize) {
-		kvm_err("%s: Invalid Index: %d\n", __func__, idx);
-		kvm_mips_dump_host_tlbs();
-		local_irq_restore(flags);
-		return -1;
-	}
-
-	write_c0_entrylo0(entrylo0);
-	write_c0_entrylo1(entrylo1);
-	mtc0_tlbw_hazard();
-
-	if (idx < 0)
-		tlb_write_random();
-	else
-		tlb_write_indexed();
-	tlbw_use_hazard();
-
-	kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0(R): 0x%08lx, entrylo1(R): 0x%08lx\n",
-		  vcpu->arch.pc, idx, read_c0_entryhi(),
-		  read_c0_entrylo0(), read_c0_entrylo1());
-
-	/* Flush D-cache */
-	if (flush_dcache_mask) {
-		if (entrylo0 & ENTRYLO_V) {
-			++vcpu->stat.flush_dcache_exits;
-			flush_data_cache_page((entryhi & VPN2_MASK) &
-					      ~flush_dcache_mask);
-		}
-		if (entrylo1 & ENTRYLO_V) {
-			++vcpu->stat.flush_dcache_exits;
-			flush_data_cache_page(((entryhi & VPN2_MASK) &
-					       ~flush_dcache_mask) |
-					      (0x1 << PAGE_SHIFT));
-		}
-	}
-
-	/* Restore old ASID */
-	write_c0_entryhi(old_entryhi);
-	mtc0_tlbw_hazard();
-	local_irq_restore(flags);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_write);
-
 int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
 	struct kvm_vcpu *vcpu)
 {
-- 
cgit v1.2.3


From 4c86460cbc03f32c3649f7aaf9104e6e65c88a61 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Sat, 8 Oct 2016 01:16:21 +0100
Subject: KVM: MIPS/MMU: Convert commpage fault handling to page tables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that we have GVA page tables and an optimised TLB refill handler in
place, convert the handling of commpage faults from the guest kernel to
fill the GVA page table and invalidate the TLB entry, rather than
filling the wired TLB entry directly.

For simplicity we no longer use a wired entry for the commpage (refill
should be much cheaper with the fast-path handler anyway). Since we
don't need to manipulate the TLB directly any longer, move the function
from tlb.c to mmu.c. This puts it closer to the similar functions
handling KSeg0 and TLB mapped page faults from the guest.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  3 ---
 arch/mips/kvm/mips.c             | 34 +------------------------------
 arch/mips/kvm/mmu.c              | 21 +++++++++++++++++++
 arch/mips/kvm/tlb.c              | 44 ----------------------------------------
 4 files changed, 22 insertions(+), 80 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index f7680999e28a..e38e11184c1c 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -159,9 +159,6 @@ struct kvm_arch {
 	/* Guest GVA->HPA page table */
 	unsigned long *guest_pmap;
 	unsigned long guest_pmap_npages;
-
-	/* Wired host TLB used for the commpage */
-	int commpage_tlb;
 };
 
 #define N_MIPS_COPROC_REGS	32
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index c369fdd19fbc..c4fc52e39f34 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -92,28 +92,10 @@ void kvm_arch_check_processor_compat(void *rtn)
 	*(int *)rtn = 0;
 }
 
-static void kvm_mips_init_tlbs(struct kvm *kvm)
-{
-	unsigned long wired;
-
-	/*
-	 * Add a wired entry to the TLB, it is used to map the commpage to
-	 * the Guest kernel
-	 */
-	wired = read_c0_wired();
-	write_c0_wired(wired + 1);
-	mtc0_tlbw_hazard();
-	kvm->arch.commpage_tlb = wired;
-
-	kvm_debug("[%d] commpage TLB: %d\n", smp_processor_id(),
-		  kvm->arch.commpage_tlb);
-}
-
 static void kvm_mips_init_vm_percpu(void *arg)
 {
 	struct kvm *kvm = (struct kvm *)arg;
 
-	kvm_mips_init_tlbs(kvm);
 	kvm_mips_callbacks->vm_init(kvm);
 
 }
@@ -165,25 +147,11 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
 	mutex_unlock(&kvm->lock);
 }
 
-static void kvm_mips_uninit_tlbs(void *arg)
-{
-	/* Restore wired count */
-	write_c0_wired(0);
-	mtc0_tlbw_hazard();
-	/* Clear out all the TLBs */
-	kvm_local_flush_tlb_all();
-}
-
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvm_mips_free_vcpus(kvm);
 
-	/* If this is the last instance, restore wired count */
-	if (atomic_dec_return(&kvm_mips_instance) == 0) {
-		kvm_debug("%s: last KVM instance, restoring TLB parameters\n",
-			  __func__);
-		on_each_cpu(kvm_mips_uninit_tlbs, NULL, 1);
-	}
+	atomic_dec(&kvm_mips_instance);
 }
 
 long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl,
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 62122d297e52..98f1a7715a68 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -449,6 +449,27 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
+				       struct kvm_vcpu *vcpu)
+{
+	kvm_pfn_t pfn;
+	pte_t *ptep;
+
+	ptep = kvm_trap_emul_pte_for_gva(vcpu, badvaddr);
+	if (!ptep) {
+		kvm_err("No ptep for commpage %lx\n", badvaddr);
+		return -1;
+	}
+
+	pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage));
+	/* Also set valid and dirty, so refill handler doesn't have to */
+	*ptep = pte_mkyoung(pte_mkdirty(pfn_pte(pfn, PAGE_SHARED)));
+
+	/* Invalidate this entry in the TLB, guest kernel ASID only */
+	kvm_mips_host_tlb_inv(vcpu, badvaddr, false, true);
+	return 0;
+}
+
 void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
 			     struct kvm_vcpu *vcpu)
 {
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 2fb76869d017..919252662d5a 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -52,11 +52,6 @@ static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
 	return cpu_asid(cpu, user_mm);
 }
 
-inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
-{
-	return vcpu->kvm->arch.commpage_tlb;
-}
-
 /* Structure defining an tlb entry data set. */
 
 void kvm_mips_dump_host_tlbs(void)
@@ -104,45 +99,6 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs);
 
-int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
-	struct kvm_vcpu *vcpu)
-{
-	kvm_pfn_t pfn;
-	unsigned long flags, old_entryhi = 0, vaddr = 0;
-	unsigned long entrylo[2] = { 0, 0 };
-	unsigned int pair_idx;
-
-	pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage));
-	pair_idx = (badvaddr >> PAGE_SHIFT) & 1;
-	entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) |
-		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
-		ENTRYLO_D | ENTRYLO_V;
-
-	local_irq_save(flags);
-
-	old_entryhi = read_c0_entryhi();
-	vaddr = badvaddr & (PAGE_MASK << 1);
-	write_c0_entryhi(vaddr | kvm_mips_get_kernel_asid(vcpu));
-	write_c0_entrylo0(entrylo[0]);
-	write_c0_entrylo1(entrylo[1]);
-	write_c0_index(kvm_mips_get_commpage_asid(vcpu));
-	mtc0_tlbw_hazard();
-	tlb_write_indexed();
-	tlbw_use_hazard();
-
-	kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0 (R): 0x%08lx, entrylo1(R): 0x%08lx\n",
-		  vcpu->arch.pc, read_c0_index(), read_c0_entryhi(),
-		  read_c0_entrylo0(), read_c0_entrylo1());
-
-	/* Restore old ASID */
-	write_c0_entryhi(old_entryhi);
-	mtc0_tlbw_hazard();
-	local_irq_restore(flags);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_handle_commpage_tlb_fault);
-
 int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
 {
 	int i;
-- 
cgit v1.2.3


From 7a156e9f822d2eb6c294226aea2a4c12c05caa10 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 16 Nov 2016 17:36:47 +0000
Subject: KVM: MIPS: Drop vm_init() callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that the commpage doesn't use wired TLB entries, the per-CPU
vm_init() callback is the only work done by kvm_mips_init_vm_percpu().

The trap & emulate implementation doesn't actually need to do anything
from vm_init(), and the future VZ implementation would be better served
by a kvm_arch_hardware_enable callback anyway.

Therefore drop the vm_init() callback entirely, allowing the
kvm_mips_init_vm_percpu() function to also be dropped, along with the
kvm_mips_instance atomic counter.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  3 ---
 arch/mips/kvm/mips.c             | 16 ----------------
 arch/mips/kvm/tlb.c              |  3 ---
 arch/mips/kvm/trap_emul.c        |  6 ------
 4 files changed, 28 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index e38e11184c1c..95c86dab9b1b 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -121,8 +121,6 @@ static inline bool kvm_is_error_hva(unsigned long addr)
 	return IS_ERR_VALUE(addr);
 }
 
-extern atomic_t kvm_mips_instance;
-
 struct kvm_vm_stat {
 	ulong remote_tlb_flush;
 };
@@ -528,7 +526,6 @@ struct kvm_mips_callbacks {
 	int (*handle_msa_fpe)(struct kvm_vcpu *vcpu);
 	int (*handle_fpe)(struct kvm_vcpu *vcpu);
 	int (*handle_msa_disabled)(struct kvm_vcpu *vcpu);
-	int (*vm_init)(struct kvm *kvm);
 	int (*vcpu_init)(struct kvm_vcpu *vcpu);
 	void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
 	int (*vcpu_setup)(struct kvm_vcpu *vcpu);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index c4fc52e39f34..07ce10e3627a 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -92,22 +92,8 @@ void kvm_arch_check_processor_compat(void *rtn)
 	*(int *)rtn = 0;
 }
 
-static void kvm_mips_init_vm_percpu(void *arg)
-{
-	struct kvm *kvm = (struct kvm *)arg;
-
-	kvm_mips_callbacks->vm_init(kvm);
-
-}
-
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
-	if (atomic_inc_return(&kvm_mips_instance) == 1) {
-		kvm_debug("%s: 1st KVM instance, setup host TLB parameters\n",
-			  __func__);
-		on_each_cpu(kvm_mips_init_vm_percpu, kvm, 1);
-	}
-
 	return 0;
 }
 
@@ -150,8 +136,6 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvm_mips_free_vcpus(kvm);
-
-	atomic_dec(&kvm_mips_instance);
 }
 
 long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl,
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 919252662d5a..8af5fd2cb107 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -33,9 +33,6 @@
 #define KVM_GUEST_PC_TLB    0
 #define KVM_GUEST_SP_TLB    1
 
-atomic_t kvm_mips_instance;
-EXPORT_SYMBOL_GPL(kvm_mips_instance);
-
 static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
 {
 	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 6a56e48f4bfa..8bb82eaa4c91 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -429,11 +429,6 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
-static int kvm_trap_emul_vm_init(struct kvm *kvm)
-{
-	return 0;
-}
-
 static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
 {
 	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
@@ -847,7 +842,6 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
 	.handle_fpe = kvm_trap_emul_handle_fpe,
 	.handle_msa_disabled = kvm_trap_emul_handle_msa_disabled,
 
-	.vm_init = kvm_trap_emul_vm_init,
 	.vcpu_init = kvm_trap_emul_vcpu_init,
 	.vcpu_uninit = kvm_trap_emul_vcpu_uninit,
 	.vcpu_setup = kvm_trap_emul_vcpu_setup,
-- 
cgit v1.2.3


From dacc3ed1dd608ff9553dcede6cd05369030ed099 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 19 Aug 2016 15:27:22 +0100
Subject: KVM: MIPS: Use uaccess to read/modify guest instructions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that we have GVA page tables, use standard user accesses with page
faults disabled to read & modify guest instructions. This should be more
robust (than the rather dodgy method of accessing guest mapped segments
by just directly addressing them) and will also work with Enhanced
Virtual Addressing (EVA) host kernel configurations where dedicated
instructions are needed for accessing user mode memory.

For simplicity and speed we do this regardless of the guest segment the
address resides in, rather than handling guest KSeg0 specially with
kmap_atomic() as before.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  2 --
 arch/mips/kvm/dyntrans.c         | 28 +++++----------
 arch/mips/kvm/mmu.c              | 77 +++-------------------------------------
 arch/mips/kvm/trap_emul.c        |  9 +++++
 4 files changed, 22 insertions(+), 94 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 95c86dab9b1b..a26504bee21c 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -639,8 +639,6 @@ void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags);
 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
 				  bool user);
-extern unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
-						   unsigned long gva);
 extern void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
 				    struct kvm_vcpu *vcpu);
 extern void kvm_local_flush_tlb_all(void);
diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index 010cef240688..60ebf5862d2b 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -13,6 +13,7 @@
 #include <linux/err.h>
 #include <linux/highmem.h>
 #include <linux/kvm_host.h>
+#include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
 #include <linux/bootmem.h>
@@ -29,28 +30,15 @@
 static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc,
 				  union mips_instruction replace)
 {
-	unsigned long paddr, flags;
-	void *vaddr;
-
-	if (KVM_GUEST_KSEGX((unsigned long)opc) == KVM_GUEST_KSEG0) {
-		paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
-							    (unsigned long)opc);
-		vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
-		vaddr += paddr & ~PAGE_MASK;
-		memcpy(vaddr, (void *)&replace, sizeof(u32));
-		local_flush_icache_range((unsigned long)vaddr,
-					 (unsigned long)vaddr + 32);
-		kunmap_atomic(vaddr);
-	} else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
-		local_irq_save(flags);
-		memcpy((void *)opc, (void *)&replace, sizeof(u32));
-		__local_flush_icache_user_range((unsigned long)opc,
-						(unsigned long)opc + 32);
-		local_irq_restore(flags);
-	} else {
+	unsigned long vaddr = (unsigned long)opc;
+	int err;
+
+	err = put_user(replace.word, opc);
+	if (unlikely(err)) {
 		kvm_err("%s: Invalid address: %p\n", __func__, opc);
-		return -EFAULT;
+		return err;
 	}
+	__local_flush_icache_user_range(vaddr, vaddr + 4);
 
 	return 0;
 }
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 98f1a7715a68..c4e9c65065ea 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -11,6 +11,7 @@
 
 #include <linux/highmem.h>
 #include <linux/kvm_host.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
 
@@ -134,34 +135,6 @@ out:
 	return err;
 }
 
-/* Translate guest KSEG0 addresses to Host PA */
-unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
-						    unsigned long gva)
-{
-	gfn_t gfn;
-	unsigned long offset = gva & ~PAGE_MASK;
-	struct kvm *kvm = vcpu->kvm;
-
-	if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) {
-		kvm_err("%s/%p: Invalid gva: %#lx\n", __func__,
-			__builtin_return_address(0), gva);
-		return KVM_INVALID_PAGE;
-	}
-
-	gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT);
-
-	if (gfn >= kvm->arch.guest_pmap_npages) {
-		kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn,
-			gva);
-		return KVM_INVALID_PAGE;
-	}
-
-	if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
-		return KVM_INVALID_ADDR;
-
-	return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset;
-}
-
 static pte_t *kvm_trap_emul_pte_for_gva(struct kvm_vcpu *vcpu,
 					unsigned long addr)
 {
@@ -551,51 +524,11 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 
 u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
 {
-	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	unsigned long paddr, flags, vpn2, asid;
-	unsigned long va = (unsigned long)opc;
-	void *vaddr;
 	u32 inst;
-	int index;
-
-	if (KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0 ||
-	    KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) {
-		local_irq_save(flags);
-		index = kvm_mips_host_tlb_lookup(vcpu, va);
-		if (index >= 0) {
-			inst = *(opc);
-		} else {
-			vpn2 = va & VPN2_MASK;
-			asid = kvm_read_c0_guest_entryhi(cop0) &
-						KVM_ENTRYHI_ASID;
-			index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
-			if (index < 0) {
-				kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
-					__func__, opc, vcpu, read_c0_entryhi());
-				kvm_mips_dump_host_tlbs();
-				kvm_mips_dump_guest_tlbs(vcpu);
-				local_irq_restore(flags);
-				return KVM_INVALID_INST;
-			}
-			if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
-					&vcpu->arch.guest_tlb[index], va)) {
-				kvm_err("%s: handling mapped seg tlb fault failed for %p, index: %u, vcpu: %p, ASID: %#lx\n",
-					__func__, opc, index, vcpu,
-					read_c0_entryhi());
-				kvm_mips_dump_guest_tlbs(vcpu);
-				local_irq_restore(flags);
-				return KVM_INVALID_INST;
-			}
-			inst = *(opc);
-		}
-		local_irq_restore(flags);
-	} else if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
-		paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, va);
-		vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
-		vaddr += paddr & ~PAGE_MASK;
-		inst = *(u32 *)vaddr;
-		kunmap_atomic(vaddr);
-	} else {
+	int err;
+
+	err = get_user(inst, opc);
+	if (unlikely(err)) {
 		kvm_err("%s: illegal address: %p\n", __func__, opc);
 		return KVM_INVALID_INST;
 	}
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 8bb82eaa4c91..ee8b5ad8c7c5 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -12,6 +12,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kvm_host.h>
+#include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
@@ -798,6 +799,12 @@ static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
 	kvm_trap_emul_vcpu_reenter(run, vcpu);
 
+	/*
+	 * We use user accessors to access guest memory, but we don't want to
+	 * invoke Linux page faulting.
+	 */
+	pagefault_disable();
+
 	/* Disable hardware page table walking while in guest */
 	htw_stop();
 
@@ -823,6 +830,8 @@ static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
 	htw_start();
 
+	pagefault_enable();
+
 	return r;
 }
 
-- 
cgit v1.2.3


From 8af0e3c2e89e56dc5b064e5854b87a19e70e2710 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 17 Oct 2016 16:37:45 +0100
Subject: KVM: MIPS/Emulate: Fix CACHE emulation for EVA hosts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use protected_writeback_dcache_line() instead of flush_dcache_line(),
and protected_flush_icache_line() instead of flush_icache_line(), so
that CACHEE (the EVA variant) is used on EVA host kernels.

Without this, guest floating point branch delay slot emulation via a
trampoline on the user stack fails on EVA host kernels due to failure of
the icache sync, resulting in the break instruction getting skipped and
execution from the stack.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/emulate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 3ced662e012e..9ac8e45017ce 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1792,7 +1792,7 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
 skip_fault:
 	/* XXXKYMA: Only a subset of cache ops are supported, used by Linux */
 	if (op_inst == Hit_Writeback_Inv_D || op_inst == Hit_Invalidate_D) {
-		flush_dcache_line(va);
+		protected_writeback_dcache_line(va);
 
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
 		/*
@@ -1802,8 +1802,8 @@ skip_fault:
 		kvm_mips_trans_cache_va(inst, opc, vcpu);
 #endif
 	} else if (op_inst == Hit_Invalidate_I) {
-		flush_dcache_line(va);
-		flush_icache_line(va);
+		protected_writeback_dcache_line(va);
+		protected_flush_icache_line(va);
 
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
 		/* Replace the CACHE instruction, with a SYNCI */
-- 
cgit v1.2.3


From 49ec508e3bd0b11aaf534af0d63e4a17e05594e4 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 7 Oct 2016 22:32:13 +0100
Subject: KVM: MIPS/TLB: Drop kvm_local_flush_tlb_all()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that KVM no longer uses wired entries we can safely use
local_flush_tlb_all() when we need to flush the entire TLB (on the start
of a new ASID cycle). This doesn't flush wired entries, which allows
other code to use them without KVM clobbering them all the time. It also
is more up to date, knowing about the tlbinv architectural feature,
flushing of micro TLB on cores where that is necessary (Loongson I
believe), and knows to stop the HTW while doing so.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h    |  1 -
 arch/mips/include/asm/mmu_context.h |  5 -----
 arch/mips/kvm/mmu.c                 |  2 +-
 arch/mips/kvm/tlb.c                 | 29 -----------------------------
 4 files changed, 1 insertion(+), 36 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index a26504bee21c..1a83b6f85de2 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -641,7 +641,6 @@ void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
 				  bool user);
 extern void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
 				    struct kvm_vcpu *vcpu);
-extern void kvm_local_flush_tlb_all(void);
 extern void kvm_mips_alloc_new_mmu_context(struct kvm_vcpu *vcpu);
 extern void kvm_mips_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu);
diff --git a/arch/mips/include/asm/mmu_context.h b/arch/mips/include/asm/mmu_context.h
index 16eb8521398e..2abf94f72c0a 100644
--- a/arch/mips/include/asm/mmu_context.h
+++ b/arch/mips/include/asm/mmu_context.h
@@ -99,17 +99,12 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 static inline void
 get_new_mmu_context(struct mm_struct *mm, unsigned long cpu)
 {
-	extern void kvm_local_flush_tlb_all(void);
 	unsigned long asid = asid_cache(cpu);
 
 	if (!((asid += cpu_asid_inc()) & cpu_asid_mask(&cpu_data[cpu]))) {
 		if (cpu_has_vtag_icache)
 			flush_icache_all();
-#ifdef CONFIG_KVM
-		kvm_local_flush_tlb_all();      /* start new asid cycle */
-#else
 		local_flush_tlb_all();	/* start new asid cycle */
-#endif
 		if (!asid)		/* fix version if needed */
 			asid = asid_first_version(cpu);
 	}
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index c4e9c65065ea..cf832ea963d8 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -453,7 +453,7 @@ void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
 		if (cpu_has_vtag_icache)
 			flush_icache_all();
 
-		kvm_local_flush_tlb_all();      /* start new asid cycle */
+		local_flush_tlb_all();      /* start new asid cycle */
 
 		if (!asid)      /* fix version if needed */
 			asid = asid_first_version(cpu);
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 8af5fd2cb107..51f4aee717e7 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -263,35 +263,6 @@ void kvm_mips_flush_host_tlb(int skip_kseg0)
 }
 EXPORT_SYMBOL_GPL(kvm_mips_flush_host_tlb);
 
-void kvm_local_flush_tlb_all(void)
-{
-	unsigned long flags;
-	unsigned long old_ctx;
-	int entry = 0;
-
-	local_irq_save(flags);
-	/* Save old context and create impossible VPN2 value */
-	old_ctx = read_c0_entryhi();
-	write_c0_entrylo0(0);
-	write_c0_entrylo1(0);
-
-	/* Blast 'em all away. */
-	while (entry < current_cpu_data.tlbsize) {
-		/* Make sure all entries differ. */
-		write_c0_entryhi(UNIQUE_ENTRYHI(entry));
-		write_c0_index(entry);
-		mtc0_tlbw_hazard();
-		tlb_write_indexed();
-		tlbw_use_hazard();
-		entry++;
-	}
-	write_c0_entryhi(old_ctx);
-	mtc0_tlbw_hazard();
-
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(kvm_local_flush_tlb_all);
-
 /**
  * kvm_mips_suspend_mm() - Suspend the active mm.
  * @cpu		The CPU we're running on.
-- 
cgit v1.2.3


From 7071a8859bb36d34b8f766275b239e98760f010c Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 13 Sep 2016 12:58:08 +0100
Subject: KVM: MIPS/Emulate: Drop redundant TLB flushes on exceptions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When exceptions are injected into the MIPS KVM guest, the whole host TLB
is flushed (except any entries in the guest KSeg0 range). This is
certainly not mandated by the architecture when exceptions are taken
(userland can't directly change TLB mappings anyway), and is a pretty
heavyweight operation:

 - There may be hundreds of TLB entries especially when a 512 entry FTLB
   is present. These are walked and read and conditionally invalidated,
   so the TLBINV feature can't be used either.

 - It'll indiscriminately wipe out entries belonging to other memory
   spaces. A simple ASID regeneration would be much faster to perform,
   although it'd wipe out the guest KSeg0 mappings too.

My suspicion is that this was simply to plaster over the fact that
kvm_mips_host_tlb_inv() incorrectly only invalidated TLB entries in the
ASID for guest usermode, and not the ASID for guest kernelmode.

Now that the recent commit "KVM: MIPS/TLB: Flush host TLB entry in
kernel ASID" fixes kvm_mips_host_tlb_inv() to flush TLB entries in the
kernelmode ASID when the guest TLB changes, lets drop these calls and
the otherwise unused kvm_mips_flush_host_tlb().

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  1 -
 arch/mips/kvm/emulate.c          | 10 --------
 arch/mips/kvm/tlb.c              | 49 ----------------------------------------
 3 files changed, 60 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 1a83b6f85de2..174857f146b1 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -608,7 +608,6 @@ extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause,
 
 extern void kvm_mips_dump_host_tlbs(void);
 extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu);
-extern void kvm_mips_flush_host_tlb(int skip_kseg0);
 extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi,
 				 bool user, bool kernel);
 
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 9ac8e45017ce..cd11d787d9dc 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1968,8 +1968,6 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
 	kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
 	/* XXXKYMA: is the context register used by linux??? */
 	kvm_write_c0_guest_entryhi(cop0, entryhi);
-	/* Blow away the shadow host TLBs */
-	kvm_mips_flush_host_tlb(1);
 
 	return EMULATE_DONE;
 }
@@ -2014,8 +2012,6 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
 	kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
 	/* XXXKYMA: is the context register used by linux??? */
 	kvm_write_c0_guest_entryhi(cop0, entryhi);
-	/* Blow away the shadow host TLBs */
-	kvm_mips_flush_host_tlb(1);
 
 	return EMULATE_DONE;
 }
@@ -2058,8 +2054,6 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
 	kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
 	/* XXXKYMA: is the context register used by linux??? */
 	kvm_write_c0_guest_entryhi(cop0, entryhi);
-	/* Blow away the shadow host TLBs */
-	kvm_mips_flush_host_tlb(1);
 
 	return EMULATE_DONE;
 }
@@ -2102,8 +2096,6 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
 	kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
 	/* XXXKYMA: is the context register used by linux??? */
 	kvm_write_c0_guest_entryhi(cop0, entryhi);
-	/* Blow away the shadow host TLBs */
-	kvm_mips_flush_host_tlb(1);
 
 	return EMULATE_DONE;
 }
@@ -2176,8 +2168,6 @@ enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
 	kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
 	/* XXXKYMA: is the context register used by linux??? */
 	kvm_write_c0_guest_entryhi(cop0, entryhi);
-	/* Blow away the shadow host TLBs */
-	kvm_mips_flush_host_tlb(1);
 
 	return EMULATE_DONE;
 }
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 51f4aee717e7..cee2e9feb942 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -214,55 +214,6 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va,
 }
 EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_inv);
 
-void kvm_mips_flush_host_tlb(int skip_kseg0)
-{
-	unsigned long flags;
-	unsigned long old_entryhi, entryhi;
-	unsigned long old_pagemask;
-	int entry = 0;
-	int maxentry = current_cpu_data.tlbsize;
-
-	local_irq_save(flags);
-
-	old_entryhi = read_c0_entryhi();
-	old_pagemask = read_c0_pagemask();
-
-	/* Blast 'em all away. */
-	for (entry = 0; entry < maxentry; entry++) {
-		write_c0_index(entry);
-
-		if (skip_kseg0) {
-			mtc0_tlbr_hazard();
-			tlb_read();
-			tlb_read_hazard();
-
-			entryhi = read_c0_entryhi();
-
-			/* Don't blow away guest kernel entries */
-			if (KVM_GUEST_KSEGX(entryhi) == KVM_GUEST_KSEG0)
-				continue;
-
-			write_c0_pagemask(old_pagemask);
-		}
-
-		/* Make sure all entries differ. */
-		write_c0_entryhi(UNIQUE_ENTRYHI(entry));
-		write_c0_entrylo0(0);
-		write_c0_entrylo1(0);
-		mtc0_tlbw_hazard();
-
-		tlb_write_indexed();
-		tlbw_use_hazard();
-	}
-
-	write_c0_entryhi(old_entryhi);
-	write_c0_pagemask(old_pagemask);
-	mtc0_tlbw_hazard();
-
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(kvm_mips_flush_host_tlb);
-
 /**
  * kvm_mips_suspend_mm() - Suspend the active mm.
  * @cpu		The CPU we're running on.
-- 
cgit v1.2.3


From a98dd7410370634496aa500abf90ec6336b494ae Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 7 Oct 2016 22:39:41 +0100
Subject: KVM: MIPS/MMU: Drop kvm_get_new_mmu_context()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MIPS KVM uses its own variation of get_new_mmu_context() which takes an
extra vcpu pointer (unused) and does exactly the same thing.

Switch to just using get_new_mmu_context() directly and drop KVM's
version of it as it doesn't really serve any purpose.

The nearby declarations of kvm_mips_alloc_new_mmu_context(),
kvm_mips_vcpu_load() and kvm_mips_vcpu_put() are also removed from
kvm_host.h, as no definitions or users exist.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  5 -----
 arch/mips/kvm/emulate.c          |  3 +--
 arch/mips/kvm/mmu.c              | 19 -------------------
 arch/mips/kvm/trap_emul.c        |  6 +++---
 4 files changed, 4 insertions(+), 29 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 174857f146b1..1337abb18e2b 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -638,11 +638,6 @@ void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags);
 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
 				  bool user);
-extern void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
-				    struct kvm_vcpu *vcpu);
-extern void kvm_mips_alloc_new_mmu_context(struct kvm_vcpu *vcpu);
-extern void kvm_mips_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu);
 
 /* Emulation */
 u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu);
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index cd11d787d9dc..67ea39973b96 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1198,8 +1198,7 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 					 */
 					preempt_disable();
 					cpu = smp_processor_id();
-					kvm_get_new_mmu_context(kern_mm,
-								cpu, vcpu);
+					get_new_mmu_context(kern_mm, cpu);
 					for_each_possible_cpu(i)
 						if (i != cpu)
 							cpu_context(i, kern_mm) = 0;
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index cf832ea963d8..aab604e75d3b 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -443,25 +443,6 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
 	return 0;
 }
 
-void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
-			     struct kvm_vcpu *vcpu)
-{
-	unsigned long asid = asid_cache(cpu);
-
-	asid += cpu_asid_inc();
-	if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
-		if (cpu_has_vtag_icache)
-			flush_icache_all();
-
-		local_flush_tlb_all();      /* start new asid cycle */
-
-		if (!asid)      /* fix version if needed */
-			asid = asid_first_version(cpu);
-	}
-
-	cpu_context(cpu, mm) = asid_cache(cpu) = asid;
-}
-
 /**
  * kvm_mips_migrate_count() - Migrate timer.
  * @vcpu:	Virtual CPU.
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index ee8b5ad8c7c5..653850c05b33 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -706,7 +706,7 @@ static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	if ((cpu_context(cpu, kern_mm) ^ asid_cache(cpu)) &
 						asid_version_mask(cpu)) {
-		kvm_get_new_mmu_context(kern_mm, cpu, vcpu);
+		get_new_mmu_context(kern_mm, cpu);
 
 		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
 			  cpu_context(cpu, current->mm));
@@ -716,7 +716,7 @@ static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	if ((cpu_context(cpu, user_mm) ^ asid_cache(cpu)) &
 						asid_version_mask(cpu)) {
-		kvm_get_new_mmu_context(user_mm, cpu, vcpu);
+		get_new_mmu_context(user_mm, cpu);
 
 		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
 			  cpu_context(cpu, current->mm));
@@ -779,7 +779,7 @@ static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run,
 		gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID;
 		if (gasid != vcpu->arch.last_user_gasid) {
 			kvm_mips_flush_gva_pt(user_mm->pgd, KMF_USER);
-			kvm_get_new_mmu_context(user_mm, cpu, vcpu);
+			get_new_mmu_context(user_mm, cpu);
 			for_each_possible_cpu(i)
 				if (i != cpu)
 					cpu_context(i, user_mm) = 0;
-- 
cgit v1.2.3


From a1ecc54d7ea629538116351a3ccc7d86bb9a3c69 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 28 Nov 2016 18:39:24 +0000
Subject: KVM: MIPS/T&E: Don't treat code fetch faults as MMIO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to make use of the CP0_BadInstr & CP0_BadInstrP registers we
need to be a bit more careful not to treat code fetch faults as MMIO,
lest we hit an UNPREDICTABLE register value when we try to emulate the
MMIO load instruction but there was no valid instruction word available
to the hardware.

Add a kvm_is_ifetch_fault() helper to try to figure out whether a load
fault was due to a code fetch, and prevent MMIO instruction emulation in
that case.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h | 27 +++++++++++++++++++++++++++
 arch/mips/kvm/trap_emul.c        | 12 ++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 1337abb18e2b..6f68f7545b66 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -643,6 +643,33 @@ void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
 u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu);
 enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
 
+/**
+ * kvm_is_ifetch_fault() - Find whether a TLBL exception is due to ifetch fault.
+ * @vcpu:	Virtual CPU.
+ *
+ * Returns:	Whether the TLBL exception was likely due to an instruction
+ *		fetch fault rather than a data load fault.
+ */
+static inline bool kvm_is_ifetch_fault(struct kvm_vcpu_arch *vcpu)
+{
+	unsigned long badvaddr = vcpu->host_cp0_badvaddr;
+	unsigned long epc = msk_isa16_mode(vcpu->pc);
+	u32 cause = vcpu->host_cp0_cause;
+
+	if (epc == badvaddr)
+		return true;
+
+	/*
+	 * Branches may be 32-bit or 16-bit instructions.
+	 * This isn't exact, but we don't really support MIPS16 or microMIPS yet
+	 * in KVM anyway.
+	 */
+	if ((cause & CAUSEF_BD) && badvaddr - epc <= 4)
+		return true;
+
+	return false;
+}
+
 extern enum emulation_result kvm_mips_emulate_inst(u32 cause,
 						   u32 *opc,
 						   struct kvm_run *run,
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 653850c05b33..ccd56b3ce84b 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -178,6 +178,12 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
 		}
 	} else if (KVM_GUEST_KERNEL_MODE(vcpu)
 		   && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) {
+		/* A code fetch fault doesn't count as an MMIO */
+		if (!store && kvm_is_ifetch_fault(&vcpu->arch)) {
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
+
 		/*
 		 * With EVA we may get a TLB exception instead of an address
 		 * error when the guest performs MMIO to KSeg1 addresses.
@@ -255,6 +261,12 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
 	int ret = RESUME_GUEST;
 
 	if (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1) {
+		/* A code fetch fault doesn't count as an MMIO */
+		if (kvm_is_ifetch_fault(&vcpu->arch)) {
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
+
 		kvm_debug("Emulate Load from MMIO space @ %#lx\n", badvaddr);
 		er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
 		if (er == EMULATE_FAIL) {
-- 
cgit v1.2.3


From 122e51d47418f74a69a93bf02f5535d11ff75bf5 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 28 Nov 2016 17:23:14 +0000
Subject: KVM: MIPS: Improve kvm_get_inst() error return
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently kvm_get_inst() returns KVM_INVALID_INST in the event of a
fault reading the guest instruction. This has the rather arbitrary magic
value 0xdeadbeef. This API isn't very robust, and in fact 0xdeadbeef is
a valid MIPS64 instruction encoding, namely "ld t1,-16657(s5)".

Therefore change the kvm_get_inst() API to return 0 or -EFAULT, and to
return the instruction via a u32 *out argument. We can then drop the
KVM_INVALID_INST definition entirely.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  3 +-
 arch/mips/kvm/emulate.c          | 90 ++++++++++++++++++++--------------------
 arch/mips/kvm/mips.c             |  7 +++-
 arch/mips/kvm/mmu.c              |  9 ++--
 4 files changed, 56 insertions(+), 53 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 6f68f7545b66..f296ebeda9e3 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -104,7 +104,6 @@
 #define KVM_GUEST_KSEG23ADDR(a)		(KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG23)
 
 #define KVM_INVALID_PAGE		0xdeadbeef
-#define KVM_INVALID_INST		0xdeadbeef
 #define KVM_INVALID_ADDR		0xdeadbeef
 
 /*
@@ -640,7 +639,7 @@ void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
 				  bool user);
 
 /* Emulation */
-u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu);
+int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
 enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
 
 /**
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 67ea39973b96..b906fc0589f3 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -38,23 +38,25 @@
  * Compute the return address and do emulate branch simulation, if required.
  * This function should be called only in branch delay slot active.
  */
-unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
-	unsigned long instpc)
+static int kvm_compute_return_epc(struct kvm_vcpu *vcpu, unsigned long instpc,
+				  unsigned long *out)
 {
 	unsigned int dspcontrol;
 	union mips_instruction insn;
 	struct kvm_vcpu_arch *arch = &vcpu->arch;
 	long epc = instpc;
-	long nextpc = KVM_INVALID_INST;
+	long nextpc;
+	int err;
 
-	if (epc & 3)
-		goto unaligned;
+	if (epc & 3) {
+		kvm_err("%s: unaligned epc\n", __func__);
+		return -EINVAL;
+	}
 
 	/* Read the instruction */
-	insn.word = kvm_get_inst((u32 *) epc, vcpu);
-
-	if (insn.word == KVM_INVALID_INST)
-		return KVM_INVALID_INST;
+	err = kvm_get_inst((u32 *)epc, vcpu, &insn.word);
+	if (err)
+		return err;
 
 	switch (insn.i_format.opcode) {
 		/* jr and jalr are in r_format format. */
@@ -66,6 +68,8 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 		case jr_op:
 			nextpc = arch->gprs[insn.r_format.rs];
 			break;
+		default:
+			return -EINVAL;
 		}
 		break;
 
@@ -114,8 +118,11 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 			nextpc = epc;
 			break;
 		case bposge32_op:
-			if (!cpu_has_dsp)
-				goto sigill;
+			if (!cpu_has_dsp) {
+				kvm_err("%s: DSP branch but not DSP ASE\n",
+					__func__);
+				return -EINVAL;
+			}
 
 			dspcontrol = rddsp(0x01);
 
@@ -125,6 +132,8 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 				epc += 8;
 			nextpc = epc;
 			break;
+		default:
+			return -EINVAL;
 		}
 		break;
 
@@ -189,7 +198,7 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 		/* And now the FPA/cp1 branch instructions. */
 	case cop1_op:
 		kvm_err("%s: unsupported cop1_op\n", __func__);
-		break;
+		return -EINVAL;
 
 #ifdef CONFIG_CPU_MIPSR6
 	/* R6 added the following compact branches with forbidden slots */
@@ -198,19 +207,19 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 		/* only rt == 0 isn't compact branch */
 		if (insn.i_format.rt != 0)
 			goto compact_branch;
-		break;
+		return -EINVAL;
 	case pop10_op:
 	case pop30_op:
 		/* only rs == rt == 0 is reserved, rest are compact branches */
 		if (insn.i_format.rs != 0 || insn.i_format.rt != 0)
 			goto compact_branch;
-		break;
+		return -EINVAL;
 	case pop66_op:
 	case pop76_op:
 		/* only rs == 0 isn't compact branch */
 		if (insn.i_format.rs != 0)
 			goto compact_branch;
-		break;
+		return -EINVAL;
 compact_branch:
 		/*
 		 * If we've hit an exception on the forbidden slot, then
@@ -221,42 +230,32 @@ compact_branch:
 		break;
 #else
 compact_branch:
-		/* Compact branches not supported before R6 */
-		break;
+		/* Fall through - Compact branches not supported before R6 */
 #endif
+	default:
+		return -EINVAL;
 	}
 
-	return nextpc;
-
-unaligned:
-	kvm_err("%s: unaligned epc\n", __func__);
-	return nextpc;
-
-sigill:
-	kvm_err("%s: DSP branch but not DSP ASE\n", __func__);
-	return nextpc;
+	*out = nextpc;
+	return 0;
 }
 
 enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause)
 {
-	unsigned long branch_pc;
-	enum emulation_result er = EMULATE_DONE;
+	int err;
 
 	if (cause & CAUSEF_BD) {
-		branch_pc = kvm_compute_return_epc(vcpu, vcpu->arch.pc);
-		if (branch_pc == KVM_INVALID_INST) {
-			er = EMULATE_FAIL;
-		} else {
-			vcpu->arch.pc = branch_pc;
-			kvm_debug("BD update_pc(): New PC: %#lx\n",
-				  vcpu->arch.pc);
-		}
-	} else
+		err = kvm_compute_return_epc(vcpu, vcpu->arch.pc,
+					     &vcpu->arch.pc);
+		if (err)
+			return EMULATE_FAIL;
+	} else {
 		vcpu->arch.pc += 4;
+	}
 
 	kvm_debug("update_pc(): New PC: %#lx\n", vcpu->arch.pc);
 
-	return er;
+	return EMULATE_DONE;
 }
 
 /**
@@ -1835,12 +1834,14 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
 {
 	union mips_instruction inst;
 	enum emulation_result er = EMULATE_DONE;
+	int err;
 
 	/* Fetch the instruction. */
 	if (cause & CAUSEF_BD)
 		opc += 1;
-
-	inst.word = kvm_get_inst(opc, vcpu);
+	err = kvm_get_inst(opc, vcpu, &inst.word);
+	if (err)
+		return EMULATE_FAIL;
 
 	switch (inst.r_format.opcode) {
 	case cop0_op:
@@ -2419,6 +2420,7 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 	enum emulation_result er = EMULATE_DONE;
 	unsigned long curr_pc;
 	union mips_instruction inst;
+	int err;
 
 	/*
 	 * Update PC and hold onto current PC in case there is
@@ -2432,11 +2434,9 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 	/* Fetch the instruction. */
 	if (cause & CAUSEF_BD)
 		opc += 1;
-
-	inst.word = kvm_get_inst(opc, vcpu);
-
-	if (inst.word == KVM_INVALID_INST) {
-		kvm_err("%s: Cannot get inst @ %p\n", __func__, opc);
+	err = kvm_get_inst(opc, vcpu, &inst.word);
+	if (err) {
+		kvm_err("%s: Cannot get inst @ %p (%d)\n", __func__, opc, err);
 		return EMULATE_FAIL;
 	}
 
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 07ce10e3627a..29afd96069ef 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1343,6 +1343,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 	enum emulation_result er = EMULATE_DONE;
+	u32 inst;
 	int ret = RESUME_GUEST;
 
 	/* re-enable HTW before enabling interrupts */
@@ -1467,8 +1468,12 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		break;
 
 	default:
+		if (cause & CAUSEF_BD)
+			opc += 1;
+		inst = 0;
+		kvm_get_inst(opc, vcpu, &inst);
 		kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x  BadVaddr: %#lx Status: %#lx\n",
-			exccode, opc, kvm_get_inst(opc, vcpu), badvaddr,
+			exccode, opc, inst, badvaddr,
 			kvm_read_c0_guest_status(vcpu->arch.cop0));
 		kvm_arch_vcpu_dump_regs(vcpu);
 		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index aab604e75d3b..6379ac1bc7b9 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -503,16 +503,15 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	local_irq_restore(flags);
 }
 
-u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
+int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
 {
-	u32 inst;
 	int err;
 
-	err = get_user(inst, opc);
+	err = get_user(*out, opc);
 	if (unlikely(err)) {
 		kvm_err("%s: illegal address: %p\n", __func__, opc);
-		return KVM_INVALID_INST;
+		return -EFAULT;
 	}
 
-	return inst;
+	return 0;
 }
-- 
cgit v1.2.3


From 6a97c775ff77fb7c54adc3f7944205ae66cb5475 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 23 Apr 2015 16:54:35 +0100
Subject: KVM: MIPS: Use CP0_BadInstr[P] for emulation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When exiting from the guest, store the values of the CP0_BadInstr and
CP0_BadInstrP registers if they exist, which contain the encodings of
the instructions which caused the last synchronous exception.

When the instruction is needed for emulation, kvm_get_badinstr() and
kvm_get_badinstrp() are used instead of calling kvm_get_inst() directly,
to decide whether to read the saved CP0_BadInstr/CP0_BadInstrP registers
(if they exist), or read the instruction from memory (if not).

The use of these registers should be more robust than using
kvm_get_inst(), as it actually gives the instruction encoding seen by
the hardware rather than relying on user accessors after the fact, which
can be fooled by incoherent icache or a racing code modification. It
will also work with VZ, where the guest virtual memory isn't directly
accessible by the host with user accessors.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  4 ++++
 arch/mips/kvm/emulate.c          | 48 +++++++++++++++++++++++++++++++++++++---
 arch/mips/kvm/entry.c            | 14 ++++++++++++
 arch/mips/kvm/mips.c             |  2 +-
 4 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index f296ebeda9e3..17c5e53ef3fa 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -280,6 +280,8 @@ struct kvm_vcpu_arch {
 	unsigned long host_cp0_badvaddr;
 	unsigned long host_cp0_epc;
 	u32 host_cp0_cause;
+	u32 host_cp0_badinstr;
+	u32 host_cp0_badinstrp;
 
 	/* GPRS */
 	unsigned long gprs[32];
@@ -641,6 +643,8 @@ void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
 /* Emulation */
 int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
 enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
+int kvm_get_badinstr(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
+int kvm_get_badinstrp(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
 
 /**
  * kvm_is_ifetch_fault() - Find whether a TLBL exception is due to ifetch fault.
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index b906fc0589f3..b295a4a1496f 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -54,7 +54,7 @@ static int kvm_compute_return_epc(struct kvm_vcpu *vcpu, unsigned long instpc,
 	}
 
 	/* Read the instruction */
-	err = kvm_get_inst((u32 *)epc, vcpu, &insn.word);
+	err = kvm_get_badinstrp((u32 *)epc, vcpu, &insn.word);
 	if (err)
 		return err;
 
@@ -258,6 +258,48 @@ enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause)
 	return EMULATE_DONE;
 }
 
+/**
+ * kvm_get_badinstr() - Get bad instruction encoding.
+ * @opc:	Guest pointer to faulting instruction.
+ * @vcpu:	KVM VCPU information.
+ *
+ * Gets the instruction encoding of the faulting instruction, using the saved
+ * BadInstr register value if it exists, otherwise falling back to reading guest
+ * memory at @opc.
+ *
+ * Returns:	The instruction encoding of the faulting instruction.
+ */
+int kvm_get_badinstr(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
+{
+	if (cpu_has_badinstr) {
+		*out = vcpu->arch.host_cp0_badinstr;
+		return 0;
+	} else {
+		return kvm_get_inst(opc, vcpu, out);
+	}
+}
+
+/**
+ * kvm_get_badinstrp() - Get bad prior instruction encoding.
+ * @opc:	Guest pointer to prior faulting instruction.
+ * @vcpu:	KVM VCPU information.
+ *
+ * Gets the instruction encoding of the prior faulting instruction (the branch
+ * containing the delay slot which faulted), using the saved BadInstrP register
+ * value if it exists, otherwise falling back to reading guest memory at @opc.
+ *
+ * Returns:	The instruction encoding of the prior faulting instruction.
+ */
+int kvm_get_badinstrp(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
+{
+	if (cpu_has_badinstrp) {
+		*out = vcpu->arch.host_cp0_badinstrp;
+		return 0;
+	} else {
+		return kvm_get_inst(opc, vcpu, out);
+	}
+}
+
 /**
  * kvm_mips_count_disabled() - Find whether the CP0_Count timer is disabled.
  * @vcpu:	Virtual CPU.
@@ -1839,7 +1881,7 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
 	/* Fetch the instruction. */
 	if (cause & CAUSEF_BD)
 		opc += 1;
-	err = kvm_get_inst(opc, vcpu, &inst.word);
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
 	if (err)
 		return EMULATE_FAIL;
 
@@ -2434,7 +2476,7 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 	/* Fetch the instruction. */
 	if (cause & CAUSEF_BD)
 		opc += 1;
-	err = kvm_get_inst(opc, vcpu, &inst.word);
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
 	if (err) {
 		kvm_err("%s: Cannot get inst @ %p (%d)\n", __func__, opc, err);
 		return EMULATE_FAIL;
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index 1ae33e0e675c..c5b254c4d0da 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -53,6 +53,8 @@
 /* Some CP0 registers */
 #define C0_HWRENA	7, 0
 #define C0_BADVADDR	8, 0
+#define C0_BADINSTR	8, 1
+#define C0_BADINSTRP	8, 2
 #define C0_ENTRYHI	10, 0
 #define C0_STATUS	12, 0
 #define C0_CAUSE	13, 0
@@ -579,6 +581,18 @@ void *kvm_mips_build_exit(void *addr)
 	uasm_i_mfc0(&p, K0, C0_CAUSE);
 	uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_cause), K1);
 
+	if (cpu_has_badinstr) {
+		uasm_i_mfc0(&p, K0, C0_BADINSTR);
+		uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch,
+					   host_cp0_badinstr), K1);
+	}
+
+	if (cpu_has_badinstrp) {
+		uasm_i_mfc0(&p, K0, C0_BADINSTRP);
+		uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch,
+					   host_cp0_badinstrp), K1);
+	}
+
 	/* Now restore the host state just enough to run the handlers */
 
 	/* Switch EBASE to the one used by Linux */
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 29afd96069ef..b8f04070bf39 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1471,7 +1471,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		if (cause & CAUSEF_BD)
 			opc += 1;
 		inst = 0;
-		kvm_get_inst(opc, vcpu, &inst);
+		kvm_get_badinstr(opc, vcpu, &inst);
 		kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x  BadVaddr: %#lx Status: %#lx\n",
 			exccode, opc, inst, badvaddr,
 			kvm_read_c0_guest_status(vcpu->arch.cop0));
-- 
cgit v1.2.3


From 06c158c96ed8909a1d8696d4f953ca8a9ef55574 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 1 May 2015 13:50:18 +0100
Subject: KVM: MIPS/MMU: Convert guest physical map to page table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Current guest physical memory is mapped to host physical addresses using
a single linear array (guest_pmap of length guest_pmap_npages). This was
only really meant to be temporary, and isn't sparse, so its wasteful of
memory. A small amount of RAM at GPA 0 and a small boot exception vector
at GPA 0x1fc00000 cannot be represented without a full 128KiB guest_pmap
allocation (MIPS32 with 16KiB pages), which is one reason why QEMU
currently runs its boot code at the top of RAM instead of the usual boot
exception vector address.

Instead use the existing infrastructure for host virtual page table
management to allocate a page table for guest physical memory too. This
should be sufficient for now, assuming the size of physical memory
doesn't exceed the size of virtual memory. It may need extending in
future to handle XPA (eXtended Physical Addressing) in 32-bit guests, as
supported by VZ guests on P5600.

Some of this code is based loosely on Cavium's VZ KVM implementation.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |   7 +-
 arch/mips/kvm/mips.c             |  48 ++-----
 arch/mips/kvm/mmu.c              | 293 ++++++++++++++++++++++++++++++++++-----
 3 files changed, 278 insertions(+), 70 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 17c5e53ef3fa..a04657932e19 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -153,9 +153,8 @@ struct kvm_arch_memory_slot {
 };
 
 struct kvm_arch {
-	/* Guest GVA->HPA page table */
-	unsigned long *guest_pmap;
-	unsigned long guest_pmap_npages;
+	/* Guest physical mm */
+	struct mm_struct gpa_mm;
 };
 
 #define N_MIPS_COPROC_REGS	32
@@ -636,6 +635,8 @@ enum kvm_mips_flush {
 	KMF_GPA		= 0x2,
 };
 void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags);
+bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn);
+pgd_t *kvm_pgd_alloc(void);
 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
 				  bool user);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index b8f04070bf39..9b72e2c55a0c 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -22,6 +22,7 @@
 #include <asm/page.h>
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 
 #include <linux/kvm_host.h>
@@ -94,6 +95,11 @@ void kvm_arch_check_processor_compat(void *rtn)
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
+	/* Allocate page table to map GPA -> RPA */
+	kvm->arch.gpa_mm.pgd = kvm_pgd_alloc();
+	if (!kvm->arch.gpa_mm.pgd)
+		return -ENOMEM;
+
 	return 0;
 }
 
@@ -112,13 +118,6 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
 	unsigned int i;
 	struct kvm_vcpu *vcpu;
 
-	/* Put the pages we reserved for the guest pmap */
-	for (i = 0; i < kvm->arch.guest_pmap_npages; i++) {
-		if (kvm->arch.guest_pmap[i] != KVM_INVALID_PAGE)
-			kvm_release_pfn_clean(kvm->arch.guest_pmap[i]);
-	}
-	kfree(kvm->arch.guest_pmap);
-
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		kvm_arch_vcpu_free(vcpu);
 	}
@@ -133,9 +132,17 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
 	mutex_unlock(&kvm->lock);
 }
 
+static void kvm_mips_free_gpa_pt(struct kvm *kvm)
+{
+	/* It should always be safe to remove after flushing the whole range */
+	WARN_ON(!kvm_mips_flush_gpa_pt(kvm, 0, ~0));
+	pgd_free(NULL, kvm->arch.gpa_mm.pgd);
+}
+
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvm_mips_free_vcpus(kvm);
+	kvm_mips_free_gpa_pt(kvm);
 }
 
 long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl,
@@ -164,36 +171,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 				   const struct kvm_memory_slot *new,
 				   enum kvm_mr_change change)
 {
-	unsigned long npages = 0;
-	int i;
-
 	kvm_debug("%s: kvm: %p slot: %d, GPA: %llx, size: %llx, QVA: %llx\n",
 		  __func__, kvm, mem->slot, mem->guest_phys_addr,
 		  mem->memory_size, mem->userspace_addr);
-
-	/* Setup Guest PMAP table */
-	if (!kvm->arch.guest_pmap) {
-		if (mem->slot == 0)
-			npages = mem->memory_size >> PAGE_SHIFT;
-
-		if (npages) {
-			kvm->arch.guest_pmap_npages = npages;
-			kvm->arch.guest_pmap =
-			    kzalloc(npages * sizeof(unsigned long), GFP_KERNEL);
-
-			if (!kvm->arch.guest_pmap) {
-				kvm_err("Failed to allocate guest PMAP\n");
-				return;
-			}
-
-			kvm_debug("Allocated space for Guest PMAP Table (%ld pages) @ %p\n",
-				  npages, kvm->arch.guest_pmap);
-
-			/* Now setup the page table */
-			for (i = 0; i < npages; i++)
-				kvm->arch.guest_pmap[i] = KVM_INVALID_PAGE;
-		}
-	}
 }
 
 static inline void dump_handler(const char *symbol, void *start, void *end)
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 6379ac1bc7b9..09f5da706d9a 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -62,6 +62,63 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 	mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
 }
 
+/**
+ * kvm_pgd_init() - Initialise KVM GPA page directory.
+ * @page:	Pointer to page directory (PGD) for KVM GPA.
+ *
+ * Initialise a KVM GPA page directory with pointers to the invalid table, i.e.
+ * representing no mappings. This is similar to pgd_init(), however it
+ * initialises all the page directory pointers, not just the ones corresponding
+ * to the userland address space (since it is for the guest physical address
+ * space rather than a virtual address space).
+ */
+static void kvm_pgd_init(void *page)
+{
+	unsigned long *p, *end;
+	unsigned long entry;
+
+#ifdef __PAGETABLE_PMD_FOLDED
+	entry = (unsigned long)invalid_pte_table;
+#else
+	entry = (unsigned long)invalid_pmd_table;
+#endif
+
+	p = (unsigned long *)page;
+	end = p + PTRS_PER_PGD;
+
+	do {
+		p[0] = entry;
+		p[1] = entry;
+		p[2] = entry;
+		p[3] = entry;
+		p[4] = entry;
+		p += 8;
+		p[-3] = entry;
+		p[-2] = entry;
+		p[-1] = entry;
+	} while (p != end);
+}
+
+/**
+ * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory.
+ *
+ * Allocate a blank KVM GPA page directory (PGD) for representing guest physical
+ * to host physical page mappings.
+ *
+ * Returns:	Pointer to new KVM GPA page directory.
+ *		NULL on allocation failure.
+ */
+pgd_t *kvm_pgd_alloc(void)
+{
+	pgd_t *ret;
+
+	ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGD_ORDER);
+	if (ret)
+		kvm_pgd_init(ret);
+
+	return ret;
+}
+
 /**
  * kvm_mips_walk_pgd() - Walk page table with optional allocation.
  * @pgd:	Page directory pointer.
@@ -112,15 +169,182 @@ static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache,
 	return pte_offset(pmd, addr);
 }
 
-static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
+/* Caller must hold kvm->mm_lock */
+static pte_t *kvm_mips_pte_for_gpa(struct kvm *kvm,
+				   struct kvm_mmu_memory_cache *cache,
+				   unsigned long addr)
 {
-	int srcu_idx, err = 0;
-	kvm_pfn_t pfn;
+	return kvm_mips_walk_pgd(kvm->arch.gpa_mm.pgd, cache, addr);
+}
 
-	if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE)
-		return 0;
+/*
+ * kvm_mips_flush_gpa_{pte,pmd,pud,pgd,pt}.
+ * Flush a range of guest physical address space from the VM's GPA page tables.
+ */
+
+static bool kvm_mips_flush_gpa_pte(pte_t *pte, unsigned long start_gpa,
+				   unsigned long end_gpa)
+{
+	int i_min = __pte_offset(start_gpa);
+	int i_max = __pte_offset(end_gpa);
+	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PTE - 1);
+	int i;
+
+	for (i = i_min; i <= i_max; ++i) {
+		if (!pte_present(pte[i]))
+			continue;
+
+		kvm_release_pfn_clean(pte_pfn(pte[i]));
+		set_pte(pte + i, __pte(0));
+	}
+	return safe_to_remove;
+}
+
+static bool kvm_mips_flush_gpa_pmd(pmd_t *pmd, unsigned long start_gpa,
+				   unsigned long end_gpa)
+{
+	pte_t *pte;
+	unsigned long end = ~0ul;
+	int i_min = __pmd_offset(start_gpa);
+	int i_max = __pmd_offset(end_gpa);
+	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PMD - 1);
+	int i;
+
+	for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
+		if (!pmd_present(pmd[i]))
+			continue;
+
+		pte = pte_offset(pmd + i, 0);
+		if (i == i_max)
+			end = end_gpa;
+
+		if (kvm_mips_flush_gpa_pte(pte, start_gpa, end)) {
+			pmd_clear(pmd + i);
+			pte_free_kernel(NULL, pte);
+		} else {
+			safe_to_remove = false;
+		}
+	}
+	return safe_to_remove;
+}
+
+static bool kvm_mips_flush_gpa_pud(pud_t *pud, unsigned long start_gpa,
+				   unsigned long end_gpa)
+{
+	pmd_t *pmd;
+	unsigned long end = ~0ul;
+	int i_min = __pud_offset(start_gpa);
+	int i_max = __pud_offset(end_gpa);
+	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PUD - 1);
+	int i;
+
+	for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
+		if (!pud_present(pud[i]))
+			continue;
+
+		pmd = pmd_offset(pud + i, 0);
+		if (i == i_max)
+			end = end_gpa;
+
+		if (kvm_mips_flush_gpa_pmd(pmd, start_gpa, end)) {
+			pud_clear(pud + i);
+			pmd_free(NULL, pmd);
+		} else {
+			safe_to_remove = false;
+		}
+	}
+	return safe_to_remove;
+}
+
+static bool kvm_mips_flush_gpa_pgd(pgd_t *pgd, unsigned long start_gpa,
+				   unsigned long end_gpa)
+{
+	pud_t *pud;
+	unsigned long end = ~0ul;
+	int i_min = pgd_index(start_gpa);
+	int i_max = pgd_index(end_gpa);
+	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PGD - 1);
+	int i;
+
+	for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
+		if (!pgd_present(pgd[i]))
+			continue;
+
+		pud = pud_offset(pgd + i, 0);
+		if (i == i_max)
+			end = end_gpa;
+
+		if (kvm_mips_flush_gpa_pud(pud, start_gpa, end)) {
+			pgd_clear(pgd + i);
+			pud_free(NULL, pud);
+		} else {
+			safe_to_remove = false;
+		}
+	}
+	return safe_to_remove;
+}
+
+/**
+ * kvm_mips_flush_gpa_pt() - Flush a range of guest physical addresses.
+ * @kvm:	KVM pointer.
+ * @start_gfn:	Guest frame number of first page in GPA range to flush.
+ * @end_gfn:	Guest frame number of last page in GPA range to flush.
+ *
+ * Flushes a range of GPA mappings from the GPA page tables.
+ *
+ * The caller must hold the @kvm->mmu_lock spinlock.
+ *
+ * Returns:	Whether its safe to remove the top level page directory because
+ *		all lower levels have been removed.
+ */
+bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
+{
+	return kvm_mips_flush_gpa_pgd(kvm->arch.gpa_mm.pgd,
+				      start_gfn << PAGE_SHIFT,
+				      end_gfn << PAGE_SHIFT);
+}
+
+/**
+ * kvm_mips_map_page() - Map a guest physical page.
+ * @vcpu:		VCPU pointer.
+ * @gpa:		Guest physical address of fault.
+ * @out_entry:		New PTE for @gpa (written on success unless NULL).
+ * @out_buddy:		New PTE for @gpa's buddy (written on success unless
+ *			NULL).
+ *
+ * Handle GPA faults by creating a new GPA mapping (or updating an existing
+ * one).
+ *
+ * This takes care of asking KVM for the corresponding PFN, and creating a
+ * mapping in the GPA page tables. Derived mappings (GVA page tables and TLBs)
+ * must be handled by the caller.
+ *
+ * Returns:	0 on success, in which case the caller may use the @out_entry
+ *		and @out_buddy PTEs to update derived mappings and resume guest
+ *		execution.
+ *		-EFAULT if there is no memory region at @gpa or a write was
+ *		attempted to a read-only memory region. This is usually handled
+ *		as an MMIO access.
+ */
+static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
+			     pte_t *out_entry, pte_t *out_buddy)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int srcu_idx, err;
+	kvm_pfn_t pfn;
+	pte_t *ptep, entry, old_pte;
+	unsigned long prot_bits;
 
 	srcu_idx = srcu_read_lock(&kvm->srcu);
+
+	/* We need a minimum of cached pages ready for page table creation */
+	err = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
+				     KVM_NR_MEM_OBJS);
+	if (err)
+		goto out;
+
 	pfn = gfn_to_pfn(kvm, gfn);
 
 	if (is_error_noslot_pfn(pfn)) {
@@ -129,7 +353,25 @@ static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
 		goto out;
 	}
 
-	kvm->arch.guest_pmap[gfn] = pfn;
+	spin_lock(&kvm->mmu_lock);
+
+	ptep = kvm_mips_pte_for_gpa(kvm, memcache, gpa);
+
+	prot_bits = __READABLE | _PAGE_PRESENT | __WRITEABLE;
+	entry = pfn_pte(pfn, __pgprot(prot_bits));
+
+	old_pte = *ptep;
+	set_pte(ptep, entry);
+	if (pte_present(old_pte))
+		kvm_release_pfn_clean(pte_pfn(old_pte));
+
+	err = 0;
+	if (out_entry)
+		*out_entry = *ptep;
+	if (out_buddy)
+		*out_buddy = *ptep_buddy(ptep);
+
+	spin_unlock(&kvm->mmu_lock);
 out:
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 	return err;
@@ -318,11 +560,10 @@ void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags)
 int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 				    struct kvm_vcpu *vcpu)
 {
-	gfn_t gfn;
+	unsigned long gpa;
 	kvm_pfn_t pfn0, pfn1;
-	unsigned long vaddr = 0;
-	struct kvm *kvm = vcpu->kvm;
-	pte_t *ptep_gva;
+	unsigned long vaddr;
+	pte_t pte_gpa[2], *ptep_gva;
 
 	if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
 		kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
@@ -332,23 +573,17 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 
 	/* Find host PFNs */
 
-	gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
-	if ((gfn | 1) >= kvm->arch.guest_pmap_npages) {
-		kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
-			gfn, badvaddr);
-		kvm_mips_dump_host_tlbs();
-		return -1;
-	}
+	gpa = KVM_GUEST_CPHYSADDR(badvaddr & (PAGE_MASK << 1));
 	vaddr = badvaddr & (PAGE_MASK << 1);
 
-	if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
+	if (kvm_mips_map_page(vcpu, gpa, &pte_gpa[0], NULL) < 0)
 		return -1;
 
-	if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0)
+	if (kvm_mips_map_page(vcpu, gpa | PAGE_SIZE, &pte_gpa[1], NULL) < 0)
 		return -1;
 
-	pfn0 = kvm->arch.guest_pmap[gfn & ~0x1];
-	pfn1 = kvm->arch.guest_pmap[gfn | 0x1];
+	pfn0 = pte_pfn(pte_gpa[0]);
+	pfn1 = pte_pfn(pte_gpa[1]);
 
 	/* Find GVA page table entry */
 
@@ -371,11 +606,9 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 					 struct kvm_mips_tlb *tlb,
 					 unsigned long gva)
 {
-	struct kvm *kvm = vcpu->kvm;
 	kvm_pfn_t pfn;
-	gfn_t gfn;
 	long tlb_lo = 0;
-	pte_t *ptep_gva;
+	pte_t pte_gpa, *ptep_gva;
 	unsigned int idx;
 	bool kernel = KVM_GUEST_KERNEL_MODE(vcpu);
 
@@ -388,16 +621,10 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 		tlb_lo = tlb->tlb_lo[idx];
 
 	/* Find host PFN */
-	gfn = mips3_tlbpfn_to_paddr(tlb_lo) >> PAGE_SHIFT;
-	if (gfn >= kvm->arch.guest_pmap_npages) {
-		kvm_err("%s: Invalid gfn: %#llx, EHi: %#lx\n",
-			__func__, gfn, tlb->tlb_hi);
-		kvm_mips_dump_guest_tlbs(vcpu);
-		return -1;
-	}
-	if (kvm_mips_map_page(kvm, gfn) < 0)
+	if (kvm_mips_map_page(vcpu, mips3_tlbpfn_to_paddr(tlb_lo), &pte_gpa,
+			      NULL) < 0)
 		return -1;
-	pfn = kvm->arch.guest_pmap[gfn];
+	pfn = pte_pfn(pte_gpa);
 
 	/* Find GVA page table entry */
 	ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, gva);
-- 
cgit v1.2.3


From 4841e0dd4f53c127b11947bdbe4423b5e9014ebc Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 28 Nov 2016 22:45:04 +0000
Subject: KVM: MIPS: Update vcpu->mode and vcpu->cpu
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Keep the vcpu->mode and vcpu->cpu variables up to date so that
kvm_make_all_cpus_request() has a chance of functioning correctly. This
will soon need to be used for kvm_flush_remote_tlbs().

We can easily update vcpu->cpu when the VCPU context is loaded or saved,
which will happen when accessing guest context and when the guest is
scheduled in and out.

We need to be a little careful with vcpu->mode though, as we will in
future be checking for outstanding VCPU requests, and this must be done
after the value of IN_GUEST_MODE in vcpu->mode is visible to other CPUs.
Otherwise the other CPU could fail to trigger an IPI to wait for
completion dispite the VCPU request not being seen.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/mips.c | 18 ++++++++++++++++++
 arch/mips/kvm/mmu.c  |  2 ++
 2 files changed, 20 insertions(+)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 9b72e2c55a0c..ff5e34293227 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -365,6 +365,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	guest_enter_irqoff();
 	trace_kvm_enter(vcpu);
 
+	/*
+	 * Make sure the read of VCPU requests in vcpu_run() callback is not
+	 * reordered ahead of the write to vcpu->mode, or we could miss a TLB
+	 * flush request while the requester sees the VCPU as outside of guest
+	 * mode and not needing an IPI.
+	 */
+	smp_store_mb(vcpu->mode, IN_GUEST_MODE);
+
 	r = kvm_mips_callbacks->vcpu_run(run, vcpu);
 
 	trace_kvm_out(vcpu);
@@ -1326,6 +1334,8 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	u32 inst;
 	int ret = RESUME_GUEST;
 
+	vcpu->mode = OUTSIDE_GUEST_MODE;
+
 	/* re-enable HTW before enabling interrupts */
 	htw_start();
 
@@ -1481,6 +1491,14 @@ skip_emul:
 	if (ret == RESUME_GUEST) {
 		trace_kvm_reenter(vcpu);
 
+		/*
+		 * Make sure the read of VCPU requests in vcpu_reenter()
+		 * callback is not reordered ahead of the write to vcpu->mode,
+		 * or we could miss a TLB flush request while the requester sees
+		 * the VCPU as outside of guest mode and not needing an IPI.
+		 */
+		smp_store_mb(vcpu->mode, IN_GUEST_MODE);
+
 		kvm_mips_callbacks->vcpu_reenter(run, vcpu);
 
 		/*
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 09f5da706d9a..e41ee36dd626 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -696,6 +696,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	local_irq_save(flags);
 
+	vcpu->cpu = cpu;
 	if (vcpu->arch.last_sched_cpu != cpu) {
 		kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
 			  vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
@@ -723,6 +724,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 
 	cpu = smp_processor_id();
 	vcpu->arch.last_sched_cpu = cpu;
+	vcpu->cpu = -1;
 
 	/* save guest state in registers */
 	kvm_mips_callbacks->vcpu_put(vcpu, cpu);
-- 
cgit v1.2.3


From b29e115ae451a67a8c044dffb3aa02b19d4570a0 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 28 Nov 2016 23:19:32 +0000
Subject: KVM: MIPS/T&E: Handle TLB invalidation requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add handling of TLB invalidation requests before entering guest mode.
This will allow asynchonous invalidation of the VCPU mappings when
physical memory regions are altered. Should the CPU running the VCPU
already be in guest mode an IPI will be sent to trigger a guest exit.

The reload_asid path will be used in a future patch for when GVA is
about to be directly accessed by KVM.

In the process, the stale user ASID check in the re-entry path (for lazy
user GVA flushing) is generalised to check the ASID for the current
guest mode, in case a TLB invalidation request was handled. This has the
side effect of making the ASID checks on vcpu_load too conservative,
which will be addressed in a later patch.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/trap_emul.c | 71 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 63 insertions(+), 8 deletions(-)

diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index ccd56b3ce84b..2b20b7de493e 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -773,31 +773,86 @@ static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 	return 0;
 }
 
+static void kvm_trap_emul_check_requests(struct kvm_vcpu *vcpu, int cpu,
+					 bool reload_asid)
+{
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
+	struct mm_struct *mm;
+	int i;
+
+	if (likely(!vcpu->requests))
+		return;
+
+	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
+		/*
+		 * Both kernel & user GVA mappings must be invalidated. The
+		 * caller is just about to check whether the ASID is stale
+		 * anyway so no need to reload it here.
+		 */
+		kvm_mips_flush_gva_pt(kern_mm->pgd, KMF_GPA | KMF_KERN);
+		kvm_mips_flush_gva_pt(user_mm->pgd, KMF_GPA | KMF_USER);
+		for_each_possible_cpu(i) {
+			cpu_context(i, kern_mm) = 0;
+			cpu_context(i, user_mm) = 0;
+		}
+
+		/* Generate new ASID for current mode */
+		if (reload_asid) {
+			mm = KVM_GUEST_KERNEL_MODE(vcpu) ? kern_mm : user_mm;
+			get_new_mmu_context(mm, cpu);
+			htw_stop();
+			write_c0_entryhi(cpu_asid(cpu, mm));
+			TLBMISS_HANDLER_SETUP_PGD(mm->pgd);
+			htw_start();
+		}
+	}
+}
+
 static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run,
 				       struct kvm_vcpu *vcpu)
 {
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
 	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
+	struct mm_struct *mm;
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	int i, cpu = smp_processor_id();
 	unsigned int gasid;
 
 	/*
-	 * Lazy host ASID regeneration / PT flush for guest user mode.
-	 * If the guest ASID has changed since the last guest usermode
-	 * execution, regenerate the host ASID so as to invalidate stale TLB
-	 * entries and flush GVA PT entries too.
+	 * No need to reload ASID, IRQs are disabled already so there's no rush,
+	 * and we'll check if we need to regenerate below anyway before
+	 * re-entering the guest.
 	 */
-	if (!KVM_GUEST_KERNEL_MODE(vcpu)) {
+	kvm_trap_emul_check_requests(vcpu, cpu, false);
+
+	if (KVM_GUEST_KERNEL_MODE(vcpu)) {
+		mm = kern_mm;
+	} else {
+		mm = user_mm;
+
+		/*
+		 * Lazy host ASID regeneration / PT flush for guest user mode.
+		 * If the guest ASID has changed since the last guest usermode
+		 * execution, invalidate the stale TLB entries and flush GVA PT
+		 * entries too.
+		 */
 		gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID;
 		if (gasid != vcpu->arch.last_user_gasid) {
 			kvm_mips_flush_gva_pt(user_mm->pgd, KMF_USER);
-			get_new_mmu_context(user_mm, cpu);
 			for_each_possible_cpu(i)
-				if (i != cpu)
-					cpu_context(i, user_mm) = 0;
+				cpu_context(i, user_mm) = 0;
 			vcpu->arch.last_user_gasid = gasid;
 		}
 	}
+
+	/*
+	 * Check if ASID is stale. This may happen due to a TLB flush request or
+	 * a lazy user MM invalidation.
+	 */
+	if ((cpu_context(cpu, mm) ^ asid_cache(cpu)) &
+	    asid_version_mask(cpu))
+		get_new_mmu_context(mm, cpu);
 }
 
 static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 91737ea205856c41183c2530fdb6b407ceeb3836 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 2 Dec 2016 23:40:52 +0000
Subject: KVM: MIPS/T&E: Reduce stale ASID checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The stale ASID checks taking place on VCPU load can be reduced:

- Now that we check for a stale ASID on guest re-entry, there is no need
  to do so when loading the VCPU outside of guest context, since it will
  happen before entering the guest. Note that a lot of KVM VCPU ioctls
  will cause the VCPU to be loaded but guest context won't be entered.

- There is no need to check for a stale kernel_mm ASID when the guest is
  in user mode and vice versa. In fact doing so can potentially be
  problematic since the user_mm ASID regeneration may trigger a new ASID
  cycle, which would cause the kern_mm ASID to become stale after it has
  been checked for staleness.

Therefore only check the ASID for the mm corresponding to the current
guest mode, and only if we're already in guest context. We drop some of
the related kvm_debug() calls here too.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/trap_emul.c | 35 ++++++-----------------------------
 1 file changed, 6 insertions(+), 29 deletions(-)

diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 2b20b7de493e..edda8f039026 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -714,35 +714,15 @@ static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
 	struct mm_struct *mm;
 
-	/* Allocate new kernel and user ASIDs if needed */
-
-	if ((cpu_context(cpu, kern_mm) ^ asid_cache(cpu)) &
-						asid_version_mask(cpu)) {
-		get_new_mmu_context(kern_mm, cpu);
-
-		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
-			  cpu_context(cpu, current->mm));
-		kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#lx\n",
-			  cpu, cpu_context(cpu, kern_mm));
-	}
-
-	if ((cpu_context(cpu, user_mm) ^ asid_cache(cpu)) &
-						asid_version_mask(cpu)) {
-		get_new_mmu_context(user_mm, cpu);
-
-		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
-			  cpu_context(cpu, current->mm));
-		kvm_debug("[%d]: Allocated new ASID for Guest User: %#lx\n",
-			  cpu, cpu_context(cpu, user_mm));
-	}
-
 	/*
-	 * Were we in guest context? If so then the pre-empted ASID is
-	 * no longer valid, we need to set it to what it should be based
-	 * on the mode of the Guest (Kernel/User)
+	 * Were we in guest context? If so, restore the appropriate ASID based
+	 * on the mode of the Guest (Kernel/User).
 	 */
 	if (current->flags & PF_VCPU) {
 		mm = KVM_GUEST_KERNEL_MODE(vcpu) ? kern_mm : user_mm;
+		if ((cpu_context(cpu, mm) ^ asid_cache(cpu)) &
+		    asid_version_mask(cpu))
+			get_new_mmu_context(mm, cpu);
 		write_c0_entryhi(cpu_asid(cpu, mm));
 		TLBMISS_HANDLER_SETUP_PGD(mm->pgd);
 		kvm_mips_suspend_mm(cpu);
@@ -759,11 +739,8 @@ static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 	if (current->flags & PF_VCPU) {
 		/* Restore normal Linux process memory map */
 		if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
-		     asid_version_mask(cpu))) {
-			kvm_debug("%s: Dropping MMU Context:  %#lx\n", __func__,
-				  cpu_context(cpu, current->mm));
+		     asid_version_mask(cpu)))
 			get_new_mmu_context(current->mm, cpu);
-		}
 		write_c0_entryhi(cpu_asid(cpu, current->mm));
 		TLBMISS_HANDLER_SETUP_PGD(current->mm->pgd);
 		kvm_mips_resume_mm(cpu);
-- 
cgit v1.2.3


From 1880afd6057f34586919715e8ffe9c5858f4a326 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 28 Nov 2016 23:04:52 +0000
Subject: KVM: MIPS/T&E: Add lockless GVA access helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add helpers to allow for lockless direct access to the GVA space, by
changing the VCPU mode to READING_SHADOW_PAGE_TABLES for the duration of
the access. This allows asynchronous TLB flush requests in future
patches to safely trigger either a TLB flush before the direct GVA space
access, or a delay until the in-progress lockless direct access is
complete.

The kvm_trap_emul_gva_lockless_begin() and
kvm_trap_emul_gva_lockless_end() helpers take care of guarding the
direct GVA accesses, and kvm_trap_emul_gva_fault() tries to handle a
uaccess fault resulting from a flush having taken place.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h | 15 ++++++++++
 arch/mips/kvm/mmu.c              | 51 +++++++++++++++++++++++++++++++
 arch/mips/kvm/trap_emul.c        | 65 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 131 insertions(+)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index a04657932e19..c1e46abb5704 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -243,6 +243,7 @@ enum emulation_result {
 #define TLB_ASID(x)		((x).tlb_hi & KVM_ENTRYHI_ASID)
 #define TLB_LO_IDX(x, va)	(((va) >> PAGE_SHIFT) & 1)
 #define TLB_IS_VALID(x, va)	((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_V)
+#define TLB_IS_DIRTY(x, va)	((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_D)
 #define TLB_HI_VPN2_HIT(x, y)	((TLB_VPN2(x) & ~(x).tlb_mask) ==	\
 				 ((y) & VPN2_MASK & ~(x).tlb_mask))
 #define TLB_HI_ASID_HIT(x, y)	(TLB_IS_GLOBAL(x) ||			\
@@ -640,6 +641,20 @@ pgd_t *kvm_pgd_alloc(void);
 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
 				  bool user);
+void kvm_trap_emul_gva_lockless_begin(struct kvm_vcpu *vcpu);
+void kvm_trap_emul_gva_lockless_end(struct kvm_vcpu *vcpu);
+
+enum kvm_mips_fault_result {
+	KVM_MIPS_MAPPED = 0,
+	KVM_MIPS_GVA,
+	KVM_MIPS_GPA,
+	KVM_MIPS_TLB,
+	KVM_MIPS_TLBINV,
+	KVM_MIPS_TLBMOD,
+};
+enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
+						   unsigned long gva,
+						   bool write);
 
 /* Emulation */
 int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index e41ee36dd626..32c317de6c0a 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -732,6 +732,57 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	local_irq_restore(flags);
 }
 
+/**
+ * kvm_trap_emul_gva_fault() - Safely attempt to handle a GVA access fault.
+ * @vcpu:	Virtual CPU.
+ * @gva:	Guest virtual address to be accessed.
+ * @write:	True if write attempted (must be dirtied and made writable).
+ *
+ * Safely attempt to handle a GVA fault, mapping GVA pages if necessary, and
+ * dirtying the page if @write so that guest instructions can be modified.
+ *
+ * Returns:	KVM_MIPS_MAPPED on success.
+ *		KVM_MIPS_GVA if bad guest virtual address.
+ *		KVM_MIPS_GPA if bad guest physical address.
+ *		KVM_MIPS_TLB if guest TLB not present.
+ *		KVM_MIPS_TLBINV if guest TLB present but not valid.
+ *		KVM_MIPS_TLBMOD if guest TLB read only.
+ */
+enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
+						   unsigned long gva,
+						   bool write)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	struct kvm_mips_tlb *tlb;
+	int index;
+
+	if (KVM_GUEST_KSEGX(gva) == KVM_GUEST_KSEG0) {
+		if (kvm_mips_handle_kseg0_tlb_fault(gva, vcpu) < 0)
+			return KVM_MIPS_GPA;
+	} else if ((KVM_GUEST_KSEGX(gva) < KVM_GUEST_KSEG0) ||
+		   KVM_GUEST_KSEGX(gva) == KVM_GUEST_KSEG23) {
+		/* Address should be in the guest TLB */
+		index = kvm_mips_guest_tlb_lookup(vcpu, (gva & VPN2_MASK) |
+			  (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID));
+		if (index < 0)
+			return KVM_MIPS_TLB;
+		tlb = &vcpu->arch.guest_tlb[index];
+
+		/* Entry should be valid, and dirty for writes */
+		if (!TLB_IS_VALID(*tlb, gva))
+			return KVM_MIPS_TLBINV;
+		if (write && !TLB_IS_DIRTY(*tlb, gva))
+			return KVM_MIPS_TLBMOD;
+
+		if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, gva))
+			return KVM_MIPS_GPA;
+	} else {
+		return KVM_MIPS_GVA;
+	}
+
+	return KVM_MIPS_MAPPED;
+}
+
 int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
 {
 	int err;
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index edda8f039026..e20369d45f24 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -786,6 +786,71 @@ static void kvm_trap_emul_check_requests(struct kvm_vcpu *vcpu, int cpu,
 	}
 }
 
+/**
+ * kvm_trap_emul_gva_lockless_begin() - Begin lockless access to GVA space.
+ * @vcpu:	VCPU pointer.
+ *
+ * Call before a GVA space access outside of guest mode, to ensure that
+ * asynchronous TLB flush requests are handled or delayed until completion of
+ * the GVA access (as indicated by a matching kvm_trap_emul_gva_lockless_end()).
+ *
+ * Should be called with IRQs already enabled.
+ */
+void kvm_trap_emul_gva_lockless_begin(struct kvm_vcpu *vcpu)
+{
+	/* We re-enable IRQs in kvm_trap_emul_gva_lockless_end() */
+	WARN_ON_ONCE(irqs_disabled());
+
+	/*
+	 * The caller is about to access the GVA space, so we set the mode to
+	 * force TLB flush requests to send an IPI, and also disable IRQs to
+	 * delay IPI handling until kvm_trap_emul_gva_lockless_end().
+	 */
+	local_irq_disable();
+
+	/*
+	 * Make sure the read of VCPU requests is not reordered ahead of the
+	 * write to vcpu->mode, or we could miss a TLB flush request while
+	 * the requester sees the VCPU as outside of guest mode and not needing
+	 * an IPI.
+	 */
+	smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
+
+	/*
+	 * If a TLB flush has been requested (potentially while
+	 * OUTSIDE_GUEST_MODE and assumed immediately effective), perform it
+	 * before accessing the GVA space, and be sure to reload the ASID if
+	 * necessary as it'll be immediately used.
+	 *
+	 * TLB flush requests after this check will trigger an IPI due to the
+	 * mode change above, which will be delayed due to IRQs disabled.
+	 */
+	kvm_trap_emul_check_requests(vcpu, smp_processor_id(), true);
+}
+
+/**
+ * kvm_trap_emul_gva_lockless_end() - End lockless access to GVA space.
+ * @vcpu:	VCPU pointer.
+ *
+ * Called after a GVA space access outside of guest mode. Should have a matching
+ * call to kvm_trap_emul_gva_lockless_begin().
+ */
+void kvm_trap_emul_gva_lockless_end(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Make sure the write to vcpu->mode is not reordered in front of GVA
+	 * accesses, or a TLB flush requester may not think it necessary to send
+	 * an IPI.
+	 */
+	smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
+
+	/*
+	 * Now that the access to GVA space is complete, its safe for pending
+	 * TLB flush request IPIs to be handled (which indicates completion).
+	 */
+	local_irq_enable();
+}
+
 static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run,
 				       struct kvm_vcpu *vcpu)
 {
-- 
cgit v1.2.3


From 4b21e8abf959ca66c27f0656bf294fe69d3f2254 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 28 Nov 2016 23:13:38 +0000
Subject: KVM: MIPS/T&E: Use lockless GVA helpers for dyntrans
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use the lockless GVA helpers to implement the dynamic translation of
guest instructions. This will allow it to handle asynchronous TLB
flushes when they are implemented.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/dyntrans.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index 60ebf5862d2b..f8e772564d74 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -33,10 +33,32 @@ static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc,
 	unsigned long vaddr = (unsigned long)opc;
 	int err;
 
+retry:
+	/* The GVA page table is still active so use the Linux TLB handlers */
+	kvm_trap_emul_gva_lockless_begin(vcpu);
 	err = put_user(replace.word, opc);
+	kvm_trap_emul_gva_lockless_end(vcpu);
+
 	if (unlikely(err)) {
-		kvm_err("%s: Invalid address: %p\n", __func__, opc);
-		return err;
+		/*
+		 * We write protect clean pages in GVA page table so normal
+		 * Linux TLB mod handler doesn't silently dirty the page.
+		 * Its also possible we raced with a GVA invalidation.
+		 * Try to force the page to become dirty.
+		 */
+		err = kvm_trap_emul_gva_fault(vcpu, vaddr, true);
+		if (unlikely(err)) {
+			kvm_info("%s: Address unwriteable: %p\n",
+				 __func__, opc);
+			return -EFAULT;
+		}
+
+		/*
+		 * Try again. This will likely trigger a TLB refill, which will
+		 * fetch the new dirty entry from the GVA page table, which
+		 * should then succeed.
+		 */
+		goto retry;
 	}
 	__local_flush_icache_user_range(vaddr, vaddr + 4);
 
-- 
cgit v1.2.3


From 5207ce144a25aef89dd12b8fc3ccaa53aba4f2bd Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 28 Nov 2016 23:15:53 +0000
Subject: KVM: MIPS/MMU: Use lockless GVA helpers for get_inst()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use the lockless GVA helpers to implement the reading of guest
instructions for emulation. This will allow it to handle asynchronous
TLB flushes when they are implemented.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/mmu.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 32c317de6c0a..b3da473e1569 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -787,11 +787,26 @@ int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
 {
 	int err;
 
+retry:
+	kvm_trap_emul_gva_lockless_begin(vcpu);
 	err = get_user(*out, opc);
+	kvm_trap_emul_gva_lockless_end(vcpu);
+
 	if (unlikely(err)) {
-		kvm_err("%s: illegal address: %p\n", __func__, opc);
-		return -EFAULT;
-	}
+		/*
+		 * Try to handle the fault, maybe we just raced with a GVA
+		 * invalidation.
+		 */
+		err = kvm_trap_emul_gva_fault(vcpu, (unsigned long)opc,
+					      false);
+		if (unlikely(err)) {
+			kvm_err("%s: illegal address: %p\n",
+				__func__, opc);
+			return -EFAULT;
+		}
 
+		/* Hopefully it'll work now */
+		goto retry;
+	}
 	return 0;
 }
-- 
cgit v1.2.3


From 4cf74c9c83dda79143490d7cc774b7830e257fcd Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Sat, 26 Nov 2016 00:37:28 +0000
Subject: KVM: MIPS/Emulate: Use lockless GVA helpers for cache emulation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use the lockless GVA helpers to implement the reading of guest
instructions for emulation. This will allow it to handle asynchronous
TLB flushes when they are implemented.

This is a little more complicated than the other two cases (get_inst()
and dynamic translation) due to the need to emulate the appropriate
guest TLB exception when the address isn't present or isn't valid in the
guest TLB.

Since there are several protected cache ops that may need to be
performed safely, this is abstracted by kvm_mips_guest_cache_op() which
is passed a protected cache op function pointer and takes care of the
lockless operation and fault handling / retry if the op should fail,
taking advantage of the new errors which the protected cache ops can now
return. This allows the existing advance fault handling which relied on
host TLB lookups to be removed, along with the now unused
kvm_mips_host_tlb_lookup(),

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |   2 +-
 arch/mips/kvm/emulate.c          | 148 +++++++++++++++++----------------------
 arch/mips/kvm/tlb.c              |  35 ---------
 3 files changed, 66 insertions(+), 119 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index c1e46abb5704..33d3d8ac742e 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -226,6 +226,7 @@ enum emulation_result {
 	EMULATE_FAIL,		/* can't emulate this instruction */
 	EMULATE_WAIT,		/* WAIT instruction */
 	EMULATE_PRIV_FAIL,
+	EMULATE_EXCEPT,		/* A guest exception has been generated */
 };
 
 #define mips3_paddr_to_tlbpfn(x) \
@@ -614,7 +615,6 @@ extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi,
 
 extern int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu,
 				     unsigned long entryhi);
-extern int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr);
 
 void kvm_mips_suspend_mm(int cpu);
 void kvm_mips_resume_mm(int cpu);
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index b295a4a1496f..40159cf5166b 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1697,12 +1697,56 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
 	return er;
 }
 
+static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long),
+						     unsigned long curr_pc,
+						     unsigned long addr,
+						     struct kvm_run *run,
+						     struct kvm_vcpu *vcpu,
+						     u32 cause)
+{
+	int err;
+
+	for (;;) {
+		/* Carefully attempt the cache operation */
+		kvm_trap_emul_gva_lockless_begin(vcpu);
+		err = fn(addr);
+		kvm_trap_emul_gva_lockless_end(vcpu);
+
+		if (likely(!err))
+			return EMULATE_DONE;
+
+		/*
+		 * Try to handle the fault and retry, maybe we just raced with a
+		 * GVA invalidation.
+		 */
+		switch (kvm_trap_emul_gva_fault(vcpu, addr, false)) {
+		case KVM_MIPS_GVA:
+		case KVM_MIPS_GPA:
+			/* bad virtual or physical address */
+			return EMULATE_FAIL;
+		case KVM_MIPS_TLB:
+			/* no matching guest TLB */
+			vcpu->arch.host_cp0_badvaddr = addr;
+			vcpu->arch.pc = curr_pc;
+			kvm_mips_emulate_tlbmiss_ld(cause, NULL, run, vcpu);
+			return EMULATE_EXCEPT;
+		case KVM_MIPS_TLBINV:
+			/* invalid matching guest TLB */
+			vcpu->arch.host_cp0_badvaddr = addr;
+			vcpu->arch.pc = curr_pc;
+			kvm_mips_emulate_tlbinv_ld(cause, NULL, run, vcpu);
+			return EMULATE_EXCEPT;
+		default:
+			break;
+		};
+	}
+}
+
 enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
 					     u32 *opc, u32 cause,
 					     struct kvm_run *run,
 					     struct kvm_vcpu *vcpu)
 {
-	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	enum emulation_result er = EMULATE_DONE;
 	u32 cache, op_inst, op, base;
 	s16 offset;
@@ -1759,81 +1803,16 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
 		goto done;
 	}
 
-	preempt_disable();
-	if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
-		if (kvm_mips_host_tlb_lookup(vcpu, va) < 0 &&
-		    kvm_mips_handle_kseg0_tlb_fault(va, vcpu)) {
-			kvm_err("%s: handling mapped kseg0 tlb fault for %lx, vcpu: %p, ASID: %#lx\n",
-				__func__, va, vcpu, read_c0_entryhi());
-			er = EMULATE_FAIL;
-			preempt_enable();
-			goto done;
-		}
-	} else if ((KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0) ||
-		   KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) {
-		int index;
-
-		/* If an entry already exists then skip */
-		if (kvm_mips_host_tlb_lookup(vcpu, va) >= 0)
-			goto skip_fault;
-
-		/*
-		 * If address not in the guest TLB, then give the guest a fault,
-		 * the resulting handler will do the right thing
-		 */
-		index = kvm_mips_guest_tlb_lookup(vcpu, (va & VPN2_MASK) |
-						  (kvm_read_c0_guest_entryhi
-						   (cop0) & KVM_ENTRYHI_ASID));
-
-		if (index < 0) {
-			vcpu->arch.host_cp0_badvaddr = va;
-			vcpu->arch.pc = curr_pc;
-			er = kvm_mips_emulate_tlbmiss_ld(cause, NULL, run,
-							 vcpu);
-			preempt_enable();
-			goto dont_update_pc;
-		} else {
-			struct kvm_mips_tlb *tlb = &vcpu->arch.guest_tlb[index];
-			/*
-			 * Check if the entry is valid, if not then setup a TLB
-			 * invalid exception to the guest
-			 */
-			if (!TLB_IS_VALID(*tlb, va)) {
-				vcpu->arch.host_cp0_badvaddr = va;
-				vcpu->arch.pc = curr_pc;
-				er = kvm_mips_emulate_tlbinv_ld(cause, NULL,
-								run, vcpu);
-				preempt_enable();
-				goto dont_update_pc;
-			}
-			/*
-			 * We fault an entry from the guest tlb to the
-			 * shadow host TLB
-			 */
-			if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb,
-								 va)) {
-				kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n",
-					__func__, va, index, vcpu,
-					read_c0_entryhi());
-				er = EMULATE_FAIL;
-				preempt_enable();
-				goto done;
-			}
-		}
-	} else {
-		kvm_err("INVALID CACHE INDEX/ADDRESS (cache: %#x, op: %#x, base[%d]: %#lx, offset: %#x\n",
-			cache, op, base, arch->gprs[base], offset);
-		er = EMULATE_FAIL;
-		preempt_enable();
-		goto done;
-
-	}
-
-skip_fault:
 	/* XXXKYMA: Only a subset of cache ops are supported, used by Linux */
 	if (op_inst == Hit_Writeback_Inv_D || op_inst == Hit_Invalidate_D) {
-		protected_writeback_dcache_line(va);
-
+		/*
+		 * Perform the dcache part of icache synchronisation on the
+		 * guest's behalf.
+		 */
+		er = kvm_mips_guest_cache_op(protected_writeback_dcache_line,
+					     curr_pc, va, run, vcpu, cause);
+		if (er != EMULATE_DONE)
+			goto done;
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
 		/*
 		 * Replace the CACHE instruction, with a SYNCI, not the same,
@@ -1842,8 +1821,15 @@ skip_fault:
 		kvm_mips_trans_cache_va(inst, opc, vcpu);
 #endif
 	} else if (op_inst == Hit_Invalidate_I) {
-		protected_writeback_dcache_line(va);
-		protected_flush_icache_line(va);
+		/* Perform the icache synchronisation on the guest's behalf */
+		er = kvm_mips_guest_cache_op(protected_writeback_dcache_line,
+					     curr_pc, va, run, vcpu, cause);
+		if (er != EMULATE_DONE)
+			goto done;
+		er = kvm_mips_guest_cache_op(protected_flush_icache_line,
+					     curr_pc, va, run, vcpu, cause);
+		if (er != EMULATE_DONE)
+			goto done;
 
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
 		/* Replace the CACHE instruction, with a SYNCI */
@@ -1855,17 +1841,13 @@ skip_fault:
 		er = EMULATE_FAIL;
 	}
 
-	preempt_enable();
 done:
 	/* Rollback PC only if emulation was unsuccessful */
 	if (er == EMULATE_FAIL)
 		vcpu->arch.pc = curr_pc;
-
-dont_update_pc:
-	/*
-	 * This is for exceptions whose emulation updates the PC, so do not
-	 * overwrite the PC under any circumstances
-	 */
+	/* Guest exception needs guest to resume */
+	if (er == EMULATE_EXCEPT)
+		er = EMULATE_DONE;
 
 	return er;
 }
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index cee2e9feb942..2819eb793345 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -117,41 +117,6 @@ int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
 }
 EXPORT_SYMBOL_GPL(kvm_mips_guest_tlb_lookup);
 
-int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr)
-{
-	unsigned long old_entryhi, flags;
-	int idx;
-
-	local_irq_save(flags);
-
-	old_entryhi = read_c0_entryhi();
-
-	if (KVM_GUEST_KERNEL_MODE(vcpu))
-		write_c0_entryhi((vaddr & VPN2_MASK) |
-				 kvm_mips_get_kernel_asid(vcpu));
-	else {
-		write_c0_entryhi((vaddr & VPN2_MASK) |
-				 kvm_mips_get_user_asid(vcpu));
-	}
-
-	mtc0_tlbw_hazard();
-
-	tlb_probe();
-	tlb_probe_hazard();
-	idx = read_c0_index();
-
-	/* Restore old ASID */
-	write_c0_entryhi(old_entryhi);
-	mtc0_tlbw_hazard();
-
-	local_irq_restore(flags);
-
-	kvm_debug("Host TLB lookup, %#lx, idx: %2d\n", vaddr, idx);
-
-	return idx;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_lookup);
-
 static int _kvm_mips_host_tlb_inv(unsigned long entryhi)
 {
 	int idx;
-- 
cgit v1.2.3


From b6209110863363b55dd60fe28a993e5367d4a215 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 25 Oct 2016 00:01:37 +0100
Subject: KVM: MIPS: Implement kvm_arch_flush_shadow_all/memslot
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement the kvm_arch_flush_shadow_all() and
kvm_arch_flush_shadow_memslot() KVM functions for MIPS to allow guest
physical mappings to be safely changed.

The general MIPS KVM code takes care of flushing of GPA page table
entries. kvm_arch_flush_shadow_all() flushes the whole GPA page table,
and is always called on the cleanup path so there is no need to acquire
the kvm->mmu_lock. kvm_arch_flush_shadow_memslot() flushes only the
range of mappings in the GPA page table corresponding to the slot being
flushed, and happens when memory regions are moved or deleted.

MIPS KVM implementation callbacks are added for handling the
implementation specific flushing of mappings derived from the GPA page
tables. These are implemented for trap_emul.c using
kvm_flush_remote_tlbs() which should now be functional, and will flush
the per-VCPU GVA page tables and ASIDS synchronously (before next
entering guest mode or directly accessing GVA space).

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h | 11 ++++++++---
 arch/mips/kvm/mips.c             | 26 ++++++++++++++++++++++++++
 arch/mips/kvm/trap_emul.c        | 14 ++++++++++++++
 3 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 33d3d8ac742e..ea1b495c042c 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -531,6 +531,14 @@ struct kvm_mips_callbacks {
 	int (*vcpu_init)(struct kvm_vcpu *vcpu);
 	void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
 	int (*vcpu_setup)(struct kvm_vcpu *vcpu);
+	void (*flush_shadow_all)(struct kvm *kvm);
+	/*
+	 * Must take care of flushing any cached GPA PTEs (e.g. guest entries in
+	 * VZ root TLB, or T&E GVA page tables and corresponding root TLB
+	 * mappings).
+	 */
+	void (*flush_shadow_memslot)(struct kvm *kvm,
+				     const struct kvm_memory_slot *slot);
 	gpa_t (*gva_to_gpa)(gva_t gva);
 	void (*queue_timer_int)(struct kvm_vcpu *vcpu);
 	void (*dequeue_timer_int)(struct kvm_vcpu *vcpu);
@@ -827,9 +835,6 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
 static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
-static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
-static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
-		struct kvm_memory_slot *slot) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index ff5e34293227..01f3fa1b9f0e 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -157,6 +157,32 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 	return 0;
 }
 
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+	/* Flush whole GPA */
+	kvm_mips_flush_gpa_pt(kvm, 0, ~0);
+
+	/* Let implementation do the rest */
+	kvm_mips_callbacks->flush_shadow_all(kvm);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+				   struct kvm_memory_slot *slot)
+{
+	/*
+	 * The slot has been made invalid (ready for moving or deletion), so we
+	 * need to ensure that it can no longer be accessed by any guest VCPUs.
+	 */
+
+	spin_lock(&kvm->mmu_lock);
+	/* Flush slot from GPA */
+	kvm_mips_flush_gpa_pt(kvm, slot->base_gfn,
+			      slot->base_gfn + slot->npages - 1);
+	/* Let implementation do the rest */
+	kvm_mips_callbacks->flush_shadow_memslot(kvm, slot);
+	spin_unlock(&kvm->mmu_lock);
+}
+
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
 				   const struct kvm_userspace_memory_region *mem,
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index e20369d45f24..1efe78d4bda8 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -586,6 +586,18 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static void kvm_trap_emul_flush_shadow_all(struct kvm *kvm)
+{
+	/* Flush GVA page tables and invalidate GVA ASIDs on all VCPUs */
+	kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_trap_emul_flush_shadow_memslot(struct kvm *kvm,
+					const struct kvm_memory_slot *slot)
+{
+	kvm_trap_emul_flush_shadow_all(kvm);
+}
+
 static unsigned long kvm_trap_emul_num_regs(struct kvm_vcpu *vcpu)
 {
 	return 0;
@@ -963,6 +975,8 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
 	.vcpu_init = kvm_trap_emul_vcpu_init,
 	.vcpu_uninit = kvm_trap_emul_vcpu_uninit,
 	.vcpu_setup = kvm_trap_emul_vcpu_setup,
+	.flush_shadow_all = kvm_trap_emul_flush_shadow_all,
+	.flush_shadow_memslot = kvm_trap_emul_flush_shadow_memslot,
 	.gva_to_gpa = kvm_trap_emul_gva_to_gpa_cb,
 	.queue_timer_int = kvm_mips_queue_timer_int_cb,
 	.dequeue_timer_int = kvm_mips_dequeue_timer_int_cb,
-- 
cgit v1.2.3


From 89d6ad8a6b26a51f6fdfd356a56681a11f309bba Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 14 Dec 2016 01:58:44 +0000
Subject: KVM: MIPS/T&E: Ignore user writes to CP0_Config7
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ignore userland writes to CP0_Config7 rather than reporting an error,
since we do allow reads of this register and it is claimed to exist in
the ioctl API.

This allows userland to blindly save and restore KVM registers without
having to special case certain registers as not being writable, for
example during live migration once dirty page logging is fixed.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/trap_emul.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 1efe78d4bda8..d0b6409cb4a3 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -705,6 +705,9 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
 			kvm_write_c0_guest_config5(cop0, v);
 		}
 		break;
+	case KVM_REG_MIPS_CP0_CONFIG7:
+		/* writes ignored */
+		break;
 	case KVM_REG_MIPS_COUNT_CTL:
 		ret = kvm_mips_set_count_ctl(vcpu, v);
 		break;
-- 
cgit v1.2.3


From 577ed7f71e9c37a46c45a7bd9a392dd0372a409c Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 1 May 2015 14:56:31 +0100
Subject: KVM: MIPS: Pass type of fault down to kvm_mips_map_page()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kvm_mips_map_page() will need to know whether the fault was due to a
read or a write in order to support dirty page tracking,
KVM_CAP_SYNC_MMU, and read only memory regions, so get that information
passed down to it via new bool write_fault arguments to various
functions.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  9 ++++++---
 arch/mips/kvm/emulate.c          |  7 ++++---
 arch/mips/kvm/mmu.c              | 21 +++++++++++++--------
 arch/mips/kvm/trap_emul.c        |  4 ++--
 4 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index ea1b495c042c..17db9d6c7c93 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -597,19 +597,22 @@ u32 kvm_get_user_asid(struct kvm_vcpu *vcpu);
 u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
 
 extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr,
-					   struct kvm_vcpu *vcpu);
+					   struct kvm_vcpu *vcpu,
+					   bool write_fault);
 
 extern int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
 					      struct kvm_vcpu *vcpu);
 
 extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 						struct kvm_mips_tlb *tlb,
-						unsigned long gva);
+						unsigned long gva,
+						bool write_fault);
 
 extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
 						     u32 *opc,
 						     struct kvm_run *run,
-						     struct kvm_vcpu *vcpu);
+						     struct kvm_vcpu *vcpu,
+						     bool write_fault);
 
 extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause,
 						    u32 *opc,
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 40159cf5166b..4875afca3f26 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -2704,7 +2704,8 @@ enum emulation_result kvm_mips_check_privilege(u32 cause,
 enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
 					      u32 *opc,
 					      struct kvm_run *run,
-					      struct kvm_vcpu *vcpu)
+					      struct kvm_vcpu *vcpu,
+					      bool write_fault)
 {
 	enum emulation_result er = EMULATE_DONE;
 	u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
@@ -2760,8 +2761,8 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
 			 * OK we have a Guest TLB entry, now inject it into the
 			 * shadow host TLB
 			 */
-			if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb,
-								 va)) {
+			if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, va,
+								 write_fault)) {
 				kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n",
 					__func__, va, index, vcpu,
 					read_c0_entryhi());
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index b3da473e1569..1af65f2e6bb7 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -308,6 +308,7 @@ bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
  * kvm_mips_map_page() - Map a guest physical page.
  * @vcpu:		VCPU pointer.
  * @gpa:		Guest physical address of fault.
+ * @write_fault:	Whether the fault was due to a write.
  * @out_entry:		New PTE for @gpa (written on success unless NULL).
  * @out_buddy:		New PTE for @gpa's buddy (written on success unless
  *			NULL).
@@ -327,6 +328,7 @@ bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
  *		as an MMIO access.
  */
 static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
+			     bool write_fault,
 			     pte_t *out_entry, pte_t *out_buddy)
 {
 	struct kvm *kvm = vcpu->kvm;
@@ -558,7 +560,8 @@ void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags)
 
 /* XXXKYMA: Must be called with interrupts disabled */
 int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
-				    struct kvm_vcpu *vcpu)
+				    struct kvm_vcpu *vcpu,
+				    bool write_fault)
 {
 	unsigned long gpa;
 	kvm_pfn_t pfn0, pfn1;
@@ -576,10 +579,11 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 	gpa = KVM_GUEST_CPHYSADDR(badvaddr & (PAGE_MASK << 1));
 	vaddr = badvaddr & (PAGE_MASK << 1);
 
-	if (kvm_mips_map_page(vcpu, gpa, &pte_gpa[0], NULL) < 0)
+	if (kvm_mips_map_page(vcpu, gpa, write_fault, &pte_gpa[0], NULL) < 0)
 		return -1;
 
-	if (kvm_mips_map_page(vcpu, gpa | PAGE_SIZE, &pte_gpa[1], NULL) < 0)
+	if (kvm_mips_map_page(vcpu, gpa | PAGE_SIZE, write_fault, &pte_gpa[1],
+			      NULL) < 0)
 		return -1;
 
 	pfn0 = pte_pfn(pte_gpa[0]);
@@ -604,7 +608,8 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 
 int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 					 struct kvm_mips_tlb *tlb,
-					 unsigned long gva)
+					 unsigned long gva,
+					 bool write_fault)
 {
 	kvm_pfn_t pfn;
 	long tlb_lo = 0;
@@ -621,8 +626,8 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 		tlb_lo = tlb->tlb_lo[idx];
 
 	/* Find host PFN */
-	if (kvm_mips_map_page(vcpu, mips3_tlbpfn_to_paddr(tlb_lo), &pte_gpa,
-			      NULL) < 0)
+	if (kvm_mips_map_page(vcpu, mips3_tlbpfn_to_paddr(tlb_lo), write_fault,
+			      &pte_gpa, NULL) < 0)
 		return -1;
 	pfn = pte_pfn(pte_gpa);
 
@@ -757,7 +762,7 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
 	int index;
 
 	if (KVM_GUEST_KSEGX(gva) == KVM_GUEST_KSEG0) {
-		if (kvm_mips_handle_kseg0_tlb_fault(gva, vcpu) < 0)
+		if (kvm_mips_handle_kseg0_tlb_fault(gva, vcpu, write) < 0)
 			return KVM_MIPS_GPA;
 	} else if ((KVM_GUEST_KSEGX(gva) < KVM_GUEST_KSEG0) ||
 		   KVM_GUEST_KSEGX(gva) == KVM_GUEST_KSEG23) {
@@ -774,7 +779,7 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
 		if (write && !TLB_IS_DIRTY(*tlb, gva))
 			return KVM_MIPS_TLBMOD;
 
-		if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, gva))
+		if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, gva, write))
 			return KVM_MIPS_GPA;
 	} else {
 		return KVM_MIPS_GVA;
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index d0b6409cb4a3..070d1ddbc7ee 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -159,7 +159,7 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
 		 *     into the shadow host TLB
 		 */
 
-		er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu);
+		er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu, store);
 		if (er == EMULATE_DONE)
 			ret = RESUME_GUEST;
 		else {
@@ -172,7 +172,7 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
 		 * not expect to ever get them
 		 */
 		if (kvm_mips_handle_kseg0_tlb_fault
-		    (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) {
+		    (vcpu->arch.host_cp0_badvaddr, vcpu, store) < 0) {
 			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 			ret = RESUME_HOST;
 		}
-- 
cgit v1.2.3


From 420ea09b645b0fb05b326a539190c41ee900ef50 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 6 Dec 2016 19:27:18 +0000
Subject: KVM: MIPS/T&E: Abstract bad access handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Abstract the handling of bad guest loads and stores which may need to
trigger an MMIO, so that the same code can be used in a later patch for
guest KSeg0 addresses (TLB exception handling) as well as for host KSeg1
addresses (existing address error exception and TLB exception handling).

We now use kvm_mips_emulate_store() and kvm_mips_emulate_load() directly
rather than the more generic kvm_mips_emulate_inst(), as there is no
need to expose emulation of any other instructions.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/trap_emul.c | 119 ++++++++++++++++++++++++++++------------------
 1 file changed, 72 insertions(+), 47 deletions(-)

diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 070d1ddbc7ee..ae971ae30e30 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -85,6 +85,75 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
+static int kvm_mips_bad_load(u32 cause, u32 *opc, struct kvm_run *run,
+			     struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er;
+	union mips_instruction inst;
+	int err;
+
+	/* A code fetch fault doesn't count as an MMIO */
+	if (kvm_is_ifetch_fault(&vcpu->arch)) {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		return RESUME_HOST;
+	}
+
+	/* Fetch the instruction. */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
+	if (err) {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		return RESUME_HOST;
+	}
+
+	/* Emulate the load */
+	er = kvm_mips_emulate_load(inst, cause, run, vcpu);
+	if (er == EMULATE_FAIL) {
+		kvm_err("Emulate load from MMIO space failed\n");
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+	} else {
+		run->exit_reason = KVM_EXIT_MMIO;
+	}
+	return RESUME_HOST;
+}
+
+static int kvm_mips_bad_store(u32 cause, u32 *opc, struct kvm_run *run,
+			      struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er;
+	union mips_instruction inst;
+	int err;
+
+	/* Fetch the instruction. */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
+	if (err) {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		return RESUME_HOST;
+	}
+
+	/* Emulate the store */
+	er = kvm_mips_emulate_store(inst, cause, run, vcpu);
+	if (er == EMULATE_FAIL) {
+		kvm_err("Emulate store to MMIO space failed\n");
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+	} else {
+		run->exit_reason = KVM_EXIT_MMIO;
+	}
+	return RESUME_HOST;
+}
+
+static int kvm_mips_bad_access(u32 cause, u32 *opc, struct kvm_run *run,
+			       struct kvm_vcpu *vcpu, bool store)
+{
+	if (store)
+		return kvm_mips_bad_store(cause, opc, run, vcpu);
+	else
+		return kvm_mips_bad_load(cause, opc, run, vcpu);
+}
+
 static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
@@ -178,28 +247,11 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
 		}
 	} else if (KVM_GUEST_KERNEL_MODE(vcpu)
 		   && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) {
-		/* A code fetch fault doesn't count as an MMIO */
-		if (!store && kvm_is_ifetch_fault(&vcpu->arch)) {
-			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			return RESUME_HOST;
-		}
-
 		/*
 		 * With EVA we may get a TLB exception instead of an address
 		 * error when the guest performs MMIO to KSeg1 addresses.
 		 */
-		kvm_debug("Emulate %s MMIO space\n",
-			  store ? "Store to" : "Load from");
-		er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
-		if (er == EMULATE_FAIL) {
-			kvm_err("Emulate %s MMIO space failed\n",
-				store ? "Store to" : "Load from");
-			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			ret = RESUME_HOST;
-		} else {
-			run->exit_reason = KVM_EXIT_MMIO;
-			ret = RESUME_HOST;
-		}
+		ret = kvm_mips_bad_access(cause, opc, run, vcpu, store);
 	} else {
 		kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
 			store ? "ST" : "LD", cause, opc, badvaddr);
@@ -227,21 +279,11 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 	u32 cause = vcpu->arch.host_cp0_cause;
-	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 
 	if (KVM_GUEST_KERNEL_MODE(vcpu)
 	    && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) {
-		kvm_debug("Emulate Store to MMIO space\n");
-		er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
-		if (er == EMULATE_FAIL) {
-			kvm_err("Emulate Store to MMIO space failed\n");
-			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			ret = RESUME_HOST;
-		} else {
-			run->exit_reason = KVM_EXIT_MMIO;
-			ret = RESUME_HOST;
-		}
+		ret = kvm_mips_bad_store(cause, opc, run, vcpu);
 	} else {
 		kvm_err("Address Error (STORE): cause %#x, PC: %p, BadVaddr: %#lx\n",
 			cause, opc, badvaddr);
@@ -257,32 +299,15 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 	u32 cause = vcpu->arch.host_cp0_cause;
-	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 
 	if (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1) {
-		/* A code fetch fault doesn't count as an MMIO */
-		if (kvm_is_ifetch_fault(&vcpu->arch)) {
-			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			return RESUME_HOST;
-		}
-
-		kvm_debug("Emulate Load from MMIO space @ %#lx\n", badvaddr);
-		er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
-		if (er == EMULATE_FAIL) {
-			kvm_err("Emulate Load from MMIO space failed\n");
-			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			ret = RESUME_HOST;
-		} else {
-			run->exit_reason = KVM_EXIT_MMIO;
-			ret = RESUME_HOST;
-		}
+		ret = kvm_mips_bad_load(cause, opc, run, vcpu);
 	} else {
 		kvm_err("Address Error (LOAD): cause %#x, PC: %p, BadVaddr: %#lx\n",
 			cause, opc, badvaddr);
 		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		ret = RESUME_HOST;
-		er = EMULATE_FAIL;
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From b8f79ddb7db95bb675b3d6009e7a4274161e1e53 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 11 May 2015 23:31:45 +0100
Subject: KVM: MIPS/T&E: Treat unhandled guest KSeg0 as MMIO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Treat unhandled accesses to guest KSeg0 as MMIO, rather than only host
KSeg0 addresses. This will allow read only memory regions (such as the
Malta boot flash as emulated by QEMU) to have writes (before reads)
treated as MMIO, and unallocated physical addresses to have all accesses
treated as MMIO.

The MMIO emulation uses the gva_to_gpa callback, so this is also updated
for trap & emulate to handle guest KSeg0 addresses.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/mmu.c       |  1 -
 arch/mips/kvm/trap_emul.c | 10 +++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 1af65f2e6bb7..934bcc3732da 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -350,7 +350,6 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
 	pfn = gfn_to_pfn(kvm, gfn);
 
 	if (is_error_noslot_pfn(pfn)) {
-		kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn);
 		err = -EFAULT;
 		goto out;
 	}
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index ae971ae30e30..77e059068a29 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -23,9 +23,12 @@ static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva)
 {
 	gpa_t gpa;
 	gva_t kseg = KSEGX(gva);
+	gva_t gkseg = KVM_GUEST_KSEGX(gva);
 
 	if ((kseg == CKSEG0) || (kseg == CKSEG1))
 		gpa = CPHYSADDR(gva);
+	else if (gkseg == KVM_GUEST_KSEG0)
+		gpa = KVM_GUEST_CPHYSADDR(gva);
 	else {
 		kvm_err("%s: cannot find GPA for GVA: %#lx\n", __func__, gva);
 		kvm_mips_dump_host_tlbs();
@@ -240,11 +243,8 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
 		 * All KSEG0 faults are handled by KVM, as the guest kernel does
 		 * not expect to ever get them
 		 */
-		if (kvm_mips_handle_kseg0_tlb_fault
-		    (vcpu->arch.host_cp0_badvaddr, vcpu, store) < 0) {
-			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			ret = RESUME_HOST;
-		}
+		if (kvm_mips_handle_kseg0_tlb_fault(badvaddr, vcpu, store) < 0)
+			ret = kvm_mips_bad_access(cause, opc, run, vcpu, store);
 	} else if (KVM_GUEST_KERNEL_MODE(vcpu)
 		   && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) {
 		/*
-- 
cgit v1.2.3


From 64ebc9e24074403c4127b06c0203f3e7b3367e69 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 13 Dec 2016 13:02:36 +0000
Subject: KVM: MIPS/T&E: Handle read only GPA in TLB mod
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rewrite TLB modified exception handling to handle read only GPA memory
regions, instead of unconditionally passing the exception to the guest.

If the guest TLB is not the cause of the exception we call into the
normal TLB fault handling depending on the memory segment, which will
soon attempt to remap the physical page to be writable (handling dirty
page tracking or copy on write in the process).

Failing that we fall back to treating it as MMIO, due to a read only
memory region. Once the capability is enabled, this will allow read only
memory regions (such as the Malta boot flash as emulated by QEMU) to
have writes treated as MMIO, while still allowing reads to run
untrapped.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  5 ---
 arch/mips/kvm/emulate.c          | 31 ------------------
 arch/mips/kvm/trap_emul.c        | 69 +++++++++++++++++++++++++---------------
 3 files changed, 43 insertions(+), 62 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 17db9d6c7c93..13c9e128bb86 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -614,11 +614,6 @@ extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
 						     struct kvm_vcpu *vcpu,
 						     bool write_fault);
 
-extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause,
-						    u32 *opc,
-						    struct kvm_run *run,
-						    struct kvm_vcpu *vcpu);
-
 extern void kvm_mips_dump_host_tlbs(void);
 extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu);
 extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi,
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 4875afca3f26..b6cafb0a9df4 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -2124,37 +2124,6 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
 	return EMULATE_DONE;
 }
 
-/* TLBMOD: store into address matching TLB with Dirty bit off */
-enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc,
-					     struct kvm_run *run,
-					     struct kvm_vcpu *vcpu)
-{
-	enum emulation_result er = EMULATE_DONE;
-#ifdef DEBUG
-	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
-			(kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
-	bool kernel = KVM_GUEST_KERNEL_MODE(vcpu);
-	int index;
-
-	/* If address not in the guest TLB, then we are in trouble */
-	index = kvm_mips_guest_tlb_lookup(vcpu, entryhi);
-	if (index < 0) {
-		/* XXXKYMA Invalidate and retry */
-		kvm_mips_host_tlb_inv(vcpu, vcpu->arch.host_cp0_badvaddr,
-				      !kernel, kernel);
-		kvm_err("%s: host got TLBMOD for %#lx but entry not present in Guest TLB\n",
-		     __func__, entryhi);
-		kvm_mips_dump_guest_tlbs(vcpu);
-		kvm_mips_dump_host_tlbs();
-		return EMULATE_FAIL;
-	}
-#endif
-
-	er = kvm_mips_emulate_tlbmod(cause, opc, run, vcpu);
-	return er;
-}
-
 enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
 					      u32 *opc,
 					      struct kvm_run *run,
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 77e059068a29..001c5fb61049 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -159,46 +159,63 @@ static int kvm_mips_bad_access(u32 cause, u32 *opc, struct kvm_run *run,
 
 static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
 {
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	struct kvm_run *run = vcpu->run;
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 	u32 cause = vcpu->arch.host_cp0_cause;
-	enum emulation_result er = EMULATE_DONE;
-	int ret = RESUME_GUEST;
+	struct kvm_mips_tlb *tlb;
+	unsigned long entryhi;
+	int index;
 
 	if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
 	    || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
-		kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#x, PC: %p, BadVaddr: %#lx\n",
-			  cause, opc, badvaddr);
-		er = kvm_mips_handle_tlbmod(cause, opc, run, vcpu);
+		/*
+		 * First find the mapping in the guest TLB. If the failure to
+		 * write was due to the guest TLB, it should be up to the guest
+		 * to handle it.
+		 */
+		entryhi = (badvaddr & VPN2_MASK) |
+			  (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
+		index = kvm_mips_guest_tlb_lookup(vcpu, entryhi);
 
-		if (er == EMULATE_DONE)
-			ret = RESUME_GUEST;
-		else {
+		/*
+		 * These should never happen.
+		 * They would indicate stale host TLB entries.
+		 */
+		if (unlikely(index < 0)) {
 			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			ret = RESUME_HOST;
+			return RESUME_HOST;
 		}
-	} else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
+		tlb = vcpu->arch.guest_tlb + index;
+		if (unlikely(!TLB_IS_VALID(*tlb, badvaddr))) {
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
+
 		/*
-		 * XXXKYMA: The guest kernel does not expect to get this fault
-		 * when we are not using HIGHMEM. Need to address this in a
-		 * HIGHMEM kernel
+		 * Guest entry not dirty? That would explain the TLB modified
+		 * exception. Relay that on to the guest so it can handle it.
 		 */
-		kvm_err("TLB MOD fault not handled, cause %#x, PC: %p, BadVaddr: %#lx\n",
-			cause, opc, badvaddr);
-		kvm_mips_dump_host_tlbs();
-		kvm_arch_vcpu_dump_regs(vcpu);
-		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		ret = RESUME_HOST;
+		if (!TLB_IS_DIRTY(*tlb, badvaddr)) {
+			kvm_mips_emulate_tlbmod(cause, opc, run, vcpu);
+			return RESUME_GUEST;
+		}
+
+		if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, badvaddr,
+							 true))
+			/* Not writable, needs handling as MMIO */
+			return kvm_mips_bad_store(cause, opc, run, vcpu);
+		return RESUME_GUEST;
+	} else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
+		if (kvm_mips_handle_kseg0_tlb_fault(badvaddr, vcpu, true) < 0)
+			/* Not writable, needs handling as MMIO */
+			return kvm_mips_bad_store(cause, opc, run, vcpu);
+		return RESUME_GUEST;
 	} else {
-		kvm_err("Illegal TLB Mod fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
-			cause, opc, badvaddr);
-		kvm_mips_dump_host_tlbs();
-		kvm_arch_vcpu_dump_regs(vcpu);
-		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		ret = RESUME_HOST;
+		/* host kernel addresses are all handled as MMIO */
+		return kvm_mips_bad_store(cause, opc, run, vcpu);
 	}
-	return ret;
 }
 
 static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
-- 
cgit v1.2.3


From f0c0c330f7bb1a640968798b63c0dffc6a8af0ec Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 6 Dec 2016 14:47:47 +0000
Subject: KVM: MIPS/MMU: Add GPA PT mkclean helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a helper function to make a range of guest physical address (GPA)
mappings in the GPA page table clean so that writes can be caught. This
will be used in a few places to manage dirty page logging.

Note that until the dirty bit is transferred from GPA page table entries
to GVA page table entries in an upcoming patch this won't trigger a TLB
modified exception on write.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |   1 +
 arch/mips/kvm/mmu.c              | 124 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 13c9e128bb86..a7394940119c 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -643,6 +643,7 @@ enum kvm_mips_flush {
 };
 void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags);
 bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn);
+int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn);
 pgd_t *kvm_pgd_alloc(void);
 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 934bcc3732da..892fd0ede718 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -304,6 +304,130 @@ bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
 				      end_gfn << PAGE_SHIFT);
 }
 
+#define BUILD_PTE_RANGE_OP(name, op)					\
+static int kvm_mips_##name##_pte(pte_t *pte, unsigned long start,	\
+				 unsigned long end)			\
+{									\
+	int ret = 0;							\
+	int i_min = __pte_offset(start);				\
+	int i_max = __pte_offset(end);					\
+	int i;								\
+	pte_t old, new;							\
+									\
+	for (i = i_min; i <= i_max; ++i) {				\
+		if (!pte_present(pte[i]))				\
+			continue;					\
+									\
+		old = pte[i];						\
+		new = op(old);						\
+		if (pte_val(new) == pte_val(old))			\
+			continue;					\
+		set_pte(pte + i, new);					\
+		ret = 1;						\
+	}								\
+	return ret;							\
+}									\
+									\
+/* returns true if anything was done */					\
+static int kvm_mips_##name##_pmd(pmd_t *pmd, unsigned long start,	\
+				 unsigned long end)			\
+{									\
+	int ret = 0;							\
+	pte_t *pte;							\
+	unsigned long cur_end = ~0ul;					\
+	int i_min = __pmd_offset(start);				\
+	int i_max = __pmd_offset(end);					\
+	int i;								\
+									\
+	for (i = i_min; i <= i_max; ++i, start = 0) {			\
+		if (!pmd_present(pmd[i]))				\
+			continue;					\
+									\
+		pte = pte_offset(pmd + i, 0);				\
+		if (i == i_max)						\
+			cur_end = end;					\
+									\
+		ret |= kvm_mips_##name##_pte(pte, start, cur_end);	\
+	}								\
+	return ret;							\
+}									\
+									\
+static int kvm_mips_##name##_pud(pud_t *pud, unsigned long start,	\
+				 unsigned long end)			\
+{									\
+	int ret = 0;							\
+	pmd_t *pmd;							\
+	unsigned long cur_end = ~0ul;					\
+	int i_min = __pud_offset(start);				\
+	int i_max = __pud_offset(end);					\
+	int i;								\
+									\
+	for (i = i_min; i <= i_max; ++i, start = 0) {			\
+		if (!pud_present(pud[i]))				\
+			continue;					\
+									\
+		pmd = pmd_offset(pud + i, 0);				\
+		if (i == i_max)						\
+			cur_end = end;					\
+									\
+		ret |= kvm_mips_##name##_pmd(pmd, start, cur_end);	\
+	}								\
+	return ret;							\
+}									\
+									\
+static int kvm_mips_##name##_pgd(pgd_t *pgd, unsigned long start,	\
+				 unsigned long end)			\
+{									\
+	int ret = 0;							\
+	pud_t *pud;							\
+	unsigned long cur_end = ~0ul;					\
+	int i_min = pgd_index(start);					\
+	int i_max = pgd_index(end);					\
+	int i;								\
+									\
+	for (i = i_min; i <= i_max; ++i, start = 0) {			\
+		if (!pgd_present(pgd[i]))				\
+			continue;					\
+									\
+		pud = pud_offset(pgd + i, 0);				\
+		if (i == i_max)						\
+			cur_end = end;					\
+									\
+		ret |= kvm_mips_##name##_pud(pud, start, cur_end);	\
+	}								\
+	return ret;							\
+}
+
+/*
+ * kvm_mips_mkclean_gpa_pt.
+ * Mark a range of guest physical address space clean (writes fault) in the VM's
+ * GPA page table to allow dirty page tracking.
+ */
+
+BUILD_PTE_RANGE_OP(mkclean, pte_mkclean)
+
+/**
+ * kvm_mips_mkclean_gpa_pt() - Make a range of guest physical addresses clean.
+ * @kvm:	KVM pointer.
+ * @start_gfn:	Guest frame number of first page in GPA range to flush.
+ * @end_gfn:	Guest frame number of last page in GPA range to flush.
+ *
+ * Make a range of GPA mappings clean so that guest writes will fault and
+ * trigger dirty page logging.
+ *
+ * The caller must hold the @kvm->mmu_lock spinlock.
+ *
+ * Returns:	Whether any GPA mappings were modified, which would require
+ *		derived mappings (GVA page tables & TLB enties) to be
+ *		invalidated.
+ */
+int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
+{
+	return kvm_mips_mkclean_pgd(kvm->arch.gpa_mm.pgd,
+				    start_gfn << PAGE_SHIFT,
+				    end_gfn << PAGE_SHIFT);
+}
+
 /**
  * kvm_mips_map_page() - Map a guest physical page.
  * @vcpu:		VCPU pointer.
-- 
cgit v1.2.3


From e88643ba1acb48fa30345ba75cc324d7181aa2bf Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 6 Dec 2016 14:50:52 +0000
Subject: KVM: MIPS/MMU: Use generic dirty log & protect helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MIPS hasn't up to this point properly supported dirty page logging, as
pages in slots with dirty logging enabled aren't made clean, and tlbmod
exceptions from writes to clean pages have been assumed to be due to
guest TLB protection and unconditionally passed to the guest.

Use the generic dirty logging helper kvm_get_dirty_log_protect() to
properly implement kvm_vm_ioctl_get_dirty_log(), similar to how ARM
does. This uses xchg to clear the dirty bits when reading them, rather
than wiping them out afterwards with a memset, which would potentially
wipe recently set bits that weren't caught by kvm_get_dirty_log(). It
also makes the pages clean again using the
kvm_arch_mmu_enable_log_dirty_pt_masked() architecture callback so that
further writes after the shadow memslot is flushed will trigger tlbmod
exceptions and dirty handling.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/Kconfig |  1 +
 arch/mips/kvm/mips.c  | 42 +++++++++++++++++++++++-------------------
 arch/mips/kvm/mmu.c   | 22 ++++++++++++++++++++++
 3 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 7c56d6b124d1..85c4593b634a 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -20,6 +20,7 @@ config KVM
 	select EXPORT_UASM
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
+	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
 	select KVM_MMIO
 	select SRCU
 	---help---
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 01f3fa1b9f0e..0b84b336ee4d 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1086,42 +1086,46 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
 	return r;
 }
 
-/* Get (and clear) the dirty memory log for a memory slot. */
+/**
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
+ * @kvm: kvm instance
+ * @log: slot id and address to which we copy the log
+ *
+ * Steps 1-4 below provide general overview of dirty page logging. See
+ * kvm_get_dirty_log_protect() function description for additional details.
+ *
+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
+ * always flush the TLB (step 4) even if previous step failed  and the dirty
+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
+ * writes will be marked dirty for next log read.
+ *
+ *   1. Take a snapshot of the bit and clear it if needed.
+ *   2. Write protect the corresponding page.
+ *   3. Copy the snapshot to the userspace.
+ *   4. Flush TLB's if needed.
+ */
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
-	unsigned long ga, ga_end;
-	int is_dirty = 0;
+	bool is_dirty = false;
 	int r;
-	unsigned long n;
 
 	mutex_lock(&kvm->slots_lock);
 
-	r = kvm_get_dirty_log(kvm, log, &is_dirty);
-	if (r)
-		goto out;
+	r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
 
-	/* If nothing is dirty, don't bother messing with page tables. */
 	if (is_dirty) {
 		slots = kvm_memslots(kvm);
 		memslot = id_to_memslot(slots, log->slot);
 
-		ga = memslot->base_gfn << PAGE_SHIFT;
-		ga_end = ga + (memslot->npages << PAGE_SHIFT);
-
-		kvm_info("%s: dirty, ga: %#lx, ga_end %#lx\n", __func__, ga,
-			 ga_end);
-
-		n = kvm_dirty_bitmap_bytes(memslot);
-		memset(memslot->dirty_bitmap, 0, n);
+		/* Let implementation handle TLB/GVA invalidation */
+		kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot);
 	}
 
-	r = 0;
-out:
 	mutex_unlock(&kvm->slots_lock);
 	return r;
-
 }
 
 long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 892fd0ede718..63a6d542ecb3 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -428,6 +428,28 @@ int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
 				    end_gfn << PAGE_SHIFT);
 }
 
+/**
+ * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages
+ * @kvm:	The KVM pointer
+ * @slot:	The memory slot associated with mask
+ * @gfn_offset:	The gfn offset in memory slot
+ * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory
+ *		slot to be write protected
+ *
+ * Walks bits set in mask write protects the associated pte's. Caller must
+ * acquire @kvm->mmu_lock.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+		struct kvm_memory_slot *slot,
+		gfn_t gfn_offset, unsigned long mask)
+{
+	gfn_t base_gfn = slot->base_gfn + gfn_offset;
+	gfn_t start = base_gfn +  __ffs(mask);
+	gfn_t end = base_gfn + __fls(mask);
+
+	kvm_mips_mkclean_gpa_pt(kvm, start, end);
+}
+
 /**
  * kvm_mips_map_page() - Map a guest physical page.
  * @vcpu:		VCPU pointer.
-- 
cgit v1.2.3


From a1ac9e17b7c934666a780772866135b9fea17f4c Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 6 Dec 2016 14:56:20 +0000
Subject: KVM: MIPS: Clean & flush on dirty page logging enable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When an existing memory region has dirty page logging enabled, make the
entire slot clean (read only) so that writes will immediately start
logging dirty pages (once the dirty bit is transferred from GPA to GVA
page tables in an upcoming patch).

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/mips.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 0b84b336ee4d..475c4cc78bd6 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -197,9 +197,33 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 				   const struct kvm_memory_slot *new,
 				   enum kvm_mr_change change)
 {
+	int needs_flush;
+
 	kvm_debug("%s: kvm: %p slot: %d, GPA: %llx, size: %llx, QVA: %llx\n",
 		  __func__, kvm, mem->slot, mem->guest_phys_addr,
 		  mem->memory_size, mem->userspace_addr);
+
+	/*
+	 * If dirty page logging is enabled, write protect all pages in the slot
+	 * ready for dirty logging.
+	 *
+	 * There is no need to do this in any of the following cases:
+	 * CREATE:	No dirty mappings will already exist.
+	 * MOVE/DELETE:	The old mappings will already have been cleaned up by
+	 *		kvm_arch_flush_shadow_memslot()
+	 */
+	if (change == KVM_MR_FLAGS_ONLY &&
+	    (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
+	     new->flags & KVM_MEM_LOG_DIRTY_PAGES)) {
+		spin_lock(&kvm->mmu_lock);
+		/* Write protect GPA page table entries */
+		needs_flush = kvm_mips_mkclean_gpa_pt(kvm, new->base_gfn,
+					new->base_gfn + new->npages - 1);
+		/* Let implementation do the rest */
+		if (needs_flush)
+			kvm_mips_callbacks->flush_shadow_memslot(kvm, new);
+		spin_unlock(&kvm->mmu_lock);
+	}
 }
 
 static inline void dump_handler(const char *symbol, void *start, void *end)
-- 
cgit v1.2.3


From b5f1dd1ba4042bda191cd2e72726c920e6c2867f Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 6 Dec 2016 14:57:10 +0000
Subject: KVM: MIPS/MMU: Handle dirty logging on GPA faults
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update kvm_mips_map_page() to handle logging of dirty guest physical
pages. Upcoming patches will propagate the dirty bit to the GVA page
tables.

A fast path is added for handling protection bits that can be resolved
without calling into KVM, currently just dirtying of clean pages being
written to.

The slow path marks the GPA page table entry writable only on writes,
and at the same time marks the page dirty in the dirty page logging
bitmask.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/mmu.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 70 insertions(+), 4 deletions(-)

diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 63a6d542ecb3..7962eea4ebc3 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -450,6 +450,58 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 	kvm_mips_mkclean_gpa_pt(kvm, start, end);
 }
 
+/**
+ * _kvm_mips_map_page_fast() - Fast path GPA fault handler.
+ * @vcpu:		VCPU pointer.
+ * @gpa:		Guest physical address of fault.
+ * @write_fault:	Whether the fault was due to a write.
+ * @out_entry:		New PTE for @gpa (written on success unless NULL).
+ * @out_buddy:		New PTE for @gpa's buddy (written on success unless
+ *			NULL).
+ *
+ * Perform fast path GPA fault handling, doing all that can be done without
+ * calling into KVM. This handles dirtying of clean pages (for dirty page
+ * logging).
+ *
+ * Returns:	0 on success, in which case we can update derived mappings and
+ *		resume guest execution.
+ *		-EFAULT on failure due to absent GPA mapping or write to
+ *		read-only page, in which case KVM must be consulted.
+ */
+static int _kvm_mips_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa,
+				   bool write_fault,
+				   pte_t *out_entry, pte_t *out_buddy)
+{
+	struct kvm *kvm = vcpu->kvm;
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	pte_t *ptep;
+	int ret = 0;
+
+	spin_lock(&kvm->mmu_lock);
+
+	/* Fast path - just check GPA page table for an existing entry */
+	ptep = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
+	if (!ptep || !pte_present(*ptep)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (write_fault && !pte_dirty(*ptep)) {
+		/* Track dirtying of pages */
+		set_pte(ptep, pte_mkdirty(*ptep));
+		mark_page_dirty(kvm, gfn);
+	}
+
+	if (out_entry)
+		*out_entry = *ptep;
+	if (out_buddy)
+		*out_buddy = *ptep_buddy(ptep);
+
+out:
+	spin_unlock(&kvm->mmu_lock);
+	return ret;
+}
+
 /**
  * kvm_mips_map_page() - Map a guest physical page.
  * @vcpu:		VCPU pointer.
@@ -462,9 +514,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
  * Handle GPA faults by creating a new GPA mapping (or updating an existing
  * one).
  *
- * This takes care of asking KVM for the corresponding PFN, and creating a
- * mapping in the GPA page tables. Derived mappings (GVA page tables and TLBs)
- * must be handled by the caller.
+ * This takes care of marking pages dirty (dirty page tracking), asking KVM for
+ * the corresponding PFN, and creating a mapping in the GPA page tables. Derived
+ * mappings (GVA page tables and TLBs) must be handled by the caller.
  *
  * Returns:	0 on success, in which case the caller may use the @out_entry
  *		and @out_buddy PTEs to update derived mappings and resume guest
@@ -485,7 +537,12 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
 	pte_t *ptep, entry, old_pte;
 	unsigned long prot_bits;
 
+	/* Try the fast path to handle clean pages */
 	srcu_idx = srcu_read_lock(&kvm->srcu);
+	err = _kvm_mips_map_page_fast(vcpu, gpa, write_fault, out_entry,
+				      out_buddy);
+	if (!err)
+		goto out;
 
 	/* We need a minimum of cached pages ready for page table creation */
 	err = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
@@ -493,6 +550,7 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
 	if (err)
 		goto out;
 
+	/* Slow path - ask KVM core whether we can access this GPA */
 	pfn = gfn_to_pfn(kvm, gfn);
 
 	if (is_error_noslot_pfn(pfn)) {
@@ -502,11 +560,19 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
 
 	spin_lock(&kvm->mmu_lock);
 
+	/* Ensure page tables are allocated */
 	ptep = kvm_mips_pte_for_gpa(kvm, memcache, gpa);
 
-	prot_bits = __READABLE | _PAGE_PRESENT | __WRITEABLE;
+	/* Set up the PTE */
+	prot_bits = __READABLE | _PAGE_PRESENT | _PAGE_WRITE |
+		_page_cachable_default;
+	if (write_fault) {
+		prot_bits |= __WRITEABLE;
+		mark_page_dirty(kvm, gfn);
+	}
 	entry = pfn_pte(pfn, __pgprot(prot_bits));
 
+	/* Write the PTE */
 	old_pte = *ptep;
 	set_pte(ptep, entry);
 	if (pte_present(old_pte))
-- 
cgit v1.2.3


From b584f460e6d6f1bf968acfcd23aceb663ba996fa Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 6 Dec 2016 14:59:28 +0000
Subject: KVM: MIPS/MMU: Pass GPA PTE bits to KSeg0 GVA PTEs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Propagate the GPA PTE protection bits on to the GVA PTEs on a KSeg0
fault (except _PAGE_WRITE), rather than always overriding the
protection. This allows dirty page tracking to work in KSeg0 as a clear
dirty bit in the GPA PTE will propagate to the GVA PTEs.

This makes it simpler to use a single kvm_mips_map_page() to obtain both
the main GPA PTE and its buddy (which may be invalid), which also allows
memory regions to be fully accessible when they don't start and end on a
2*PAGE_SIZE boundary.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/mmu.c | 49 ++++++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 7962eea4ebc3..9cc941864aa8 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -769,15 +769,27 @@ void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags)
 	}
 }
 
+static pte_t kvm_mips_gpa_pte_to_gva_unmapped(pte_t pte)
+{
+	/*
+	 * Don't leak writeable but clean entries from GPA page tables. We don't
+	 * want the normal Linux tlbmod handler to handle dirtying when KVM
+	 * accesses guest memory.
+	 */
+	if (!pte_dirty(pte))
+		pte = pte_wrprotect(pte);
+
+	return pte;
+}
+
 /* XXXKYMA: Must be called with interrupts disabled */
 int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 				    struct kvm_vcpu *vcpu,
 				    bool write_fault)
 {
 	unsigned long gpa;
-	kvm_pfn_t pfn0, pfn1;
-	unsigned long vaddr;
 	pte_t pte_gpa[2], *ptep_gva;
+	int idx;
 
 	if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
 		kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
@@ -785,35 +797,26 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 		return -1;
 	}
 
-	/* Find host PFNs */
-
-	gpa = KVM_GUEST_CPHYSADDR(badvaddr & (PAGE_MASK << 1));
-	vaddr = badvaddr & (PAGE_MASK << 1);
-
-	if (kvm_mips_map_page(vcpu, gpa, write_fault, &pte_gpa[0], NULL) < 0)
-		return -1;
-
-	if (kvm_mips_map_page(vcpu, gpa | PAGE_SIZE, write_fault, &pte_gpa[1],
-			      NULL) < 0)
+	/* Get the GPA page table entry */
+	gpa = KVM_GUEST_CPHYSADDR(badvaddr);
+	idx = (badvaddr >> PAGE_SHIFT) & 1;
+	if (kvm_mips_map_page(vcpu, gpa, write_fault, &pte_gpa[idx],
+			      &pte_gpa[!idx]) < 0)
 		return -1;
 
-	pfn0 = pte_pfn(pte_gpa[0]);
-	pfn1 = pte_pfn(pte_gpa[1]);
-
-	/* Find GVA page table entry */
-
-	ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, vaddr);
+	/* Get the GVA page table entry */
+	ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, badvaddr & ~PAGE_SIZE);
 	if (!ptep_gva) {
-		kvm_err("No ptep for gva %lx\n", vaddr);
+		kvm_err("No ptep for gva %lx\n", badvaddr);
 		return -1;
 	}
 
-	/* Write host PFNs into GVA page table */
-	ptep_gva[0] = pte_mkyoung(pte_mkdirty(pfn_pte(pfn0, PAGE_SHARED)));
-	ptep_gva[1] = pte_mkyoung(pte_mkdirty(pfn_pte(pfn1, PAGE_SHARED)));
+	/* Copy a pair of entries from GPA page table to GVA page table */
+	ptep_gva[0] = kvm_mips_gpa_pte_to_gva_unmapped(pte_gpa[0]);
+	ptep_gva[1] = kvm_mips_gpa_pte_to_gva_unmapped(pte_gpa[1]);
 
 	/* Invalidate this entry in the TLB, guest kernel ASID only */
-	kvm_mips_host_tlb_inv(vcpu, vaddr, false, true);
+	kvm_mips_host_tlb_inv(vcpu, badvaddr, false, true);
 	return 0;
 }
 
-- 
cgit v1.2.3


From f9b11e51f89f6d2eca2ca8f41bb0ceb07c943e60 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 6 Dec 2016 14:59:43 +0000
Subject: KVM: MIPS/MMU: Pass GPA PTE bits to mapped GVA PTEs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Propagate the GPA PTE protection bits on to the GVA PTEs on a mapped
fault (except _PAGE_WRITE, and filtered by the guest TLB entry), rather
than always overriding the protection. This allows dirty page tracking
to work in mapped guest segments as a clear dirty bit in the GPA PTE
will propagate to the GVA PTEs even when the guest TLB has the dirty bit
set.

Since the filtering of protection bits is now abstracted, if the buddy
GVA PTE is also valid, we obtain the corresponding GPA PTE using a
simple non-allocating walk and load that into the GVA PTE similarly
(which may itself be invalid).

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/mmu.c | 56 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 37 insertions(+), 19 deletions(-)

diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 9cc941864aa8..8a01bbd276fc 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -782,6 +782,15 @@ static pte_t kvm_mips_gpa_pte_to_gva_unmapped(pte_t pte)
 	return pte;
 }
 
+static pte_t kvm_mips_gpa_pte_to_gva_mapped(pte_t pte, long entrylo)
+{
+	/* Guest EntryLo overrides host EntryLo */
+	if (!(entrylo & ENTRYLO_D))
+		pte = pte_mkclean(pte);
+
+	return kvm_mips_gpa_pte_to_gva_unmapped(pte);
+}
+
 /* XXXKYMA: Must be called with interrupts disabled */
 int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 				    struct kvm_vcpu *vcpu,
@@ -825,39 +834,48 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 					 unsigned long gva,
 					 bool write_fault)
 {
-	kvm_pfn_t pfn;
-	long tlb_lo = 0;
-	pte_t pte_gpa, *ptep_gva;
-	unsigned int idx;
+	struct kvm *kvm = vcpu->kvm;
+	long tlb_lo[2];
+	pte_t pte_gpa[2], *ptep_buddy, *ptep_gva;
+	unsigned int idx = TLB_LO_IDX(*tlb, gva);
 	bool kernel = KVM_GUEST_KERNEL_MODE(vcpu);
 
+	tlb_lo[0] = tlb->tlb_lo[0];
+	tlb_lo[1] = tlb->tlb_lo[1];
+
 	/*
 	 * The commpage address must not be mapped to anything else if the guest
 	 * TLB contains entries nearby, or commpage accesses will break.
 	 */
-	idx = TLB_LO_IDX(*tlb, gva);
-	if ((gva ^ KVM_GUEST_COMMPAGE_ADDR) & VPN2_MASK & PAGE_MASK)
-		tlb_lo = tlb->tlb_lo[idx];
+	if (!((gva ^ KVM_GUEST_COMMPAGE_ADDR) & VPN2_MASK & (PAGE_MASK << 1)))
+		tlb_lo[TLB_LO_IDX(*tlb, KVM_GUEST_COMMPAGE_ADDR)] = 0;
 
-	/* Find host PFN */
-	if (kvm_mips_map_page(vcpu, mips3_tlbpfn_to_paddr(tlb_lo), write_fault,
-			      &pte_gpa, NULL) < 0)
+	/* Get the GPA page table entry */
+	if (kvm_mips_map_page(vcpu, mips3_tlbpfn_to_paddr(tlb_lo[idx]),
+			      write_fault, &pte_gpa[idx], NULL) < 0)
 		return -1;
-	pfn = pte_pfn(pte_gpa);
 
-	/* Find GVA page table entry */
-	ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, gva);
+	/* And its GVA buddy's GPA page table entry if it also exists */
+	pte_gpa[!idx] = pfn_pte(0, __pgprot(0));
+	if (tlb_lo[!idx] & ENTRYLO_V) {
+		spin_lock(&kvm->mmu_lock);
+		ptep_buddy = kvm_mips_pte_for_gpa(kvm, NULL,
+					mips3_tlbpfn_to_paddr(tlb_lo[!idx]));
+		if (ptep_buddy)
+			pte_gpa[!idx] = *ptep_buddy;
+		spin_unlock(&kvm->mmu_lock);
+	}
+
+	/* Get the GVA page table entry pair */
+	ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, gva & ~PAGE_SIZE);
 	if (!ptep_gva) {
 		kvm_err("No ptep for gva %lx\n", gva);
 		return -1;
 	}
 
-	/* Write PFN into GVA page table, taking attributes from Guest TLB */
-	*ptep_gva = pfn_pte(pfn, (!(tlb_lo & ENTRYLO_V)) ? __pgprot(0) :
-				 (tlb_lo & ENTRYLO_D) ? PAGE_SHARED :
-				 PAGE_READONLY);
-	if (pte_present(*ptep_gva))
-		*ptep_gva = pte_mkyoung(pte_mkdirty(*ptep_gva));
+	/* Copy a pair of entries from GPA page table to GVA page table */
+	ptep_gva[0] = kvm_mips_gpa_pte_to_gva_mapped(pte_gpa[0], tlb_lo[0]);
+	ptep_gva[1] = kvm_mips_gpa_pte_to_gva_mapped(pte_gpa[1], tlb_lo[1]);
 
 	/* Invalidate this entry in the TLB, current guest mode ASID only */
 	kvm_mips_host_tlb_inv(vcpu, gva, !kernel, kernel);
-- 
cgit v1.2.3


From 411740f5422a960c30a4285343d821b62daec34b Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 13 Dec 2016 16:32:39 +0000
Subject: KVM: MIPS/MMU: Implement KVM_CAP_SYNC_MMU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement the SYNC_MMU capability for KVM MIPS, allowing changes in the
underlying user host virtual address (HVA) mappings to be promptly
reflected in the corresponding guest physical address (GPA) mappings.

This allows for several features to work with guest RAM which require
mappings to be altered or protected, such as copy-on-write, KSM (Kernel
Samepage Merging), idle page tracking, memory swapping, and guest memory
ballooning.

There are two main aspects of this change, described below.

The KVM MMU notifier architecture callbacks are implemented so we can be
notified of changes in the HVA mappings. These arrange for the guest
physical address (GPA) page tables to be modified and possibly for
derived mappings (GVA page tables and TLBs) to be flushed.

 - kvm_unmap_hva[_range]() - These deal with HVA mappings being removed,
   for example before a copy-on-write takes place, which requires the
   corresponding GPA page table mappings to be removed too.

 - kvm_set_spte_hva() - These update a GPA page table entry to match the
   new HVA entry, but must be careful to respect KVM specific
   configuration such as not dirtying a clean guest page which is dirty
   to the host, and write protecting writable pages in read only
   memslots (which will soon be supported).

 - kvm[_test]_age_hva() - These update GPA page table entries to be old
   (invalid) so that access can be tracked, making them young again.

The GPA page fault handling (kvm_mips_map_page) is updated to use
gfn_to_pfn_prot() (which may provide read-only pages), to handle
asynchronous page table invalidation from MMU notifier callbacks, and to
handle more cases in the fast path.

 - mmu_notifier_seq is used to detect asynchronous page table
   invalidations while we're holding a pfn from gfn_to_pfn_prot()
   outside of kvm->mmu_lock, retrying if invalidations have taken place,
   e.g. a COW or a KSM page merge.

 - The fast path (_kvm_mips_map_page_fast) now handles marking old pages
   as young / accessed, and disallowing dirtying of clean pages that
   aren't actually writable (e.g. shared pages that should COW, and
   read-only memory regions when they are enabled in a future patch).

 - Due to the use of MMU notifications we no longer need to keep the
   page references after we've updated the GPA page tables.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |  13 +++
 arch/mips/kvm/Kconfig            |   1 +
 arch/mips/kvm/mips.c             |   1 +
 arch/mips/kvm/mmu.c              | 235 ++++++++++++++++++++++++++++++++++++---
 4 files changed, 233 insertions(+), 17 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index a7394940119c..718dfffa17d5 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -663,6 +663,19 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
 						   unsigned long gva,
 						   bool write);
 
+#define KVM_ARCH_WANT_MMU_NOTIFIER
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_unmap_hva_range(struct kvm *kvm,
+			unsigned long start, unsigned long end);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
+
+static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
+							 unsigned long address)
+{
+}
+
 /* Emulation */
 int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
 enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 85c4593b634a..65067327db12 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -22,6 +22,7 @@ config KVM
 	select ANON_INODES
 	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
 	select KVM_MMIO
+	select MMU_NOTIFIER
 	select SRCU
 	---help---
 	  Support for hosting Guest kernels.
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 475c4cc78bd6..9932f53a1e5c 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1217,6 +1217,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	switch (ext) {
 	case KVM_CAP_ONE_REG:
 	case KVM_CAP_ENABLE_CAP:
+	case KVM_CAP_SYNC_MMU:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 8a01bbd276fc..cb0faade311e 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -194,7 +194,6 @@ static bool kvm_mips_flush_gpa_pte(pte_t *pte, unsigned long start_gpa,
 		if (!pte_present(pte[i]))
 			continue;
 
-		kvm_release_pfn_clean(pte_pfn(pte[i]));
 		set_pte(pte + i, __pte(0));
 	}
 	return safe_to_remove;
@@ -450,6 +449,155 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 	kvm_mips_mkclean_gpa_pt(kvm, start, end);
 }
 
+/*
+ * kvm_mips_mkold_gpa_pt.
+ * Mark a range of guest physical address space old (all accesses fault) in the
+ * VM's GPA page table to allow detection of commonly used pages.
+ */
+
+BUILD_PTE_RANGE_OP(mkold, pte_mkold)
+
+static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn,
+				 gfn_t end_gfn)
+{
+	return kvm_mips_mkold_pgd(kvm->arch.gpa_mm.pgd,
+				  start_gfn << PAGE_SHIFT,
+				  end_gfn << PAGE_SHIFT);
+}
+
+static int handle_hva_to_gpa(struct kvm *kvm,
+			     unsigned long start,
+			     unsigned long end,
+			     int (*handler)(struct kvm *kvm, gfn_t gfn,
+					    gpa_t gfn_end,
+					    struct kvm_memory_slot *memslot,
+					    void *data),
+			     void *data)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int ret = 0;
+
+	slots = kvm_memslots(kvm);
+
+	/* we only care about the pages that the guest sees */
+	kvm_for_each_memslot(memslot, slots) {
+		unsigned long hva_start, hva_end;
+		gfn_t gfn, gfn_end;
+
+		hva_start = max(start, memslot->userspace_addr);
+		hva_end = min(end, memslot->userspace_addr +
+					(memslot->npages << PAGE_SHIFT));
+		if (hva_start >= hva_end)
+			continue;
+
+		/*
+		 * {gfn(page) | page intersects with [hva_start, hva_end)} =
+		 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+		 */
+		gfn = hva_to_gfn_memslot(hva_start, memslot);
+		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+
+		ret |= handler(kvm, gfn, gfn_end, memslot, data);
+	}
+
+	return ret;
+}
+
+
+static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
+				 struct kvm_memory_slot *memslot, void *data)
+{
+	kvm_mips_flush_gpa_pt(kvm, gfn, gfn_end);
+	return 1;
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+	unsigned long end = hva + PAGE_SIZE;
+
+	handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL);
+
+	kvm_mips_callbacks->flush_shadow_all(kvm);
+	return 0;
+}
+
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+	handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
+
+	kvm_mips_callbacks->flush_shadow_all(kvm);
+	return 0;
+}
+
+static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
+				struct kvm_memory_slot *memslot, void *data)
+{
+	gpa_t gpa = gfn << PAGE_SHIFT;
+	pte_t hva_pte = *(pte_t *)data;
+	pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
+	pte_t old_pte;
+
+	if (!gpa_pte)
+		return 0;
+
+	/* Mapping may need adjusting depending on memslot flags */
+	old_pte = *gpa_pte;
+	if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
+		hva_pte = pte_mkclean(hva_pte);
+	else if (memslot->flags & KVM_MEM_READONLY)
+		hva_pte = pte_wrprotect(hva_pte);
+
+	set_pte(gpa_pte, hva_pte);
+
+	/* Replacing an absent or old page doesn't need flushes */
+	if (!pte_present(old_pte) || !pte_young(old_pte))
+		return 0;
+
+	/* Pages swapped, aged, moved, or cleaned require flushes */
+	return !pte_present(hva_pte) ||
+	       !pte_young(hva_pte) ||
+	       pte_pfn(old_pte) != pte_pfn(hva_pte) ||
+	       (pte_dirty(old_pte) && !pte_dirty(hva_pte));
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+	unsigned long end = hva + PAGE_SIZE;
+	int ret;
+
+	ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte);
+	if (ret)
+		kvm_mips_callbacks->flush_shadow_all(kvm);
+}
+
+static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
+			       struct kvm_memory_slot *memslot, void *data)
+{
+	return kvm_mips_mkold_gpa_pt(kvm, gfn, gfn_end);
+}
+
+static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
+				    struct kvm_memory_slot *memslot, void *data)
+{
+	gpa_t gpa = gfn << PAGE_SHIFT;
+	pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
+
+	if (!gpa_pte)
+		return 0;
+	return pte_young(*gpa_pte);
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+	return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
+}
+
 /**
  * _kvm_mips_map_page_fast() - Fast path GPA fault handler.
  * @vcpu:		VCPU pointer.
@@ -460,8 +608,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
  *			NULL).
  *
  * Perform fast path GPA fault handling, doing all that can be done without
- * calling into KVM. This handles dirtying of clean pages (for dirty page
- * logging).
+ * calling into KVM. This handles marking old pages young (for idle page
+ * tracking), and dirtying of clean pages (for dirty page logging).
  *
  * Returns:	0 on success, in which case we can update derived mappings and
  *		resume guest execution.
@@ -475,6 +623,8 @@ static int _kvm_mips_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa,
 	struct kvm *kvm = vcpu->kvm;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	pte_t *ptep;
+	kvm_pfn_t pfn = 0;	/* silence bogus GCC warning */
+	bool pfn_valid = false;
 	int ret = 0;
 
 	spin_lock(&kvm->mmu_lock);
@@ -486,10 +636,24 @@ static int _kvm_mips_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa,
 		goto out;
 	}
 
+	/* Track access to pages marked old */
+	if (!pte_young(*ptep)) {
+		set_pte(ptep, pte_mkyoung(*ptep));
+		pfn = pte_pfn(*ptep);
+		pfn_valid = true;
+		/* call kvm_set_pfn_accessed() after unlock */
+	}
 	if (write_fault && !pte_dirty(*ptep)) {
-		/* Track dirtying of pages */
+		if (!pte_write(*ptep)) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		/* Track dirtying of writeable pages */
 		set_pte(ptep, pte_mkdirty(*ptep));
+		pfn = pte_pfn(*ptep);
 		mark_page_dirty(kvm, gfn);
+		kvm_set_pfn_dirty(pfn);
 	}
 
 	if (out_entry)
@@ -499,6 +663,8 @@ static int _kvm_mips_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa,
 
 out:
 	spin_unlock(&kvm->mmu_lock);
+	if (pfn_valid)
+		kvm_set_pfn_accessed(pfn);
 	return ret;
 }
 
@@ -514,9 +680,10 @@ out:
  * Handle GPA faults by creating a new GPA mapping (or updating an existing
  * one).
  *
- * This takes care of marking pages dirty (dirty page tracking), asking KVM for
- * the corresponding PFN, and creating a mapping in the GPA page tables. Derived
- * mappings (GVA page tables and TLBs) must be handled by the caller.
+ * This takes care of marking pages young or dirty (idle/dirty page tracking),
+ * asking KVM for the corresponding PFN, and creating a mapping in the GPA page
+ * tables. Derived mappings (GVA page tables and TLBs) must be handled by the
+ * caller.
  *
  * Returns:	0 on success, in which case the caller may use the @out_entry
  *		and @out_buddy PTEs to update derived mappings and resume guest
@@ -535,9 +702,11 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
 	int srcu_idx, err;
 	kvm_pfn_t pfn;
 	pte_t *ptep, entry, old_pte;
+	bool writeable;
 	unsigned long prot_bits;
+	unsigned long mmu_seq;
 
-	/* Try the fast path to handle clean pages */
+	/* Try the fast path to handle old / clean pages */
 	srcu_idx = srcu_read_lock(&kvm->srcu);
 	err = _kvm_mips_map_page_fast(vcpu, gpa, write_fault, out_entry,
 				      out_buddy);
@@ -550,33 +719,63 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
 	if (err)
 		goto out;
 
-	/* Slow path - ask KVM core whether we can access this GPA */
-	pfn = gfn_to_pfn(kvm, gfn);
+retry:
+	/*
+	 * Used to check for invalidations in progress, of the pfn that is
+	 * returned by pfn_to_pfn_prot below.
+	 */
+	mmu_seq = kvm->mmu_notifier_seq;
+	/*
+	 * Ensure the read of mmu_notifier_seq isn't reordered with PTE reads in
+	 * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't
+	 * risk the page we get a reference to getting unmapped before we have a
+	 * chance to grab the mmu_lock without mmu_notifier_retry() noticing.
+	 *
+	 * This smp_rmb() pairs with the effective smp_wmb() of the combination
+	 * of the pte_unmap_unlock() after the PTE is zapped, and the
+	 * spin_lock() in kvm_mmu_notifier_invalidate_<page|range_end>() before
+	 * mmu_notifier_seq is incremented.
+	 */
+	smp_rmb();
 
+	/* Slow path - ask KVM core whether we can access this GPA */
+	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writeable);
 	if (is_error_noslot_pfn(pfn)) {
 		err = -EFAULT;
 		goto out;
 	}
 
 	spin_lock(&kvm->mmu_lock);
+	/* Check if an invalidation has taken place since we got pfn */
+	if (mmu_notifier_retry(kvm, mmu_seq)) {
+		/*
+		 * This can happen when mappings are changed asynchronously, but
+		 * also synchronously if a COW is triggered by
+		 * gfn_to_pfn_prot().
+		 */
+		spin_unlock(&kvm->mmu_lock);
+		kvm_release_pfn_clean(pfn);
+		goto retry;
+	}
 
 	/* Ensure page tables are allocated */
 	ptep = kvm_mips_pte_for_gpa(kvm, memcache, gpa);
 
 	/* Set up the PTE */
-	prot_bits = __READABLE | _PAGE_PRESENT | _PAGE_WRITE |
-		_page_cachable_default;
-	if (write_fault) {
-		prot_bits |= __WRITEABLE;
-		mark_page_dirty(kvm, gfn);
+	prot_bits = _PAGE_PRESENT | __READABLE | _page_cachable_default;
+	if (writeable) {
+		prot_bits |= _PAGE_WRITE;
+		if (write_fault) {
+			prot_bits |= __WRITEABLE;
+			mark_page_dirty(kvm, gfn);
+			kvm_set_pfn_dirty(pfn);
+		}
 	}
 	entry = pfn_pte(pfn, __pgprot(prot_bits));
 
 	/* Write the PTE */
 	old_pte = *ptep;
 	set_pte(ptep, entry);
-	if (pte_present(old_pte))
-		kvm_release_pfn_clean(pte_pfn(old_pte));
 
 	err = 0;
 	if (out_entry)
@@ -585,6 +784,8 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
 		*out_buddy = *ptep_buddy(ptep);
 
 	spin_unlock(&kvm->mmu_lock);
+	kvm_release_pfn_clean(pfn);
+	kvm_set_pfn_accessed(pfn);
 out:
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 	return err;
-- 
cgit v1.2.3


From 230c57244c2c4d945dba7f9d15845bffe4135b58 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 8 May 2015 17:11:49 +0100
Subject: KVM: MIPS: Claim KVM_CAP_READONLY_MEM support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that load/store faults due to read only memory regions are treated
as MMIO accesses it is safe to claim support for read only memory
regions (KVM_CAP_READONLY_MEM).

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/uapi/asm/kvm.h | 2 ++
 arch/mips/kvm/mips.c             | 1 +
 2 files changed, 3 insertions(+)

diff --git a/arch/mips/include/uapi/asm/kvm.h b/arch/mips/include/uapi/asm/kvm.h
index 6985eb59b085..a8a0199bf760 100644
--- a/arch/mips/include/uapi/asm/kvm.h
+++ b/arch/mips/include/uapi/asm/kvm.h
@@ -19,6 +19,8 @@
  * Some parts derived from the x86 version of this file.
  */
 
+#define __KVM_HAVE_READONLY_MEM
+
 /*
  * for KVM_GET_REGS and KVM_SET_REGS
  *
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 9932f53a1e5c..591426cda15e 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1217,6 +1217,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	switch (ext) {
 	case KVM_CAP_ONE_REG:
 	case KVM_CAP_ENABLE_CAP:
+	case KVM_CAP_READONLY_MEM:
 	case KVM_CAP_SYNC_MMU:
 		r = 1;
 		break;
-- 
cgit v1.2.3


From 654229a02456a9af372defb13d1911345360074d Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 8 Dec 2016 22:46:41 +0000
Subject: KVM: MIPS/T&E: Move CP0 register access into T&E
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Access to various CP0 registers via the KVM register access API needs to
be implementation specific to allow restrictions to be made on changes,
for example when VZ guest registers aren't present, so move them all
into trap_emul.c in preparation for VZ.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h |   1 -
 arch/mips/kvm/emulate.c          |   2 +-
 arch/mips/kvm/mips.c             | 198 ---------------------------------------
 arch/mips/kvm/trap_emul.c        | 181 ++++++++++++++++++++++++++++++++++-
 4 files changed, 179 insertions(+), 203 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 718dfffa17d5..bc56a312497d 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -345,7 +345,6 @@ struct kvm_vcpu_arch {
 
 	u8 fpu_enabled;
 	u8 msa_enabled;
-	u8 kscratch_enabled;
 };
 
 
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index b6cafb0a9df4..f2b054b80bca 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1066,7 +1066,7 @@ unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu)
 	unsigned int mask = MIPS_CONF_M;
 
 	/* KScrExist */
-	mask |= (unsigned int)vcpu->arch.kscratch_enabled << 16;
+	mask |= 0xfc << MIPS_CONF4_KSCREXIST_SHIFT;
 
 	return mask;
 }
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 591426cda15e..9338aec08790 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -520,33 +520,6 @@ static u64 kvm_mips_get_one_regs[] = {
 	KVM_REG_MIPS_LO,
 #endif
 	KVM_REG_MIPS_PC,
-
-	KVM_REG_MIPS_CP0_INDEX,
-	KVM_REG_MIPS_CP0_CONTEXT,
-	KVM_REG_MIPS_CP0_USERLOCAL,
-	KVM_REG_MIPS_CP0_PAGEMASK,
-	KVM_REG_MIPS_CP0_WIRED,
-	KVM_REG_MIPS_CP0_HWRENA,
-	KVM_REG_MIPS_CP0_BADVADDR,
-	KVM_REG_MIPS_CP0_COUNT,
-	KVM_REG_MIPS_CP0_ENTRYHI,
-	KVM_REG_MIPS_CP0_COMPARE,
-	KVM_REG_MIPS_CP0_STATUS,
-	KVM_REG_MIPS_CP0_CAUSE,
-	KVM_REG_MIPS_CP0_EPC,
-	KVM_REG_MIPS_CP0_PRID,
-	KVM_REG_MIPS_CP0_CONFIG,
-	KVM_REG_MIPS_CP0_CONFIG1,
-	KVM_REG_MIPS_CP0_CONFIG2,
-	KVM_REG_MIPS_CP0_CONFIG3,
-	KVM_REG_MIPS_CP0_CONFIG4,
-	KVM_REG_MIPS_CP0_CONFIG5,
-	KVM_REG_MIPS_CP0_CONFIG7,
-	KVM_REG_MIPS_CP0_ERROREPC,
-
-	KVM_REG_MIPS_COUNT_CTL,
-	KVM_REG_MIPS_COUNT_RESUME,
-	KVM_REG_MIPS_COUNT_HZ,
 };
 
 static u64 kvm_mips_get_one_regs_fpu[] = {
@@ -559,15 +532,6 @@ static u64 kvm_mips_get_one_regs_msa[] = {
 	KVM_REG_MIPS_MSA_CSR,
 };
 
-static u64 kvm_mips_get_one_regs_kscratch[] = {
-	KVM_REG_MIPS_CP0_KSCRATCH1,
-	KVM_REG_MIPS_CP0_KSCRATCH2,
-	KVM_REG_MIPS_CP0_KSCRATCH3,
-	KVM_REG_MIPS_CP0_KSCRATCH4,
-	KVM_REG_MIPS_CP0_KSCRATCH5,
-	KVM_REG_MIPS_CP0_KSCRATCH6,
-};
-
 static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
 {
 	unsigned long ret;
@@ -581,7 +545,6 @@ static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
 	}
 	if (kvm_mips_guest_can_have_msa(&vcpu->arch))
 		ret += ARRAY_SIZE(kvm_mips_get_one_regs_msa) + 32;
-	ret += __arch_hweight8(vcpu->arch.kscratch_enabled);
 	ret += kvm_mips_callbacks->num_regs(vcpu);
 
 	return ret;
@@ -634,16 +597,6 @@ static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
 		}
 	}
 
-	for (i = 0; i < 6; ++i) {
-		if (!(vcpu->arch.kscratch_enabled & BIT(i + 2)))
-			continue;
-
-		if (copy_to_user(indices, &kvm_mips_get_one_regs_kscratch[i],
-				 sizeof(kvm_mips_get_one_regs_kscratch[i])))
-			return -EFAULT;
-		++indices;
-	}
-
 	return kvm_mips_callbacks->copy_reg_indices(vcpu, indices);
 }
 
@@ -734,95 +687,6 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
 		v = fpu->msacsr;
 		break;
 
-	/* Co-processor 0 registers */
-	case KVM_REG_MIPS_CP0_INDEX:
-		v = (long)kvm_read_c0_guest_index(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CONTEXT:
-		v = (long)kvm_read_c0_guest_context(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_USERLOCAL:
-		v = (long)kvm_read_c0_guest_userlocal(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_PAGEMASK:
-		v = (long)kvm_read_c0_guest_pagemask(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_WIRED:
-		v = (long)kvm_read_c0_guest_wired(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_HWRENA:
-		v = (long)kvm_read_c0_guest_hwrena(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_BADVADDR:
-		v = (long)kvm_read_c0_guest_badvaddr(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_ENTRYHI:
-		v = (long)kvm_read_c0_guest_entryhi(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_COMPARE:
-		v = (long)kvm_read_c0_guest_compare(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_STATUS:
-		v = (long)kvm_read_c0_guest_status(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CAUSE:
-		v = (long)kvm_read_c0_guest_cause(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_EPC:
-		v = (long)kvm_read_c0_guest_epc(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_PRID:
-		v = (long)kvm_read_c0_guest_prid(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CONFIG:
-		v = (long)kvm_read_c0_guest_config(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CONFIG1:
-		v = (long)kvm_read_c0_guest_config1(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CONFIG2:
-		v = (long)kvm_read_c0_guest_config2(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CONFIG3:
-		v = (long)kvm_read_c0_guest_config3(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CONFIG4:
-		v = (long)kvm_read_c0_guest_config4(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CONFIG5:
-		v = (long)kvm_read_c0_guest_config5(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CONFIG7:
-		v = (long)kvm_read_c0_guest_config7(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_ERROREPC:
-		v = (long)kvm_read_c0_guest_errorepc(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
-		idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
-		if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
-			return -EINVAL;
-		switch (idx) {
-		case 2:
-			v = (long)kvm_read_c0_guest_kscratch1(cop0);
-			break;
-		case 3:
-			v = (long)kvm_read_c0_guest_kscratch2(cop0);
-			break;
-		case 4:
-			v = (long)kvm_read_c0_guest_kscratch3(cop0);
-			break;
-		case 5:
-			v = (long)kvm_read_c0_guest_kscratch4(cop0);
-			break;
-		case 6:
-			v = (long)kvm_read_c0_guest_kscratch5(cop0);
-			break;
-		case 7:
-			v = (long)kvm_read_c0_guest_kscratch6(cop0);
-			break;
-		}
-		break;
 	/* registers to be handled specially */
 	default:
 		ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v);
@@ -954,68 +818,6 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
 		fpu->msacsr = v;
 		break;
 
-	/* Co-processor 0 registers */
-	case KVM_REG_MIPS_CP0_INDEX:
-		kvm_write_c0_guest_index(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_CONTEXT:
-		kvm_write_c0_guest_context(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_USERLOCAL:
-		kvm_write_c0_guest_userlocal(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_PAGEMASK:
-		kvm_write_c0_guest_pagemask(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_WIRED:
-		kvm_write_c0_guest_wired(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_HWRENA:
-		kvm_write_c0_guest_hwrena(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_BADVADDR:
-		kvm_write_c0_guest_badvaddr(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_ENTRYHI:
-		kvm_write_c0_guest_entryhi(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_STATUS:
-		kvm_write_c0_guest_status(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_EPC:
-		kvm_write_c0_guest_epc(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_PRID:
-		kvm_write_c0_guest_prid(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_ERROREPC:
-		kvm_write_c0_guest_errorepc(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
-		idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
-		if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
-			return -EINVAL;
-		switch (idx) {
-		case 2:
-			kvm_write_c0_guest_kscratch1(cop0, v);
-			break;
-		case 3:
-			kvm_write_c0_guest_kscratch2(cop0, v);
-			break;
-		case 4:
-			kvm_write_c0_guest_kscratch3(cop0, v);
-			break;
-		case 5:
-			kvm_write_c0_guest_kscratch4(cop0, v);
-			break;
-		case 6:
-			kvm_write_c0_guest_kscratch5(cop0, v);
-			break;
-		case 7:
-			kvm_write_c0_guest_kscratch6(cop0, v);
-			break;
-		}
-		break;
 	/* registers to be handled specially */
 	default:
 		return kvm_mips_callbacks->set_one_reg(vcpu, reg, v);
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 001c5fb61049..08327de4323a 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -489,8 +489,6 @@ static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
 	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
 	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
 
-	vcpu->arch.kscratch_enabled = 0xfc;
-
 	/*
 	 * Allocate GVA -> HPA page tables.
 	 * MIPS doesn't use the mm_struct pointer argument.
@@ -640,14 +638,54 @@ static void kvm_trap_emul_flush_shadow_memslot(struct kvm *kvm,
 	kvm_trap_emul_flush_shadow_all(kvm);
 }
 
+static u64 kvm_trap_emul_get_one_regs[] = {
+	KVM_REG_MIPS_CP0_INDEX,
+	KVM_REG_MIPS_CP0_CONTEXT,
+	KVM_REG_MIPS_CP0_USERLOCAL,
+	KVM_REG_MIPS_CP0_PAGEMASK,
+	KVM_REG_MIPS_CP0_WIRED,
+	KVM_REG_MIPS_CP0_HWRENA,
+	KVM_REG_MIPS_CP0_BADVADDR,
+	KVM_REG_MIPS_CP0_COUNT,
+	KVM_REG_MIPS_CP0_ENTRYHI,
+	KVM_REG_MIPS_CP0_COMPARE,
+	KVM_REG_MIPS_CP0_STATUS,
+	KVM_REG_MIPS_CP0_CAUSE,
+	KVM_REG_MIPS_CP0_EPC,
+	KVM_REG_MIPS_CP0_PRID,
+	KVM_REG_MIPS_CP0_CONFIG,
+	KVM_REG_MIPS_CP0_CONFIG1,
+	KVM_REG_MIPS_CP0_CONFIG2,
+	KVM_REG_MIPS_CP0_CONFIG3,
+	KVM_REG_MIPS_CP0_CONFIG4,
+	KVM_REG_MIPS_CP0_CONFIG5,
+	KVM_REG_MIPS_CP0_CONFIG7,
+	KVM_REG_MIPS_CP0_ERROREPC,
+	KVM_REG_MIPS_CP0_KSCRATCH1,
+	KVM_REG_MIPS_CP0_KSCRATCH2,
+	KVM_REG_MIPS_CP0_KSCRATCH3,
+	KVM_REG_MIPS_CP0_KSCRATCH4,
+	KVM_REG_MIPS_CP0_KSCRATCH5,
+	KVM_REG_MIPS_CP0_KSCRATCH6,
+
+	KVM_REG_MIPS_COUNT_CTL,
+	KVM_REG_MIPS_COUNT_RESUME,
+	KVM_REG_MIPS_COUNT_HZ,
+};
+
 static unsigned long kvm_trap_emul_num_regs(struct kvm_vcpu *vcpu)
 {
-	return 0;
+	return ARRAY_SIZE(kvm_trap_emul_get_one_regs);
 }
 
 static int kvm_trap_emul_copy_reg_indices(struct kvm_vcpu *vcpu,
 					  u64 __user *indices)
 {
+	if (copy_to_user(indices, kvm_trap_emul_get_one_regs,
+			 sizeof(kvm_trap_emul_get_one_regs)))
+		return -EFAULT;
+	indices += ARRAY_SIZE(kvm_trap_emul_get_one_regs);
+
 	return 0;
 }
 
@@ -655,7 +693,69 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
 				     const struct kvm_one_reg *reg,
 				     s64 *v)
 {
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+
 	switch (reg->id) {
+	case KVM_REG_MIPS_CP0_INDEX:
+		*v = (long)kvm_read_c0_guest_index(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CONTEXT:
+		*v = (long)kvm_read_c0_guest_context(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_USERLOCAL:
+		*v = (long)kvm_read_c0_guest_userlocal(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_PAGEMASK:
+		*v = (long)kvm_read_c0_guest_pagemask(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_WIRED:
+		*v = (long)kvm_read_c0_guest_wired(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_HWRENA:
+		*v = (long)kvm_read_c0_guest_hwrena(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_BADVADDR:
+		*v = (long)kvm_read_c0_guest_badvaddr(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYHI:
+		*v = (long)kvm_read_c0_guest_entryhi(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_COMPARE:
+		*v = (long)kvm_read_c0_guest_compare(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_STATUS:
+		*v = (long)kvm_read_c0_guest_status(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CAUSE:
+		*v = (long)kvm_read_c0_guest_cause(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_EPC:
+		*v = (long)kvm_read_c0_guest_epc(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_PRID:
+		*v = (long)kvm_read_c0_guest_prid(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG:
+		*v = (long)kvm_read_c0_guest_config(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG1:
+		*v = (long)kvm_read_c0_guest_config1(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG2:
+		*v = (long)kvm_read_c0_guest_config2(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG3:
+		*v = (long)kvm_read_c0_guest_config3(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG4:
+		*v = (long)kvm_read_c0_guest_config4(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG5:
+		*v = (long)kvm_read_c0_guest_config5(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG7:
+		*v = (long)kvm_read_c0_guest_config7(cop0);
+		break;
 	case KVM_REG_MIPS_CP0_COUNT:
 		*v = kvm_mips_read_count(vcpu);
 		break;
@@ -668,6 +768,27 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_COUNT_HZ:
 		*v = vcpu->arch.count_hz;
 		break;
+	case KVM_REG_MIPS_CP0_ERROREPC:
+		*v = (long)kvm_read_c0_guest_errorepc(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH1:
+		*v = (long)kvm_read_c0_guest_kscratch1(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH2:
+		*v = (long)kvm_read_c0_guest_kscratch2(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH3:
+		*v = (long)kvm_read_c0_guest_kscratch3(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH4:
+		*v = (long)kvm_read_c0_guest_kscratch4(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH5:
+		*v = (long)kvm_read_c0_guest_kscratch5(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH6:
+		*v = (long)kvm_read_c0_guest_kscratch6(cop0);
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -683,6 +804,39 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
 	unsigned int cur, change;
 
 	switch (reg->id) {
+	case KVM_REG_MIPS_CP0_INDEX:
+		kvm_write_c0_guest_index(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_CONTEXT:
+		kvm_write_c0_guest_context(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_USERLOCAL:
+		kvm_write_c0_guest_userlocal(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_PAGEMASK:
+		kvm_write_c0_guest_pagemask(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_WIRED:
+		kvm_write_c0_guest_wired(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_HWRENA:
+		kvm_write_c0_guest_hwrena(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_BADVADDR:
+		kvm_write_c0_guest_badvaddr(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYHI:
+		kvm_write_c0_guest_entryhi(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_STATUS:
+		kvm_write_c0_guest_status(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_EPC:
+		kvm_write_c0_guest_epc(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_PRID:
+		kvm_write_c0_guest_prid(cop0, v);
+		break;
 	case KVM_REG_MIPS_CP0_COUNT:
 		kvm_mips_write_count(vcpu, v);
 		break;
@@ -759,6 +913,27 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_COUNT_HZ:
 		ret = kvm_mips_set_count_hz(vcpu, v);
 		break;
+	case KVM_REG_MIPS_CP0_ERROREPC:
+		kvm_write_c0_guest_errorepc(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH1:
+		kvm_write_c0_guest_kscratch1(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH2:
+		kvm_write_c0_guest_kscratch2(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH3:
+		kvm_write_c0_guest_kscratch3(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH4:
+		kvm_write_c0_guest_kscratch4(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH5:
+		kvm_write_c0_guest_kscratch5(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH6:
+		kvm_write_c0_guest_kscratch6(cop0, v);
+		break;
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 7801bbe1bd907a8f8b136fc184583260508febb6 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 14 Nov 2016 23:59:27 +0000
Subject: KVM: MIPS/T&E: Implement CP0_EBase register
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CP0_EBase register is a standard feature of MIPS32r2, so we should
always have been implementing it properly. However the register value
was ignored and wasn't exposed to userland.

Fix the emulation of exceptions and interrupts to use the value stored
in guest CP0_EBase, and fix the masks so that the top 3 bits (rather
than the standard 2) are fixed, so that it is always in the guest KSeg0
segment.

Also add CP0_EBASE to the KVM one_reg interface so it can be accessed by
userland, also allowing the CPU number field to be written (which isn't
permitted by the guest).

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt |  1 +
 arch/mips/include/asm/kvm_host.h  |  3 ++
 arch/mips/kvm/emulate.c           | 73 ++++++++++++++++++++++-----------------
 arch/mips/kvm/interrupt.c         |  5 +--
 arch/mips/kvm/trap_emul.c         | 12 +++++++
 5 files changed, 61 insertions(+), 33 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 03145b7cafaa..8d52d0f990ae 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2074,6 +2074,7 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_CAUSE        | 32
   MIPS  | KVM_REG_MIPS_CP0_EPC          | 64
   MIPS  | KVM_REG_MIPS_CP0_PRID         | 32
+  MIPS  | KVM_REG_MIPS_CP0_EBASE        | 64
   MIPS  | KVM_REG_MIPS_CP0_CONFIG       | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG1      | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG2      | 32
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index bc56a312497d..420372fa5bbc 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -88,6 +88,7 @@
 
 #define KVM_GUEST_KUSEG			0x00000000UL
 #define KVM_GUEST_KSEG0			0x40000000UL
+#define KVM_GUEST_KSEG1			0x40000000UL
 #define KVM_GUEST_KSEG23		0x60000000UL
 #define KVM_GUEST_KSEGX(a)		((_ACAST32_(a)) & 0xe0000000)
 #define KVM_GUEST_CPHYSADDR(a)		((_ACAST32_(a)) & 0x1fffffff)
@@ -713,6 +714,8 @@ extern enum emulation_result kvm_mips_emulate_inst(u32 cause,
 						   struct kvm_run *run,
 						   struct kvm_vcpu *vcpu);
 
+long kvm_mips_guest_exception_base(struct kvm_vcpu *vcpu);
+
 extern enum emulation_result kvm_mips_emulate_syscall(u32 cause,
 						      u32 *opc,
 						      struct kvm_run *run,
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index f2b054b80bca..d40cfaad4529 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1200,14 +1200,13 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 				er = EMULATE_FAIL;
 				break;
 			}
-#define C0_EBASE_CORE_MASK 0xff
 			if ((rd == MIPS_CP0_PRID) && (sel == 1)) {
-				/* Preserve CORE number */
-				kvm_change_c0_guest_ebase(cop0,
-							  ~(C0_EBASE_CORE_MASK),
+				/*
+				 * Preserve core number, and keep the exception
+				 * base in guest KSeg0.
+				 */
+				kvm_change_c0_guest_ebase(cop0, 0x1ffff000,
 							  vcpu->arch.gprs[rt]);
-				kvm_err("MTCz, cop0->reg[EBASE]: %#lx\n",
-					kvm_read_c0_guest_ebase(cop0));
 			} else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
 				u32 nasid =
 					vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
@@ -1917,6 +1916,22 @@ unknown:
 	return er;
 }
 
+/**
+ * kvm_mips_guest_exception_base() - Find guest exception vector base address.
+ *
+ * Returns:	The base address of the current guest exception vector, taking
+ *		both Guest.CP0_Status.BEV and Guest.CP0_EBase into account.
+ */
+long kvm_mips_guest_exception_base(struct kvm_vcpu *vcpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+
+	if (kvm_read_c0_guest_status(cop0) & ST0_BEV)
+		return KVM_GUEST_CKSEG1ADDR(0x1fc00200);
+	else
+		return kvm_read_c0_guest_ebase(cop0) & MIPS_EBASE_BASE;
+}
+
 enum emulation_result kvm_mips_emulate_syscall(u32 cause,
 					       u32 *opc,
 					       struct kvm_run *run,
@@ -1942,7 +1957,7 @@ enum emulation_result kvm_mips_emulate_syscall(u32 cause,
 					  (EXCCODE_SYS << CAUSEB_EXCCODE));
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 
 	} else {
 		kvm_err("Trying to deliver SYSCALL when EXL is already set\n");
@@ -1976,13 +1991,13 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
 			  arch->pc);
 
 		/* set pc to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x0;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x0;
 
 	} else {
 		kvm_debug("[EXL == 1] delivering TLB MISS @ pc %#lx\n",
 			  arch->pc);
 
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 	}
 
 	kvm_change_c0_guest_cause(cop0, (0xff),
@@ -2019,16 +2034,14 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
 
 		kvm_debug("[EXL == 0] delivering TLB INV @ pc %#lx\n",
 			  arch->pc);
-
-		/* set pc to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
-
 	} else {
 		kvm_debug("[EXL == 1] delivering TLB MISS @ pc %#lx\n",
 			  arch->pc);
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
 	}
 
+	/* set pc to the exception entry point */
+	arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
+
 	kvm_change_c0_guest_cause(cop0, (0xff),
 				  (EXCCODE_TLBL << CAUSEB_EXCCODE));
 
@@ -2064,11 +2077,11 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
 			  arch->pc);
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x0;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x0;
 	} else {
 		kvm_debug("[EXL == 1] Delivering TLB MISS @ pc %#lx\n",
 			  arch->pc);
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 	}
 
 	kvm_change_c0_guest_cause(cop0, (0xff),
@@ -2104,15 +2117,14 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
 
 		kvm_debug("[EXL == 0] Delivering TLB MISS @ pc %#lx\n",
 			  arch->pc);
-
-		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
 	} else {
 		kvm_debug("[EXL == 1] Delivering TLB MISS @ pc %#lx\n",
 			  arch->pc);
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
 	}
 
+	/* Set PC to the exception entry point */
+	arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
+
 	kvm_change_c0_guest_cause(cop0, (0xff),
 				  (EXCCODE_TLBS << CAUSEB_EXCCODE));
 
@@ -2146,14 +2158,13 @@ enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
 
 		kvm_debug("[EXL == 0] Delivering TLB MOD @ pc %#lx\n",
 			  arch->pc);
-
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
 	} else {
 		kvm_debug("[EXL == 1] Delivering TLB MOD @ pc %#lx\n",
 			  arch->pc);
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
 	}
 
+	arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
+
 	kvm_change_c0_guest_cause(cop0, (0xff),
 				  (EXCCODE_MOD << CAUSEB_EXCCODE));
 
@@ -2185,7 +2196,7 @@ enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause,
 
 	}
 
-	arch->pc = KVM_GUEST_KSEG0 + 0x180;
+	arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 
 	kvm_change_c0_guest_cause(cop0, (0xff),
 				  (EXCCODE_CPU << CAUSEB_EXCCODE));
@@ -2219,7 +2230,7 @@ enum emulation_result kvm_mips_emulate_ri_exc(u32 cause,
 					  (EXCCODE_RI << CAUSEB_EXCCODE));
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 
 	} else {
 		kvm_err("Trying to deliver RI when EXL is already set\n");
@@ -2254,7 +2265,7 @@ enum emulation_result kvm_mips_emulate_bp_exc(u32 cause,
 					  (EXCCODE_BP << CAUSEB_EXCCODE));
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 
 	} else {
 		kvm_err("Trying to deliver BP when EXL is already set\n");
@@ -2289,7 +2300,7 @@ enum emulation_result kvm_mips_emulate_trap_exc(u32 cause,
 					  (EXCCODE_TR << CAUSEB_EXCCODE));
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 
 	} else {
 		kvm_err("Trying to deliver TRAP when EXL is already set\n");
@@ -2324,7 +2335,7 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause,
 					  (EXCCODE_MSAFPE << CAUSEB_EXCCODE));
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 
 	} else {
 		kvm_err("Trying to deliver MSAFPE when EXL is already set\n");
@@ -2359,7 +2370,7 @@ enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause,
 					  (EXCCODE_FPE << CAUSEB_EXCCODE));
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 
 	} else {
 		kvm_err("Trying to deliver FPE when EXL is already set\n");
@@ -2394,7 +2405,7 @@ enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
 					  (EXCCODE_MSADIS << CAUSEB_EXCCODE));
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 
 	} else {
 		kvm_err("Trying to deliver MSADIS when EXL is already set\n");
@@ -2560,7 +2571,7 @@ static enum emulation_result kvm_mips_emulate_exc(u32 cause,
 					  (exccode << CAUSEB_EXCCODE));
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 		kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
 
 		kvm_debug("Delivering EXC %d @ pc %#lx, badVaddr: %#lx\n",
diff --git a/arch/mips/kvm/interrupt.c b/arch/mips/kvm/interrupt.c
index e88403b3dcdd..aa0a1a00faf6 100644
--- a/arch/mips/kvm/interrupt.c
+++ b/arch/mips/kvm/interrupt.c
@@ -183,10 +183,11 @@ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
 					  (exccode << CAUSEB_EXCCODE));
 
 		/* XXXSL Set PC to the interrupt exception entry point */
+		arch->pc = kvm_mips_guest_exception_base(vcpu);
 		if (kvm_read_c0_guest_cause(cop0) & CAUSEF_IV)
-			arch->pc = KVM_GUEST_KSEG0 + 0x200;
+			arch->pc += 0x200;
 		else
-			arch->pc = KVM_GUEST_KSEG0 + 0x180;
+			arch->pc += 0x180;
 
 		clear_bit(priority, &vcpu->arch.pending_exceptions);
 	}
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 08327de4323a..80a681f42bf5 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -653,6 +653,7 @@ static u64 kvm_trap_emul_get_one_regs[] = {
 	KVM_REG_MIPS_CP0_CAUSE,
 	KVM_REG_MIPS_CP0_EPC,
 	KVM_REG_MIPS_CP0_PRID,
+	KVM_REG_MIPS_CP0_EBASE,
 	KVM_REG_MIPS_CP0_CONFIG,
 	KVM_REG_MIPS_CP0_CONFIG1,
 	KVM_REG_MIPS_CP0_CONFIG2,
@@ -735,6 +736,9 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_PRID:
 		*v = (long)kvm_read_c0_guest_prid(cop0);
 		break;
+	case KVM_REG_MIPS_CP0_EBASE:
+		*v = (long)kvm_read_c0_guest_ebase(cop0);
+		break;
 	case KVM_REG_MIPS_CP0_CONFIG:
 		*v = (long)kvm_read_c0_guest_config(cop0);
 		break;
@@ -837,6 +841,14 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_PRID:
 		kvm_write_c0_guest_prid(cop0, v);
 		break;
+	case KVM_REG_MIPS_CP0_EBASE:
+		/*
+		 * Allow core number to be written, but the exception base must
+		 * remain in guest KSeg0.
+		 */
+		kvm_change_c0_guest_ebase(cop0, 0x1ffff000 | MIPS_EBASE_CPUNUM,
+					  v);
+		break;
 	case KVM_REG_MIPS_CP0_COUNT:
 		kvm_mips_write_count(vcpu, v);
 		break;
-- 
cgit v1.2.3


From be67a0be94b65746dee63af5c184c78d00a707f6 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 18 Jan 2017 16:20:31 +0000
Subject: KVM: MIPS/T&E: Default to reset vector
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Set the default VCPU state closer to the architectural reset state, with
PC pointing at the reset vector (uncached PA 0x1fc00000, which for KVM
T&E is VA 0x5fc00000), and with CP0_Status.BEV and CP0_Status.ERL to 1.

Although QEMU at least will overwrite this state, it makes sense to do
this now that CP0_EBase is properly implemented to check BEV, and now
that we support a sparse GPA layout potentially with a boot ROM at GPA
0x1fc00000.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/kvm/trap_emul.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 80a681f42bf5..ce44f91c653a 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -614,6 +614,9 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 	/* Set Wait IE/IXMT Ignore in Config7, IAR, AR */
 	kvm_write_c0_guest_config7(cop0, (MIPS_CONF7_WII) | (1 << 10));
 
+	/* Status */
+	kvm_write_c0_guest_status(cop0, ST0_BEV | ST0_ERL);
+
 	/*
 	 * Setup IntCtl defaults, compatibility mode for timer interrupts (HW5)
 	 */
@@ -623,6 +626,9 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 	kvm_write_c0_guest_ebase(cop0, KVM_GUEST_KSEG0 |
 				       (vcpu_id & MIPS_EBASE_CPUNUM));
 
+	/* Put PC at guest reset vector */
+	vcpu->arch.pc = KVM_GUEST_CKSEG1ADDR(0x1fc00000);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 013044cc65f8661c5fa2b59da5e134b3453d975d Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 7 Dec 2016 17:16:37 +0000
Subject: KVM: MIPS/T&E: Expose CP0_EntryLo0/1 registers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expose the CP0_EntryLo0 and CP0_EntryLo1 registers through the KVM
register access API. This is fairly straightforward for trap & emulate
since we don't support the RI and XI bits. For the sake of future
proofing (particularly for VZ) it is explicitly specified that the API
always exposes the 64-bit version of these registers (i.e. with the RI
and XI bits in bit positions 63 and 62 respectively), and they are
implemented in trap_emul.c rather than mips.c to allow them to be
implemented differently for VZ.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt |  8 ++++++++
 arch/mips/include/asm/kvm_host.h  |  2 ++
 arch/mips/kvm/trap_emul.c         | 14 ++++++++++++++
 3 files changed, 24 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 8d52d0f990ae..df4a309ba56e 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2061,6 +2061,8 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_LO               | 64
   MIPS  | KVM_REG_MIPS_PC               | 64
   MIPS  | KVM_REG_MIPS_CP0_INDEX        | 32
+  MIPS  | KVM_REG_MIPS_CP0_ENTRYLO0     | 64
+  MIPS  | KVM_REG_MIPS_CP0_ENTRYLO1     | 64
   MIPS  | KVM_REG_MIPS_CP0_CONTEXT      | 64
   MIPS  | KVM_REG_MIPS_CP0_USERLOCAL    | 64
   MIPS  | KVM_REG_MIPS_CP0_PAGEMASK     | 32
@@ -2149,6 +2151,12 @@ patterns depending on whether they're 32-bit or 64-bit registers:
   0x7020 0000 0001 00 <reg:5> <sel:3>   (32-bit)
   0x7030 0000 0001 00 <reg:5> <sel:3>   (64-bit)
 
+Note: KVM_REG_MIPS_CP0_ENTRYLO0 and KVM_REG_MIPS_CP0_ENTRYLO1 are the MIPS64
+versions of the EntryLo registers regardless of the word size of the host
+hardware, host kernel, guest, and whether XPA is present in the guest, i.e.
+with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and
+the PFNX field starting at bit 30.
+
 MIPS KVM control registers (see above) have the following id bit patterns:
   0x7030 0000 0002 <reg:16>
 
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 420372fa5bbc..66459ca4af81 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -352,7 +352,9 @@ struct kvm_vcpu_arch {
 #define kvm_read_c0_guest_index(cop0)		(cop0->reg[MIPS_CP0_TLB_INDEX][0])
 #define kvm_write_c0_guest_index(cop0, val)	(cop0->reg[MIPS_CP0_TLB_INDEX][0] = val)
 #define kvm_read_c0_guest_entrylo0(cop0)	(cop0->reg[MIPS_CP0_TLB_LO0][0])
+#define kvm_write_c0_guest_entrylo0(cop0, val)	(cop0->reg[MIPS_CP0_TLB_LO0][0] = (val))
 #define kvm_read_c0_guest_entrylo1(cop0)	(cop0->reg[MIPS_CP0_TLB_LO1][0])
+#define kvm_write_c0_guest_entrylo1(cop0, val)	(cop0->reg[MIPS_CP0_TLB_LO1][0] = (val))
 #define kvm_read_c0_guest_context(cop0)		(cop0->reg[MIPS_CP0_TLB_CONTEXT][0])
 #define kvm_write_c0_guest_context(cop0, val)	(cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val))
 #define kvm_read_c0_guest_userlocal(cop0)	(cop0->reg[MIPS_CP0_TLB_CONTEXT][2])
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index ce44f91c653a..2f9e44b0f177 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -646,6 +646,8 @@ static void kvm_trap_emul_flush_shadow_memslot(struct kvm *kvm,
 
 static u64 kvm_trap_emul_get_one_regs[] = {
 	KVM_REG_MIPS_CP0_INDEX,
+	KVM_REG_MIPS_CP0_ENTRYLO0,
+	KVM_REG_MIPS_CP0_ENTRYLO1,
 	KVM_REG_MIPS_CP0_CONTEXT,
 	KVM_REG_MIPS_CP0_USERLOCAL,
 	KVM_REG_MIPS_CP0_PAGEMASK,
@@ -706,6 +708,12 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_INDEX:
 		*v = (long)kvm_read_c0_guest_index(cop0);
 		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO0:
+		*v = kvm_read_c0_guest_entrylo0(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO1:
+		*v = kvm_read_c0_guest_entrylo1(cop0);
+		break;
 	case KVM_REG_MIPS_CP0_CONTEXT:
 		*v = (long)kvm_read_c0_guest_context(cop0);
 		break;
@@ -817,6 +825,12 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_INDEX:
 		kvm_write_c0_guest_index(cop0, v);
 		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO0:
+		kvm_write_c0_guest_entrylo0(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO1:
+		kvm_write_c0_guest_entrylo1(cop0, v);
+		break;
 	case KVM_REG_MIPS_CP0_CONTEXT:
 		kvm_write_c0_guest_context(cop0, v);
 		break;
-- 
cgit v1.2.3


From ad58d4d4a274e9290725188c557d16e7d0cd1b3d Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 2 Feb 2015 22:55:17 +0000
Subject: KVM: MIPS/T&E: Expose read-only CP0_IntCtl register
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expose the CP0_IntCtl register through the KVM register access API,
which is a required register since MIPS32r2. It is currently read-only
since the VS field isn't implemented due to lack of Config3.VInt or
Config3.VEIC.

It is implemented in trap_emul.c so that a VZ implementation can allow
writes.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt | 1 +
 arch/mips/include/asm/kvm_host.h  | 1 +
 arch/mips/kvm/trap_emul.c         | 7 +++++++
 3 files changed, 9 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index df4a309ba56e..d34b03c99233 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2073,6 +2073,7 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_ENTRYHI      | 64
   MIPS  | KVM_REG_MIPS_CP0_COMPARE      | 32
   MIPS  | KVM_REG_MIPS_CP0_STATUS       | 32
+  MIPS  | KVM_REG_MIPS_CP0_INTCTL       | 32
   MIPS  | KVM_REG_MIPS_CP0_CAUSE        | 32
   MIPS  | KVM_REG_MIPS_CP0_EPC          | 64
   MIPS  | KVM_REG_MIPS_CP0_PRID         | 32
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 66459ca4af81..ebcc55963941 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -43,6 +43,7 @@
 #define KVM_REG_MIPS_CP0_ENTRYHI	MIPS_CP0_64(10, 0)
 #define KVM_REG_MIPS_CP0_COMPARE	MIPS_CP0_32(11, 0)
 #define KVM_REG_MIPS_CP0_STATUS		MIPS_CP0_32(12, 0)
+#define KVM_REG_MIPS_CP0_INTCTL		MIPS_CP0_32(12, 1)
 #define KVM_REG_MIPS_CP0_CAUSE		MIPS_CP0_32(13, 0)
 #define KVM_REG_MIPS_CP0_EPC		MIPS_CP0_64(14, 0)
 #define KVM_REG_MIPS_CP0_PRID		MIPS_CP0_32(15, 0)
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 2f9e44b0f177..b1fa53b252ea 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -658,6 +658,7 @@ static u64 kvm_trap_emul_get_one_regs[] = {
 	KVM_REG_MIPS_CP0_ENTRYHI,
 	KVM_REG_MIPS_CP0_COMPARE,
 	KVM_REG_MIPS_CP0_STATUS,
+	KVM_REG_MIPS_CP0_INTCTL,
 	KVM_REG_MIPS_CP0_CAUSE,
 	KVM_REG_MIPS_CP0_EPC,
 	KVM_REG_MIPS_CP0_PRID,
@@ -741,6 +742,9 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_STATUS:
 		*v = (long)kvm_read_c0_guest_status(cop0);
 		break;
+	case KVM_REG_MIPS_CP0_INTCTL:
+		*v = (long)kvm_read_c0_guest_intctl(cop0);
+		break;
 	case KVM_REG_MIPS_CP0_CAUSE:
 		*v = (long)kvm_read_c0_guest_cause(cop0);
 		break;
@@ -855,6 +859,9 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_STATUS:
 		kvm_write_c0_guest_status(cop0, v);
 		break;
+	case KVM_REG_MIPS_CP0_INTCTL:
+		/* No VInt, so no VS, read-only for now */
+		break;
 	case KVM_REG_MIPS_CP0_EPC:
 		kvm_write_c0_guest_epc(cop0, v);
 		break;
-- 
cgit v1.2.3


From 12ed1faece3f141c2604b5b3a8377ba71d23ec9d Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 13 Dec 2016 22:39:39 +0000
Subject: KVM: MIPS: Allow multiple VCPUs to be created
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Increase the maximum number of MIPS KVM VCPUs to 8, and implement the
KVM_CAP_NR_VCPUS and KVM_CAP_MAX_CPUS capabilities which expose the
recommended and maximum number of VCPUs to userland. The previous
maximum of 1 didn't allow for any form of SMP guests.

We calculate the values similarly to ARM, recommending as many VCPUs as
there are CPUs online in the system. This will allow userland to know
how many VCPUs it is possible to create.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
---
 arch/mips/include/asm/kvm_host.h | 2 +-
 arch/mips/kvm/mips.c             | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index ebcc55963941..05e785fc061d 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -65,7 +65,7 @@
 #define KVM_REG_MIPS_CP0_KSCRATCH6	MIPS_CP0_64(31, 7)
 
 
-#define KVM_MAX_VCPUS		1
+#define KVM_MAX_VCPUS		8
 #define KVM_USER_MEM_SLOTS	8
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS	0
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 9338aec08790..31ee5ee0010b 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1026,6 +1026,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
+	case KVM_CAP_NR_VCPUS:
+		r = num_online_cpus();
+		break;
+	case KVM_CAP_MAX_VCPUS:
+		r = KVM_MAX_VCPUS;
+		break;
 	case KVM_CAP_MIPS_FPU:
 		/* We don't handle systems with inconsistent cpu_has_fpu */
 		r = !!raw_cpu_has_fpu;
-- 
cgit v1.2.3


From e1e8a9624f7ba8ead4f056ff558ed070e86fa747 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.vnet.ibm.com>
Date: Thu, 2 Feb 2017 16:39:31 +0100
Subject: KVM: s390: Disable dirty log retrieval for UCONTROL guests

User controlled KVM guests do not support the dirty log, as they have
no single gmap that we can check for changes.

As they have no single gmap, kvm->arch.gmap is NULL and all further
referencing to it for dirty checking will result in a NULL
dereference.

Let's return -EINVAL if a caller tries to sync dirty logs for a
UCONTROL guest.

Fixes: 15f36eb ("KVM: s390: Add proper dirty bitmap support to S390 kvm.")
Cc: <stable@vger.kernel.org> # 3.16+

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reported-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index dabd3b15bf11..502de74ea984 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -442,6 +442,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	struct kvm_memory_slot *memslot;
 	int is_dirty = 0;
 
+	if (kvm_is_ucontrol(kvm))
+		return -EINVAL;
+
 	mutex_lock(&kvm->slots_lock);
 
 	r = -EINVAL;
-- 
cgit v1.2.3


From fb7dc1d4ddce744c8d8e1aca19d4982102cf72e1 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 26 Jan 2017 20:45:33 +0100
Subject: KVM: s390: detect some program check loops

Sometimes (e.g. early boot) a guest is broken in such ways that it loops
100% delivering operation exceptions (illegal operation) but the pgm new
PSW is not set properly. This will result in code being read from
address zero, which usually contains another illegal op. Let's detect
this case and return to userspace. Instead of only detecting
this for address zero apply a heuristic that will work for any program
check new psw.
We do not want guest problem state to be able to trigger a guest panic,
e.g. by faulting on an address that is the same as the program check
new PSW, so we check for the problem state bit being off.

With proper handling in userspace we
a: get rid of CPU consumption of such broken guests
b: keep the program old PSW. This allows to find out the original illegal
   operation - making debugging such early boot issues much easier than
   with single stepping

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 arch/s390/kvm/intercept.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 8b13f7098c61..59920f96ebc0 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -361,6 +361,9 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
 
 static int handle_operexc(struct kvm_vcpu *vcpu)
 {
+	psw_t oldpsw, newpsw;
+	int rc;
+
 	vcpu->stat.exit_operation_exception++;
 	trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa,
 				      vcpu->arch.sie_block->ipb);
@@ -371,6 +374,24 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
 
 	if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
 		return -EOPNOTSUPP;
+	rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &newpsw, sizeof(psw_t));
+	if (rc)
+		return rc;
+	/*
+	 * Avoid endless loops of operation exceptions, if the pgm new
+	 * PSW will cause a new operation exception.
+	 * The heuristic checks if the pgm new psw is within 6 bytes before
+	 * the faulting psw address (with same DAT, AS settings) and the
+	 * new psw is not a wait psw and the fault was not triggered by
+	 * problem state.
+	 */
+	oldpsw = vcpu->arch.sie_block->gpsw;
+	if (oldpsw.addr - newpsw.addr <= 6 &&
+	    !(newpsw.mask & PSW_MASK_WAIT) &&
+	    !(oldpsw.mask & PSW_MASK_PSTATE) &&
+	    (newpsw.mask & PSW_MASK_ASC) == (oldpsw.mask & PSW_MASK_ASC) &&
+	    (newpsw.mask & PSW_MASK_DAT) == (oldpsw.mask & PSW_MASK_DAT))
+		return -EOPNOTSUPP;
 
 	return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
 }
-- 
cgit v1.2.3


From 42cf014d38d8822cce63703a467e00f65d000952 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 25 Jan 2017 11:58:57 +0100
Subject: KVM: nVMX: kmap() can't fail

kmap() can't fail, therefore it will always return a valid pointer. Let's
just get rid of the unnecessary checks.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d850d5d36182..693e4203b666 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4973,10 +4973,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 			return 0;
 
 		vapic_page = kmap(vmx->nested.virtual_apic_page);
-		if (!vapic_page) {
-			WARN_ON(1);
-			return -ENOMEM;
-		}
 		__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
 		kunmap(vmx->nested.virtual_apic_page);
 
@@ -9738,11 +9734,6 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 		return false;
 	}
 	msr_bitmap_l1 = (unsigned long *)kmap(page);
-	if (!msr_bitmap_l1) {
-		nested_release_page_clean(page);
-		WARN_ON(1);
-		return false;
-	}
 
 	memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
 
-- 
cgit v1.2.3


From 6342c50ad12e8ce0736e722184a7dbdea4a3477f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 25 Jan 2017 11:58:58 +0100
Subject: KVM: nVMX: vmx_complete_nested_posted_interrupt() can't fail

vmx_complete_nested_posted_interrupt() can't fail, let's turn it into
a void function.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 693e4203b666..7c3e42623090 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4953,7 +4953,7 @@ static bool vmx_get_enable_apicv(void)
 	return enable_apicv;
 }
 
-static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	int max_irr;
@@ -4964,13 +4964,13 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 	    vmx->nested.pi_pending) {
 		vmx->nested.pi_pending = false;
 		if (!pi_test_and_clear_on(vmx->nested.pi_desc))
-			return 0;
+			return;
 
 		max_irr = find_last_bit(
 			(unsigned long *)vmx->nested.pi_desc->pir, 256);
 
 		if (max_irr == 256)
-			return 0;
+			return;
 
 		vapic_page = kmap(vmx->nested.virtual_apic_page);
 		__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
@@ -4983,7 +4983,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 			vmcs_write16(GUEST_INTR_STATUS, status);
 		}
 	}
-	return 0;
 }
 
 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -10695,7 +10694,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
 		return 0;
 	}
 
-	return vmx_complete_nested_posted_interrupt(vcpu);
+	vmx_complete_nested_posted_interrupt(vcpu);
+	return 0;
 }
 
 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 55dd00a73a518281bc846dc5de1a718349431eb2 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 24 Jan 2017 15:09:39 -0200
Subject: KVM: x86: add KVM_HC_CLOCK_PAIRING hypercall

Add a hypercall to retrieve the host realtime clock and the TSC value
used to calculate that clock read.

Used to implement clock synchronization between host and guest.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/hypercalls.txt | 35 +++++++++++++++++
 arch/x86/include/uapi/asm/kvm_para.h     |  9 +++++
 arch/x86/kvm/x86.c                       | 66 ++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm_para.h            |  2 +
 4 files changed, 112 insertions(+)

diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
index c8d040e27046..feaaa634f154 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -81,3 +81,38 @@ the vcpu to sleep until occurrence of an appropriate event. Another vcpu of the
 same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall,
 specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0)
 is used in the hypercall for future use.
+
+
+6. KVM_HC_CLOCK_PAIRING
+------------------------
+Architecture: x86
+Status: active
+Purpose: Hypercall used to synchronize host and guest clocks.
+Usage:
+
+a0: guest physical address where host copies
+"struct kvm_clock_offset" structure.
+
+a1: clock_type, ATM only KVM_CLOCK_PAIRING_WALLCLOCK (0)
+is supported (corresponding to the host's CLOCK_REALTIME clock).
+
+		struct kvm_clock_pairing {
+			__s64 sec;
+			__s64 nsec;
+			__u64 tsc;
+			__u32 flags;
+			__u32 pad[9];
+		};
+
+       Where:
+               * sec: seconds from clock_type clock.
+               * nsec: nanoseconds from clock_type clock.
+               * tsc: guest TSC value used to calculate sec/nsec pair
+               * flags: flags, unused (0) at the moment.
+
+The hypercall lets a guest compute a precise timestamp across
+host and guest.  The guest can use the returned TSC value to
+compute the CLOCK_REALTIME for its clock, at the same instant.
+
+Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource,
+or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK.
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 1421a6585126..cff0bb6556f8 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -50,6 +50,15 @@ struct kvm_steal_time {
 	__u32 pad[11];
 };
 
+#define KVM_CLOCK_PAIRING_WALLCLOCK 0
+struct kvm_clock_pairing {
+	__s64 sec;
+	__s64 nsec;
+	__u64 tsc;
+	__u32 flags;
+	__u32 pad[9];
+};
+
 #define KVM_STEAL_ALIGNMENT_BITS 5
 #define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
 #define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4fd4d4f35caf..09e5d31dac98 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1142,6 +1142,7 @@ struct pvclock_gtod_data {
 
 	u64		boot_ns;
 	u64		nsec_base;
+	u64		wall_time_sec;
 };
 
 static struct pvclock_gtod_data pvclock_gtod_data;
@@ -1165,6 +1166,8 @@ static void update_pvclock_gtod(struct timekeeper *tk)
 	vdata->boot_ns			= boot_ns;
 	vdata->nsec_base		= tk->tkr_mono.xtime_nsec;
 
+	vdata->wall_time_sec            = tk->xtime_sec;
+
 	write_seqcount_end(&vdata->seq);
 }
 #endif
@@ -1626,6 +1629,28 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now)
 	return mode;
 }
 
+static int do_realtime(struct timespec *ts, u64 *cycle_now)
+{
+	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+	unsigned long seq;
+	int mode;
+	u64 ns;
+
+	do {
+		seq = read_seqcount_begin(&gtod->seq);
+		mode = gtod->clock.vclock_mode;
+		ts->tv_sec = gtod->wall_time_sec;
+		ns = gtod->nsec_base;
+		ns += vgettsc(cycle_now);
+		ns >>= gtod->clock.shift;
+	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+
+	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+	ts->tv_nsec = ns;
+
+	return mode;
+}
+
 /* returns true if host is using tsc clocksource */
 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
 {
@@ -1635,6 +1660,17 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
 
 	return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
 }
+
+/* returns true if host is using tsc clocksource */
+static bool kvm_get_walltime_and_clockread(struct timespec *ts,
+					   u64 *cycle_now)
+{
+	/* checked again under seqlock below */
+	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+		return false;
+
+	return do_realtime(ts, cycle_now) == VCLOCK_TSC;
+}
 #endif
 
 /*
@@ -6112,6 +6148,33 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
+static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
+			        unsigned long clock_type)
+{
+	struct kvm_clock_pairing clock_pairing;
+	struct timespec ts;
+	cycle_t cycle;
+	int ret;
+
+	if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
+		return -KVM_EOPNOTSUPP;
+
+	if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
+		return -KVM_EOPNOTSUPP;
+
+	clock_pairing.sec = ts.tv_sec;
+	clock_pairing.nsec = ts.tv_nsec;
+	clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
+	clock_pairing.flags = 0;
+
+	ret = 0;
+	if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
+			    sizeof(struct kvm_clock_pairing)))
+		ret = -KVM_EFAULT;
+
+	return ret;
+}
+
 /*
  * kvm_pv_kick_cpu_op:  Kick a vcpu.
  *
@@ -6176,6 +6239,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
 		ret = 0;
 		break;
+	case KVM_HC_CLOCK_PAIRING:
+		ret = kvm_pv_clock_pairing(vcpu, a0, a1);
+		break;
 	default:
 		ret = -KVM_ENOSYS;
 		break;
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index bf6cd7d5cac2..fed506aeff62 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -14,6 +14,7 @@
 #define KVM_EFAULT		EFAULT
 #define KVM_E2BIG		E2BIG
 #define KVM_EPERM		EPERM
+#define KVM_EOPNOTSUPP		95
 
 #define KVM_HC_VAPIC_POLL_IRQ		1
 #define KVM_HC_MMU_OP			2
@@ -23,6 +24,7 @@
 #define KVM_HC_MIPS_GET_CLOCK_FREQ	6
 #define KVM_HC_MIPS_EXIT_VM		7
 #define KVM_HC_MIPS_CONSOLE_OUTPUT	8
+#define KVM_HC_CLOCK_PAIRING		9
 
 /*
  * hypercalls use architecture specific
-- 
cgit v1.2.3


From 80fbd89cbd07287a7013006c14ddec923b7a4ff6 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 8 Feb 2017 10:57:24 +0100
Subject: KVM: x86: fix compilation

Fix rebase breakage from commit 55dd00a73a51 ("KVM: x86: add
KVM_HC_CLOCK_PAIRING hypercall", 2017-01-24), courtesy of the
"I could have sworn I had pushed the right branch" department.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 09e5d31dac98..96dd7dd13ee6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6153,7 +6153,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
 {
 	struct kvm_clock_pairing clock_pairing;
 	struct timespec ts;
-	cycle_t cycle;
+	u64 cycle;
 	int ret;
 
 	if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
-- 
cgit v1.2.3


From f4066c2bc4d0de4e5dcbff21dae41e89fe8f38c0 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 24 Jan 2017 15:09:41 -0200
Subject: kvmclock: export kvmclock clocksource and data pointers

To be used by KVM PTP driver.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvmclock.h | 6 ++++++
 arch/x86/kernel/kvmclock.c      | 5 ++++-
 2 files changed, 10 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/kvmclock.h

diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h
new file mode 100644
index 000000000000..f260bef63591
--- /dev/null
+++ b/arch/x86/include/asm/kvmclock.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_X86_KVM_CLOCK_H
+#define _ASM_X86_KVM_CLOCK_H
+
+extern struct clocksource kvm_clock;
+
+#endif /* _ASM_X86_KVM_CLOCK_H */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2a5cafdf8808..995fa260a6da 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -28,6 +28,7 @@
 
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
+#include <asm/kvmclock.h>
 
 static int kvmclock __ro_after_init = 1;
 static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
@@ -49,6 +50,7 @@ struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
 {
 	return hv_clock;
 }
+EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va);
 
 /*
  * The wallclock is the time of day when we booted. Since then, some time may
@@ -174,13 +176,14 @@ bool kvm_check_and_clear_guest_paused(void)
 	return ret;
 }
 
-static struct clocksource kvm_clock = {
+struct clocksource kvm_clock = {
 	.name = "kvm-clock",
 	.read = kvm_clock_get_cycles,
 	.rating = 400,
 	.mask = CLOCKSOURCE_MASK(64),
 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
 };
+EXPORT_SYMBOL_GPL(kvm_clock);
 
 int kvm_register_clock(char *txt)
 {
-- 
cgit v1.2.3


From a0e136d436ded817c0aade72efdefa56a00b4e5e Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 24 Jan 2017 15:09:42 -0200
Subject: PTP: add kvm PTP driver

Add a driver with gettime method returning hosts realtime clock.
This allows Chrony to synchronize host and guest clocks with
high precision (see results below).

chronyc> sources
MS Name/IP address         Stratum Poll Reach LastRx Last sample
===============================================================================

To configure Chronyd to use PHC refclock, add the
following line to its configuration file:

refclock PHC /dev/ptpX poll 3 dpoll -2 offset 0

Where /dev/ptpX is the kvmclock PTP clock.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 drivers/ptp/Kconfig   |  12 +++
 drivers/ptp/Makefile  |   1 +
 drivers/ptp/ptp_kvm.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 213 insertions(+)
 create mode 100644 drivers/ptp/ptp_kvm.c

diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index bdce33291161..384f661a6496 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -90,4 +90,16 @@ config PTP_1588_CLOCK_PCH
 	  To compile this driver as a module, choose M here: the module
 	  will be called ptp_pch.
 
+config PTP_1588_CLOCK_KVM
+	tristate "KVM virtual PTP clock"
+	depends on PTP_1588_CLOCK
+	depends on KVM_GUEST && X86
+	default y
+	help
+	  This driver adds support for using kvm infrastructure as a PTP
+	  clock. This clock is only useful if you are using KVM guests.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called ptp_kvm.
+
 endmenu
diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
index 8b58597298de..530736161a8b 100644
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -6,3 +6,4 @@ ptp-y					:= ptp_clock.o ptp_chardev.o ptp_sysfs.o
 obj-$(CONFIG_PTP_1588_CLOCK)		+= ptp.o
 obj-$(CONFIG_PTP_1588_CLOCK_IXP46X)	+= ptp_ixp46x.o
 obj-$(CONFIG_PTP_1588_CLOCK_PCH)	+= ptp_pch.o
+obj-$(CONFIG_PTP_1588_CLOCK_KVM)	+= ptp_kvm.o
diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm.c
new file mode 100644
index 000000000000..0a54e8326a90
--- /dev/null
+++ b/drivers/ptp/ptp_kvm.c
@@ -0,0 +1,200 @@
+/*
+ * Virtual PTP 1588 clock for use with KVM guests
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ */
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <uapi/linux/kvm_para.h>
+#include <asm/kvm_para.h>
+#include <asm/pvclock.h>
+#include <asm/kvmclock.h>
+#include <uapi/asm/kvm_para.h>
+
+#include <linux/ptp_clock_kernel.h>
+
+struct kvm_ptp_clock {
+	struct ptp_clock *ptp_clock;
+	struct ptp_clock_info caps;
+};
+
+DEFINE_SPINLOCK(kvm_ptp_lock);
+
+static struct pvclock_vsyscall_time_info *hv_clock;
+
+static struct kvm_clock_pairing clock_pair;
+static phys_addr_t clock_pair_gpa;
+
+static int ptp_kvm_get_time_fn(ktime_t *device_time,
+			       struct system_counterval_t *system_counter,
+			       void *ctx)
+{
+	unsigned long ret;
+	struct timespec64 tspec;
+	unsigned version;
+	int cpu;
+	struct pvclock_vcpu_time_info *src;
+
+	spin_lock(&kvm_ptp_lock);
+
+	preempt_disable_notrace();
+	cpu = smp_processor_id();
+	src = &hv_clock[cpu].pvti;
+
+	do {
+		/*
+		 * We are using a TSC value read in the hosts
+		 * kvm_hc_clock_pairing handling.
+		 * So any changes to tsc_to_system_mul
+		 * and tsc_shift or any other pvclock
+		 * data invalidate that measurement.
+		 */
+		version = pvclock_read_begin(src);
+
+		ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+				     clock_pair_gpa,
+				     KVM_CLOCK_PAIRING_WALLCLOCK);
+		if (ret != 0) {
+			pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
+			spin_unlock(&kvm_ptp_lock);
+			preempt_enable_notrace();
+			return -EOPNOTSUPP;
+		}
+
+		tspec.tv_sec = clock_pair.sec;
+		tspec.tv_nsec = clock_pair.nsec;
+		ret = __pvclock_read_cycles(src, clock_pair.tsc);
+	} while (pvclock_read_retry(src, version));
+
+	preempt_enable_notrace();
+
+	system_counter->cycles = ret;
+	system_counter->cs = &kvm_clock;
+
+	*device_time = timespec64_to_ktime(tspec);
+
+	spin_unlock(&kvm_ptp_lock);
+
+	return 0;
+}
+
+static int ptp_kvm_getcrosststamp(struct ptp_clock_info *ptp,
+				  struct system_device_crosststamp *xtstamp)
+{
+	return get_device_system_crosststamp(ptp_kvm_get_time_fn, NULL,
+					     NULL, xtstamp);
+}
+
+/*
+ * PTP clock operations
+ */
+
+static int ptp_kvm_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ptp_kvm_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ptp_kvm_settime(struct ptp_clock_info *ptp,
+			   const struct timespec64 *ts)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
+{
+	unsigned long ret;
+	struct timespec64 tspec;
+
+	spin_lock(&kvm_ptp_lock);
+
+	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+			     clock_pair_gpa,
+			     KVM_CLOCK_PAIRING_WALLCLOCK);
+	if (ret != 0) {
+		pr_err_ratelimited("clock offset hypercall ret %lu\n", ret);
+		spin_unlock(&kvm_ptp_lock);
+		return -EOPNOTSUPP;
+	}
+
+	tspec.tv_sec = clock_pair.sec;
+	tspec.tv_nsec = clock_pair.nsec;
+	spin_unlock(&kvm_ptp_lock);
+
+	memcpy(ts, &tspec, sizeof(struct timespec64));
+
+	return 0;
+}
+
+static int ptp_kvm_enable(struct ptp_clock_info *ptp,
+			  struct ptp_clock_request *rq, int on)
+{
+	return -EOPNOTSUPP;
+}
+
+static struct ptp_clock_info ptp_kvm_caps = {
+	.owner		= THIS_MODULE,
+	.name		= "KVM virtual PTP",
+	.max_adj	= 0,
+	.n_ext_ts	= 0,
+	.n_pins		= 0,
+	.pps		= 0,
+	.adjfreq	= ptp_kvm_adjfreq,
+	.adjtime	= ptp_kvm_adjtime,
+	.gettime64	= ptp_kvm_gettime,
+	.settime64	= ptp_kvm_settime,
+	.enable		= ptp_kvm_enable,
+	.getcrosststamp = ptp_kvm_getcrosststamp,
+};
+
+/* module operations */
+
+static struct kvm_ptp_clock kvm_ptp_clock;
+
+static void __exit ptp_kvm_exit(void)
+{
+	ptp_clock_unregister(kvm_ptp_clock.ptp_clock);
+}
+
+static int __init ptp_kvm_init(void)
+{
+	clock_pair_gpa = slow_virt_to_phys(&clock_pair);
+	hv_clock = pvclock_pvti_cpu0_va();
+
+	if (!hv_clock)
+		return -ENODEV;
+
+	kvm_ptp_clock.caps = ptp_kvm_caps;
+
+	kvm_ptp_clock.ptp_clock = ptp_clock_register(&kvm_ptp_clock.caps, NULL);
+
+	if (IS_ERR(kvm_ptp_clock.ptp_clock))
+		return PTR_ERR(kvm_ptp_clock.ptp_clock);
+
+	return 0;
+}
+
+module_init(ptp_kvm_init);
+module_exit(ptp_kvm_exit);
+
+MODULE_AUTHOR("Marcelo Tosatti <mtosatti@redhat.com>");
+MODULE_DESCRIPTION("PTP clock using KVMCLOCK");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3