Merge remote-tracking branch 'kvm/linux-next'

author: Stephen Rothwell <sfr@canb.auug.org.au> 2017-02-09 14:22:45 +1100
committer: Stephen Rothwell <sfr@canb.auug.org.au> 2017-02-09 14:22:45 +1100
commit: f494e0835d2db55f49ba1c97ce7f85b1fccacb45 (patch)
tree: 16d13f85796efa859a3736b32966bc9173692e54
parent: e15246babb490d1285a70591b5544d57c427c4d5 (diff)
parent: a0e136d436ded817c0aade72efdefa56a00b4e5e (diff)
91 files changed, 6451 insertions, 2222 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 03145b7cafaa..e4f2cdcf78eb 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2061,6 +2061,8 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_LO               | 64
   MIPS  | KVM_REG_MIPS_PC               | 64
   MIPS  | KVM_REG_MIPS_CP0_INDEX        | 32
+  MIPS  | KVM_REG_MIPS_CP0_ENTRYLO0     | 64
+  MIPS  | KVM_REG_MIPS_CP0_ENTRYLO1     | 64
   MIPS  | KVM_REG_MIPS_CP0_CONTEXT      | 64
   MIPS  | KVM_REG_MIPS_CP0_USERLOCAL    | 64
   MIPS  | KVM_REG_MIPS_CP0_PAGEMASK     | 32
@@ -2071,9 +2073,11 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_ENTRYHI      | 64
   MIPS  | KVM_REG_MIPS_CP0_COMPARE      | 32
   MIPS  | KVM_REG_MIPS_CP0_STATUS       | 32
+  MIPS  | KVM_REG_MIPS_CP0_INTCTL       | 32
   MIPS  | KVM_REG_MIPS_CP0_CAUSE        | 32
   MIPS  | KVM_REG_MIPS_CP0_EPC          | 64
   MIPS  | KVM_REG_MIPS_CP0_PRID         | 32
+  MIPS  | KVM_REG_MIPS_CP0_EBASE        | 64
   MIPS  | KVM_REG_MIPS_CP0_CONFIG       | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG1      | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG2      | 32
@@ -2148,6 +2152,12 @@ patterns depending on whether they're 32-bit or 64-bit registers:
   0x7020 0000 0001 00 <reg:5> <sel:3>   (32-bit)
   0x7030 0000 0001 00 <reg:5> <sel:3>   (64-bit)
 
+Note: KVM_REG_MIPS_CP0_ENTRYLO0 and KVM_REG_MIPS_CP0_ENTRYLO1 are the MIPS64
+versions of the EntryLo registers regardless of the word size of the host
+hardware, host kernel, guest, and whether XPA is present in the guest, i.e.
+with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and
+the PFNX field starting at bit 30.
+
 MIPS KVM control registers (see above) have the following id bit patterns:
   0x7030 0000 0002 <reg:16>
 
@@ -2443,18 +2453,20 @@ are, it will do nothing and return an EBUSY error.
 The parameter is a pointer to a 32-bit unsigned integer variable
 containing the order (log base 2) of the desired size of the hash
 table, which must be between 18 and 46.  On successful return from the
-ioctl, it will have been updated with the order of the hash table that
-was allocated.
+ioctl, the value will not be changed by the kernel.
 
 If no hash table has been allocated when any vcpu is asked to run
 (with the KVM_RUN ioctl), the host kernel will allocate a
 default-sized hash table (16 MB).
 
 If this ioctl is called when a hash table has already been allocated,
-the kernel will clear out the existing hash table (zero all HPTEs) and
-return the hash table order in the parameter.  (If the guest is using
-the virtualized real-mode area (VRMA) facility, the kernel will
-re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
+with a different order from the existing hash table, the existing hash
+table will be freed and a new one allocated.  If this is ioctl is
+called when a hash table has already been allocated of the same order
+as specified, the kernel will clear out the existing hash table (zero
+all HPTEs).  In either case, if the guest is using the virtualized
+real-mode area (VRMA) facility, the kernel will re-create the VMRA
+HPTEs on the next KVM_RUN of any vcpu.
 
 4.77 KVM_S390_INTERRUPT
 
@@ -3177,7 +3189,7 @@ of IOMMU pages.
 
 The rest of functionality is identical to KVM_CREATE_SPAPR_TCE.
 
-4.98 KVM_REINJECT_CONTROL
+4.99 KVM_REINJECT_CONTROL
 
 Capability: KVM_CAP_REINJECT_CONTROL
 Architectures: x86
@@ -3201,6 +3213,166 @@ struct kvm_reinject_control {
 pit_reinject = 0 (!reinject mode) is recommended, unless running an old
 operating system that uses the PIT for timing (e.g. Linux 2.4.x).
 
+4.100 KVM_PPC_CONFIGURE_V3_MMU
+
+Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_ppc_mmuv3_cfg (in)
+Returns: 0 on success,
+         -EFAULT if struct kvm_ppc_mmuv3_cfg cannot be read,
+         -EINVAL if the configuration is invalid
+
+This ioctl controls whether the guest will use radix or HPT (hashed
+page table) translation, and sets the pointer to the process table for
+the guest.
+
+struct kvm_ppc_mmuv3_cfg {
+	__u64	flags;
+	__u64	process_table;
+};
+
+There are two bits that can be set in flags; KVM_PPC_MMUV3_RADIX and
+KVM_PPC_MMUV3_GTSE.  KVM_PPC_MMUV3_RADIX, if set, configures the guest
+to use radix tree translation, and if clear, to use HPT translation.
+KVM_PPC_MMUV3_GTSE, if set and if KVM permits it, configures the guest
+to be able to use the global TLB and SLB invalidation instructions;
+if clear, the guest may not use these instructions.
+
+The process_table field specifies the address and size of the guest
+process table, which is in the guest's space.  This field is formatted
+as the second doubleword of the partition table entry, as defined in
+the Power ISA V3.00, Book III section 5.7.6.1.
+
+4.101 KVM_PPC_GET_RMMU_INFO
+
+Capability: KVM_CAP_PPC_RADIX_MMU
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_ppc_rmmu_info (out)
+Returns: 0 on success,
+	 -EFAULT if struct kvm_ppc_rmmu_info cannot be written,
+	 -EINVAL if no useful information can be returned
+
+This ioctl returns a structure containing two things: (a) a list
+containing supported radix tree geometries, and (b) a list that maps
+page sizes to put in the "AP" (actual page size) field for the tlbie
+(TLB invalidate entry) instruction.
+
+struct kvm_ppc_rmmu_info {
+	struct kvm_ppc_radix_geom {
+		__u8	page_shift;
+		__u8	level_bits[4];
+		__u8	pad[3];
+	}	geometries[8];
+	__u32	ap_encodings[8];
+};
+
+The geometries[] field gives up to 8 supported geometries for the
+radix page table, in terms of the log base 2 of the smallest page
+size, and the number of bits indexed at each level of the tree, from
+the PTE level up to the PGD level in that order.  Any unused entries
+will have 0 in the page_shift field.
+
+The ap_encodings gives the supported page sizes and their AP field
+encodings, encoded with the AP value in the top 3 bits and the log
+base 2 of the page size in the bottom 6 bits.
+
+4.102 KVM_PPC_RESIZE_HPT_PREPARE
+
+Capability: KVM_CAP_SPAPR_RESIZE_HPT
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_ppc_resize_hpt (in)
+Returns: 0 on successful completion,
+	 >0 if a new HPT is being prepared, the value is an estimated
+             number of milliseconds until preparation is complete
+         -EFAULT if struct kvm_reinject_control cannot be read,
+	 -EINVAL if the supplied shift or flags are invalid
+	 -ENOMEM if unable to allocate the new HPT
+	 -ENOSPC if there was a hash collision when moving existing
+                  HPT entries to the new HPT
+	 -EIO on other error conditions
+
+Used to implement the PAPR extension for runtime resizing of a guest's
+Hashed Page Table (HPT).  Specifically this starts, stops or monitors
+the preparation of a new potential HPT for the guest, essentially
+implementing the H_RESIZE_HPT_PREPARE hypercall.
+
+If called with shift > 0 when there is no pending HPT for the guest,
+this begins preparation of a new pending HPT of size 2^(shift) bytes.
+It then returns a positive integer with the estimated number of
+milliseconds until preparation is complete.
+
+If called when there is a pending HPT whose size does not match that
+requested in the parameters, discards the existing pending HPT and
+creates a new one as above.
+
+If called when there is a pending HPT of the size requested, will:
+  * If preparation of the pending HPT is already complete, return 0
+  * If preparation of the pending HPT has failed, return an error
+    code, then discard the pending HPT.
+  * If preparation of the pending HPT is still in progress, return an
+    estimated number of milliseconds until preparation is complete.
+
+If called with shift == 0, discards any currently pending HPT and
+returns 0 (i.e. cancels any in-progress preparation).
+
+flags is reserved for future expansion, currently setting any bits in
+flags will result in an -EINVAL.
+
+Normally this will be called repeatedly with the same parameters until
+it returns <= 0.  The first call will initiate preparation, subsequent
+ones will monitor preparation until it completes or fails.
+
+struct kvm_ppc_resize_hpt {
+	__u64 flags;
+	__u32 shift;
+	__u32 pad;
+};
+
+4.103 KVM_PPC_RESIZE_HPT_COMMIT
+
+Capability: KVM_CAP_SPAPR_RESIZE_HPT
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_ppc_resize_hpt (in)
+Returns: 0 on successful completion,
+         -EFAULT if struct kvm_reinject_control cannot be read,
+	 -EINVAL if the supplied shift or flags are invalid
+	 -ENXIO is there is no pending HPT, or the pending HPT doesn't
+                 have the requested size
+	 -EBUSY if the pending HPT is not fully prepared
+	 -ENOSPC if there was a hash collision when moving existing
+                  HPT entries to the new HPT
+	 -EIO on other error conditions
+
+Used to implement the PAPR extension for runtime resizing of a guest's
+Hashed Page Table (HPT).  Specifically this requests that the guest be
+transferred to working with the new HPT, essentially implementing the
+H_RESIZE_HPT_COMMIT hypercall.
+
+This should only be called after KVM_PPC_RESIZE_HPT_PREPARE has
+returned 0 with the same parameters.  In other cases
+KVM_PPC_RESIZE_HPT_COMMIT will return an error (usually -ENXIO or
+-EBUSY, though others may be possible if the preparation was started,
+but failed).
+
+This will have undefined effects on the guest if it has not already
+placed itself in a quiescent state where no vcpu will make MMU enabled
+memory accesses.
+
+On succsful completion, the pending HPT will become the guest's active
+HPT and the previous HPT will be discarded.
+
+On failure, the guest will still be operating on its previous HPT.
+
+struct kvm_ppc_resize_hpt {
+	__u64 flags;
+	__u32 shift;
+	__u32 pad;
+};
+
 5. The kvm_run structure
 ------------------------
 
@@ -3942,3 +4114,21 @@ In order to use SynIC, it has to be activated by setting this
 capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this
 will disable the use of APIC hardware virtualization even if supported
 by the CPU, as it's incompatible with SynIC auto-EOI behavior.
+
+8.3 KVM_CAP_PPC_RADIX_MMU
+
+Architectures: ppc
+
+This capability, if KVM_CHECK_EXTENSION indicates that it is
+available, means that that the kernel can support guests using the
+radix MMU defined in Power ISA V3.00 (as implemented in the POWER9
+processor).
+
+8.4 KVM_CAP_PPC_HASH_MMU_V3
+
+Architectures: ppc
+
+This capability, if KVM_CHECK_EXTENSION indicates that it is
+available, means that that the kernel can support guests using the
+hashed page table MMU defined in Power ISA V3.00 (as implemented in
+the POWER9 processor), including in-memory segment tables.
diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
index c8d040e27046..feaaa634f154 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -81,3 +81,38 @@ the vcpu to sleep until occurrence of an appropriate event. Another vcpu of the
 same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall,
 specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0)
 is used in the hypercall for future use.
+
+
+6. KVM_HC_CLOCK_PAIRING
+------------------------
+Architecture: x86
+Status: active
+Purpose: Hypercall used to synchronize host and guest clocks.
+Usage:
+
+a0: guest physical address where host copies
+"struct kvm_clock_offset" structure.
+
+a1: clock_type, ATM only KVM_CLOCK_PAIRING_WALLCLOCK (0)
+is supported (corresponding to the host's CLOCK_REALTIME clock).
+
+		struct kvm_clock_pairing {
+			__s64 sec;
+			__s64 nsec;
+			__u64 tsc;
+			__u32 flags;
+			__u32 pad[9];
+		};
+
+       Where:
+               * sec: seconds from clock_type clock.
+               * nsec: nanoseconds from clock_type clock.
+               * tsc: guest TSC value used to calculate sec/nsec pair
+               * flags: flags, unused (0) at the moment.
+
+The hypercall lets a guest compute a precise timestamp across
+host and guest.  The guest can use the returned TSC value to
+compute the CLOCK_REALTIME for its clock, at the same instant.
+
+Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource,
+or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK.
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index fd013bf4115b..1bb8bcaf8497 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -26,9 +26,16 @@ sections.
 Fast page fault:
 
 Fast page fault is the fast path which fixes the guest page fault out of
-the mmu-lock on x86. Currently, the page fault can be fast only if the
-shadow page table is present and it is caused by write-protect, that means
-we just need change the W bit of the spte.
+the mmu-lock on x86. Currently, the page fault can be fast in one of the
+following two cases:
+
+1. Access Tracking: The SPTE is not present, but it is marked for access
+tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to
+restore the saved R/X bits. This is described in more detail later below.
+
+2. Write-Protection: The SPTE is present and the fault is
+caused by write-protect. That means we just need to change the W bit of the 
+spte.
 
 What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and
 SPTE_MMU_WRITEABLE bit on the spte:
@@ -38,7 +45,8 @@ SPTE_MMU_WRITEABLE bit on the spte:
   page write-protection.
 
 On fast page fault path, we will use cmpxchg to atomically set the spte W
-bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, this
+bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or 
+restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This
 is safe because whenever changing these bits can be detected by cmpxchg.
 
 But we need carefully check these cases:
@@ -142,6 +150,21 @@ Since the spte is "volatile" if it can be updated out of mmu-lock, we always
 atomically update the spte, the race caused by fast page fault can be avoided,
 See the comments in spte_has_volatile_bits() and mmu_spte_update().
 
+Lockless Access Tracking:
+
+This is used for Intel CPUs that are using EPT but do not support the EPT A/D
+bits. In this case, when the KVM MMU notifier is called to track accesses to a
+page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present
+by clearing the RWX bits in the PTE and storing the original R & X bits in
+some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the
+PTE (using the ignored bit 62). When the VM tries to access the page later on,
+a fault is generated and the fast page fault mechanism described above is used
+to atomically restore the PTE to a Present state. The W bit is not saved when
+the PTE is marked for access tracking and during restoration to the Present
+state, the W bit is set depending on whether or not it was a write access. If
+it wasn't, then the W bit will remain clear until a write access happens, at 
+which time it will be set using the Dirty tracking mechanism described above.
+
 3. Reference
 ------------
 
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index bebec370324f..05e785fc061d 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -43,6 +43,7 @@
 #define KVM_REG_MIPS_CP0_ENTRYHI	MIPS_CP0_64(10, 0)
 #define KVM_REG_MIPS_CP0_COMPARE	MIPS_CP0_32(11, 0)
 #define KVM_REG_MIPS_CP0_STATUS		MIPS_CP0_32(12, 0)
+#define KVM_REG_MIPS_CP0_INTCTL		MIPS_CP0_32(12, 1)
 #define KVM_REG_MIPS_CP0_CAUSE		MIPS_CP0_32(13, 0)
 #define KVM_REG_MIPS_CP0_EPC		MIPS_CP0_64(14, 0)
 #define KVM_REG_MIPS_CP0_PRID		MIPS_CP0_32(15, 0)
@@ -64,7 +65,7 @@
 #define KVM_REG_MIPS_CP0_KSCRATCH6	MIPS_CP0_64(31, 7)
 
 
-#define KVM_MAX_VCPUS		1
+#define KVM_MAX_VCPUS		8
 #define KVM_USER_MEM_SLOTS	8
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS	0
@@ -88,6 +89,7 @@
 
 #define KVM_GUEST_KUSEG			0x00000000UL
 #define KVM_GUEST_KSEG0			0x40000000UL
+#define KVM_GUEST_KSEG1			0x40000000UL
 #define KVM_GUEST_KSEG23		0x60000000UL
 #define KVM_GUEST_KSEGX(a)		((_ACAST32_(a)) & 0xe0000000)
 #define KVM_GUEST_CPHYSADDR(a)		((_ACAST32_(a)) & 0x1fffffff)
@@ -104,7 +106,6 @@
 #define KVM_GUEST_KSEG23ADDR(a)		(KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG23)
 
 #define KVM_INVALID_PAGE		0xdeadbeef
-#define KVM_INVALID_INST		0xdeadbeef
 #define KVM_INVALID_ADDR		0xdeadbeef
 
 /*
@@ -121,8 +122,6 @@ static inline bool kvm_is_error_hva(unsigned long addr)
 	return IS_ERR_VALUE(addr);
 }
 
-extern atomic_t kvm_mips_instance;
-
 struct kvm_vm_stat {
 	ulong remote_tlb_flush;
 };
@@ -156,12 +155,8 @@ struct kvm_arch_memory_slot {
 };
 
 struct kvm_arch {
-	/* Guest GVA->HPA page table */
-	unsigned long *guest_pmap;
-	unsigned long guest_pmap_npages;
-
-	/* Wired host TLB used for the commpage */
-	int commpage_tlb;
+	/* Guest physical mm */
+	struct mm_struct gpa_mm;
 };
 
 #define N_MIPS_COPROC_REGS	32
@@ -233,6 +228,7 @@ enum emulation_result {
 	EMULATE_FAIL,		/* can't emulate this instruction */
 	EMULATE_WAIT,		/* WAIT instruction */
 	EMULATE_PRIV_FAIL,
+	EMULATE_EXCEPT,		/* A guest exception has been generated */
 };
 
 #define mips3_paddr_to_tlbpfn(x) \
@@ -250,6 +246,7 @@ enum emulation_result {
 #define TLB_ASID(x)		((x).tlb_hi & KVM_ENTRYHI_ASID)
 #define TLB_LO_IDX(x, va)	(((va) >> PAGE_SHIFT) & 1)
 #define TLB_IS_VALID(x, va)	((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_V)
+#define TLB_IS_DIRTY(x, va)	((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_D)
 #define TLB_HI_VPN2_HIT(x, y)	((TLB_VPN2(x) & ~(x).tlb_mask) ==	\
 				 ((y) & VPN2_MASK & ~(x).tlb_mask))
 #define TLB_HI_ASID_HIT(x, y)	(TLB_IS_GLOBAL(x) ||			\
@@ -261,6 +258,17 @@ struct kvm_mips_tlb {
 	long tlb_lo[2];
 };
 
+#define KVM_NR_MEM_OBJS     4
+
+/*
+ * We don't want allocation failures within the mmu code, so we preallocate
+ * enough memory for a single page fault in a cache.
+ */
+struct kvm_mmu_memory_cache {
+	int nobjs;
+	void *objects[KVM_NR_MEM_OBJS];
+};
+
 #define KVM_MIPS_AUX_FPU	0x1
 #define KVM_MIPS_AUX_MSA	0x2
 
@@ -275,6 +283,8 @@ struct kvm_vcpu_arch {
 	unsigned long host_cp0_badvaddr;
 	unsigned long host_cp0_epc;
 	u32 host_cp0_cause;
+	u32 host_cp0_badinstr;
+	u32 host_cp0_badinstrp;
 
 	/* GPRS */
 	unsigned long gprs[32];
@@ -318,20 +328,18 @@ struct kvm_vcpu_arch {
 	/* Bitmask of pending exceptions to be cleared */
 	unsigned long pending_exceptions_clr;
 
-	/* Save/Restore the entryhi register when are are preempted/scheduled back in */
-	unsigned long preempt_entryhi;
-
 	/* S/W Based TLB for guest */
 	struct kvm_mips_tlb guest_tlb[KVM_MIPS_GUEST_TLB_SIZE];
 
-	/* Cached guest kernel/user ASIDs */
-	u32 guest_user_asid[NR_CPUS];
-	u32 guest_kernel_asid[NR_CPUS];
+	/* Guest kernel/user [partial] mm */
 	struct mm_struct guest_kernel_mm, guest_user_mm;
 
 	/* Guest ASID of last user mode execution */
 	unsigned int last_user_gasid;
 
+	/* Cache some mmu pages needed inside spinlock regions */
+	struct kvm_mmu_memory_cache mmu_page_cache;
+
 	int last_sched_cpu;
 
 	/* WAIT executed */
@@ -339,14 +347,15 @@ struct kvm_vcpu_arch {
 
 	u8 fpu_enabled;
 	u8 msa_enabled;
-	u8 kscratch_enabled;
 };
 
 
 #define kvm_read_c0_guest_index(cop0)		(cop0->reg[MIPS_CP0_TLB_INDEX][0])
 #define kvm_write_c0_guest_index(cop0, val)	(cop0->reg[MIPS_CP0_TLB_INDEX][0] = val)
 #define kvm_read_c0_guest_entrylo0(cop0)	(cop0->reg[MIPS_CP0_TLB_LO0][0])
+#define kvm_write_c0_guest_entrylo0(cop0, val)	(cop0->reg[MIPS_CP0_TLB_LO0][0] = (val))
 #define kvm_read_c0_guest_entrylo1(cop0)	(cop0->reg[MIPS_CP0_TLB_LO1][0])
+#define kvm_write_c0_guest_entrylo1(cop0, val)	(cop0->reg[MIPS_CP0_TLB_LO1][0] = (val))
 #define kvm_read_c0_guest_context(cop0)		(cop0->reg[MIPS_CP0_TLB_CONTEXT][0])
 #define kvm_write_c0_guest_context(cop0, val)	(cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val))
 #define kvm_read_c0_guest_userlocal(cop0)	(cop0->reg[MIPS_CP0_TLB_CONTEXT][2])
@@ -522,9 +531,17 @@ struct kvm_mips_callbacks {
 	int (*handle_msa_fpe)(struct kvm_vcpu *vcpu);
 	int (*handle_fpe)(struct kvm_vcpu *vcpu);
 	int (*handle_msa_disabled)(struct kvm_vcpu *vcpu);
-	int (*vm_init)(struct kvm *kvm);
 	int (*vcpu_init)(struct kvm_vcpu *vcpu);
+	void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
 	int (*vcpu_setup)(struct kvm_vcpu *vcpu);
+	void (*flush_shadow_all)(struct kvm *kvm);
+	/*
+	 * Must take care of flushing any cached GPA PTEs (e.g. guest entries in
+	 * VZ root TLB, or T&E GVA page tables and corresponding root TLB
+	 * mappings).
+	 */
+	void (*flush_shadow_memslot)(struct kvm *kvm,
+				     const struct kvm_memory_slot *slot);
 	gpa_t (*gva_to_gpa)(gva_t gva);
 	void (*queue_timer_int)(struct kvm_vcpu *vcpu);
 	void (*dequeue_timer_int)(struct kvm_vcpu *vcpu);
@@ -542,8 +559,10 @@ struct kvm_mips_callbacks {
 			   const struct kvm_one_reg *reg, s64 *v);
 	int (*set_one_reg)(struct kvm_vcpu *vcpu,
 			   const struct kvm_one_reg *reg, s64 v);
-	int (*vcpu_get_regs)(struct kvm_vcpu *vcpu);
-	int (*vcpu_set_regs)(struct kvm_vcpu *vcpu);
+	int (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+	int (*vcpu_put)(struct kvm_vcpu *vcpu, int cpu);
+	int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+	void (*vcpu_reenter)(struct kvm_run *run, struct kvm_vcpu *vcpu);
 };
 extern struct kvm_mips_callbacks *kvm_mips_callbacks;
 int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
@@ -556,6 +575,7 @@ extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu);
 /* Building of entry/exception code */
 int kvm_mips_entry_setup(void);
 void *kvm_mips_build_vcpu_run(void *addr);
+void *kvm_mips_build_tlb_refill_exception(void *addr, void *handler);
 void *kvm_mips_build_exception(void *addr, void *handler);
 void *kvm_mips_build_exit(void *addr);
 
@@ -580,54 +600,125 @@ u32 kvm_get_user_asid(struct kvm_vcpu *vcpu);
 u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
 
 extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr,
-					   struct kvm_vcpu *vcpu);
+					   struct kvm_vcpu *vcpu,
+					   bool write_fault);
 
 extern int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
 					      struct kvm_vcpu *vcpu);
 
 extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
-						struct kvm_mips_tlb *tlb);
+						struct kvm_mips_tlb *tlb,
+						unsigned long gva,
+						bool write_fault);
 
 extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
 						     u32 *opc,
 						     struct kvm_run *run,
-						     struct kvm_vcpu *vcpu);
-
-extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause,
-						    u32 *opc,
-						    struct kvm_run *run,
-						    struct kvm_vcpu *vcpu);
+						     struct kvm_vcpu *vcpu,
+						     bool write_fault);
 
 extern void kvm_mips_dump_host_tlbs(void);
 extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu);
-extern int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
-				   unsigned long entrylo0,
-				   unsigned long entrylo1,
-				   int flush_dcache_mask);
-extern void kvm_mips_flush_host_tlb(int skip_kseg0);
-extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi);
+extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi,
+				 bool user, bool kernel);
 
 extern int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu,
 				     unsigned long entryhi);
-extern int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr);
-extern unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
-						   unsigned long gva);
-extern void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
-				    struct kvm_vcpu *vcpu);
-extern void kvm_local_flush_tlb_all(void);
-extern void kvm_mips_alloc_new_mmu_context(struct kvm_vcpu *vcpu);
-extern void kvm_mips_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu);
+
+void kvm_mips_suspend_mm(int cpu);
+void kvm_mips_resume_mm(int cpu);
+
+/* MMU handling */
+
+/**
+ * enum kvm_mips_flush - Types of MMU flushes.
+ * @KMF_USER:	Flush guest user virtual memory mappings.
+ *		Guest USeg only.
+ * @KMF_KERN:	Flush guest kernel virtual memory mappings.
+ *		Guest USeg and KSeg2/3.
+ * @KMF_GPA:	Flush guest physical memory mappings.
+ *		Also includes KSeg0 if KMF_KERN is set.
+ */
+enum kvm_mips_flush {
+	KMF_USER	= 0x0,
+	KMF_KERN	= 0x1,
+	KMF_GPA		= 0x2,
+};
+void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags);
+bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn);
+int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn);
+pgd_t *kvm_pgd_alloc(void);
+void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
+void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
+				  bool user);
+void kvm_trap_emul_gva_lockless_begin(struct kvm_vcpu *vcpu);
+void kvm_trap_emul_gva_lockless_end(struct kvm_vcpu *vcpu);
+
+enum kvm_mips_fault_result {
+	KVM_MIPS_MAPPED = 0,
+	KVM_MIPS_GVA,
+	KVM_MIPS_GPA,
+	KVM_MIPS_TLB,
+	KVM_MIPS_TLBINV,
+	KVM_MIPS_TLBMOD,
+};
+enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
+						   unsigned long gva,
+						   bool write);
+
+#define KVM_ARCH_WANT_MMU_NOTIFIER
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_unmap_hva_range(struct kvm *kvm,
+			unsigned long start, unsigned long end);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
+
+static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
+							 unsigned long address)
+{
+}
 
 /* Emulation */
-u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu);
+int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
 enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
+int kvm_get_badinstr(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
+int kvm_get_badinstrp(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
+
+/**
+ * kvm_is_ifetch_fault() - Find whether a TLBL exception is due to ifetch fault.
+ * @vcpu:	Virtual CPU.
+ *
+ * Returns:	Whether the TLBL exception was likely due to an instruction
+ *		fetch fault rather than a data load fault.
+ */
+static inline bool kvm_is_ifetch_fault(struct kvm_vcpu_arch *vcpu)
+{
+	unsigned long badvaddr = vcpu->host_cp0_badvaddr;
+	unsigned long epc = msk_isa16_mode(vcpu->pc);
+	u32 cause = vcpu->host_cp0_cause;
+
+	if (epc == badvaddr)
+		return true;
+
+	/*
+	 * Branches may be 32-bit or 16-bit instructions.
+	 * This isn't exact, but we don't really support MIPS16 or microMIPS yet
+	 * in KVM anyway.
+	 */
+	if ((cause & CAUSEF_BD) && badvaddr - epc <= 4)
+		return true;
+
+	return false;
+}
 
 extern enum emulation_result kvm_mips_emulate_inst(u32 cause,
 						   u32 *opc,
 						   struct kvm_run *run,
 						   struct kvm_vcpu *vcpu);
 
+long kvm_mips_guest_exception_base(struct kvm_vcpu *vcpu);
+
 extern enum emulation_result kvm_mips_emulate_syscall(u32 cause,
 						      u32 *opc,
 						      struct kvm_run *run,
@@ -761,10 +852,6 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
 static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
-static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
-static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
-		struct kvm_memory_slot *slot) {}
-static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
diff --git a/arch/mips/include/asm/mmu_context.h b/arch/mips/include/asm/mmu_context.h
index ddd57ade1aa8..2abf94f72c0a 100644
--- a/arch/mips/include/asm/mmu_context.h
+++ b/arch/mips/include/asm/mmu_context.h
@@ -29,9 +29,11 @@ do {									\
 	}								\
 } while (0)
 
+extern void tlbmiss_handler_setup_pgd(unsigned long);
+
+/* Note: This is also implemented with uasm in arch/mips/kvm/entry.c */
 #define TLBMISS_HANDLER_SETUP_PGD(pgd)					\
 do {									\
-	extern void tlbmiss_handler_setup_pgd(unsigned long);		\
 	tlbmiss_handler_setup_pgd((unsigned long)(pgd));		\
 	htw_set_pwbase((unsigned long)pgd);				\
 } while (0)
@@ -97,17 +99,12 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 static inline void
 get_new_mmu_context(struct mm_struct *mm, unsigned long cpu)
 {
-	extern void kvm_local_flush_tlb_all(void);
 	unsigned long asid = asid_cache(cpu);
 
 	if (!((asid += cpu_asid_inc()) & cpu_asid_mask(&cpu_data[cpu]))) {
 		if (cpu_has_vtag_icache)
 			flush_icache_all();
-#ifdef CONFIG_KVM
-		kvm_local_flush_tlb_all();      /* start new asid cycle */
-#else
 		local_flush_tlb_all();	/* start new asid cycle */
-#endif
 		if (!asid)		/* fix version if needed */
 			asid = asid_first_version(cpu);
 	}
diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h
index a03e86969f78..a8705f6c8180 100644
--- a/arch/mips/include/asm/pgalloc.h
+++ b/arch/mips/include/asm/pgalloc.h
@@ -43,21 +43,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
  * Initialize a new pgd / pmd table with invalid pointers.
  */
 extern void pgd_init(unsigned long page);
-
-static inline pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-	pgd_t *ret, *init;
-
-	ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_ORDER);
-	if (ret) {
-		init = pgd_offset(&init_mm, 0UL);
-		pgd_init((unsigned long)ret);
-		memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
-		       (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
-	}
-
-	return ret;
-}
+extern pgd_t *pgd_alloc(struct mm_struct *mm);
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
diff --git a/arch/mips/include/asm/r4kcache.h b/arch/mips/include/asm/r4kcache.h
index b42b513007a2..7227c158cbf8 100644
--- a/arch/mips/include/asm/r4kcache.h
+++ b/arch/mips/include/asm/r4kcache.h
@@ -147,49 +147,64 @@ static inline void flush_scache_line(unsigned long addr)
 }
 
 #define protected_cache_op(op,addr)				\
+({								\
+	int __err = 0;						\
 	__asm__ __volatile__(					\
 	"	.set	push			\n"		\
 	"	.set	noreorder		\n"		\
 	"	.set "MIPS_ISA_ARCH_LEVEL"	\n"		\
-	"1:	cache	%0, (%1)		\n"		\
+	"1:	cache	%1, (%2)		\n"		\
 	"2:	.set	pop			\n"		\
+	"	.section .fixup,\"ax\"		\n"		\
+	"3:	li	%0, %3			\n"		\
+	"	j	2b			\n"		\
+	"	.previous			\n"		\
 	"	.section __ex_table,\"a\"	\n"		\
-	"	"STR(PTR)" 1b, 2b		\n"		\
+	"	"STR(PTR)" 1b, 3b		\n"		\
 	"	.previous"					\
-	:							\
-	: "i" (op), "r" (addr))
+	: "+r" (__err)						\
+	: "i" (op), "r" (addr), "i" (-EFAULT));			\
+	__err;							\
+})
+
 
 #define protected_cachee_op(op,addr)				\
+({								\
+	int __err = 0;						\
 	__asm__ __volatile__(					\
 	"	.set	push			\n"		\
 	"	.set	noreorder		\n"		\
 	"	.set	mips0			\n"		\
 	"	.set	eva			\n"		\
-	"1:	cachee	%0, (%1)		\n"		\
+	"1:	cachee	%1, (%2)		\n"		\
 	"2:	.set	pop			\n"		\
+	"	.section .fixup,\"ax\"		\n"		\
+	"3:	li	%0, %3			\n"		\
+	"	j	2b			\n"		\
+	"	.previous			\n"		\
 	"	.section __ex_table,\"a\"	\n"		\
-	"	"STR(PTR)" 1b, 2b		\n"		\
+	"	"STR(PTR)" 1b, 3b		\n"		\
 	"	.previous"					\
-	:							\
-	: "i" (op), "r" (addr))
+	: "+r" (__err)						\
+	: "i" (op), "r" (addr), "i" (-EFAULT));			\
+	__err;							\
+})
 
 /*
  * The next two are for badland addresses like signal trampolines.
  */
-static inline void protected_flush_icache_line(unsigned long addr)
+static inline int protected_flush_icache_line(unsigned long addr)
 {
 	switch (boot_cpu_type()) {
 	case CPU_LOONGSON2:
-		protected_cache_op(Hit_Invalidate_I_Loongson2, addr);
-		break;
+		return protected_cache_op(Hit_Invalidate_I_Loongson2, addr);
 
 	default:
 #ifdef CONFIG_EVA
-		protected_cachee_op(Hit_Invalidate_I, addr);
+		return protected_cachee_op(Hit_Invalidate_I, addr);
 #else
-		protected_cache_op(Hit_Invalidate_I, addr);
+		return protected_cache_op(Hit_Invalidate_I, addr);
 #endif
-		break;
 	}
 }
 
@@ -199,21 +214,21 @@ static inline void protected_flush_icache_line(unsigned long addr)
  * caches.  We're talking about one cacheline unnecessarily getting invalidated
  * here so the penalty isn't overly hard.
  */
-static inline void protected_writeback_dcache_line(unsigned long addr)
+static inline int protected_writeback_dcache_line(unsigned long addr)
 {
 #ifdef CONFIG_EVA
-	protected_cachee_op(Hit_Writeback_Inv_D, addr);
+	return protected_cachee_op(Hit_Writeback_Inv_D, addr);
 #else
-	protected_cache_op(Hit_Writeback_Inv_D, addr);
+	return protected_cache_op(Hit_Writeback_Inv_D, addr);
 #endif
 }
 
-static inline void protected_writeback_scache_line(unsigned long addr)
+static inline int protected_writeback_scache_line(unsigned long addr)
 {
 #ifdef CONFIG_EVA
-	protected_cachee_op(Hit_Writeback_Inv_SD, addr);
+	return protected_cachee_op(Hit_Writeback_Inv_SD, addr);
 #else
-	protected_cache_op(Hit_Writeback_Inv_SD, addr);
+	return protected_cache_op(Hit_Writeback_Inv_SD, addr);
 #endif
 }
 
diff --git a/arch/mips/include/asm/tlbex.h b/arch/mips/include/asm/tlbex.h
new file mode 100644
index 000000000000..53050e9dd2c9
--- /dev/null
+++ b/arch/mips/include/asm/tlbex.h
@@ -0,0 +1,26 @@
+#ifndef __ASM_TLBEX_H
+#define __ASM_TLBEX_H
+
+#include <asm/uasm.h>
+
+/*
+ * Write random or indexed TLB entry, and care about the hazards from
+ * the preceding mtc0 and for the following eret.
+ */
+enum tlb_write_entry {
+	tlb_random,
+	tlb_indexed
+};
+
+extern int pgd_reg;
+
+void build_get_pmde64(u32 **p, struct uasm_label **l, struct uasm_reloc **r,
+		      unsigned int tmp, unsigned int ptr);
+void build_get_pgde32(u32 **p, unsigned int tmp, unsigned int ptr);
+void build_get_ptep(u32 **p, unsigned int tmp, unsigned int ptr);
+void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep);
+void build_tlb_write_entry(u32 **p, struct uasm_label **l,
+			   struct uasm_reloc **r,
+			   enum tlb_write_entry wmode);
+
+#endif /* __ASM_TLBEX_H */
diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h
index f7929f65f7ca..e9a9e2ade1d2 100644
--- a/arch/mips/include/asm/uasm.h
+++ b/arch/mips/include/asm/uasm.h
@@ -9,6 +9,9 @@
  * Copyright (C) 2012, 2013  MIPS Technologies, Inc.  All rights reserved.
  */
 
+#ifndef __ASM_UASM_H
+#define __ASM_UASM_H
+
 #include <linux/types.h>
 
 #ifdef CONFIG_EXPORT_UASM
@@ -309,3 +312,5 @@ void uasm_il_bltz(u32 **p, struct uasm_reloc **r, unsigned int reg, int lid);
 void uasm_il_bne(u32 **p, struct uasm_reloc **r, unsigned int reg1,
 		 unsigned int reg2, int lid);
 void uasm_il_bnez(u32 **p, struct uasm_reloc **r, unsigned int reg, int lid);
+
+#endif /* __ASM_UASM_H */
diff --git a/arch/mips/include/uapi/asm/kvm.h b/arch/mips/include/uapi/asm/kvm.h
index 6985eb59b085..a8a0199bf760 100644
--- a/arch/mips/include/uapi/asm/kvm.h
+++ b/arch/mips/include/uapi/asm/kvm.h
@@ -19,6 +19,8 @@
  * Some parts derived from the x86 version of this file.
  */
 
+#define __KVM_HAVE_READONLY_MEM
+
 /*
  * for KVM_GET_REGS and KVM_SET_REGS
  *
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 7c56d6b124d1..65067327db12 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -20,7 +20,9 @@ config KVM
 	select EXPORT_UASM
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
+	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
 	select KVM_MMIO
+	select MMU_NOTIFIER
 	select SRCU
 	---help---
 	  Support for hosting Guest kernels.
diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index 010cef240688..f8e772564d74 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -13,6 +13,7 @@
 #include <linux/err.h>
 #include <linux/highmem.h>
 #include <linux/kvm_host.h>
+#include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
 #include <linux/bootmem.h>
@@ -29,28 +30,37 @@
 static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc,
 				  union mips_instruction replace)
 {
-	unsigned long paddr, flags;
-	void *vaddr;
-
-	if (KVM_GUEST_KSEGX((unsigned long)opc) == KVM_GUEST_KSEG0) {
-		paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
-							    (unsigned long)opc);
-		vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
-		vaddr += paddr & ~PAGE_MASK;
-		memcpy(vaddr, (void *)&replace, sizeof(u32));
-		local_flush_icache_range((unsigned long)vaddr,
-					 (unsigned long)vaddr + 32);
-		kunmap_atomic(vaddr);
-	} else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
-		local_irq_save(flags);
-		memcpy((void *)opc, (void *)&replace, sizeof(u32));
-		__local_flush_icache_user_range((unsigned long)opc,
-						(unsigned long)opc + 32);
-		local_irq_restore(flags);
-	} else {
-		kvm_err("%s: Invalid address: %p\n", __func__, opc);
-		return -EFAULT;
+	unsigned long vaddr = (unsigned long)opc;
+	int err;
+
+retry:
+	/* The GVA page table is still active so use the Linux TLB handlers */
+	kvm_trap_emul_gva_lockless_begin(vcpu);
+	err = put_user(replace.word, opc);
+	kvm_trap_emul_gva_lockless_end(vcpu);
+
+	if (unlikely(err)) {
+		/*
+		 * We write protect clean pages in GVA page table so normal
+		 * Linux TLB mod handler doesn't silently dirty the page.
+		 * Its also possible we raced with a GVA invalidation.
+		 * Try to force the page to become dirty.
+		 */
+		err = kvm_trap_emul_gva_fault(vcpu, vaddr, true);
+		if (unlikely(err)) {
+			kvm_info("%s: Address unwriteable: %p\n",
+				 __func__, opc);
+			return -EFAULT;
+		}
+
+		/*
+		 * Try again. This will likely trigger a TLB refill, which will
+		 * fetch the new dirty entry from the GVA page table, which
+		 * should then succeed.
+		 */
+		goto retry;
 	}
+	__local_flush_icache_user_range(vaddr, vaddr + 4);
 
 	return 0;
 }
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index aa0937423e28..d40cfaad4529 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -38,23 +38,25 @@
  * Compute the return address and do emulate branch simulation, if required.
  * This function should be called only in branch delay slot active.
  */
-unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
-	unsigned long instpc)
+static int kvm_compute_return_epc(struct kvm_vcpu *vcpu, unsigned long instpc,
+				  unsigned long *out)
 {
 	unsigned int dspcontrol;
 	union mips_instruction insn;
 	struct kvm_vcpu_arch *arch = &vcpu->arch;
 	long epc = instpc;
-	long nextpc = KVM_INVALID_INST;
+	long nextpc;
+	int err;
 
-	if (epc & 3)
-		goto unaligned;
+	if (epc & 3) {
+		kvm_err("%s: unaligned epc\n", __func__);
+		return -EINVAL;
+	}
 
 	/* Read the instruction */
-	insn.word = kvm_get_inst((u32 *) epc, vcpu);
-
-	if (insn.word == KVM_INVALID_INST)
-		return KVM_INVALID_INST;
+	err = kvm_get_badinstrp((u32 *)epc, vcpu, &insn.word);
+	if (err)
+		return err;
 
 	switch (insn.i_format.opcode) {
 		/* jr and jalr are in r_format format. */
@@ -66,6 +68,8 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 		case jr_op:
 			nextpc = arch->gprs[insn.r_format.rs];
 			break;
+		default:
+			return -EINVAL;
 		}
 		break;
 
@@ -114,8 +118,11 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 			nextpc = epc;
 			break;
 		case bposge32_op:
-			if (!cpu_has_dsp)
-				goto sigill;
+			if (!cpu_has_dsp) {
+				kvm_err("%s: DSP branch but not DSP ASE\n",
+					__func__);
+				return -EINVAL;
+			}
 
 			dspcontrol = rddsp(0x01);
 
@@ -125,6 +132,8 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 				epc += 8;
 			nextpc = epc;
 			break;
+		default:
+			return -EINVAL;
 		}
 		break;
 
@@ -189,7 +198,7 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 		/* And now the FPA/cp1 branch instructions. */
 	case cop1_op:
 		kvm_err("%s: unsupported cop1_op\n", __func__);
-		break;
+		return -EINVAL;
 
 #ifdef CONFIG_CPU_MIPSR6
 	/* R6 added the following compact branches with forbidden slots */
@@ -198,19 +207,19 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 		/* only rt == 0 isn't compact branch */
 		if (insn.i_format.rt != 0)
 			goto compact_branch;
-		break;
+		return -EINVAL;
 	case pop10_op:
 	case pop30_op:
 		/* only rs == rt == 0 is reserved, rest are compact branches */
 		if (insn.i_format.rs != 0 || insn.i_format.rt != 0)
 			goto compact_branch;
-		break;
+		return -EINVAL;
 	case pop66_op:
 	case pop76_op:
 		/* only rs == 0 isn't compact branch */
 		if (insn.i_format.rs != 0)
 			goto compact_branch;
-		break;
+		return -EINVAL;
 compact_branch:
 		/*
 		 * If we've hit an exception on the forbidden slot, then
@@ -221,42 +230,74 @@ compact_branch:
 		break;
 #else
 compact_branch:
-		/* Compact branches not supported before R6 */
-		break;
+		/* Fall through - Compact branches not supported before R6 */
 #endif
+	default:
+		return -EINVAL;
 	}
 
-	return nextpc;
-
-unaligned:
-	kvm_err("%s: unaligned epc\n", __func__);
-	return nextpc;
-
-sigill:
-	kvm_err("%s: DSP branch but not DSP ASE\n", __func__);
-	return nextpc;
+	*out = nextpc;
+	return 0;
 }
 
 enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause)
 {
-	unsigned long branch_pc;
-	enum emulation_result er = EMULATE_DONE;
+	int err;
 
 	if (cause & CAUSEF_BD) {
-		branch_pc = kvm_compute_return_epc(vcpu, vcpu->arch.pc);
-		if (branch_pc == KVM_INVALID_INST) {
-			er = EMULATE_FAIL;
-		} else {
-			vcpu->arch.pc = branch_pc;
-			kvm_debug("BD update_pc(): New PC: %#lx\n",
-				  vcpu->arch.pc);
-		}
-	} else
+		err = kvm_compute_return_epc(vcpu, vcpu->arch.pc,
+					     &vcpu->arch.pc);
+		if (err)
+			return EMULATE_FAIL;
+	} else {
 		vcpu->arch.pc += 4;
+	}
 
 	kvm_debug("update_pc(): New PC: %#lx\n", vcpu->arch.pc);
 
-	return er;
+	return EMULATE_DONE;
+}
+
+/**
+ * kvm_get_badinstr() - Get bad instruction encoding.
+ * @opc:	Guest pointer to faulting instruction.
+ * @vcpu:	KVM VCPU information.
+ *
+ * Gets the instruction encoding of the faulting instruction, using the saved
+ * BadInstr register value if it exists, otherwise falling back to reading guest
+ * memory at @opc.
+ *
+ * Returns:	The instruction encoding of the faulting instruction.
+ */
+int kvm_get_badinstr(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
+{
+	if (cpu_has_badinstr) {
+		*out = vcpu->arch.host_cp0_badinstr;
+		return 0;
+	} else {
+		return kvm_get_inst(opc, vcpu, out);
+	}
+}
+
+/**
+ * kvm_get_badinstrp() - Get bad prior instruction encoding.
+ * @opc:	Guest pointer to prior faulting instruction.
+ * @vcpu:	KVM VCPU information.
+ *
+ * Gets the instruction encoding of the prior faulting instruction (the branch
+ * containing the delay slot which faulted), using the saved BadInstrP register
+ * value if it exists, otherwise falling back to reading guest memory at @opc.
+ *
+ * Returns:	The instruction encoding of the prior faulting instruction.
+ */
+int kvm_get_badinstrp(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
+{
+	if (cpu_has_badinstrp) {
+		*out = vcpu->arch.host_cp0_badinstrp;
+		return 0;
+	} else {
+		return kvm_get_inst(opc, vcpu, out);
+	}
 }
 
 /**
@@ -856,22 +897,30 @@ enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu)
 static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu,
 					  struct kvm_mips_tlb *tlb)
 {
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
 	int cpu, i;
 	bool user;
 
 	/* No need to flush for entries which are already invalid */
 	if (!((tlb->tlb_lo[0] | tlb->tlb_lo[1]) & ENTRYLO_V))
 		return;
+	/* Don't touch host kernel page tables or TLB mappings */
+	if ((unsigned long)tlb->tlb_hi > 0x7fffffff)
+		return;
 	/* User address space doesn't need flushing for KSeg2/3 changes */
 	user = tlb->tlb_hi < KVM_GUEST_KSEG0;
 
 	preempt_disable();
 
+	/* Invalidate page table entries */
+	kvm_trap_emul_invalidate_gva(vcpu, tlb->tlb_hi & VPN2_MASK, user);
+
 	/*
 	 * Probe the shadow host TLB for the entry being overwritten, if one
 	 * matches, invalidate it
 	 */
-	kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi);
+	kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi, user, true);
 
 	/* Invalidate the whole ASID on other CPUs */
 	cpu = smp_processor_id();
@@ -879,8 +928,8 @@ static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu,
 		if (i == cpu)
 			continue;
 		if (user)
-			vcpu->arch.guest_user_asid[i] = 0;
-		vcpu->arch.guest_kernel_asid[i] = 0;
+			cpu_context(i, user_mm) = 0;
+		cpu_context(i, kern_mm) = 0;
 	}
 
 	preempt_enable();
@@ -1017,7 +1066,7 @@ unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu)
 	unsigned int mask = MIPS_CONF_M;
 
 	/* KScrExist */
-	mask |= (unsigned int)vcpu->arch.kscratch_enabled << 16;
+	mask |= 0xfc << MIPS_CONF4_KSCREXIST_SHIFT;
 
 	return mask;
 }
@@ -1056,6 +1105,7 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 					   struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
 	enum emulation_result er = EMULATE_DONE;
 	u32 rt, rd, sel;
 	unsigned long curr_pc;
@@ -1150,14 +1200,13 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 				er = EMULATE_FAIL;
 				break;
 			}
-#define C0_EBASE_CORE_MASK 0xff
 			if ((rd == MIPS_CP0_PRID) && (sel == 1)) {
-				/* Preserve CORE number */
-				kvm_change_c0_guest_ebase(cop0,
-							  ~(C0_EBASE_CORE_MASK),
+				/*
+				 * Preserve core number, and keep the exception
+				 * base in guest KSeg0.
+				 */
+				kvm_change_c0_guest_ebase(cop0, 0x1ffff000,
 							  vcpu->arch.gprs[rt]);
-				kvm_err("MTCz, cop0->reg[EBASE]: %#lx\n",
-					kvm_read_c0_guest_ebase(cop0));
 			} else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
 				u32 nasid =
 					vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
@@ -1169,6 +1218,17 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 						nasid);
 
 					/*
+					 * Flush entries from the GVA page
+					 * tables.
+					 * Guest user page table will get
+					 * flushed lazily on re-entry to guest
+					 * user if the guest ASID actually
+					 * changes.
+					 */
+					kvm_mips_flush_gva_pt(kern_mm->pgd,
+							      KMF_KERN);
+
+					/*
 					 * Regenerate/invalidate kernel MMU
 					 * context.
 					 * The user MMU context will be
@@ -1178,13 +1238,10 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 					 */
 					preempt_disable();
 					cpu = smp_processor_id();
-					kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm,
-								cpu, vcpu);
-					vcpu->arch.guest_kernel_asid[cpu] =
-						vcpu->arch.guest_kernel_mm.context.asid[cpu];
+					get_new_mmu_context(kern_mm, cpu);
 					for_each_possible_cpu(i)
 						if (i != cpu)
-							vcpu->arch.guest_kernel_asid[i] = 0;
+							cpu_context(i, kern_mm) = 0;
 					preempt_enable();
 				}
 				kvm_write_c0_guest_entryhi(cop0,
@@ -1639,12 +1696,56 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
 	return er;
 }
 
+static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long),
+						     unsigned long curr_pc,
+						     unsigned long addr,
+						     struct kvm_run *run,
+						     struct kvm_vcpu *vcpu,
+						     u32 cause)
+{
+	int err;
+
+	for (;;) {
+		/* Carefully attempt the cache operation */
+		kvm_trap_emul_gva_lockless_begin(vcpu);
+		err = fn(addr);
+		kvm_trap_emul_gva_lockless_end(vcpu);
+
+		if (likely(!err))
+			return EMULATE_DONE;
+
+		/*
+		 * Try to handle the fault and retry, maybe we just raced with a
+		 * GVA invalidation.
+		 */
+		switch (kvm_trap_emul_gva_fault(vcpu, addr, false)) {
+		case KVM_MIPS_GVA:
+		case KVM_MIPS_GPA:
+			/* bad virtual or physical address */
+			return EMULATE_FAIL;
+		case KVM_MIPS_TLB:
+			/* no matching guest TLB */
+			vcpu->arch.host_cp0_badvaddr = addr;
+			vcpu->arch.pc = curr_pc;
+			kvm_mips_emulate_tlbmiss_ld(cause, NULL, run, vcpu);
+			return EMULATE_EXCEPT;
+		case KVM_MIPS_TLBINV:
+			/* invalid matching guest TLB */
+			vcpu->arch.host_cp0_badvaddr = addr;
+			vcpu->arch.pc = curr_pc;
+			kvm_mips_emulate_tlbinv_ld(cause, NULL, run, vcpu);
+			return EMULATE_EXCEPT;
+		default:
+			break;
+		};
+	}
+}
+
 enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
 					     u32 *opc, u32 cause,
 					     struct kvm_run *run,
 					     struct kvm_vcpu *vcpu)
 {
-	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	enum emulation_result er = EMULATE_DONE;
 	u32 cache, op_inst, op, base;
 	s16 offset;
@@ -1701,80 +1802,16 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
 		goto done;
 	}
 
-	preempt_disable();
-	if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
-		if (kvm_mips_host_tlb_lookup(vcpu, va) < 0 &&
-		    kvm_mips_handle_kseg0_tlb_fault(va, vcpu)) {
-			kvm_err("%s: handling mapped kseg0 tlb fault for %lx, vcpu: %p, ASID: %#lx\n",
-				__func__, va, vcpu, read_c0_entryhi());
-			er = EMULATE_FAIL;
-			preempt_enable();
-			goto done;
-		}
-	} else if ((KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0) ||
-		   KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) {
-		int index;
-
-		/* If an entry already exists then skip */
-		if (kvm_mips_host_tlb_lookup(vcpu, va) >= 0)
-			goto skip_fault;
-
-		/*
-		 * If address not in the guest TLB, then give the guest a fault,
-		 * the resulting handler will do the right thing
-		 */
-		index = kvm_mips_guest_tlb_lookup(vcpu, (va & VPN2_MASK) |
-						  (kvm_read_c0_guest_entryhi
-						   (cop0) & KVM_ENTRYHI_ASID));
-
-		if (index < 0) {
-			vcpu->arch.host_cp0_badvaddr = va;
-			vcpu->arch.pc = curr_pc;
-			er = kvm_mips_emulate_tlbmiss_ld(cause, NULL, run,
-							 vcpu);
-			preempt_enable();
-			goto dont_update_pc;
-		} else {
-			struct kvm_mips_tlb *tlb = &vcpu->arch.guest_tlb[index];
-			/*
-			 * Check if the entry is valid, if not then setup a TLB
-			 * invalid exception to the guest
-			 */
-			if (!TLB_IS_VALID(*tlb, va)) {
-				vcpu->arch.host_cp0_badvaddr = va;
-				vcpu->arch.pc = curr_pc;
-				er = kvm_mips_emulate_tlbinv_ld(cause, NULL,
-								run, vcpu);
-				preempt_enable();
-				goto dont_update_pc;
-			}
-			/*
-			 * We fault an entry from the guest tlb to the
-			 * shadow host TLB
-			 */
-			if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) {
-				kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n",
-					__func__, va, index, vcpu,
-					read_c0_entryhi());
-				er = EMULATE_FAIL;
-				preempt_enable();
-				goto done;
-			}
-		}
-	} else {
-		kvm_err("INVALID CACHE INDEX/ADDRESS (cache: %#x, op: %#x, base[%d]: %#lx, offset: %#x\n",
-			cache, op, base, arch->gprs[base], offset);
-		er = EMULATE_FAIL;
-		preempt_enable();
-		goto done;
-
-	}
-
-skip_fault:
 	/* XXXKYMA: Only a subset of cache ops are supported, used by Linux */
 	if (op_inst == Hit_Writeback_Inv_D || op_inst == Hit_Invalidate_D) {
-		flush_dcache_line(va);
-
+		/*
+		 * Perform the dcache part of icache synchronisation on the
+		 * guest's behalf.
+		 */
+		er = kvm_mips_guest_cache_op(protected_writeback_dcache_line,
+					     curr_pc, va, run, vcpu, cause);
+		if (er != EMULATE_DONE)
+			goto done;
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
 		/*
 		 * Replace the CACHE instruction, with a SYNCI, not the same,
@@ -1783,8 +1820,15 @@ skip_fault:
 		kvm_mips_trans_cache_va(inst, opc, vcpu);
 #endif
 	} else if (op_inst == Hit_Invalidate_I) {
-		flush_dcache_line(va);
-		flush_icache_line(va);
+		/* Perform the icache synchronisation on the guest's behalf */
+		er = kvm_mips_guest_cache_op(protected_writeback_dcache_line,
+					     curr_pc, va, run, vcpu, cause);
+		if (er != EMULATE_DONE)
+			goto done;
+		er = kvm_mips_guest_cache_op(protected_flush_icache_line,
+					     curr_pc, va, run, vcpu, cause);
+		if (er != EMULATE_DONE)
+			goto done;
 
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
 		/* Replace the CACHE instruction, with a SYNCI */
@@ -1796,17 +1840,13 @@ skip_fault:
 		er = EMULATE_FAIL;
 	}
 
-	preempt_enable();
 done:
 	/* Rollback PC only if emulation was unsuccessful */
 	if (er == EMULATE_FAIL)
 		vcpu->arch.pc = curr_pc;
-
-dont_update_pc:
-	/*
-	 * This is for exceptions whose emulation updates the PC, so do not
-	 * overwrite the PC under any circumstances
-	 */
+	/* Guest exception needs guest to resume */
+	if (er == EMULATE_EXCEPT)
+		er = EMULATE_DONE;
 
 	return er;
 }
@@ -1817,12 +1857,14 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
 {
 	union mips_instruction inst;
 	enum emulation_result er = EMULATE_DONE;
+	int err;
 
 	/* Fetch the instruction. */
 	if (cause & CAUSEF_BD)
 		opc += 1;
-
-	inst.word = kvm_get_inst(opc, vcpu);
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
+	if (err)
+		return EMULATE_FAIL;
 
 	switch (inst.r_format.opcode) {
 	case cop0_op:
@@ -1874,6 +1916,22 @@ unknown:
 	return er;
 }
 
+/**
+ * kvm_mips_guest_exception_base() - Find guest exception vector base address.
+ *
+ * Returns:	The base address of the current guest exception vector, taking
+ *		both Guest.CP0_Status.BEV and Guest.CP0_EBase into account.
+ */
+long kvm_mips_guest_exception_base(struct kvm_vcpu *vcpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+
+	if (kvm_read_c0_guest_status(cop0) & ST0_BEV)
+		return KVM_GUEST_CKSEG1ADDR(0x1fc00200);
+	else
+		return kvm_read_c0_guest_ebase(cop0) & MIPS_EBASE_BASE;
+}
+
 enum emulation_result kvm_mips_emulate_syscall(u32 cause,
 					       u32 *opc,
 					       struct kvm_run *run,
@@ -1899,7 +1957,7 @@ enum emulation_result kvm_mips_emulate_syscall(u32 cause,
 					  (EXCCODE_SYS << CAUSEB_EXCCODE));
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 
 	} else {
 		kvm_err("Trying to deliver SYSCALL when EXL is already set\n");
@@ -1933,13 +1991,13 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
 			  arch->pc);
 
 		/* set pc to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x0;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x0;
 
 	} else {
 		kvm_debug("[EXL == 1] delivering TLB MISS @ pc %#lx\n",
 			  arch->pc);
 
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 	}
 
 	kvm_change_c0_guest_cause(cop0, (0xff),
@@ -1949,8 +2007,6 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
 	kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
 	/* XXXKYMA: is the context register used by linux??? */
 	kvm_write_c0_guest_entryhi(cop0, entryhi);
-	/* Blow away the shadow host TLBs */
-	kvm_mips_flush_host_tlb(1);
 
 	return EMULATE_DONE;
 }
@@ -1978,16 +2034,14 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
 
 		kvm_debug("[EXL == 0] delivering TLB INV @ pc %#lx\n",
 			  arch->pc);
-
-		/* set pc to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
-
 	} else {
 		kvm_debug("[EXL == 1] delivering TLB MISS @ pc %#lx\n",
 			  arch->pc);
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
 	}
 
+	/* set pc to the exception entry point */
+	arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
+
 	kvm_change_c0_guest_cause(cop0, (0xff),
 				  (EXCCODE_TLBL << CAUSEB_EXCCODE));
 
@@ -1995,8 +2049,6 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
 	kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
 	/* XXXKYMA: is the context register used by linux??? */
 	kvm_write_c0_guest_entryhi(cop0, entryhi);
-	/* Blow away the shadow host TLBs */
-	kvm_mips_flush_host_tlb(1);
 
 	return EMULATE_DONE;
 }
@@ -2025,11 +2077,11 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
 			  arch->pc);
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x0;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x0;
 	} else {
 		kvm_debug("[EXL == 1] Delivering TLB MISS @ pc %#lx\n",
 			  arch->pc);
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 	}
 
 	kvm_change_c0_guest_cause(cop0, (0xff),
@@ -2039,8 +2091,6 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
 	kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
 	/* XXXKYMA: is the context register used by linux??? */
 	kvm_write_c0_guest_entryhi(cop0, entryhi);
-	/* Blow away the shadow host TLBs */
-	kvm_mips_flush_host_tlb(1);
 
 	return EMULATE_DONE;
 }
@@ -2067,15 +2117,14 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
 
 		kvm_debug("[EXL == 0] Delivering TLB MISS @ pc %#lx\n",
 			  arch->pc);
-
-		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
 	} else {
 		kvm_debug("[EXL == 1] Delivering TLB MISS @ pc %#lx\n",
 			  arch->pc);
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
 	}
 
+	/* Set PC to the exception entry point */
+	arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
+
 	kvm_change_c0_guest_cause(cop0, (0xff),
 				  (EXCCODE_TLBS << CAUSEB_EXCCODE));
 
@@ -2083,41 +2132,10 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
 	kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
 	/* XXXKYMA: is the context register used by linux??? */
 	kvm_write_c0_guest_entryhi(cop0, entryhi);
-	/* Blow away the shadow host TLBs */
-	kvm_mips_flush_host_tlb(1);
 
 	return EMULATE_DONE;
 }
 
-/* TLBMOD: store into address matching TLB with Dirty bit off */
-enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc,
-					     struct kvm_run *run,
-					     struct kvm_vcpu *vcpu)
-{
-	enum emulation_result er = EMULATE_DONE;
-#ifdef DEBUG
-	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
-			(kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
-	int index;
-
-	/* If address not in the guest TLB, then we are in trouble */
-	index = kvm_mips_guest_tlb_lookup(vcpu, entryhi);
-	if (index < 0) {
-		/* XXXKYMA Invalidate and retry */
-		kvm_mips_host_tlb_inv(vcpu, vcpu->arch.host_cp0_badvaddr);
-		kvm_err("%s: host got TLBMOD for %#lx but entry not present in Guest TLB\n",
-		     __func__, entryhi);
-		kvm_mips_dump_guest_tlbs(vcpu);
-		kvm_mips_dump_host_tlbs();
-		return EMULATE_FAIL;
-	}
-#endif
-
-	er = kvm_mips_emulate_tlbmod(cause, opc, run, vcpu);
-	return er;
-}
-
 enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
 					      u32 *opc,
 					      struct kvm_run *run,
@@ -2140,14 +2158,13 @@ enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
 
 		kvm_debug("[EXL == 0] Delivering TLB MOD @ pc %#lx\n",
 			  arch->pc);
-
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
 	} else {
 		kvm_debug("[EXL == 1] Delivering TLB MOD @ pc %#lx\n",
 			  arch->pc);
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
 	}
 
+	arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
+
 	kvm_change_c0_guest_cause(cop0, (0xff),
 				  (EXCCODE_MOD << CAUSEB_EXCCODE));
 
@@ -2155,8 +2172,6 @@ enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
 	kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
 	/* XXXKYMA: is the context register used by linux??? */
 	kvm_write_c0_guest_entryhi(cop0, entryhi);
-	/* Blow away the shadow host TLBs */
-	kvm_mips_flush_host_tlb(1);
 
 	return EMULATE_DONE;
 }
@@ -2181,7 +2196,7 @@ enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause,
 
 	}
 
-	arch->pc = KVM_GUEST_KSEG0 + 0x180;
+	arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 
 	kvm_change_c0_guest_cause(cop0, (0xff),
 				  (EXCCODE_CPU << CAUSEB_EXCCODE));
@@ -2215,7 +2230,7 @@ enum emulation_result kvm_mips_emulate_ri_exc(u32 cause,
 					  (EXCCODE_RI << CAUSEB_EXCCODE));
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 
 	} else {
 		kvm_err("Trying to deliver RI when EXL is already set\n");
@@ -2250,7 +2265,7 @@ enum emulation_result kvm_mips_emulate_bp_exc(u32 cause,
 					  (EXCCODE_BP << CAUSEB_EXCCODE));
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 
 	} else {
 		kvm_err("Trying to deliver BP when EXL is already set\n");
@@ -2285,7 +2300,7 @@ enum emulation_result kvm_mips_emulate_trap_exc(u32 cause,
 					  (EXCCODE_TR << CAUSEB_EXCCODE));
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 
 	} else {
 		kvm_err("Trying to deliver TRAP when EXL is already set\n");
@@ -2320,7 +2335,7 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause,
 					  (EXCCODE_MSAFPE << CAUSEB_EXCCODE));
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 
 	} else {
 		kvm_err("Trying to deliver MSAFPE when EXL is already set\n");
@@ -2355,7 +2370,7 @@ enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause,
 					  (EXCCODE_FPE << CAUSEB_EXCCODE));
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 
 	} else {
 		kvm_err("Trying to deliver FPE when EXL is already set\n");
@@ -2390,7 +2405,7 @@ enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
 					  (EXCCODE_MSADIS << CAUSEB_EXCCODE));
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 
 	} else {
 		kvm_err("Trying to deliver MSADIS when EXL is already set\n");
@@ -2409,6 +2424,7 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 	enum emulation_result er = EMULATE_DONE;
 	unsigned long curr_pc;
 	union mips_instruction inst;
+	int err;
 
 	/*
 	 * Update PC and hold onto current PC in case there is
@@ -2422,11 +2438,9 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 	/* Fetch the instruction. */
 	if (cause & CAUSEF_BD)
 		opc += 1;
-
-	inst.word = kvm_get_inst(opc, vcpu);
-
-	if (inst.word == KVM_INVALID_INST) {
-		kvm_err("%s: Cannot get inst @ %p\n", __func__, opc);
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
+	if (err) {
+		kvm_err("%s: Cannot get inst @ %p (%d)\n", __func__, opc, err);
 		return EMULATE_FAIL;
 	}
 
@@ -2557,7 +2571,7 @@ static enum emulation_result kvm_mips_emulate_exc(u32 cause,
 					  (exccode << CAUSEB_EXCCODE));
 
 		/* Set PC to the exception entry point */
-		arch->pc = KVM_GUEST_KSEG0 + 0x180;
+		arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
 		kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
 
 		kvm_debug("Delivering EXC %d @ pc %#lx, badVaddr: %#lx\n",
@@ -2670,7 +2684,8 @@ enum emulation_result kvm_mips_check_privilege(u32 cause,
 enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
 					      u32 *opc,
 					      struct kvm_run *run,
-					      struct kvm_vcpu *vcpu)
+					      struct kvm_vcpu *vcpu,
+					      bool write_fault)
 {
 	enum emulation_result er = EMULATE_DONE;
 	u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
@@ -2726,7 +2741,8 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
 			 * OK we have a Guest TLB entry, now inject it into the
 			 * shadow host TLB
 			 */
-			if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) {
+			if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, va,
+								 write_fault)) {
 				kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n",
 					__func__, va, index, vcpu,
 					read_c0_entryhi());
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index e92fb190e2d6..c5b254c4d0da 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -12,8 +12,11 @@
  */
 
 #include <linux/kvm_host.h>
+#include <linux/log2.h>
+#include <asm/mmu_context.h>
 #include <asm/msa.h>
 #include <asm/setup.h>
+#include <asm/tlbex.h>
 #include <asm/uasm.h>
 
 /* Register names */
@@ -50,6 +53,8 @@
 /* Some CP0 registers */
 #define C0_HWRENA	7, 0
 #define C0_BADVADDR	8, 0
+#define C0_BADINSTR	8, 1
+#define C0_BADINSTRP	8, 2
 #define C0_ENTRYHI	10, 0
 #define C0_STATUS	12, 0
 #define C0_CAUSE	13, 0
@@ -89,6 +94,21 @@ static void *kvm_mips_build_ret_from_exit(void *addr);
 static void *kvm_mips_build_ret_to_guest(void *addr);
 static void *kvm_mips_build_ret_to_host(void *addr);
 
+/*
+ * The version of this function in tlbex.c uses current_cpu_type(), but for KVM
+ * we assume symmetry.
+ */
+static int c0_kscratch(void)
+{
+	switch (boot_cpu_type()) {
+	case CPU_XLP:
+	case CPU_XLR:
+		return 22;
+	default:
+		return 31;
+	}
+}
+
 /**
  * kvm_mips_entry_setup() - Perform global setup for entry code.
  *
@@ -103,18 +123,21 @@ int kvm_mips_entry_setup(void)
 	 * We prefer to use KScratchN registers if they are available over the
 	 * defaults above, which may not work on all cores.
 	 */
-	unsigned int kscratch_mask = cpu_data[0].kscratch_mask & 0xfc;
+	unsigned int kscratch_mask = cpu_data[0].kscratch_mask;
+
+	if (pgd_reg != -1)
+		kscratch_mask &= ~BIT(pgd_reg);
 
 	/* Pick a scratch register for storing VCPU */
 	if (kscratch_mask) {
-		scratch_vcpu[0] = 31;
+		scratch_vcpu[0] = c0_kscratch();
 		scratch_vcpu[1] = ffs(kscratch_mask) - 1;
 		kscratch_mask &= ~BIT(scratch_vcpu[1]);
 	}
 
 	/* Pick a scratch register to use as a temp for saving state */
 	if (kscratch_mask) {
-		scratch_tmp[0] = 31;
+		scratch_tmp[0] = c0_kscratch();
 		scratch_tmp[1] = ffs(kscratch_mask) - 1;
 		kscratch_mask &= ~BIT(scratch_tmp[1]);
 	}
@@ -130,7 +153,7 @@ static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp,
 	UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
 
 	/* Save the temp scratch register value in cp0_cause of stack frame */
-	if (scratch_tmp[0] == 31) {
+	if (scratch_tmp[0] == c0_kscratch()) {
 		UASM_i_MFC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
 		UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
 	}
@@ -146,7 +169,7 @@ static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp,
 	UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
 	UASM_i_MTC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
 
-	if (scratch_tmp[0] == 31) {
+	if (scratch_tmp[0] == c0_kscratch()) {
 		UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
 		UASM_i_MTC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
 	}
@@ -286,23 +309,26 @@ static void *kvm_mips_build_enter_guest(void *addr)
 	uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL);
 	uasm_i_xori(&p, T0, T0, KSU_USER);
 	uasm_il_bnez(&p, &r, T0, label_kernel_asid);
-	 UASM_i_ADDIU(&p, T1, K1,
-		      offsetof(struct kvm_vcpu_arch, guest_kernel_asid));
+	 UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch,
+					   guest_kernel_mm.context.asid));
 	/* else user */
-	UASM_i_ADDIU(&p, T1, K1,
-		     offsetof(struct kvm_vcpu_arch, guest_user_asid));
+	UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch,
+					  guest_user_mm.context.asid));
 	uasm_l_kernel_asid(&l, p);
 
 	/* t1: contains the base of the ASID array, need to get the cpu id  */
 	/* smp_processor_id */
 	uasm_i_lw(&p, T2, offsetof(struct thread_info, cpu), GP);
-	/* x4 */
-	uasm_i_sll(&p, T2, T2, 2);
+	/* index the ASID array */
+	uasm_i_sll(&p, T2, T2, ilog2(sizeof(long)));
 	UASM_i_ADDU(&p, T3, T1, T2);
-	uasm_i_lw(&p, K0, 0, T3);
+	UASM_i_LW(&p, K0, 0, T3);
 #ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
-	/* x sizeof(struct cpuinfo_mips)/4 */
-	uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4);
+	/*
+	 * reuse ASID array offset
+	 * cpuinfo_mips is a multiple of sizeof(long)
+	 */
+	uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/sizeof(long));
 	uasm_i_mul(&p, T2, T2, T3);
 
 	UASM_i_LA_mostly(&p, AT, (long)&cpu_data[0].asid_mask);
@@ -312,7 +338,20 @@ static void *kvm_mips_build_enter_guest(void *addr)
 #else
 	uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID);
 #endif
-	uasm_i_mtc0(&p, K0, C0_ENTRYHI);
+
+	/*
+	 * Set up KVM T&E GVA pgd.
+	 * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD():
+	 * - call tlbmiss_handler_setup_pgd(mm->pgd)
+	 * - but skips write into CP0_PWBase for now
+	 */
+	UASM_i_LW(&p, A0, (int)offsetof(struct mm_struct, pgd) -
+			  (int)offsetof(struct mm_struct, context.asid), T1);
+
+	UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd);
+	uasm_i_jalr(&p, RA, T9);
+	 uasm_i_mtc0(&p, K0, C0_ENTRYHI);
+
 	uasm_i_ehb(&p);
 
 	/* Disable RDHWR access */
@@ -348,6 +387,80 @@ static void *kvm_mips_build_enter_guest(void *addr)
 }
 
 /**
+ * kvm_mips_build_tlb_refill_exception() - Assemble TLB refill handler.
+ * @addr:	Address to start writing code.
+ * @handler:	Address of common handler (within range of @addr).
+ *
+ * Assemble TLB refill exception fast path handler for guest execution.
+ *
+ * Returns:	Next address after end of written function.
+ */
+void *kvm_mips_build_tlb_refill_exception(void *addr, void *handler)
+{
+	u32 *p = addr;
+	struct uasm_label labels[2];
+	struct uasm_reloc relocs[2];
+	struct uasm_label *l = labels;
+	struct uasm_reloc *r = relocs;
+
+	memset(labels, 0, sizeof(labels));
+	memset(relocs, 0, sizeof(relocs));
+
+	/* Save guest k1 into scratch register */
+	UASM_i_MTC0(&p, K1, scratch_tmp[0], scratch_tmp[1]);
+
+	/* Get the VCPU pointer from the VCPU scratch register */
+	UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
+
+	/* Save guest k0 into VCPU structure */
+	UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu, arch.gprs[K0]), K1);
+
+	/*
+	 * Some of the common tlbex code uses current_cpu_type(). For KVM we
+	 * assume symmetry and just disable preemption to silence the warning.
+	 */
+	preempt_disable();
+
+	/*
+	 * Now for the actual refill bit. A lot of this can be common with the
+	 * Linux TLB refill handler, however we don't need to handle so many
+	 * cases. We only need to handle user mode refills, and user mode runs
+	 * with 32-bit addressing.
+	 *
+	 * Therefore the branch to label_vmalloc generated by build_get_pmde64()
+	 * that isn't resolved should never actually get taken and is harmless
+	 * to leave in place for now.
+	 */
+
+#ifdef CONFIG_64BIT
+	build_get_pmde64(&p, &l, &r, K0, K1); /* get pmd in K1 */
+#else
+	build_get_pgde32(&p, K0, K1); /* get pgd in K1 */
+#endif
+
+	/* we don't support huge pages yet */
+
+	build_get_ptep(&p, K0, K1);
+	build_update_entries(&p, K0, K1);
+	build_tlb_write_entry(&p, &l, &r, tlb_random);
+
+	preempt_enable();
+
+	/* Get the VCPU pointer from the VCPU scratch register again */
+	UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
+
+	/* Restore the guest's k0/k1 registers */
+	UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu, arch.gprs[K0]), K1);
+	uasm_i_ehb(&p);
+	UASM_i_MFC0(&p, K1, scratch_tmp[0], scratch_tmp[1]);
+
+	/* Jump to guest */
+	uasm_i_eret(&p);
+
+	return p;
+}
+
+/**
  * kvm_mips_build_exception() - Assemble first level guest exception handler.
  * @addr:	Address to start writing code.
  * @handler:	Address of common handler (within range of @addr).
@@ -468,6 +581,18 @@ void *kvm_mips_build_exit(void *addr)
 	uasm_i_mfc0(&p, K0, C0_CAUSE);
 	uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_cause), K1);
 
+	if (cpu_has_badinstr) {
+		uasm_i_mfc0(&p, K0, C0_BADINSTR);
+		uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch,
+					   host_cp0_badinstr), K1);
+	}
+
+	if (cpu_has_badinstrp) {
+		uasm_i_mfc0(&p, K0, C0_BADINSTRP);
+		uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch,
+					   host_cp0_badinstrp), K1);
+	}
+
 	/* Now restore the host state just enough to run the handlers */
 
 	/* Switch EBASE to the one used by Linux */
diff --git a/arch/mips/kvm/interrupt.c b/arch/mips/kvm/interrupt.c
index e88403b3dcdd..aa0a1a00faf6 100644
--- a/arch/mips/kvm/interrupt.c
+++ b/arch/mips/kvm/interrupt.c
@@ -183,10 +183,11 @@ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
 					  (exccode << CAUSEB_EXCCODE));
 
 		/* XXXSL Set PC to the interrupt exception entry point */
+		arch->pc = kvm_mips_guest_exception_base(vcpu);
 		if (kvm_read_c0_guest_cause(cop0) & CAUSEF_IV)
-			arch->pc = KVM_GUEST_KSEG0 + 0x200;
+			arch->pc += 0x200;
 		else
-			arch->pc = KVM_GUEST_KSEG0 + 0x180;
+			arch->pc += 0x180;
 
 		clear_bit(priority, &vcpu->arch.pending_exceptions);
 	}
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 29ec9ab3fd55..31ee5ee0010b 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -22,6 +22,7 @@
 #include <asm/page.h>
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 
 #include <linux/kvm_host.h>
@@ -63,18 +64,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{NULL}
 };
 
-static int kvm_mips_reset_vcpu(struct kvm_vcpu *vcpu)
-{
-	int i;
-
-	for_each_possible_cpu(i) {
-		vcpu->arch.guest_kernel_asid[i] = 0;
-		vcpu->arch.guest_user_asid[i] = 0;
-	}
-
-	return 0;
-}
-
 /*
  * XXXKYMA: We are simulatoring a processor that has the WII bit set in
  * Config7, so we are "runnable" if interrupts are pending
@@ -104,39 +93,12 @@ void kvm_arch_check_processor_compat(void *rtn)
 	*(int *)rtn = 0;
 }
 
-static void kvm_mips_init_tlbs(struct kvm *kvm)
-{
-	unsigned long wired;
-
-	/*
-	 * Add a wired entry to the TLB, it is used to map the commpage to
-	 * the Guest kernel
-	 */
-	wired = read_c0_wired();
-	write_c0_wired(wired + 1);
-	mtc0_tlbw_hazard();
-	kvm->arch.commpage_tlb = wired;
-
-	kvm_debug("[%d] commpage TLB: %d\n", smp_processor_id(),
-		  kvm->arch.commpage_tlb);
-}
-
-static void kvm_mips_init_vm_percpu(void *arg)
-{
-	struct kvm *kvm = (struct kvm *)arg;
-
-	kvm_mips_init_tlbs(kvm);
-	kvm_mips_callbacks->vm_init(kvm);
-
-}
-
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
-	if (atomic_inc_return(&kvm_mips_instance) == 1) {
-		kvm_debug("%s: 1st KVM instance, setup host TLB parameters\n",
-			  __func__);
-		on_each_cpu(kvm_mips_init_vm_percpu, kvm, 1);
-	}
+	/* Allocate page table to map GPA -> RPA */
+	kvm->arch.gpa_mm.pgd = kvm_pgd_alloc();
+	if (!kvm->arch.gpa_mm.pgd)
+		return -ENOMEM;
 
 	return 0;
 }
@@ -156,13 +118,6 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
 	unsigned int i;
 	struct kvm_vcpu *vcpu;
 
-	/* Put the pages we reserved for the guest pmap */
-	for (i = 0; i < kvm->arch.guest_pmap_npages; i++) {
-		if (kvm->arch.guest_pmap[i] != KVM_INVALID_PAGE)
-			kvm_release_pfn_clean(kvm->arch.guest_pmap[i]);
-	}
-	kfree(kvm->arch.guest_pmap);
-
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		kvm_arch_vcpu_free(vcpu);
 	}
@@ -177,25 +132,17 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
 	mutex_unlock(&kvm->lock);
 }
 
-static void kvm_mips_uninit_tlbs(void *arg)
+static void kvm_mips_free_gpa_pt(struct kvm *kvm)
 {
-	/* Restore wired count */
-	write_c0_wired(0);
-	mtc0_tlbw_hazard();
-	/* Clear out all the TLBs */
-	kvm_local_flush_tlb_all();
+	/* It should always be safe to remove after flushing the whole range */
+	WARN_ON(!kvm_mips_flush_gpa_pt(kvm, 0, ~0));
+	pgd_free(NULL, kvm->arch.gpa_mm.pgd);
 }
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvm_mips_free_vcpus(kvm);
-
-	/* If this is the last instance, restore wired count */
-	if (atomic_dec_return(&kvm_mips_instance) == 0) {
-		kvm_debug("%s: last KVM instance, restoring TLB parameters\n",
-			  __func__);
-		on_each_cpu(kvm_mips_uninit_tlbs, NULL, 1);
-	}
+	kvm_mips_free_gpa_pt(kvm);
 }
 
 long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl,
@@ -210,6 +157,32 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 	return 0;
 }
 
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+	/* Flush whole GPA */
+	kvm_mips_flush_gpa_pt(kvm, 0, ~0);
+
+	/* Let implementation do the rest */
+	kvm_mips_callbacks->flush_shadow_all(kvm);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+				   struct kvm_memory_slot *slot)
+{
+	/*
+	 * The slot has been made invalid (ready for moving or deletion), so we
+	 * need to ensure that it can no longer be accessed by any guest VCPUs.
+	 */
+
+	spin_lock(&kvm->mmu_lock);
+	/* Flush slot from GPA */
+	kvm_mips_flush_gpa_pt(kvm, slot->base_gfn,
+			      slot->base_gfn + slot->npages - 1);
+	/* Let implementation do the rest */
+	kvm_mips_callbacks->flush_shadow_memslot(kvm, slot);
+	spin_unlock(&kvm->mmu_lock);
+}
+
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
 				   const struct kvm_userspace_memory_region *mem,
@@ -224,35 +197,32 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 				   const struct kvm_memory_slot *new,
 				   enum kvm_mr_change change)
 {
-	unsigned long npages = 0;
-	int i;
+	int needs_flush;
 
 	kvm_debug("%s: kvm: %p slot: %d, GPA: %llx, size: %llx, QVA: %llx\n",
 		  __func__, kvm, mem->slot, mem->guest_phys_addr,
 		  mem->memory_size, mem->userspace_addr);
 
-	/* Setup Guest PMAP table */
-	if (!kvm->arch.guest_pmap) {
-		if (mem->slot == 0)
-			npages = mem->memory_size >> PAGE_SHIFT;
-
-		if (npages) {
-			kvm->arch.guest_pmap_npages = npages;
-			kvm->arch.guest_pmap =
-			    kzalloc(npages * sizeof(unsigned long), GFP_KERNEL);
-
-			if (!kvm->arch.guest_pmap) {
-				kvm_err("Failed to allocate guest PMAP\n");
-				return;
-			}
-
-			kvm_debug("Allocated space for Guest PMAP Table (%ld pages) @ %p\n",
-				  npages, kvm->arch.guest_pmap);
-
-			/* Now setup the page table */
-			for (i = 0; i < npages; i++)
-				kvm->arch.guest_pmap[i] = KVM_INVALID_PAGE;
-		}
+	/*
+	 * If dirty page logging is enabled, write protect all pages in the slot
+	 * ready for dirty logging.
+	 *
+	 * There is no need to do this in any of the following cases:
+	 * CREATE:	No dirty mappings will already exist.
+	 * MOVE/DELETE:	The old mappings will already have been cleaned up by
+	 *		kvm_arch_flush_shadow_memslot()
+	 */
+	if (change == KVM_MR_FLAGS_ONLY &&
+	    (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
+	     new->flags & KVM_MEM_LOG_DIRTY_PAGES)) {
+		spin_lock(&kvm->mmu_lock);
+		/* Write protect GPA page table entries */
+		needs_flush = kvm_mips_mkclean_gpa_pt(kvm, new->base_gfn,
+					new->base_gfn + new->npages - 1);
+		/* Let implementation do the rest */
+		if (needs_flush)
+			kvm_mips_callbacks->flush_shadow_memslot(kvm, new);
+		spin_unlock(&kvm->mmu_lock);
 	}
 }
 
@@ -276,7 +246,7 @@ static inline void dump_handler(const char *symbol, void *start, void *end)
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
 	int err, size;
-	void *gebase, *p, *handler;
+	void *gebase, *p, *handler, *refill_start, *refill_end;
 	int i;
 
 	struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
@@ -329,8 +299,9 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	/* Build guest exception vectors dynamically in unmapped memory */
 	handler = gebase + 0x2000;
 
-	/* TLB Refill, EXL = 0 */
-	kvm_mips_build_exception(gebase, handler);
+	/* TLB refill */
+	refill_start = gebase;
+	refill_end = kvm_mips_build_tlb_refill_exception(refill_start, handler);
 
 	/* General Exception Entry point */
 	kvm_mips_build_exception(gebase + 0x180, handler);
@@ -356,6 +327,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	pr_debug("#include <asm/regdef.h>\n");
 	pr_debug("\n");
 	dump_handler("kvm_vcpu_run", vcpu->arch.vcpu_run, p);
+	dump_handler("kvm_tlb_refill", refill_start, refill_end);
 	dump_handler("kvm_gen_exc", gebase + 0x180, gebase + 0x200);
 	dump_handler("kvm_exit", gebase + 0x2000, vcpu->arch.vcpu_run);
 
@@ -406,6 +378,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 
 	kvm_mips_dump_stats(vcpu);
 
+	kvm_mmu_free_memory_caches(vcpu);
 	kfree(vcpu->arch.guest_ebase);
 	kfree(vcpu->arch.kseg0_commpage);
 	kfree(vcpu);
@@ -422,34 +395,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 	return -ENOIOCTLCMD;
 }
 
-/* Must be called with preemption disabled, just before entering guest */
-static void kvm_mips_check_asids(struct kvm_vcpu *vcpu)
-{
-	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	int i, cpu = smp_processor_id();
-	unsigned int gasid;
-
-	/*
-	 * Lazy host ASID regeneration for guest user mode.
-	 * If the guest ASID has changed since the last guest usermode
-	 * execution, regenerate the host ASID so as to invalidate stale TLB
-	 * entries.
-	 */
-	if (!KVM_GUEST_KERNEL_MODE(vcpu)) {
-		gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID;
-		if (gasid != vcpu->arch.last_user_gasid) {
-			kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu,
-						vcpu);
-			vcpu->arch.guest_user_asid[cpu] =
-				vcpu->arch.guest_user_mm.context.asid[cpu];
-			for_each_possible_cpu(i)
-				if (i != cpu)
-					vcpu->arch.guest_user_asid[cpu] = 0;
-			vcpu->arch.last_user_gasid = gasid;
-		}
-	}
-}
-
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
 	int r = 0;
@@ -467,25 +412,20 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	lose_fpu(1);
 
 	local_irq_disable();
-	/* Check if we have any exceptions/interrupts pending */
-	kvm_mips_deliver_interrupts(vcpu,
-				    kvm_read_c0_guest_cause(vcpu->arch.cop0));
-
 	guest_enter_irqoff();
-
-	/* Disable hardware page table walking while in guest */
-	htw_stop();
-
 	trace_kvm_enter(vcpu);
 
-	kvm_mips_check_asids(vcpu);
+	/*
+	 * Make sure the read of VCPU requests in vcpu_run() callback is not
+	 * reordered ahead of the write to vcpu->mode, or we could miss a TLB
+	 * flush request while the requester sees the VCPU as outside of guest
+	 * mode and not needing an IPI.
+	 */
+	smp_store_mb(vcpu->mode, IN_GUEST_MODE);
 
-	r = vcpu->arch.vcpu_run(run, vcpu);
-	trace_kvm_out(vcpu);
-
-	/* Re-enable HTW before enabling interrupts */
-	htw_start();
+	r = kvm_mips_callbacks->vcpu_run(run, vcpu);
 
+	trace_kvm_out(vcpu);
 	guest_exit_irqoff();
 	local_irq_enable();
 
@@ -580,33 +520,6 @@ static u64 kvm_mips_get_one_regs[] = {
 	KVM_REG_MIPS_LO,
 #endif
 	KVM_REG_MIPS_PC,
-
-	KVM_REG_MIPS_CP0_INDEX,
-	KVM_REG_MIPS_CP0_CONTEXT,
-	KVM_REG_MIPS_CP0_USERLOCAL,
-	KVM_REG_MIPS_CP0_PAGEMASK,
-	KVM_REG_MIPS_CP0_WIRED,
-	KVM_REG_MIPS_CP0_HWRENA,
-	KVM_REG_MIPS_CP0_BADVADDR,
-	KVM_REG_MIPS_CP0_COUNT,
-	KVM_REG_MIPS_CP0_ENTRYHI,
-	KVM_REG_MIPS_CP0_COMPARE,
-	KVM_REG_MIPS_CP0_STATUS,
-	KVM_REG_MIPS_CP0_CAUSE,
-	KVM_REG_MIPS_CP0_EPC,
-	KVM_REG_MIPS_CP0_PRID,
-	KVM_REG_MIPS_CP0_CONFIG,
-	KVM_REG_MIPS_CP0_CONFIG1,
-	KVM_REG_MIPS_CP0_CONFIG2,
-	KVM_REG_MIPS_CP0_CONFIG3,
-	KVM_REG_MIPS_CP0_CONFIG4,
-	KVM_REG_MIPS_CP0_CONFIG5,
-	KVM_REG_MIPS_CP0_CONFIG7,
-	KVM_REG_MIPS_CP0_ERROREPC,
-
-	KVM_REG_MIPS_COUNT_CTL,
-	KVM_REG_MIPS_COUNT_RESUME,
-	KVM_REG_MIPS_COUNT_HZ,
 };
 
 static u64 kvm_mips_get_one_regs_fpu[] = {
@@ -619,15 +532,6 @@ static u64 kvm_mips_get_one_regs_msa[] = {
 	KVM_REG_MIPS_MSA_CSR,
 };
 
-static u64 kvm_mips_get_one_regs_kscratch[] = {
-	KVM_REG_MIPS_CP0_KSCRATCH1,
-	KVM_REG_MIPS_CP0_KSCRATCH2,
-	KVM_REG_MIPS_CP0_KSCRATCH3,
-	KVM_REG_MIPS_CP0_KSCRATCH4,
-	KVM_REG_MIPS_CP0_KSCRATCH5,
-	KVM_REG_MIPS_CP0_KSCRATCH6,
-};
-
 static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
 {
 	unsigned long ret;
@@ -641,7 +545,6 @@ static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
 	}
 	if (kvm_mips_guest_can_have_msa(&vcpu->arch))
 		ret += ARRAY_SIZE(kvm_mips_get_one_regs_msa) + 32;
-	ret += __arch_hweight8(vcpu->arch.kscratch_enabled);
 	ret += kvm_mips_callbacks->num_regs(vcpu);
 
 	return ret;
@@ -694,16 +597,6 @@ static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
 		}
 	}
 
-	for (i = 0; i < 6; ++i) {
-		if (!(vcpu->arch.kscratch_enabled & BIT(i + 2)))
-			continue;
-
-		if (copy_to_user(indices, &kvm_mips_get_one_regs_kscratch[i],
-				 sizeof(kvm_mips_get_one_regs_kscratch[i])))
-			return -EFAULT;
-		++indices;
-	}
-
 	return kvm_mips_callbacks->copy_reg_indices(vcpu, indices);
 }
 
@@ -794,95 +687,6 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
 		v = fpu->msacsr;
 		break;
 
-	/* Co-processor 0 registers */
-	case KVM_REG_MIPS_CP0_INDEX:
-		v = (long)kvm_read_c0_guest_index(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CONTEXT:
-		v = (long)kvm_read_c0_guest_context(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_USERLOCAL:
-		v = (long)kvm_read_c0_guest_userlocal(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_PAGEMASK:
-		v = (long)kvm_read_c0_guest_pagemask(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_WIRED:
-		v = (long)kvm_read_c0_guest_wired(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_HWRENA:
-		v = (long)kvm_read_c0_guest_hwrena(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_BADVADDR:
-		v = (long)kvm_read_c0_guest_badvaddr(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_ENTRYHI:
-		v = (long)kvm_read_c0_guest_entryhi(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_COMPARE:
-		v = (long)kvm_read_c0_guest_compare(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_STATUS:
-		v = (long)kvm_read_c0_guest_status(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CAUSE:
-		v = (long)kvm_read_c0_guest_cause(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_EPC:
-		v = (long)kvm_read_c0_guest_epc(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_PRID:
-		v = (long)kvm_read_c0_guest_prid(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CONFIG:
-		v = (long)kvm_read_c0_guest_config(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CONFIG1:
-		v = (long)kvm_read_c0_guest_config1(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CONFIG2:
-		v = (long)kvm_read_c0_guest_config2(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CONFIG3:
-		v = (long)kvm_read_c0_guest_config3(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CONFIG4:
-		v = (long)kvm_read_c0_guest_config4(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CONFIG5:
-		v = (long)kvm_read_c0_guest_config5(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_CONFIG7:
-		v = (long)kvm_read_c0_guest_config7(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_ERROREPC:
-		v = (long)kvm_read_c0_guest_errorepc(cop0);
-		break;
-	case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
-		idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
-		if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
-			return -EINVAL;
-		switch (idx) {
-		case 2:
-			v = (long)kvm_read_c0_guest_kscratch1(cop0);
-			break;
-		case 3:
-			v = (long)kvm_read_c0_guest_kscratch2(cop0);
-			break;
-		case 4:
-			v = (long)kvm_read_c0_guest_kscratch3(cop0);
-			break;
-		case 5:
-			v = (long)kvm_read_c0_guest_kscratch4(cop0);
-			break;
-		case 6:
-			v = (long)kvm_read_c0_guest_kscratch5(cop0);
-			break;
-		case 7:
-			v = (long)kvm_read_c0_guest_kscratch6(cop0);
-			break;
-		}
-		break;
 	/* registers to be handled specially */
 	default:
 		ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v);
@@ -1014,68 +818,6 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
 		fpu->msacsr = v;
 		break;
 
-	/* Co-processor 0 registers */
-	case KVM_REG_MIPS_CP0_INDEX:
-		kvm_write_c0_guest_index(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_CONTEXT:
-		kvm_write_c0_guest_context(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_USERLOCAL:
-		kvm_write_c0_guest_userlocal(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_PAGEMASK:
-		kvm_write_c0_guest_pagemask(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_WIRED:
-		kvm_write_c0_guest_wired(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_HWRENA:
-		kvm_write_c0_guest_hwrena(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_BADVADDR:
-		kvm_write_c0_guest_badvaddr(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_ENTRYHI:
-		kvm_write_c0_guest_entryhi(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_STATUS:
-		kvm_write_c0_guest_status(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_EPC:
-		kvm_write_c0_guest_epc(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_PRID:
-		kvm_write_c0_guest_prid(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_ERROREPC:
-		kvm_write_c0_guest_errorepc(cop0, v);
-		break;
-	case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
-		idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
-		if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
-			return -EINVAL;
-		switch (idx) {
-		case 2:
-			kvm_write_c0_guest_kscratch1(cop0, v);
-			break;
-		case 3:
-			kvm_write_c0_guest_kscratch2(cop0, v);
-			break;
-		case 4:
-			kvm_write_c0_guest_kscratch3(cop0, v);
-			break;
-		case 5:
-			kvm_write_c0_guest_kscratch4(cop0, v);
-			break;
-		case 6:
-			kvm_write_c0_guest_kscratch5(cop0, v);
-			break;
-		case 7:
-			kvm_write_c0_guest_kscratch6(cop0, v);
-			break;
-		}
-		break;
 	/* registers to be handled specially */
 	default:
 		return kvm_mips_callbacks->set_one_reg(vcpu, reg, v);
@@ -1144,18 +886,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
 			return -E2BIG;
 		return kvm_mips_copy_reg_indices(vcpu, user_list->reg);
 	}
-	case KVM_NMI:
-		/* Treat the NMI as a CPU reset */
-		r = kvm_mips_reset_vcpu(vcpu);
-		break;
 	case KVM_INTERRUPT:
 		{
 			struct kvm_mips_interrupt irq;
 
-			r = -EFAULT;
 			if (copy_from_user(&irq, argp, sizeof(irq)))
-				goto out;
-
+				return -EFAULT;
 			kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__,
 				  irq.irq);
 
@@ -1165,56 +901,57 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
 	case KVM_ENABLE_CAP: {
 		struct kvm_enable_cap cap;
 
-		r = -EFAULT;
 		if (copy_from_user(&cap, argp, sizeof(cap)))
-			goto out;
+			return -EFAULT;
 		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
 		break;
 	}
 	default:
 		r = -ENOIOCTLCMD;
 	}
-
-out:
 	return r;
 }
 
-/* Get (and clear) the dirty memory log for a memory slot. */
+/**
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
+ * @kvm: kvm instance
+ * @log: slot id and address to which we copy the log
+ *
+ * Steps 1-4 below provide general overview of dirty page logging. See
+ * kvm_get_dirty_log_protect() function description for additional details.
+ *
+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
+ * always flush the TLB (step 4) even if previous step failed  and the dirty
+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
+ * writes will be marked dirty for next log read.
+ *
+ *   1. Take a snapshot of the bit and clear it if needed.
+ *   2. Write protect the corresponding page.
+ *   3. Copy the snapshot to the userspace.
+ *   4. Flush TLB's if needed.
+ */
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
-	unsigned long ga, ga_end;
-	int is_dirty = 0;
+	bool is_dirty = false;
 	int r;
-	unsigned long n;
 
 	mutex_lock(&kvm->slots_lock);
 
-	r = kvm_get_dirty_log(kvm, log, &is_dirty);
-	if (r)
-		goto out;
+	r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
 
-	/* If nothing is dirty, don't bother messing with page tables. */
 	if (is_dirty) {
 		slots = kvm_memslots(kvm);
 		memslot = id_to_memslot(slots, log->slot);
 
-		ga = memslot->base_gfn << PAGE_SHIFT;
-		ga_end = ga + (memslot->npages << PAGE_SHIFT);
-
-		kvm_info("%s: dirty, ga: %#lx, ga_end %#lx\n", __func__, ga,
-			 ga_end);
-
-		n = kvm_dirty_bitmap_bytes(memslot);
-		memset(memslot->dirty_bitmap, 0, n);
+		/* Let implementation handle TLB/GVA invalidation */
+		kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot);
 	}
 
-	r = 0;
-out:
 	mutex_unlock(&kvm->slots_lock);
 	return r;
-
 }
 
 long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
@@ -1282,11 +1019,19 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	switch (ext) {
 	case KVM_CAP_ONE_REG:
 	case KVM_CAP_ENABLE_CAP:
+	case KVM_CAP_READONLY_MEM:
+	case KVM_CAP_SYNC_MMU:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
+	case KVM_CAP_NR_VCPUS:
+		r = num_online_cpus();
+		break;
+	case KVM_CAP_MAX_VCPUS:
+		r = KVM_MAX_VCPUS;
+		break;
 	case KVM_CAP_MIPS_FPU:
 		/* We don't handle systems with inconsistent cpu_has_fpu */
 		r = !!raw_cpu_has_fpu;
@@ -1400,13 +1145,23 @@ static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer)
 
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
-	kvm_mips_callbacks->vcpu_init(vcpu);
+	int err;
+
+	err = kvm_mips_callbacks->vcpu_init(vcpu);
+	if (err)
+		return err;
+
 	hrtimer_init(&vcpu->arch.comparecount_timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_REL);
 	vcpu->arch.comparecount_timer.function = kvm_mips_comparecount_wakeup;
 	return 0;
 }
 
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	kvm_mips_callbacks->vcpu_uninit(vcpu);
+}
+
 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 				  struct kvm_translation *tr)
 {
@@ -1440,8 +1195,11 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 	enum emulation_result er = EMULATE_DONE;
+	u32 inst;
 	int ret = RESUME_GUEST;
 
+	vcpu->mode = OUTSIDE_GUEST_MODE;
+
 	/* re-enable HTW before enabling interrupts */
 	htw_start();
 
@@ -1564,8 +1322,12 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		break;
 
 	default:
+		if (cause & CAUSEF_BD)
+			opc += 1;
+		inst = 0;
+		kvm_get_badinstr(opc, vcpu, &inst);
 		kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x  BadVaddr: %#lx Status: %#lx\n",
-			exccode, opc, kvm_get_inst(opc, vcpu), badvaddr,
+			exccode, opc, inst, badvaddr,
 			kvm_read_c0_guest_status(vcpu->arch.cop0));
 		kvm_arch_vcpu_dump_regs(vcpu);
 		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -1593,7 +1355,15 @@ skip_emul:
 	if (ret == RESUME_GUEST) {
 		trace_kvm_reenter(vcpu);
 
-		kvm_mips_check_asids(vcpu);
+		/*
+		 * Make sure the read of VCPU requests in vcpu_reenter()
+		 * callback is not reordered ahead of the write to vcpu->mode,
+		 * or we could miss a TLB flush request while the requester sees
+		 * the VCPU as outside of guest mode and not needing an IPI.
+		 */
+		smp_store_mb(vcpu->mode, IN_GUEST_MODE);
+
+		kvm_mips_callbacks->vcpu_reenter(run, vcpu);
 
 		/*
 		 * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 3b677c851be0..cb0faade311e 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -11,86 +11,995 @@
 
 #include <linux/highmem.h>
 #include <linux/kvm_host.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
 
-static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
+/*
+ * KVM_MMU_CACHE_MIN_PAGES is the number of GPA page table translation levels
+ * for which pages need to be cached.
+ */
+#if defined(__PAGETABLE_PMD_FOLDED)
+#define KVM_MMU_CACHE_MIN_PAGES 1
+#else
+#define KVM_MMU_CACHE_MIN_PAGES 2
+#endif
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+				  int min, int max)
 {
-	int cpu = smp_processor_id();
+	void *page;
+
+	BUG_ON(max > KVM_NR_MEM_OBJS);
+	if (cache->nobjs >= min)
+		return 0;
+	while (cache->nobjs < max) {
+		page = (void *)__get_free_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+		cache->objects[cache->nobjs++] = page;
+	}
+	return 0;
+}
 
-	return vcpu->arch.guest_kernel_asid[cpu] &
-			cpu_asid_mask(&cpu_data[cpu]);
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+	while (mc->nobjs)
+		free_page((unsigned long)mc->objects[--mc->nobjs]);
 }
 
-static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 {
-	int cpu = smp_processor_id();
+	void *p;
 
-	return vcpu->arch.guest_user_asid[cpu] &
-			cpu_asid_mask(&cpu_data[cpu]);
+	BUG_ON(!mc || !mc->nobjs);
+	p = mc->objects[--mc->nobjs];
+	return p;
 }
 
-static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
+void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
-	int srcu_idx, err = 0;
-	kvm_pfn_t pfn;
+	mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+}
+
+/**
+ * kvm_pgd_init() - Initialise KVM GPA page directory.
+ * @page:	Pointer to page directory (PGD) for KVM GPA.
+ *
+ * Initialise a KVM GPA page directory with pointers to the invalid table, i.e.
+ * representing no mappings. This is similar to pgd_init(), however it
+ * initialises all the page directory pointers, not just the ones corresponding
+ * to the userland address space (since it is for the guest physical address
+ * space rather than a virtual address space).
+ */
+static void kvm_pgd_init(void *page)
+{
+	unsigned long *p, *end;
+	unsigned long entry;
+
+#ifdef __PAGETABLE_PMD_FOLDED
+	entry = (unsigned long)invalid_pte_table;
+#else
+	entry = (unsigned long)invalid_pmd_table;
+#endif
+
+	p = (unsigned long *)page;
+	end = p + PTRS_PER_PGD;
+
+	do {
+		p[0] = entry;
+		p[1] = entry;
+		p[2] = entry;
+		p[3] = entry;
+		p[4] = entry;
+		p += 8;
+		p[-3] = entry;
+		p[-2] = entry;
+		p[-1] = entry;
+	} while (p != end);
+}
+
+/**
+ * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory.
+ *
+ * Allocate a blank KVM GPA page directory (PGD) for representing guest physical
+ * to host physical page mappings.
+ *
+ * Returns:	Pointer to new KVM GPA page directory.
+ *		NULL on allocation failure.
+ */
+pgd_t *kvm_pgd_alloc(void)
+{
+	pgd_t *ret;
+
+	ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGD_ORDER);
+	if (ret)
+		kvm_pgd_init(ret);
+
+	return ret;
+}
+
+/**
+ * kvm_mips_walk_pgd() - Walk page table with optional allocation.
+ * @pgd:	Page directory pointer.
+ * @addr:	Address to index page table using.
+ * @cache:	MMU page cache to allocate new page tables from, or NULL.
+ *
+ * Walk the page tables pointed to by @pgd to find the PTE corresponding to the
+ * address @addr. If page tables don't exist for @addr, they will be created
+ * from the MMU cache if @cache is not NULL.
+ *
+ * Returns:	Pointer to pte_t corresponding to @addr.
+ *		NULL if a page table doesn't exist for @addr and !@cache.
+ *		NULL if a page table allocation failed.
+ */
+static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache,
+				unsigned long addr)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd += pgd_index(addr);
+	if (pgd_none(*pgd)) {
+		/* Not used on MIPS yet */
+		BUG();
+		return NULL;
+	}
+	pud = pud_offset(pgd, addr);
+	if (pud_none(*pud)) {
+		pmd_t *new_pmd;
+
+		if (!cache)
+			return NULL;
+		new_pmd = mmu_memory_cache_alloc(cache);
+		pmd_init((unsigned long)new_pmd,
+			 (unsigned long)invalid_pte_table);
+		pud_populate(NULL, pud, new_pmd);
+	}
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd)) {
+		pte_t *new_pte;
+
+		if (!cache)
+			return NULL;
+		new_pte = mmu_memory_cache_alloc(cache);
+		clear_page(new_pte);
+		pmd_populate_kernel(NULL, pmd, new_pte);
+	}
+	return pte_offset(pmd, addr);
+}
+
+/* Caller must hold kvm->mm_lock */
+static pte_t *kvm_mips_pte_for_gpa(struct kvm *kvm,
+				   struct kvm_mmu_memory_cache *cache,
+				   unsigned long addr)
+{
+	return kvm_mips_walk_pgd(kvm->arch.gpa_mm.pgd, cache, addr);
+}
+
+/*
+ * kvm_mips_flush_gpa_{pte,pmd,pud,pgd,pt}.
+ * Flush a range of guest physical address space from the VM's GPA page tables.
+ */
+
+static bool kvm_mips_flush_gpa_pte(pte_t *pte, unsigned long start_gpa,
+				   unsigned long end_gpa)
+{
+	int i_min = __pte_offset(start_gpa);
+	int i_max = __pte_offset(end_gpa);
+	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PTE - 1);
+	int i;
+
+	for (i = i_min; i <= i_max; ++i) {
+		if (!pte_present(pte[i]))
+			continue;
+
+		set_pte(pte + i, __pte(0));
+	}
+	return safe_to_remove;
+}
+
+static bool kvm_mips_flush_gpa_pmd(pmd_t *pmd, unsigned long start_gpa,
+				   unsigned long end_gpa)
+{
+	pte_t *pte;
+	unsigned long end = ~0ul;
+	int i_min = __pmd_offset(start_gpa);
+	int i_max = __pmd_offset(end_gpa);
+	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PMD - 1);
+	int i;
+
+	for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
+		if (!pmd_present(pmd[i]))
+			continue;
+
+		pte = pte_offset(pmd + i, 0);
+		if (i == i_max)
+			end = end_gpa;
+
+		if (kvm_mips_flush_gpa_pte(pte, start_gpa, end)) {
+			pmd_clear(pmd + i);
+			pte_free_kernel(NULL, pte);
+		} else {
+			safe_to_remove = false;
+		}
+	}
+	return safe_to_remove;
+}
+
+static bool kvm_mips_flush_gpa_pud(pud_t *pud, unsigned long start_gpa,
+				   unsigned long end_gpa)
+{
+	pmd_t *pmd;
+	unsigned long end = ~0ul;
+	int i_min = __pud_offset(start_gpa);
+	int i_max = __pud_offset(end_gpa);
+	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PUD - 1);
+	int i;
+
+	for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
+		if (!pud_present(pud[i]))
+			continue;
+
+		pmd = pmd_offset(pud + i, 0);
+		if (i == i_max)
+			end = end_gpa;
+
+		if (kvm_mips_flush_gpa_pmd(pmd, start_gpa, end)) {
+			pud_clear(pud + i);
+			pmd_free(NULL, pmd);
+		} else {
+			safe_to_remove = false;
+		}
+	}
+	return safe_to_remove;
+}
+
+static bool kvm_mips_flush_gpa_pgd(pgd_t *pgd, unsigned long start_gpa,
+				   unsigned long end_gpa)
+{
+	pud_t *pud;
+	unsigned long end = ~0ul;
+	int i_min = pgd_index(start_gpa);
+	int i_max = pgd_index(end_gpa);
+	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PGD - 1);
+	int i;
+
+	for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
+		if (!pgd_present(pgd[i]))
+			continue;
+
+		pud = pud_offset(pgd + i, 0);
+		if (i == i_max)
+			end = end_gpa;
+
+		if (kvm_mips_flush_gpa_pud(pud, start_gpa, end)) {
+			pgd_clear(pgd + i);
+			pud_free(NULL, pud);
+		} else {
+			safe_to_remove = false;
+		}
+	}
+	return safe_to_remove;
+}
+
+/**
+ * kvm_mips_flush_gpa_pt() - Flush a range of guest physical addresses.
+ * @kvm:	KVM pointer.
+ * @start_gfn:	Guest frame number of first page in GPA range to flush.
+ * @end_gfn:	Guest frame number of last page in GPA range to flush.
+ *
+ * Flushes a range of GPA mappings from the GPA page tables.
+ *
+ * The caller must hold the @kvm->mmu_lock spinlock.
+ *
+ * Returns:	Whether its safe to remove the top level page directory because
+ *		all lower levels have been removed.
+ */
+bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
+{
+	return kvm_mips_flush_gpa_pgd(kvm->arch.gpa_mm.pgd,
+				      start_gfn << PAGE_SHIFT,
+				      end_gfn << PAGE_SHIFT);
+}
+
+#define BUILD_PTE_RANGE_OP(name, op)					\
+static int kvm_mips_##name##_pte(pte_t *pte, unsigned long start,	\
+				 unsigned long end)			\
+{									\
+	int ret = 0;							\
+	int i_min = __pte_offset(start);				\
+	int i_max = __pte_offset(end);					\
+	int i;								\
+	pte_t old, new;							\
+									\
+	for (i = i_min; i <= i_max; ++i) {				\
+		if (!pte_present(pte[i]))				\
+			continue;					\
+									\
+		old = pte[i];						\
+		new = op(old);						\
+		if (pte_val(new) == pte_val(old))			\
+			continue;					\
+		set_pte(pte + i, new);					\
+		ret = 1;						\
+	}								\
+	return ret;							\
+}									\
+									\
+/* returns true if anything was done */					\
+static int kvm_mips_##name##_pmd(pmd_t *pmd, unsigned long start,	\
+				 unsigned long end)			\
+{									\
+	int ret = 0;							\
+	pte_t *pte;							\
+	unsigned long cur_end = ~0ul;					\
+	int i_min = __pmd_offset(start);				\
+	int i_max = __pmd_offset(end);					\
+	int i;								\
+									\
+	for (i = i_min; i <= i_max; ++i, start = 0) {			\
+		if (!pmd_present(pmd[i]))				\
+			continue;					\
+									\
+		pte = pte_offset(pmd + i, 0);				\
+		if (i == i_max)						\
+			cur_end = end;					\
+									\
+		ret |= kvm_mips_##name##_pte(pte, start, cur_end);	\
+	}								\
+	return ret;							\
+}									\
+									\
+static int kvm_mips_##name##_pud(pud_t *pud, unsigned long start,	\
+				 unsigned long end)			\
+{									\
+	int ret = 0;							\
+	pmd_t *pmd;							\
+	unsigned long cur_end = ~0ul;					\
+	int i_min = __pud_offset(start);				\
+	int i_max = __pud_offset(end);					\
+	int i;								\
+									\
+	for (i = i_min; i <= i_max; ++i, start = 0) {			\
+		if (!pud_present(pud[i]))				\
+			continue;					\
+									\
+		pmd = pmd_offset(pud + i, 0);				\
+		if (i == i_max)						\
+			cur_end = end;					\
+									\
+		ret |= kvm_mips_##name##_pmd(pmd, start, cur_end);	\
+	}								\
+	return ret;							\
+}									\
+									\
+static int kvm_mips_##name##_pgd(pgd_t *pgd, unsigned long start,	\
+				 unsigned long end)			\
+{									\
+	int ret = 0;							\
+	pud_t *pud;							\
+	unsigned long cur_end = ~0ul;					\
+	int i_min = pgd_index(start);					\
+	int i_max = pgd_index(end);					\
+	int i;								\
+									\
+	for (i = i_min; i <= i_max; ++i, start = 0) {			\
+		if (!pgd_present(pgd[i]))				\
+			continue;					\
+									\
+		pud = pud_offset(pgd + i, 0);				\
+		if (i == i_max)						\
+			cur_end = end;					\
+									\
+		ret |= kvm_mips_##name##_pud(pud, start, cur_end);	\
+	}								\
+	return ret;							\
+}
+
+/*
+ * kvm_mips_mkclean_gpa_pt.
+ * Mark a range of guest physical address space clean (writes fault) in the VM's
+ * GPA page table to allow dirty page tracking.
+ */
 
-	if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE)
+BUILD_PTE_RANGE_OP(mkclean, pte_mkclean)
+
+/**
+ * kvm_mips_mkclean_gpa_pt() - Make a range of guest physical addresses clean.
+ * @kvm:	KVM pointer.
+ * @start_gfn:	Guest frame number of first page in GPA range to flush.
+ * @end_gfn:	Guest frame number of last page in GPA range to flush.
+ *
+ * Make a range of GPA mappings clean so that guest writes will fault and
+ * trigger dirty page logging.
+ *
+ * The caller must hold the @kvm->mmu_lock spinlock.
+ *
+ * Returns:	Whether any GPA mappings were modified, which would require
+ *		derived mappings (GVA page tables & TLB enties) to be
+ *		invalidated.
+ */
+int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
+{
+	return kvm_mips_mkclean_pgd(kvm->arch.gpa_mm.pgd,
+				    start_gfn << PAGE_SHIFT,
+				    end_gfn << PAGE_SHIFT);
+}
+
+/**
+ * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages
+ * @kvm:	The KVM pointer
+ * @slot:	The memory slot associated with mask
+ * @gfn_offset:	The gfn offset in memory slot
+ * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory
+ *		slot to be write protected
+ *
+ * Walks bits set in mask write protects the associated pte's. Caller must
+ * acquire @kvm->mmu_lock.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+		struct kvm_memory_slot *slot,
+		gfn_t gfn_offset, unsigned long mask)
+{
+	gfn_t base_gfn = slot->base_gfn + gfn_offset;
+	gfn_t start = base_gfn +  __ffs(mask);
+	gfn_t end = base_gfn + __fls(mask);
+
+	kvm_mips_mkclean_gpa_pt(kvm, start, end);
+}
+
+/*
+ * kvm_mips_mkold_gpa_pt.
+ * Mark a range of guest physical address space old (all accesses fault) in the
+ * VM's GPA page table to allow detection of commonly used pages.
+ */
+
+BUILD_PTE_RANGE_OP(mkold, pte_mkold)
+
+static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn,
+				 gfn_t end_gfn)
+{
+	return kvm_mips_mkold_pgd(kvm->arch.gpa_mm.pgd,
+				  start_gfn << PAGE_SHIFT,
+				  end_gfn << PAGE_SHIFT);
+}
+
+static int handle_hva_to_gpa(struct kvm *kvm,
+			     unsigned long start,
+			     unsigned long end,
+			     int (*handler)(struct kvm *kvm, gfn_t gfn,
+					    gpa_t gfn_end,
+					    struct kvm_memory_slot *memslot,
+					    void *data),
+			     void *data)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int ret = 0;
+
+	slots = kvm_memslots(kvm);
+
+	/* we only care about the pages that the guest sees */
+	kvm_for_each_memslot(memslot, slots) {
+		unsigned long hva_start, hva_end;
+		gfn_t gfn, gfn_end;
+
+		hva_start = max(start, memslot->userspace_addr);
+		hva_end = min(end, memslot->userspace_addr +
+					(memslot->npages << PAGE_SHIFT));
+		if (hva_start >= hva_end)
+			continue;
+
+		/*
+		 * {gfn(page) | page intersects with [hva_start, hva_end)} =
+		 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+		 */
+		gfn = hva_to_gfn_memslot(hva_start, memslot);
+		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+
+		ret |= handler(kvm, gfn, gfn_end, memslot, data);
+	}
+
+	return ret;
+}
+
+
+static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
+				 struct kvm_memory_slot *memslot, void *data)
+{
+	kvm_mips_flush_gpa_pt(kvm, gfn, gfn_end);
+	return 1;
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+	unsigned long end = hva + PAGE_SIZE;
+
+	handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL);
+
+	kvm_mips_callbacks->flush_shadow_all(kvm);
+	return 0;
+}
+
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+	handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
+
+	kvm_mips_callbacks->flush_shadow_all(kvm);
+	return 0;
+}
+
+static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
+				struct kvm_memory_slot *memslot, void *data)
+{
+	gpa_t gpa = gfn << PAGE_SHIFT;
+	pte_t hva_pte = *(pte_t *)data;
+	pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
+	pte_t old_pte;
+
+	if (!gpa_pte)
+		return 0;
+
+	/* Mapping may need adjusting depending on memslot flags */
+	old_pte = *gpa_pte;
+	if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
+		hva_pte = pte_mkclean(hva_pte);
+	else if (memslot->flags & KVM_MEM_READONLY)
+		hva_pte = pte_wrprotect(hva_pte);
+
+	set_pte(gpa_pte, hva_pte);
+
+	/* Replacing an absent or old page doesn't need flushes */
+	if (!pte_present(old_pte) || !pte_young(old_pte))
 		return 0;
 
+	/* Pages swapped, aged, moved, or cleaned require flushes */
+	return !pte_present(hva_pte) ||
+	       !pte_young(hva_pte) ||
+	       pte_pfn(old_pte) != pte_pfn(hva_pte) ||
+	       (pte_dirty(old_pte) && !pte_dirty(hva_pte));
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+	unsigned long end = hva + PAGE_SIZE;
+	int ret;
+
+	ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte);
+	if (ret)
+		kvm_mips_callbacks->flush_shadow_all(kvm);
+}
+
+static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
+			       struct kvm_memory_slot *memslot, void *data)
+{
+	return kvm_mips_mkold_gpa_pt(kvm, gfn, gfn_end);
+}
+
+static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
+				    struct kvm_memory_slot *memslot, void *data)
+{
+	gpa_t gpa = gfn << PAGE_SHIFT;
+	pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
+
+	if (!gpa_pte)
+		return 0;
+	return pte_young(*gpa_pte);
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+	return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
+}
+
+/**
+ * _kvm_mips_map_page_fast() - Fast path GPA fault handler.
+ * @vcpu:		VCPU pointer.
+ * @gpa:		Guest physical address of fault.
+ * @write_fault:	Whether the fault was due to a write.
+ * @out_entry:		New PTE for @gpa (written on success unless NULL).
+ * @out_buddy:		New PTE for @gpa's buddy (written on success unless
+ *			NULL).
+ *
+ * Perform fast path GPA fault handling, doing all that can be done without
+ * calling into KVM. This handles marking old pages young (for idle page
+ * tracking), and dirtying of clean pages (for dirty page logging).
+ *
+ * Returns:	0 on success, in which case we can update derived mappings and
+ *		resume guest execution.
+ *		-EFAULT on failure due to absent GPA mapping or write to
+ *		read-only page, in which case KVM must be consulted.
+ */
+static int _kvm_mips_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa,
+				   bool write_fault,
+				   pte_t *out_entry, pte_t *out_buddy)
+{
+	struct kvm *kvm = vcpu->kvm;
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	pte_t *ptep;
+	kvm_pfn_t pfn = 0;	/* silence bogus GCC warning */
+	bool pfn_valid = false;
+	int ret = 0;
+
+	spin_lock(&kvm->mmu_lock);
+
+	/* Fast path - just check GPA page table for an existing entry */
+	ptep = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
+	if (!ptep || !pte_present(*ptep)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* Track access to pages marked old */
+	if (!pte_young(*ptep)) {
+		set_pte(ptep, pte_mkyoung(*ptep));
+		pfn = pte_pfn(*ptep);
+		pfn_valid = true;
+		/* call kvm_set_pfn_accessed() after unlock */
+	}
+	if (write_fault && !pte_dirty(*ptep)) {
+		if (!pte_write(*ptep)) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		/* Track dirtying of writeable pages */
+		set_pte(ptep, pte_mkdirty(*ptep));
+		pfn = pte_pfn(*ptep);
+		mark_page_dirty(kvm, gfn);
+		kvm_set_pfn_dirty(pfn);
+	}
+
+	if (out_entry)
+		*out_entry = *ptep;
+	if (out_buddy)
+		*out_buddy = *ptep_buddy(ptep);
+
+out:
+	spin_unlock(&kvm->mmu_lock);
+	if (pfn_valid)
+		kvm_set_pfn_accessed(pfn);
+	return ret;
+}
+
+/**
+ * kvm_mips_map_page() - Map a guest physical page.
+ * @vcpu:		VCPU pointer.
+ * @gpa:		Guest physical address of fault.
+ * @write_fault:	Whether the fault was due to a write.
+ * @out_entry:		New PTE for @gpa (written on success unless NULL).
+ * @out_buddy:		New PTE for @gpa's buddy (written on success unless
+ *			NULL).
+ *
+ * Handle GPA faults by creating a new GPA mapping (or updating an existing
+ * one).
+ *
+ * This takes care of marking pages young or dirty (idle/dirty page tracking),
+ * asking KVM for the corresponding PFN, and creating a mapping in the GPA page
+ * tables. Derived mappings (GVA page tables and TLBs) must be handled by the
+ * caller.
+ *
+ * Returns:	0 on success, in which case the caller may use the @out_entry
+ *		and @out_buddy PTEs to update derived mappings and resume guest
+ *		execution.
+ *		-EFAULT if there is no memory region at @gpa or a write was
+ *		attempted to a read-only memory region. This is usually handled
+ *		as an MMIO access.
+ */
+static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
+			     bool write_fault,
+			     pte_t *out_entry, pte_t *out_buddy)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int srcu_idx, err;
+	kvm_pfn_t pfn;
+	pte_t *ptep, entry, old_pte;
+	bool writeable;
+	unsigned long prot_bits;
+	unsigned long mmu_seq;
+
+	/* Try the fast path to handle old / clean pages */
 	srcu_idx = srcu_read_lock(&kvm->srcu);
-	pfn = gfn_to_pfn(kvm, gfn);
+	err = _kvm_mips_map_page_fast(vcpu, gpa, write_fault, out_entry,
+				      out_buddy);
+	if (!err)
+		goto out;
 
+	/* We need a minimum of cached pages ready for page table creation */
+	err = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
+				     KVM_NR_MEM_OBJS);
+	if (err)
+		goto out;
+
+retry:
+	/*
+	 * Used to check for invalidations in progress, of the pfn that is
+	 * returned by pfn_to_pfn_prot below.
+	 */
+	mmu_seq = kvm->mmu_notifier_seq;
+	/*
+	 * Ensure the read of mmu_notifier_seq isn't reordered with PTE reads in
+	 * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't
+	 * risk the page we get a reference to getting unmapped before we have a
+	 * chance to grab the mmu_lock without mmu_notifier_retry() noticing.
+	 *
+	 * This smp_rmb() pairs with the effective smp_wmb() of the combination
+	 * of the pte_unmap_unlock() after the PTE is zapped, and the
+	 * spin_lock() in kvm_mmu_notifier_invalidate_<page|range_end>() before
+	 * mmu_notifier_seq is incremented.
+	 */
+	smp_rmb();
+
+	/* Slow path - ask KVM core whether we can access this GPA */
+	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writeable);
 	if (is_error_noslot_pfn(pfn)) {
-		kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn);
 		err = -EFAULT;
 		goto out;
 	}
 
-	kvm->arch.guest_pmap[gfn] = pfn;
+	spin_lock(&kvm->mmu_lock);
+	/* Check if an invalidation has taken place since we got pfn */
+	if (mmu_notifier_retry(kvm, mmu_seq)) {
+		/*
+		 * This can happen when mappings are changed asynchronously, but
+		 * also synchronously if a COW is triggered by
+		 * gfn_to_pfn_prot().
+		 */
+		spin_unlock(&kvm->mmu_lock);
+		kvm_release_pfn_clean(pfn);
+		goto retry;
+	}
+
+	/* Ensure page tables are allocated */
+	ptep = kvm_mips_pte_for_gpa(kvm, memcache, gpa);
+
+	/* Set up the PTE */
+	prot_bits = _PAGE_PRESENT | __READABLE | _page_cachable_default;
+	if (writeable) {
+		prot_bits |= _PAGE_WRITE;
+		if (write_fault) {
+			prot_bits |= __WRITEABLE;
+			mark_page_dirty(kvm, gfn);
+			kvm_set_pfn_dirty(pfn);
+		}
+	}
+	entry = pfn_pte(pfn, __pgprot(prot_bits));
+
+	/* Write the PTE */
+	old_pte = *ptep;
+	set_pte(ptep, entry);
+
+	err = 0;
+	if (out_entry)
+		*out_entry = *ptep;
+	if (out_buddy)
+		*out_buddy = *ptep_buddy(ptep);
+
+	spin_unlock(&kvm->mmu_lock);
+	kvm_release_pfn_clean(pfn);
+	kvm_set_pfn_accessed(pfn);
 out:
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 	return err;
 }
 
-/* Translate guest KSEG0 addresses to Host PA */
-unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
-						    unsigned long gva)
+static pte_t *kvm_trap_emul_pte_for_gva(struct kvm_vcpu *vcpu,
+					unsigned long addr)
 {
-	gfn_t gfn;
-	unsigned long offset = gva & ~PAGE_MASK;
-	struct kvm *kvm = vcpu->kvm;
+	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+	pgd_t *pgdp;
+	int ret;
+
+	/* We need a minimum of cached pages ready for page table creation */
+	ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
+				     KVM_NR_MEM_OBJS);
+	if (ret)
+		return NULL;
+
+	if (KVM_GUEST_KERNEL_MODE(vcpu))
+		pgdp = vcpu->arch.guest_kernel_mm.pgd;
+	else
+		pgdp = vcpu->arch.guest_user_mm.pgd;
+
+	return kvm_mips_walk_pgd(pgdp, memcache, addr);
+}
 
-	if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) {
-		kvm_err("%s/%p: Invalid gva: %#lx\n", __func__,
-			__builtin_return_address(0), gva);
-		return KVM_INVALID_PAGE;
+void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
+				  bool user)
+{
+	pgd_t *pgdp;
+	pte_t *ptep;
+
+	addr &= PAGE_MASK << 1;
+
+	pgdp = vcpu->arch.guest_kernel_mm.pgd;
+	ptep = kvm_mips_walk_pgd(pgdp, NULL, addr);
+	if (ptep) {
+		ptep[0] = pfn_pte(0, __pgprot(0));
+		ptep[1] = pfn_pte(0, __pgprot(0));
+	}
+
+	if (user) {
+		pgdp = vcpu->arch.guest_user_mm.pgd;
+		ptep = kvm_mips_walk_pgd(pgdp, NULL, addr);
+		if (ptep) {
+			ptep[0] = pfn_pte(0, __pgprot(0));
+			ptep[1] = pfn_pte(0, __pgprot(0));
+		}
 	}
+}
 
-	gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT);
+/*
+ * kvm_mips_flush_gva_{pte,pmd,pud,pgd,pt}.
+ * Flush a range of guest physical address space from the VM's GPA page tables.
+ */
 
-	if (gfn >= kvm->arch.guest_pmap_npages) {
-		kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn,
-			gva);
-		return KVM_INVALID_PAGE;
+static bool kvm_mips_flush_gva_pte(pte_t *pte, unsigned long start_gva,
+				   unsigned long end_gva)
+{
+	int i_min = __pte_offset(start_gva);
+	int i_max = __pte_offset(end_gva);
+	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PTE - 1);
+	int i;
+
+	/*
+	 * There's no freeing to do, so there's no point clearing individual
+	 * entries unless only part of the last level page table needs flushing.
+	 */
+	if (safe_to_remove)
+		return true;
+
+	for (i = i_min; i <= i_max; ++i) {
+		if (!pte_present(pte[i]))
+			continue;
+
+		set_pte(pte + i, __pte(0));
 	}
+	return false;
+}
 
-	if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
-		return KVM_INVALID_ADDR;
+static bool kvm_mips_flush_gva_pmd(pmd_t *pmd, unsigned long start_gva,
+				   unsigned long end_gva)
+{
+	pte_t *pte;
+	unsigned long end = ~0ul;
+	int i_min = __pmd_offset(start_gva);
+	int i_max = __pmd_offset(end_gva);
+	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PMD - 1);
+	int i;
+
+	for (i = i_min; i <= i_max; ++i, start_gva = 0) {
+		if (!pmd_present(pmd[i]))
+			continue;
+
+		pte = pte_offset(pmd + i, 0);
+		if (i == i_max)
+			end = end_gva;
+
+		if (kvm_mips_flush_gva_pte(pte, start_gva, end)) {
+			pmd_clear(pmd + i);
+			pte_free_kernel(NULL, pte);
+		} else {
+			safe_to_remove = false;
+		}
+	}
+	return safe_to_remove;
+}
 
-	return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset;
+static bool kvm_mips_flush_gva_pud(pud_t *pud, unsigned long start_gva,
+				   unsigned long end_gva)
+{
+	pmd_t *pmd;
+	unsigned long end = ~0ul;
+	int i_min = __pud_offset(start_gva);
+	int i_max = __pud_offset(end_gva);
+	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PUD - 1);
+	int i;
+
+	for (i = i_min; i <= i_max; ++i, start_gva = 0) {
+		if (!pud_present(pud[i]))
+			continue;
+
+		pmd = pmd_offset(pud + i, 0);
+		if (i == i_max)
+			end = end_gva;
+
+		if (kvm_mips_flush_gva_pmd(pmd, start_gva, end)) {
+			pud_clear(pud + i);
+			pmd_free(NULL, pmd);
+		} else {
+			safe_to_remove = false;
+		}
+	}
+	return safe_to_remove;
+}
+
+static bool kvm_mips_flush_gva_pgd(pgd_t *pgd, unsigned long start_gva,
+				   unsigned long end_gva)
+{
+	pud_t *pud;
+	unsigned long end = ~0ul;
+	int i_min = pgd_index(start_gva);
+	int i_max = pgd_index(end_gva);
+	bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PGD - 1);
+	int i;
+
+	for (i = i_min; i <= i_max; ++i, start_gva = 0) {
+		if (!pgd_present(pgd[i]))
+			continue;
+
+		pud = pud_offset(pgd + i, 0);
+		if (i == i_max)
+			end = end_gva;
+
+		if (kvm_mips_flush_gva_pud(pud, start_gva, end)) {
+			pgd_clear(pgd + i);
+			pud_free(NULL, pud);
+		} else {
+			safe_to_remove = false;
+		}
+	}
+	return safe_to_remove;
+}
+
+void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags)
+{
+	if (flags & KMF_GPA) {
+		/* all of guest virtual address space could be affected */
+		if (flags & KMF_KERN)
+			/* useg, kseg0, seg2/3 */
+			kvm_mips_flush_gva_pgd(pgd, 0, 0x7fffffff);
+		else
+			/* useg */
+			kvm_mips_flush_gva_pgd(pgd, 0, 0x3fffffff);
+	} else {
+		/* useg */
+		kvm_mips_flush_gva_pgd(pgd, 0, 0x3fffffff);
+
+		/* kseg2/3 */
+		if (flags & KMF_KERN)
+			kvm_mips_flush_gva_pgd(pgd, 0x60000000, 0x7fffffff);
+	}
+}
+
+static pte_t kvm_mips_gpa_pte_to_gva_unmapped(pte_t pte)
+{
+	/*
+	 * Don't leak writeable but clean entries from GPA page tables. We don't
+	 * want the normal Linux tlbmod handler to handle dirtying when KVM
+	 * accesses guest memory.
+	 */
+	if (!pte_dirty(pte))
+		pte = pte_wrprotect(pte);
+
+	return pte;
+}
+
+static pte_t kvm_mips_gpa_pte_to_gva_mapped(pte_t pte, long entrylo)
+{
+	/* Guest EntryLo overrides host EntryLo */
+	if (!(entrylo & ENTRYLO_D))
+		pte = pte_mkclean(pte);
+
+	return kvm_mips_gpa_pte_to_gva_unmapped(pte);
 }
 
 /* XXXKYMA: Must be called with interrupts disabled */
 int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
-				    struct kvm_vcpu *vcpu)
+				    struct kvm_vcpu *vcpu,
+				    bool write_fault)
 {
-	gfn_t gfn;
-	kvm_pfn_t pfn0, pfn1;
-	unsigned long vaddr = 0;
-	unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
-	struct kvm *kvm = vcpu->kvm;
-	const int flush_dcache_mask = 0;
-	int ret;
+	unsigned long gpa;
+	pte_t pte_gpa[2], *ptep_gva;
+	int idx;
 
 	if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
 		kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
@@ -98,49 +1007,39 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 		return -1;
 	}
 
-	gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
-	if ((gfn | 1) >= kvm->arch.guest_pmap_npages) {
-		kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
-			gfn, badvaddr);
-		kvm_mips_dump_host_tlbs();
+	/* Get the GPA page table entry */
+	gpa = KVM_GUEST_CPHYSADDR(badvaddr);
+	idx = (badvaddr >> PAGE_SHIFT) & 1;
+	if (kvm_mips_map_page(vcpu, gpa, write_fault, &pte_gpa[idx],
+			      &pte_gpa[!idx]) < 0)
 		return -1;
-	}
-	vaddr = badvaddr & (PAGE_MASK << 1);
 
-	if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
+	/* Get the GVA page table entry */
+	ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, badvaddr & ~PAGE_SIZE);
+	if (!ptep_gva) {
+		kvm_err("No ptep for gva %lx\n", badvaddr);
 		return -1;
+	}
 
-	if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0)
-		return -1;
-
-	pfn0 = kvm->arch.guest_pmap[gfn & ~0x1];
-	pfn1 = kvm->arch.guest_pmap[gfn | 0x1];
-
-	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
-		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
-		ENTRYLO_D | ENTRYLO_V;
-	entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
-		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
-		ENTRYLO_D | ENTRYLO_V;
-
-	preempt_disable();
-	entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
-	ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
-				      flush_dcache_mask);
-	preempt_enable();
+	/* Copy a pair of entries from GPA page table to GVA page table */
+	ptep_gva[0] = kvm_mips_gpa_pte_to_gva_unmapped(pte_gpa[0]);
+	ptep_gva[1] = kvm_mips_gpa_pte_to_gva_unmapped(pte_gpa[1]);
 
-	return ret;
+	/* Invalidate this entry in the TLB, guest kernel ASID only */
+	kvm_mips_host_tlb_inv(vcpu, badvaddr, false, true);
+	return 0;
 }
 
 int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
-					 struct kvm_mips_tlb *tlb)
+					 struct kvm_mips_tlb *tlb,
+					 unsigned long gva,
+					 bool write_fault)
 {
-	unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
 	struct kvm *kvm = vcpu->kvm;
-	kvm_pfn_t pfn0, pfn1;
-	gfn_t gfn0, gfn1;
 	long tlb_lo[2];
-	int ret;
+	pte_t pte_gpa[2], *ptep_buddy, *ptep_gva;
+	unsigned int idx = TLB_LO_IDX(*tlb, gva);
+	bool kernel = KVM_GUEST_KERNEL_MODE(vcpu);
 
 	tlb_lo[0] = tlb->tlb_lo[0];
 	tlb_lo[1] = tlb->tlb_lo[1];
@@ -149,70 +1048,64 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 	 * The commpage address must not be mapped to anything else if the guest
 	 * TLB contains entries nearby, or commpage accesses will break.
 	 */
-	if (!((tlb->tlb_hi ^ KVM_GUEST_COMMPAGE_ADDR) &
-			VPN2_MASK & (PAGE_MASK << 1)))
-		tlb_lo[(KVM_GUEST_COMMPAGE_ADDR >> PAGE_SHIFT) & 1] = 0;
-
-	gfn0 = mips3_tlbpfn_to_paddr(tlb_lo[0]) >> PAGE_SHIFT;
-	gfn1 = mips3_tlbpfn_to_paddr(tlb_lo[1]) >> PAGE_SHIFT;
-	if (gfn0 >= kvm->arch.guest_pmap_npages ||
-	    gfn1 >= kvm->arch.guest_pmap_npages) {
-		kvm_err("%s: Invalid gfn: [%#llx, %#llx], EHi: %#lx\n",
-			__func__, gfn0, gfn1, tlb->tlb_hi);
-		kvm_mips_dump_guest_tlbs(vcpu);
-		return -1;
-	}
+	if (!((gva ^ KVM_GUEST_COMMPAGE_ADDR) & VPN2_MASK & (PAGE_MASK << 1)))
+		tlb_lo[TLB_LO_IDX(*tlb, KVM_GUEST_COMMPAGE_ADDR)] = 0;
 
-	if (kvm_mips_map_page(kvm, gfn0) < 0)
+	/* Get the GPA page table entry */
+	if (kvm_mips_map_page(vcpu, mips3_tlbpfn_to_paddr(tlb_lo[idx]),
+			      write_fault, &pte_gpa[idx], NULL) < 0)
 		return -1;
 
-	if (kvm_mips_map_page(kvm, gfn1) < 0)
+	/* And its GVA buddy's GPA page table entry if it also exists */
+	pte_gpa[!idx] = pfn_pte(0, __pgprot(0));
+	if (tlb_lo[!idx] & ENTRYLO_V) {
+		spin_lock(&kvm->mmu_lock);
+		ptep_buddy = kvm_mips_pte_for_gpa(kvm, NULL,
+					mips3_tlbpfn_to_paddr(tlb_lo[!idx]));
+		if (ptep_buddy)
+			pte_gpa[!idx] = *ptep_buddy;
+		spin_unlock(&kvm->mmu_lock);
+	}
+
+	/* Get the GVA page table entry pair */
+	ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, gva & ~PAGE_SIZE);
+	if (!ptep_gva) {
+		kvm_err("No ptep for gva %lx\n", gva);
 		return -1;
+	}
 
-	pfn0 = kvm->arch.guest_pmap[gfn0];
-	pfn1 = kvm->arch.guest_pmap[gfn1];
+	/* Copy a pair of entries from GPA page table to GVA page table */
+	ptep_gva[0] = kvm_mips_gpa_pte_to_gva_mapped(pte_gpa[0], tlb_lo[0]);
+	ptep_gva[1] = kvm_mips_gpa_pte_to_gva_mapped(pte_gpa[1], tlb_lo[1]);
 
-	/* Get attributes from the Guest TLB */
-	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
-		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
-		(tlb_lo[0] & ENTRYLO_D) |
-		(tlb_lo[0] & ENTRYLO_V);
-	entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
-		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
-		(tlb_lo[1] & ENTRYLO_D) |
-		(tlb_lo[1] & ENTRYLO_V);
+	/* Invalidate this entry in the TLB, current guest mode ASID only */
+	kvm_mips_host_tlb_inv(vcpu, gva, !kernel, kernel);
 
 	kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
 		  tlb->tlb_lo[0], tlb->tlb_lo[1]);
 
-	preempt_disable();
-	entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
-					       kvm_mips_get_kernel_asid(vcpu) :
-					       kvm_mips_get_user_asid(vcpu));
-	ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
-				      tlb->tlb_mask);
-	preempt_enable();
-
-	return ret;
+	return 0;
 }
 
-void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
-			     struct kvm_vcpu *vcpu)
+int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
+				       struct kvm_vcpu *vcpu)
 {
-	unsigned long asid = asid_cache(cpu);
-
-	asid += cpu_asid_inc();
-	if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
-		if (cpu_has_vtag_icache)
-			flush_icache_all();
-
-		kvm_local_flush_tlb_all();      /* start new asid cycle */
+	kvm_pfn_t pfn;
+	pte_t *ptep;
 
-		if (!asid)      /* fix version if needed */
-			asid = asid_first_version(cpu);
+	ptep = kvm_trap_emul_pte_for_gva(vcpu, badvaddr);
+	if (!ptep) {
+		kvm_err("No ptep for commpage %lx\n", badvaddr);
+		return -1;
 	}
 
-	cpu_context(cpu, mm) = asid_cache(cpu) = asid;
+	pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage));
+	/* Also set valid and dirty, so refill handler doesn't have to */
+	*ptep = pte_mkyoung(pte_mkdirty(pfn_pte(pfn, PAGE_SHARED)));
+
+	/* Invalidate this entry in the TLB, guest kernel ASID only */
+	kvm_mips_host_tlb_inv(vcpu, badvaddr, false, true);
+	return 0;
 }
 
 /**
@@ -235,42 +1128,13 @@ static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu)
 /* Restore ASID once we are scheduled back after preemption */
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
 	unsigned long flags;
-	int newasid = 0;
 
 	kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
 
-	/* Allocate new kernel and user ASIDs if needed */
-
 	local_irq_save(flags);
 
-	if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
-						asid_version_mask(cpu)) {
-		kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
-		vcpu->arch.guest_kernel_asid[cpu] =
-		    vcpu->arch.guest_kernel_mm.context.asid[cpu];
-		newasid++;
-
-		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
-			  cpu_context(cpu, current->mm));
-		kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
-			  cpu, vcpu->arch.guest_kernel_asid[cpu]);
-	}
-
-	if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) &
-						asid_version_mask(cpu)) {
-		kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
-		vcpu->arch.guest_user_asid[cpu] =
-		    vcpu->arch.guest_user_mm.context.asid[cpu];
-		newasid++;
-
-		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
-			  cpu_context(cpu, current->mm));
-		kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
-			  vcpu->arch.guest_user_asid[cpu]);
-	}
-
+	vcpu->cpu = cpu;
 	if (vcpu->arch.last_sched_cpu != cpu) {
 		kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
 			  vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
@@ -282,42 +1146,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		kvm_mips_migrate_count(vcpu);
 	}
 
-	if (!newasid) {
-		/*
-		 * If we preempted while the guest was executing, then reload
-		 * the pre-empted ASID
-		 */
-		if (current->flags & PF_VCPU) {
-			write_c0_entryhi(vcpu->arch.
-					 preempt_entryhi & asid_mask);
-			ehb();
-		}
-	} else {
-		/* New ASIDs were allocated for the VM */
-
-		/*
-		 * Were we in guest context? If so then the pre-empted ASID is
-		 * no longer valid, we need to set it to what it should be based
-		 * on the mode of the Guest (Kernel/User)
-		 */
-		if (current->flags & PF_VCPU) {
-			if (KVM_GUEST_KERNEL_MODE(vcpu))
-				write_c0_entryhi(vcpu->arch.
-						 guest_kernel_asid[cpu] &
-						 asid_mask);
-			else
-				write_c0_entryhi(vcpu->arch.
-						 guest_user_asid[cpu] &
-						 asid_mask);
-			ehb();
-		}
-	}
-
 	/* restore guest state to registers */
-	kvm_mips_callbacks->vcpu_set_regs(vcpu);
+	kvm_mips_callbacks->vcpu_load(vcpu, cpu);
 
 	local_irq_restore(flags);
-
 }
 
 /* ASID can change if another task is scheduled during preemption */
@@ -329,75 +1161,90 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	local_irq_save(flags);
 
 	cpu = smp_processor_id();
-
-	vcpu->arch.preempt_entryhi = read_c0_entryhi();
 	vcpu->arch.last_sched_cpu = cpu;
+	vcpu->cpu = -1;
 
 	/* save guest state in registers */
-	kvm_mips_callbacks->vcpu_get_regs(vcpu);
-
-	if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
-	     asid_version_mask(cpu))) {
-		kvm_debug("%s: Dropping MMU Context:  %#lx\n", __func__,
-			  cpu_context(cpu, current->mm));
-		drop_mmu_context(current->mm, cpu);
-	}
-	write_c0_entryhi(cpu_asid(cpu, current->mm));
-	ehb();
+	kvm_mips_callbacks->vcpu_put(vcpu, cpu);
 
 	local_irq_restore(flags);
 }
 
-u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
+/**
+ * kvm_trap_emul_gva_fault() - Safely attempt to handle a GVA access fault.
+ * @vcpu:	Virtual CPU.
+ * @gva:	Guest virtual address to be accessed.
+ * @write:	True if write attempted (must be dirtied and made writable).
+ *
+ * Safely attempt to handle a GVA fault, mapping GVA pages if necessary, and
+ * dirtying the page if @write so that guest instructions can be modified.
+ *
+ * Returns:	KVM_MIPS_MAPPED on success.
+ *		KVM_MIPS_GVA if bad guest virtual address.
+ *		KVM_MIPS_GPA if bad guest physical address.
+ *		KVM_MIPS_TLB if guest TLB not present.
+ *		KVM_MIPS_TLBINV if guest TLB present but not valid.
+ *		KVM_MIPS_TLBMOD if guest TLB read only.
+ */
+enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
+						   unsigned long gva,
+						   bool write)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	unsigned long paddr, flags, vpn2, asid;
-	unsigned long va = (unsigned long)opc;
-	void *vaddr;
-	u32 inst;
+	struct kvm_mips_tlb *tlb;
 	int index;
 
-	if (KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0 ||
-	    KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) {
-		local_irq_save(flags);
-		index = kvm_mips_host_tlb_lookup(vcpu, va);
-		if (index >= 0) {
-			inst = *(opc);
-		} else {
-			vpn2 = va & VPN2_MASK;
-			asid = kvm_read_c0_guest_entryhi(cop0) &
-						KVM_ENTRYHI_ASID;
-			index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
-			if (index < 0) {
-				kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
-					__func__, opc, vcpu, read_c0_entryhi());
-				kvm_mips_dump_host_tlbs();
-				kvm_mips_dump_guest_tlbs(vcpu);
-				local_irq_restore(flags);
-				return KVM_INVALID_INST;
-			}
-			if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
-						&vcpu->arch.guest_tlb[index])) {
-				kvm_err("%s: handling mapped seg tlb fault failed for %p, index: %u, vcpu: %p, ASID: %#lx\n",
-					__func__, opc, index, vcpu,
-					read_c0_entryhi());
-				kvm_mips_dump_guest_tlbs(vcpu);
-				local_irq_restore(flags);
-				return KVM_INVALID_INST;
-			}
-			inst = *(opc);
-		}
-		local_irq_restore(flags);
-	} else if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
-		paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, va);
-		vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
-		vaddr += paddr & ~PAGE_MASK;
-		inst = *(u32 *)vaddr;
-		kunmap_atomic(vaddr);
+	if (KVM_GUEST_KSEGX(gva) == KVM_GUEST_KSEG0) {
+		if (kvm_mips_handle_kseg0_tlb_fault(gva, vcpu, write) < 0)
+			return KVM_MIPS_GPA;
+	} else if ((KVM_GUEST_KSEGX(gva) < KVM_GUEST_KSEG0) ||
+		   KVM_GUEST_KSEGX(gva) == KVM_GUEST_KSEG23) {
+		/* Address should be in the guest TLB */
+		index = kvm_mips_guest_tlb_lookup(vcpu, (gva & VPN2_MASK) |
+			  (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID));
+		if (index < 0)
+			return KVM_MIPS_TLB;
+		tlb = &vcpu->arch.guest_tlb[index];
+
+		/* Entry should be valid, and dirty for writes */
+		if (!TLB_IS_VALID(*tlb, gva))
+			return KVM_MIPS_TLBINV;
+		if (write && !TLB_IS_DIRTY(*tlb, gva))
+			return KVM_MIPS_TLBMOD;
+
+		if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, gva, write))
+			return KVM_MIPS_GPA;
 	} else {
-		kvm_err("%s: illegal address: %p\n", __func__, opc);
-		return KVM_INVALID_INST;
+		return KVM_MIPS_GVA;
 	}
 
-	return inst;
+	return KVM_MIPS_MAPPED;
+}
+
+int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
+{
+	int err;
+
+retry:
+	kvm_trap_emul_gva_lockless_begin(vcpu);
+	err = get_user(*out, opc);
+	kvm_trap_emul_gva_lockless_end(vcpu);
+
+	if (unlikely(err)) {
+		/*
+		 * Try to handle the fault, maybe we just raced with a GVA
+		 * invalidation.
+		 */
+		err = kvm_trap_emul_gva_fault(vcpu, (unsigned long)opc,
+					      false);
+		if (unlikely(err)) {
+			kvm_err("%s: illegal address: %p\n",
+				__func__, opc);
+			return -EFAULT;
+		}
+
+		/* Hopefully it'll work now */
+		goto retry;
+	}
+	return 0;
 }
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 254377d8e0b9..2819eb793345 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -33,28 +33,20 @@
 #define KVM_GUEST_PC_TLB    0
 #define KVM_GUEST_SP_TLB    1
 
-atomic_t kvm_mips_instance;
-EXPORT_SYMBOL_GPL(kvm_mips_instance);
-
 static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
 {
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
 	int cpu = smp_processor_id();
 
-	return vcpu->arch.guest_kernel_asid[cpu] &
-			cpu_asid_mask(&cpu_data[cpu]);
+	return cpu_asid(cpu, kern_mm);
 }
 
 static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
 {
+	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
 	int cpu = smp_processor_id();
 
-	return vcpu->arch.guest_user_asid[cpu] &
-			cpu_asid_mask(&cpu_data[cpu]);
-}
-
-inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
-{
-	return vcpu->kvm->arch.commpage_tlb;
+	return cpu_asid(cpu, user_mm);
 }
 
 /* Structure defining an tlb entry data set. */
@@ -104,109 +96,6 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs);
 
-/* XXXKYMA: Must be called with interrupts disabled */
-/* set flush_dcache_mask == 0 if no dcache flush required */
-int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
-			    unsigned long entrylo0, unsigned long entrylo1,
-			    int flush_dcache_mask)
-{
-	unsigned long flags;
-	unsigned long old_entryhi;
-	int idx;
-
-	local_irq_save(flags);
-
-	old_entryhi = read_c0_entryhi();
-	write_c0_entryhi(entryhi);
-	mtc0_tlbw_hazard();
-
-	tlb_probe();
-	tlb_probe_hazard();
-	idx = read_c0_index();
-
-	if (idx > current_cpu_data.tlbsize) {
-		kvm_err("%s: Invalid Index: %d\n", __func__, idx);
-		kvm_mips_dump_host_tlbs();
-		local_irq_restore(flags);
-		return -1;
-	}
-
-	write_c0_entrylo0(entrylo0);
-	write_c0_entrylo1(entrylo1);
-	mtc0_tlbw_hazard();
-
-	if (idx < 0)
-		tlb_write_random();
-	else
-		tlb_write_indexed();
-	tlbw_use_hazard();
-
-	kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0(R): 0x%08lx, entrylo1(R): 0x%08lx\n",
-		  vcpu->arch.pc, idx, read_c0_entryhi(),
-		  read_c0_entrylo0(), read_c0_entrylo1());
-
-	/* Flush D-cache */
-	if (flush_dcache_mask) {
-		if (entrylo0 & ENTRYLO_V) {
-			++vcpu->stat.flush_dcache_exits;
-			flush_data_cache_page((entryhi & VPN2_MASK) &
-					      ~flush_dcache_mask);
-		}
-		if (entrylo1 & ENTRYLO_V) {
-			++vcpu->stat.flush_dcache_exits;
-			flush_data_cache_page(((entryhi & VPN2_MASK) &
-					       ~flush_dcache_mask) |
-					      (0x1 << PAGE_SHIFT));
-		}
-	}
-
-	/* Restore old ASID */
-	write_c0_entryhi(old_entryhi);
-	mtc0_tlbw_hazard();
-	local_irq_restore(flags);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_write);
-
-int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
-	struct kvm_vcpu *vcpu)
-{
-	kvm_pfn_t pfn;
-	unsigned long flags, old_entryhi = 0, vaddr = 0;
-	unsigned long entrylo[2] = { 0, 0 };
-	unsigned int pair_idx;
-
-	pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage));
-	pair_idx = (badvaddr >> PAGE_SHIFT) & 1;
-	entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) |
-		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
-		ENTRYLO_D | ENTRYLO_V;
-
-	local_irq_save(flags);
-
-	old_entryhi = read_c0_entryhi();
-	vaddr = badvaddr & (PAGE_MASK << 1);
-	write_c0_entryhi(vaddr | kvm_mips_get_kernel_asid(vcpu));
-	write_c0_entrylo0(entrylo[0]);
-	write_c0_entrylo1(entrylo[1]);
-	write_c0_index(kvm_mips_get_commpage_asid(vcpu));
-	mtc0_tlbw_hazard();
-	tlb_write_indexed();
-	tlbw_use_hazard();
-
-	kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0 (R): 0x%08lx, entrylo1(R): 0x%08lx\n",
-		  vcpu->arch.pc, read_c0_index(), read_c0_entryhi(),
-		  read_c0_entrylo0(), read_c0_entrylo1());
-
-	/* Restore old ASID */
-	write_c0_entryhi(old_entryhi);
-	mtc0_tlbw_hazard();
-	local_irq_restore(flags);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_handle_commpage_tlb_fault);
-
 int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
 {
 	int i;
@@ -228,51 +117,11 @@ int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
 }
 EXPORT_SYMBOL_GPL(kvm_mips_guest_tlb_lookup);
 
-int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr)
-{
-	unsigned long old_entryhi, flags;
-	int idx;
-
-	local_irq_save(flags);
-
-	old_entryhi = read_c0_entryhi();
-
-	if (KVM_GUEST_KERNEL_MODE(vcpu))
-		write_c0_entryhi((vaddr & VPN2_MASK) |
-				 kvm_mips_get_kernel_asid(vcpu));
-	else {
-		write_c0_entryhi((vaddr & VPN2_MASK) |
-				 kvm_mips_get_user_asid(vcpu));
-	}
-
-	mtc0_tlbw_hazard();
-
-	tlb_probe();
-	tlb_probe_hazard();
-	idx = read_c0_index();
-
-	/* Restore old ASID */
-	write_c0_entryhi(old_entryhi);
-	mtc0_tlbw_hazard();
-
-	local_irq_restore(flags);
-
-	kvm_debug("Host TLB lookup, %#lx, idx: %2d\n", vaddr, idx);
-
-	return idx;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_lookup);
-
-int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
+static int _kvm_mips_host_tlb_inv(unsigned long entryhi)
 {
 	int idx;
-	unsigned long flags, old_entryhi;
-
-	local_irq_save(flags);
-
-	old_entryhi = read_c0_entryhi();
 
-	write_c0_entryhi((va & VPN2_MASK) | kvm_mips_get_user_asid(vcpu));
+	write_c0_entryhi(entryhi);
 	mtc0_tlbw_hazard();
 
 	tlb_probe();
@@ -282,7 +131,7 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
 	if (idx >= current_cpu_data.tlbsize)
 		BUG();
 
-	if (idx > 0) {
+	if (idx >= 0) {
 		write_c0_entryhi(UNIQUE_ENTRYHI(idx));
 		write_c0_entrylo0(0);
 		write_c0_entrylo1(0);
@@ -292,93 +141,75 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
 		tlbw_use_hazard();
 	}
 
-	write_c0_entryhi(old_entryhi);
-	mtc0_tlbw_hazard();
-
-	local_irq_restore(flags);
-
-	if (idx > 0)
-		kvm_debug("%s: Invalidated entryhi %#lx @ idx %d\n", __func__,
-			  (va & VPN2_MASK) | kvm_mips_get_user_asid(vcpu), idx);
-
-	return 0;
+	return idx;
 }
-EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_inv);
 
-void kvm_mips_flush_host_tlb(int skip_kseg0)
+int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va,
+			  bool user, bool kernel)
 {
-	unsigned long flags;
-	unsigned long old_entryhi, entryhi;
-	unsigned long old_pagemask;
-	int entry = 0;
-	int maxentry = current_cpu_data.tlbsize;
+	int idx_user, idx_kernel;
+	unsigned long flags, old_entryhi;
 
 	local_irq_save(flags);
 
 	old_entryhi = read_c0_entryhi();
-	old_pagemask = read_c0_pagemask();
-
-	/* Blast 'em all away. */
-	for (entry = 0; entry < maxentry; entry++) {
-		write_c0_index(entry);
-
-		if (skip_kseg0) {
-			mtc0_tlbr_hazard();
-			tlb_read();
-			tlb_read_hazard();
-
-			entryhi = read_c0_entryhi();
 
-			/* Don't blow away guest kernel entries */
-			if (KVM_GUEST_KSEGX(entryhi) == KVM_GUEST_KSEG0)
-				continue;
-
-			write_c0_pagemask(old_pagemask);
-		}
-
-		/* Make sure all entries differ. */
-		write_c0_entryhi(UNIQUE_ENTRYHI(entry));
-		write_c0_entrylo0(0);
-		write_c0_entrylo1(0);
-		mtc0_tlbw_hazard();
-
-		tlb_write_indexed();
-		tlbw_use_hazard();
-	}
+	if (user)
+		idx_user = _kvm_mips_host_tlb_inv((va & VPN2_MASK) |
+						  kvm_mips_get_user_asid(vcpu));
+	if (kernel)
+		idx_kernel = _kvm_mips_host_tlb_inv((va & VPN2_MASK) |
+						kvm_mips_get_kernel_asid(vcpu));
 
 	write_c0_entryhi(old_entryhi);
-	write_c0_pagemask(old_pagemask);
 	mtc0_tlbw_hazard();
 
 	local_irq_restore(flags);
+
+	if (user && idx_user >= 0)
+		kvm_debug("%s: Invalidated guest user entryhi %#lx @ idx %d\n",
+			  __func__, (va & VPN2_MASK) |
+				    kvm_mips_get_user_asid(vcpu), idx_user);
+	if (kernel && idx_kernel >= 0)
+		kvm_debug("%s: Invalidated guest kernel entryhi %#lx @ idx %d\n",
+			  __func__, (va & VPN2_MASK) |
+				    kvm_mips_get_kernel_asid(vcpu), idx_kernel);
+
+	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_mips_flush_host_tlb);
+EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_inv);
 
-void kvm_local_flush_tlb_all(void)
+/**
+ * kvm_mips_suspend_mm() - Suspend the active mm.
+ * @cpu		The CPU we're running on.
+ *
+ * Suspend the active_mm, ready for a switch to a KVM guest virtual address
+ * space. This is left active for the duration of guest context, including time
+ * with interrupts enabled, so we need to be careful not to confuse e.g. cache
+ * management IPIs.
+ *
+ * kvm_mips_resume_mm() should be called before context switching to a different
+ * process so we don't need to worry about reference counting.
+ *
+ * This needs to be in static kernel code to avoid exporting init_mm.
+ */
+void kvm_mips_suspend_mm(int cpu)
 {
-	unsigned long flags;
-	unsigned long old_ctx;
-	int entry = 0;
-
-	local_irq_save(flags);
-	/* Save old context and create impossible VPN2 value */
-	old_ctx = read_c0_entryhi();
-	write_c0_entrylo0(0);
-	write_c0_entrylo1(0);
-
-	/* Blast 'em all away. */
-	while (entry < current_cpu_data.tlbsize) {
-		/* Make sure all entries differ. */
-		write_c0_entryhi(UNIQUE_ENTRYHI(entry));
-		write_c0_index(entry);
-		mtc0_tlbw_hazard();
-		tlb_write_indexed();
-		tlbw_use_hazard();
-		entry++;
-	}
-	write_c0_entryhi(old_ctx);
-	mtc0_tlbw_hazard();
+	cpumask_clear_cpu(cpu, mm_cpumask(current->active_mm));
+	current->active_mm = &init_mm;
+}
+EXPORT_SYMBOL_GPL(kvm_mips_suspend_mm);
 
-	local_irq_restore(flags);
+/**
+ * kvm_mips_resume_mm() - Resume the current process mm.
+ * @cpu		The CPU we're running on.
+ *
+ * Resume the mm of the current process, after a switch back from a KVM guest
+ * virtual address space (see kvm_mips_suspend_mm()).
+ */
+void kvm_mips_resume_mm(int cpu)
+{
+	cpumask_set_cpu(cpu, mm_cpumask(current->mm));
+	current->active_mm = current->mm;
 }
-EXPORT_SYMBOL_GPL(kvm_local_flush_tlb_all);
+EXPORT_SYMBOL_GPL(kvm_mips_resume_mm);
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 3b20441f2beb..b1fa53b252ea 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -11,9 +11,11 @@
 
 #include <linux/errno.h>
 #include <linux/err.h>
-#include <linux/vmalloc.h>
-
 #include <linux/kvm_host.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
 
 #include "interrupt.h"
 
@@ -21,9 +23,12 @@ static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva)
 {
 	gpa_t gpa;
 	gva_t kseg = KSEGX(gva);
+	gva_t gkseg = KVM_GUEST_KSEGX(gva);
 
 	if ((kseg == CKSEG0) || (kseg == CKSEG1))
 		gpa = CPHYSADDR(gva);
+	else if (gkseg == KVM_GUEST_KSEG0)
+		gpa = KVM_GUEST_CPHYSADDR(gva);
 	else {
 		kvm_err("%s: cannot find GPA for GVA: %#lx\n", __func__, gva);
 		kvm_mips_dump_host_tlbs();
@@ -83,48 +88,134 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
+static int kvm_mips_bad_load(u32 cause, u32 *opc, struct kvm_run *run,
+			     struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er;
+	union mips_instruction inst;
+	int err;
+
+	/* A code fetch fault doesn't count as an MMIO */
+	if (kvm_is_ifetch_fault(&vcpu->arch)) {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		return RESUME_HOST;
+	}
+
+	/* Fetch the instruction. */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
+	if (err) {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		return RESUME_HOST;
+	}
+
+	/* Emulate the load */
+	er = kvm_mips_emulate_load(inst, cause, run, vcpu);
+	if (er == EMULATE_FAIL) {
+		kvm_err("Emulate load from MMIO space failed\n");
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+	} else {
+		run->exit_reason = KVM_EXIT_MMIO;
+	}
+	return RESUME_HOST;
+}
+
+static int kvm_mips_bad_store(u32 cause, u32 *opc, struct kvm_run *run,
+			      struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er;
+	union mips_instruction inst;
+	int err;
+
+	/* Fetch the instruction. */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
+	if (err) {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		return RESUME_HOST;
+	}
+
+	/* Emulate the store */
+	er = kvm_mips_emulate_store(inst, cause, run, vcpu);
+	if (er == EMULATE_FAIL) {
+		kvm_err("Emulate store to MMIO space failed\n");
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+	} else {
+		run->exit_reason = KVM_EXIT_MMIO;
+	}
+	return RESUME_HOST;
+}
+
+static int kvm_mips_bad_access(u32 cause, u32 *opc, struct kvm_run *run,
+			       struct kvm_vcpu *vcpu, bool store)
+{
+	if (store)
+		return kvm_mips_bad_store(cause, opc, run, vcpu);
+	else
+		return kvm_mips_bad_load(cause, opc, run, vcpu);
+}
+
 static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
 {
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	struct kvm_run *run = vcpu->run;
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 	u32 cause = vcpu->arch.host_cp0_cause;
-	enum emulation_result er = EMULATE_DONE;
-	int ret = RESUME_GUEST;
+	struct kvm_mips_tlb *tlb;
+	unsigned long entryhi;
+	int index;
 
 	if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
 	    || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
-		kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#x, PC: %p, BadVaddr: %#lx\n",
-			  cause, opc, badvaddr);
-		er = kvm_mips_handle_tlbmod(cause, opc, run, vcpu);
+		/*
+		 * First find the mapping in the guest TLB. If the failure to
+		 * write was due to the guest TLB, it should be up to the guest
+		 * to handle it.
+		 */
+		entryhi = (badvaddr & VPN2_MASK) |
+			  (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
+		index = kvm_mips_guest_tlb_lookup(vcpu, entryhi);
 
-		if (er == EMULATE_DONE)
-			ret = RESUME_GUEST;
-		else {
+		/*
+		 * These should never happen.
+		 * They would indicate stale host TLB entries.
+		 */
+		if (unlikely(index < 0)) {
 			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			ret = RESUME_HOST;
+			return RESUME_HOST;
 		}
-	} else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
+		tlb = vcpu->arch.guest_tlb + index;
+		if (unlikely(!TLB_IS_VALID(*tlb, badvaddr))) {
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
+
 		/*
-		 * XXXKYMA: The guest kernel does not expect to get this fault
-		 * when we are not using HIGHMEM. Need to address this in a
-		 * HIGHMEM kernel
+		 * Guest entry not dirty? That would explain the TLB modified
+		 * exception. Relay that on to the guest so it can handle it.
 		 */
-		kvm_err("TLB MOD fault not handled, cause %#x, PC: %p, BadVaddr: %#lx\n",
-			cause, opc, badvaddr);
-		kvm_mips_dump_host_tlbs();
-		kvm_arch_vcpu_dump_regs(vcpu);
-		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		ret = RESUME_HOST;
+		if (!TLB_IS_DIRTY(*tlb, badvaddr)) {
+			kvm_mips_emulate_tlbmod(cause, opc, run, vcpu);
+			return RESUME_GUEST;
+		}
+
+		if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, badvaddr,
+							 true))
+			/* Not writable, needs handling as MMIO */
+			return kvm_mips_bad_store(cause, opc, run, vcpu);
+		return RESUME_GUEST;
+	} else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
+		if (kvm_mips_handle_kseg0_tlb_fault(badvaddr, vcpu, true) < 0)
+			/* Not writable, needs handling as MMIO */
+			return kvm_mips_bad_store(cause, opc, run, vcpu);
+		return RESUME_GUEST;
 	} else {
-		kvm_err("Illegal TLB Mod fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
-			cause, opc, badvaddr);
-		kvm_mips_dump_host_tlbs();
-		kvm_arch_vcpu_dump_regs(vcpu);
-		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		ret = RESUME_HOST;
+		/* host kernel addresses are all handled as MMIO */
+		return kvm_mips_bad_store(cause, opc, run, vcpu);
 	}
-	return ret;
 }
 
 static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
@@ -157,7 +248,7 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
 		 *     into the shadow host TLB
 		 */
 
-		er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu);
+		er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu, store);
 		if (er == EMULATE_DONE)
 			ret = RESUME_GUEST;
 		else {
@@ -169,29 +260,15 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
 		 * All KSEG0 faults are handled by KVM, as the guest kernel does
 		 * not expect to ever get them
 		 */
-		if (kvm_mips_handle_kseg0_tlb_fault
-		    (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) {
-			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			ret = RESUME_HOST;
-		}
+		if (kvm_mips_handle_kseg0_tlb_fault(badvaddr, vcpu, store) < 0)
+			ret = kvm_mips_bad_access(cause, opc, run, vcpu, store);
 	} else if (KVM_GUEST_KERNEL_MODE(vcpu)
 		   && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) {
 		/*
 		 * With EVA we may get a TLB exception instead of an address
 		 * error when the guest performs MMIO to KSeg1 addresses.
 		 */
-		kvm_debug("Emulate %s MMIO space\n",
-			  store ? "Store to" : "Load from");
-		er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
-		if (er == EMULATE_FAIL) {
-			kvm_err("Emulate %s MMIO space failed\n",
-				store ? "Store to" : "Load from");
-			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			ret = RESUME_HOST;
-		} else {
-			run->exit_reason = KVM_EXIT_MMIO;
-			ret = RESUME_HOST;
-		}
+		ret = kvm_mips_bad_access(cause, opc, run, vcpu, store);
 	} else {
 		kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
 			store ? "ST" : "LD", cause, opc, badvaddr);
@@ -219,21 +296,11 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 	u32 cause = vcpu->arch.host_cp0_cause;
-	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 
 	if (KVM_GUEST_KERNEL_MODE(vcpu)
 	    && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) {
-		kvm_debug("Emulate Store to MMIO space\n");
-		er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
-		if (er == EMULATE_FAIL) {
-			kvm_err("Emulate Store to MMIO space failed\n");
-			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			ret = RESUME_HOST;
-		} else {
-			run->exit_reason = KVM_EXIT_MMIO;
-			ret = RESUME_HOST;
-		}
+		ret = kvm_mips_bad_store(cause, opc, run, vcpu);
 	} else {
 		kvm_err("Address Error (STORE): cause %#x, PC: %p, BadVaddr: %#lx\n",
 			cause, opc, badvaddr);
@@ -249,26 +316,15 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 	u32 cause = vcpu->arch.host_cp0_cause;
-	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 
 	if (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1) {
-		kvm_debug("Emulate Load from MMIO space @ %#lx\n", badvaddr);
-		er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
-		if (er == EMULATE_FAIL) {
-			kvm_err("Emulate Load from MMIO space failed\n");
-			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			ret = RESUME_HOST;
-		} else {
-			run->exit_reason = KVM_EXIT_MMIO;
-			ret = RESUME_HOST;
-		}
+		ret = kvm_mips_bad_load(cause, opc, run, vcpu);
 	} else {
 		kvm_err("Address Error (LOAD): cause %#x, PC: %p, BadVaddr: %#lx\n",
 			cause, opc, badvaddr);
 		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		ret = RESUME_HOST;
-		er = EMULATE_FAIL;
 	}
 	return ret;
 }
@@ -428,16 +484,75 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
-static int kvm_trap_emul_vm_init(struct kvm *kvm)
+static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
 {
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
+
+	/*
+	 * Allocate GVA -> HPA page tables.
+	 * MIPS doesn't use the mm_struct pointer argument.
+	 */
+	kern_mm->pgd = pgd_alloc(kern_mm);
+	if (!kern_mm->pgd)
+		return -ENOMEM;
+
+	user_mm->pgd = pgd_alloc(user_mm);
+	if (!user_mm->pgd) {
+		pgd_free(kern_mm, kern_mm->pgd);
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
-static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
+static void kvm_mips_emul_free_gva_pt(pgd_t *pgd)
 {
-	vcpu->arch.kscratch_enabled = 0xfc;
+	/* Don't free host kernel page tables copied from init_mm.pgd */
+	const unsigned long end = 0x80000000;
+	unsigned long pgd_va, pud_va, pmd_va;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	int i, j, k;
+
+	for (i = 0; i < USER_PTRS_PER_PGD; i++) {
+		if (pgd_none(pgd[i]))
+			continue;
+
+		pgd_va = (unsigned long)i << PGDIR_SHIFT;
+		if (pgd_va >= end)
+			break;
+		pud = pud_offset(pgd + i, 0);
+		for (j = 0; j < PTRS_PER_PUD; j++) {
+			if (pud_none(pud[j]))
+				continue;
+
+			pud_va = pgd_va | ((unsigned long)j << PUD_SHIFT);
+			if (pud_va >= end)
+				break;
+			pmd = pmd_offset(pud + j, 0);
+			for (k = 0; k < PTRS_PER_PMD; k++) {
+				if (pmd_none(pmd[k]))
+					continue;
+
+				pmd_va = pud_va | (k << PMD_SHIFT);
+				if (pmd_va >= end)
+					break;
+				pte = pte_offset(pmd + k, 0);
+				pte_free_kernel(NULL, pte);
+			}
+			pmd_free(NULL, pmd);
+		}
+		pud_free(NULL, pud);
+	}
+	pgd_free(NULL, pgd);
+}
 
-	return 0;
+static void kvm_trap_emul_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	kvm_mips_emul_free_gva_pt(vcpu->arch.guest_kernel_mm.pgd);
+	kvm_mips_emul_free_gva_pt(vcpu->arch.guest_user_mm.pgd);
 }
 
 static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
@@ -499,6 +614,9 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 	/* Set Wait IE/IXMT Ignore in Config7, IAR, AR */
 	kvm_write_c0_guest_config7(cop0, (MIPS_CONF7_WII) | (1 << 10));
 
+	/* Status */
+	kvm_write_c0_guest_status(cop0, ST0_BEV | ST0_ERL);
+
 	/*
 	 * Setup IntCtl defaults, compatibility mode for timer interrupts (HW5)
 	 */
@@ -508,17 +626,76 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 	kvm_write_c0_guest_ebase(cop0, KVM_GUEST_KSEG0 |
 				       (vcpu_id & MIPS_EBASE_CPUNUM));
 
+	/* Put PC at guest reset vector */
+	vcpu->arch.pc = KVM_GUEST_CKSEG1ADDR(0x1fc00000);
+
 	return 0;
 }
 
+static void kvm_trap_emul_flush_shadow_all(struct kvm *kvm)
+{
+	/* Flush GVA page tables and invalidate GVA ASIDs on all VCPUs */
+	kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_trap_emul_flush_shadow_memslot(struct kvm *kvm,
+					const struct kvm_memory_slot *slot)
+{
+	kvm_trap_emul_flush_shadow_all(kvm);
+}
+
+static u64 kvm_trap_emul_get_one_regs[] = {
+	KVM_REG_MIPS_CP0_INDEX,
+	KVM_REG_MIPS_CP0_ENTRYLO0,
+	KVM_REG_MIPS_CP0_ENTRYLO1,
+	KVM_REG_MIPS_CP0_CONTEXT,
+	KVM_REG_MIPS_CP0_USERLOCAL,
+	KVM_REG_MIPS_CP0_PAGEMASK,
+	KVM_REG_MIPS_CP0_WIRED,
+	KVM_REG_MIPS_CP0_HWRENA,
+	KVM_REG_MIPS_CP0_BADVADDR,
+	KVM_REG_MIPS_CP0_COUNT,
+	KVM_REG_MIPS_CP0_ENTRYHI,
+	KVM_REG_MIPS_CP0_COMPARE,
+	KVM_REG_MIPS_CP0_STATUS,
+	KVM_REG_MIPS_CP0_INTCTL,
+	KVM_REG_MIPS_CP0_CAUSE,
+	KVM_REG_MIPS_CP0_EPC,
+	KVM_REG_MIPS_CP0_PRID,
+	KVM_REG_MIPS_CP0_EBASE,
+	KVM_REG_MIPS_CP0_CONFIG,
+	KVM_REG_MIPS_CP0_CONFIG1,
+	KVM_REG_MIPS_CP0_CONFIG2,
+	KVM_REG_MIPS_CP0_CONFIG3,
+	KVM_REG_MIPS_CP0_CONFIG4,
+	KVM_REG_MIPS_CP0_CONFIG5,
+	KVM_REG_MIPS_CP0_CONFIG7,
+	KVM_REG_MIPS_CP0_ERROREPC,
+	KVM_REG_MIPS_CP0_KSCRATCH1,
+	KVM_REG_MIPS_CP0_KSCRATCH2,
+	KVM_REG_MIPS_CP0_KSCRATCH3,
+	KVM_REG_MIPS_CP0_KSCRATCH4,
+	KVM_REG_MIPS_CP0_KSCRATCH5,
+	KVM_REG_MIPS_CP0_KSCRATCH6,
+
+	KVM_REG_MIPS_COUNT_CTL,
+	KVM_REG_MIPS_COUNT_RESUME,
+	KVM_REG_MIPS_COUNT_HZ,
+};
+
 static unsigned long kvm_trap_emul_num_regs(struct kvm_vcpu *vcpu)
 {
-	return 0;
+	return ARRAY_SIZE(kvm_trap_emul_get_one_regs);
 }
 
 static int kvm_trap_emul_copy_reg_indices(struct kvm_vcpu *vcpu,
 					  u64 __user *indices)
 {
+	if (copy_to_user(indices, kvm_trap_emul_get_one_regs,
+			 sizeof(kvm_trap_emul_get_one_regs)))
+		return -EFAULT;
+	indices += ARRAY_SIZE(kvm_trap_emul_get_one_regs);
+
 	return 0;
 }
 
@@ -526,7 +703,81 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
 				     const struct kvm_one_reg *reg,
 				     s64 *v)
 {
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+
 	switch (reg->id) {
+	case KVM_REG_MIPS_CP0_INDEX:
+		*v = (long)kvm_read_c0_guest_index(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO0:
+		*v = kvm_read_c0_guest_entrylo0(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO1:
+		*v = kvm_read_c0_guest_entrylo1(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CONTEXT:
+		*v = (long)kvm_read_c0_guest_context(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_USERLOCAL:
+		*v = (long)kvm_read_c0_guest_userlocal(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_PAGEMASK:
+		*v = (long)kvm_read_c0_guest_pagemask(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_WIRED:
+		*v = (long)kvm_read_c0_guest_wired(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_HWRENA:
+		*v = (long)kvm_read_c0_guest_hwrena(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_BADVADDR:
+		*v = (long)kvm_read_c0_guest_badvaddr(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYHI:
+		*v = (long)kvm_read_c0_guest_entryhi(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_COMPARE:
+		*v = (long)kvm_read_c0_guest_compare(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_STATUS:
+		*v = (long)kvm_read_c0_guest_status(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_INTCTL:
+		*v = (long)kvm_read_c0_guest_intctl(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CAUSE:
+		*v = (long)kvm_read_c0_guest_cause(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_EPC:
+		*v = (long)kvm_read_c0_guest_epc(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_PRID:
+		*v = (long)kvm_read_c0_guest_prid(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_EBASE:
+		*v = (long)kvm_read_c0_guest_ebase(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG:
+		*v = (long)kvm_read_c0_guest_config(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG1:
+		*v = (long)kvm_read_c0_guest_config1(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG2:
+		*v = (long)kvm_read_c0_guest_config2(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG3:
+		*v = (long)kvm_read_c0_guest_config3(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG4:
+		*v = (long)kvm_read_c0_guest_config4(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG5:
+		*v = (long)kvm_read_c0_guest_config5(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG7:
+		*v = (long)kvm_read_c0_guest_config7(cop0);
+		break;
 	case KVM_REG_MIPS_CP0_COUNT:
 		*v = kvm_mips_read_count(vcpu);
 		break;
@@ -539,6 +790,27 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_COUNT_HZ:
 		*v = vcpu->arch.count_hz;
 		break;
+	case KVM_REG_MIPS_CP0_ERROREPC:
+		*v = (long)kvm_read_c0_guest_errorepc(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH1:
+		*v = (long)kvm_read_c0_guest_kscratch1(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH2:
+		*v = (long)kvm_read_c0_guest_kscratch2(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH3:
+		*v = (long)kvm_read_c0_guest_kscratch3(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH4:
+		*v = (long)kvm_read_c0_guest_kscratch4(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH5:
+		*v = (long)kvm_read_c0_guest_kscratch5(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH6:
+		*v = (long)kvm_read_c0_guest_kscratch6(cop0);
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -554,6 +826,56 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
 	unsigned int cur, change;
 
 	switch (reg->id) {
+	case KVM_REG_MIPS_CP0_INDEX:
+		kvm_write_c0_guest_index(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO0:
+		kvm_write_c0_guest_entrylo0(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO1:
+		kvm_write_c0_guest_entrylo1(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_CONTEXT:
+		kvm_write_c0_guest_context(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_USERLOCAL:
+		kvm_write_c0_guest_userlocal(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_PAGEMASK:
+		kvm_write_c0_guest_pagemask(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_WIRED:
+		kvm_write_c0_guest_wired(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_HWRENA:
+		kvm_write_c0_guest_hwrena(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_BADVADDR:
+		kvm_write_c0_guest_badvaddr(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYHI:
+		kvm_write_c0_guest_entryhi(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_STATUS:
+		kvm_write_c0_guest_status(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_INTCTL:
+		/* No VInt, so no VS, read-only for now */
+		break;
+	case KVM_REG_MIPS_CP0_EPC:
+		kvm_write_c0_guest_epc(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_PRID:
+		kvm_write_c0_guest_prid(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_EBASE:
+		/*
+		 * Allow core number to be written, but the exception base must
+		 * remain in guest KSeg0.
+		 */
+		kvm_change_c0_guest_ebase(cop0, 0x1ffff000 | MIPS_EBASE_CPUNUM,
+					  v);
+		break;
 	case KVM_REG_MIPS_CP0_COUNT:
 		kvm_mips_write_count(vcpu, v);
 		break;
@@ -618,6 +940,9 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
 			kvm_write_c0_guest_config5(cop0, v);
 		}
 		break;
+	case KVM_REG_MIPS_CP0_CONFIG7:
+		/* writes ignored */
+		break;
 	case KVM_REG_MIPS_COUNT_CTL:
 		ret = kvm_mips_set_count_ctl(vcpu, v);
 		break;
@@ -627,24 +952,269 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_COUNT_HZ:
 		ret = kvm_mips_set_count_hz(vcpu, v);
 		break;
+	case KVM_REG_MIPS_CP0_ERROREPC:
+		kvm_write_c0_guest_errorepc(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH1:
+		kvm_write_c0_guest_kscratch1(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH2:
+		kvm_write_c0_guest_kscratch2(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH3:
+		kvm_write_c0_guest_kscratch3(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH4:
+		kvm_write_c0_guest_kscratch4(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH5:
+		kvm_write_c0_guest_kscratch5(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH6:
+		kvm_write_c0_guest_kscratch6(cop0, v);
+		break;
 	default:
 		return -EINVAL;
 	}
 	return ret;
 }
 
-static int kvm_trap_emul_vcpu_get_regs(struct kvm_vcpu *vcpu)
+static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	kvm_lose_fpu(vcpu);
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
+	struct mm_struct *mm;
+
+	/*
+	 * Were we in guest context? If so, restore the appropriate ASID based
+	 * on the mode of the Guest (Kernel/User).
+	 */
+	if (current->flags & PF_VCPU) {
+		mm = KVM_GUEST_KERNEL_MODE(vcpu) ? kern_mm : user_mm;
+		if ((cpu_context(cpu, mm) ^ asid_cache(cpu)) &
+		    asid_version_mask(cpu))
+			get_new_mmu_context(mm, cpu);
+		write_c0_entryhi(cpu_asid(cpu, mm));
+		TLBMISS_HANDLER_SETUP_PGD(mm->pgd);
+		kvm_mips_suspend_mm(cpu);
+		ehb();
+	}
 
 	return 0;
 }
 
-static int kvm_trap_emul_vcpu_set_regs(struct kvm_vcpu *vcpu)
+static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 {
+	kvm_lose_fpu(vcpu);
+
+	if (current->flags & PF_VCPU) {
+		/* Restore normal Linux process memory map */
+		if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
+		     asid_version_mask(cpu)))
+			get_new_mmu_context(current->mm, cpu);
+		write_c0_entryhi(cpu_asid(cpu, current->mm));
+		TLBMISS_HANDLER_SETUP_PGD(current->mm->pgd);
+		kvm_mips_resume_mm(cpu);
+		ehb();
+	}
+
 	return 0;
 }
 
+static void kvm_trap_emul_check_requests(struct kvm_vcpu *vcpu, int cpu,
+					 bool reload_asid)
+{
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
+	struct mm_struct *mm;
+	int i;
+
+	if (likely(!vcpu->requests))
+		return;
+
+	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
+		/*
+		 * Both kernel & user GVA mappings must be invalidated. The
+		 * caller is just about to check whether the ASID is stale
+		 * anyway so no need to reload it here.
+		 */
+		kvm_mips_flush_gva_pt(kern_mm->pgd, KMF_GPA | KMF_KERN);
+		kvm_mips_flush_gva_pt(user_mm->pgd, KMF_GPA | KMF_USER);
+		for_each_possible_cpu(i) {
+			cpu_context(i, kern_mm) = 0;
+			cpu_context(i, user_mm) = 0;
+		}
+
+		/* Generate new ASID for current mode */
+		if (reload_asid) {
+			mm = KVM_GUEST_KERNEL_MODE(vcpu) ? kern_mm : user_mm;
+			get_new_mmu_context(mm, cpu);
+			htw_stop();
+			write_c0_entryhi(cpu_asid(cpu, mm));
+			TLBMISS_HANDLER_SETUP_PGD(mm->pgd);
+			htw_start();
+		}
+	}
+}
+
+/**
+ * kvm_trap_emul_gva_lockless_begin() - Begin lockless access to GVA space.
+ * @vcpu:	VCPU pointer.
+ *
+ * Call before a GVA space access outside of guest mode, to ensure that
+ * asynchronous TLB flush requests are handled or delayed until completion of
+ * the GVA access (as indicated by a matching kvm_trap_emul_gva_lockless_end()).
+ *
+ * Should be called with IRQs already enabled.
+ */
+void kvm_trap_emul_gva_lockless_begin(struct kvm_vcpu *vcpu)
+{
+	/* We re-enable IRQs in kvm_trap_emul_gva_lockless_end() */
+	WARN_ON_ONCE(irqs_disabled());
+
+	/*
+	 * The caller is about to access the GVA space, so we set the mode to
+	 * force TLB flush requests to send an IPI, and also disable IRQs to
+	 * delay IPI handling until kvm_trap_emul_gva_lockless_end().
+	 */
+	local_irq_disable();
+
+	/*
+	 * Make sure the read of VCPU requests is not reordered ahead of the
+	 * write to vcpu->mode, or we could miss a TLB flush request while
+	 * the requester sees the VCPU as outside of guest mode and not needing
+	 * an IPI.
+	 */
+	smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
+
+	/*
+	 * If a TLB flush has been requested (potentially while
+	 * OUTSIDE_GUEST_MODE and assumed immediately effective), perform it
+	 * before accessing the GVA space, and be sure to reload the ASID if
+	 * necessary as it'll be immediately used.
+	 *
+	 * TLB flush requests after this check will trigger an IPI due to the
+	 * mode change above, which will be delayed due to IRQs disabled.
+	 */
+	kvm_trap_emul_check_requests(vcpu, smp_processor_id(), true);
+}
+
+/**
+ * kvm_trap_emul_gva_lockless_end() - End lockless access to GVA space.
+ * @vcpu:	VCPU pointer.
+ *
+ * Called after a GVA space access outside of guest mode. Should have a matching
+ * call to kvm_trap_emul_gva_lockless_begin().
+ */
+void kvm_trap_emul_gva_lockless_end(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Make sure the write to vcpu->mode is not reordered in front of GVA
+	 * accesses, or a TLB flush requester may not think it necessary to send
+	 * an IPI.
+	 */
+	smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
+
+	/*
+	 * Now that the access to GVA space is complete, its safe for pending
+	 * TLB flush request IPIs to be handled (which indicates completion).
+	 */
+	local_irq_enable();
+}
+
+static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run,
+				       struct kvm_vcpu *vcpu)
+{
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+	struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
+	struct mm_struct *mm;
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	int i, cpu = smp_processor_id();
+	unsigned int gasid;
+
+	/*
+	 * No need to reload ASID, IRQs are disabled already so there's no rush,
+	 * and we'll check if we need to regenerate below anyway before
+	 * re-entering the guest.
+	 */
+	kvm_trap_emul_check_requests(vcpu, cpu, false);
+
+	if (KVM_GUEST_KERNEL_MODE(vcpu)) {
+		mm = kern_mm;
+	} else {
+		mm = user_mm;
+
+		/*
+		 * Lazy host ASID regeneration / PT flush for guest user mode.
+		 * If the guest ASID has changed since the last guest usermode
+		 * execution, invalidate the stale TLB entries and flush GVA PT
+		 * entries too.
+		 */
+		gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID;
+		if (gasid != vcpu->arch.last_user_gasid) {
+			kvm_mips_flush_gva_pt(user_mm->pgd, KMF_USER);
+			for_each_possible_cpu(i)
+				cpu_context(i, user_mm) = 0;
+			vcpu->arch.last_user_gasid = gasid;
+		}
+	}
+
+	/*
+	 * Check if ASID is stale. This may happen due to a TLB flush request or
+	 * a lazy user MM invalidation.
+	 */
+	if ((cpu_context(cpu, mm) ^ asid_cache(cpu)) &
+	    asid_version_mask(cpu))
+		get_new_mmu_context(mm, cpu);
+}
+
+static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	int cpu = smp_processor_id();
+	int r;
+
+	/* Check if we have any exceptions/interrupts pending */
+	kvm_mips_deliver_interrupts(vcpu,
+				    kvm_read_c0_guest_cause(vcpu->arch.cop0));
+
+	kvm_trap_emul_vcpu_reenter(run, vcpu);
+
+	/*
+	 * We use user accessors to access guest memory, but we don't want to
+	 * invoke Linux page faulting.
+	 */
+	pagefault_disable();
+
+	/* Disable hardware page table walking while in guest */
+	htw_stop();
+
+	/*
+	 * While in guest context we're in the guest's address space, not the
+	 * host process address space, so we need to be careful not to confuse
+	 * e.g. cache management IPIs.
+	 */
+	kvm_mips_suspend_mm(cpu);
+
+	r = vcpu->arch.vcpu_run(run, vcpu);
+
+	/* We may have migrated while handling guest exits */
+	cpu = smp_processor_id();
+
+	/* Restore normal Linux process memory map */
+	if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
+	     asid_version_mask(cpu)))
+		get_new_mmu_context(current->mm, cpu);
+	write_c0_entryhi(cpu_asid(cpu, current->mm));
+	TLBMISS_HANDLER_SETUP_PGD(current->mm->pgd);
+	kvm_mips_resume_mm(cpu);
+
+	htw_start();
+
+	pagefault_enable();
+
+	return r;
+}
+
 static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
 	/* exit handlers */
 	.handle_cop_unusable = kvm_trap_emul_handle_cop_unusable,
@@ -661,9 +1231,11 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
 	.handle_fpe = kvm_trap_emul_handle_fpe,
 	.handle_msa_disabled = kvm_trap_emul_handle_msa_disabled,
 
-	.vm_init = kvm_trap_emul_vm_init,
 	.vcpu_init = kvm_trap_emul_vcpu_init,
+	.vcpu_uninit = kvm_trap_emul_vcpu_uninit,
 	.vcpu_setup = kvm_trap_emul_vcpu_setup,
+	.flush_shadow_all = kvm_trap_emul_flush_shadow_all,
+	.flush_shadow_memslot = kvm_trap_emul_flush_shadow_memslot,
 	.gva_to_gpa = kvm_trap_emul_gva_to_gpa_cb,
 	.queue_timer_int = kvm_mips_queue_timer_int_cb,
 	.dequeue_timer_int = kvm_mips_dequeue_timer_int_cb,
@@ -675,8 +1247,10 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
 	.copy_reg_indices = kvm_trap_emul_copy_reg_indices,
 	.get_one_reg = kvm_trap_emul_get_one_reg,
 	.set_one_reg = kvm_trap_emul_set_one_reg,
-	.vcpu_get_regs = kvm_trap_emul_vcpu_get_regs,
-	.vcpu_set_regs = kvm_trap_emul_vcpu_set_regs,
+	.vcpu_load = kvm_trap_emul_vcpu_load,
+	.vcpu_put = kvm_trap_emul_vcpu_put,
+	.vcpu_run = kvm_trap_emul_vcpu_run,
+	.vcpu_reenter = kvm_trap_emul_vcpu_reenter,
 };
 
 int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks)
diff --git a/arch/mips/mm/Makefile b/arch/mips/mm/Makefile
index b4c64bd3f723..b4cc8811a664 100644
--- a/arch/mips/mm/Makefile
+++ b/arch/mips/mm/Makefile
@@ -4,7 +4,7 @@
 
 obj-y				+= cache.o dma-default.o extable.o fault.o \
 				   gup.o init.o mmap.o page.o page-funcs.o \
-				   tlbex.o tlbex-fault.o tlb-funcs.o
+				   pgtable.o tlbex.o tlbex-fault.o tlb-funcs.o
 
 ifdef CONFIG_CPU_MICROMIPS
 obj-y				+= uasm-micromips.o
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 9d1d54b8e1e8..aa75849c36bc 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -539,6 +539,7 @@ unsigned long pgd_current[NR_CPUS];
 pgd_t swapper_pg_dir[_PTRS_PER_PGD] __section(.bss..swapper_pg_dir);
 #ifndef __PAGETABLE_PMD_FOLDED
 pmd_t invalid_pmd_table[PTRS_PER_PMD] __page_aligned_bss;
+EXPORT_SYMBOL_GPL(invalid_pmd_table);
 #endif
 pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned_bss;
 EXPORT_SYMBOL(invalid_pte_table);
diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c
index ce4473e7c0d2..0ae7b28b4db5 100644
--- a/arch/mips/mm/pgtable-64.c
+++ b/arch/mips/mm/pgtable-64.c
@@ -6,6 +6,7 @@
  * Copyright (C) 1999, 2000 by Silicon Graphics
  * Copyright (C) 2003 by Ralf Baechle
  */
+#include <linux/export.h>
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <asm/fixmap.h>
@@ -60,6 +61,7 @@ void pmd_init(unsigned long addr, unsigned long pagetable)
 		p[-1] = pagetable;
 	} while (p != end);
 }
+EXPORT_SYMBOL_GPL(pmd_init);
 #endif
 
 pmd_t mk_pmd(struct page *page, pgprot_t prot)
diff --git a/arch/mips/mm/pgtable.c b/arch/mips/mm/pgtable.c
new file mode 100644
index 000000000000..05560b042d82
--- /dev/null
+++ b/arch/mips/mm/pgtable.c
@@ -0,0 +1,25 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <asm/pgalloc.h>
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	pgd_t *ret, *init;
+
+	ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_ORDER);
+	if (ret) {
+		init = pgd_offset(&init_mm, 0UL);
+		pgd_init((unsigned long)ret);
+		memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
+		       (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pgd_alloc);
diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c
index 87eed65660f5..9bfee8988eaf 100644
--- a/arch/mips/mm/tlbex.c
+++ b/arch/mips/mm/tlbex.c
@@ -22,6 +22,7 @@
  */
 
 #include <linux/bug.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/smp.h>
@@ -34,6 +35,7 @@
 #include <asm/war.h>
 #include <asm/uasm.h>
 #include <asm/setup.h>
+#include <asm/tlbex.h>
 
 static int mips_xpa_disabled;
 
@@ -344,7 +346,8 @@ static int allocate_kscratch(void)
 }
 
 static int scratch_reg;
-static int pgd_reg;
+int pgd_reg;
+EXPORT_SYMBOL_GPL(pgd_reg);
 enum vmalloc64_mode {not_refill, refill_scratch, refill_noscratch};
 
 static struct work_registers build_get_work_registers(u32 **p)
@@ -496,15 +499,9 @@ static void __maybe_unused build_tlb_probe_entry(u32 **p)
 	}
 }
 
-/*
- * Write random or indexed TLB entry, and care about the hazards from
- * the preceding mtc0 and for the following eret.
- */
-enum tlb_write_entry { tlb_random, tlb_indexed };
-
-static void build_tlb_write_entry(u32 **p, struct uasm_label **l,
-				  struct uasm_reloc **r,
-				  enum tlb_write_entry wmode)
+void build_tlb_write_entry(u32 **p, struct uasm_label **l,
+			   struct uasm_reloc **r,
+			   enum tlb_write_entry wmode)
 {
 	void(*tlbw)(u32 **) = NULL;
 
@@ -627,6 +624,7 @@ static void build_tlb_write_entry(u32 **p, struct uasm_label **l,
 		break;
 	}
 }
+EXPORT_SYMBOL_GPL(build_tlb_write_entry);
 
 static __maybe_unused void build_convert_pte_to_entrylo(u32 **p,
 							unsigned int reg)
@@ -781,9 +779,8 @@ static void build_huge_handler_tail(u32 **p, struct uasm_reloc **r,
  * TMP and PTR are scratch.
  * TMP will be clobbered, PTR will hold the pmd entry.
  */
-static void
-build_get_pmde64(u32 **p, struct uasm_label **l, struct uasm_reloc **r,
-		 unsigned int tmp, unsigned int ptr)
+void build_get_pmde64(u32 **p, struct uasm_label **l, struct uasm_reloc **r,
+		      unsigned int tmp, unsigned int ptr)
 {
 #ifndef CONFIG_MIPS_PGD_C0_CONTEXT
 	long pgdc = (long)pgd_current;
@@ -859,6 +856,7 @@ build_get_pmde64(u32 **p, struct uasm_label **l, struct uasm_reloc **r,
 	uasm_i_daddu(p, ptr, ptr, tmp); /* add in pmd offset */
 #endif
 }
+EXPORT_SYMBOL_GPL(build_get_pmde64);
 
 /*
  * BVADDR is the faulting address, PTR is scratch.
@@ -934,8 +932,7 @@ build_get_pgd_vmalloc64(u32 **p, struct uasm_label **l, struct uasm_reloc **r,
  * TMP and PTR are scratch.
  * TMP will be clobbered, PTR will hold the pgd entry.
  */
-static void __maybe_unused
-build_get_pgde32(u32 **p, unsigned int tmp, unsigned int ptr)
+void build_get_pgde32(u32 **p, unsigned int tmp, unsigned int ptr)
 {
 	if (pgd_reg != -1) {
 		/* pgd is in pgd_reg */
@@ -960,6 +957,7 @@ build_get_pgde32(u32 **p, unsigned int tmp, unsigned int ptr)
 	uasm_i_sll(p, tmp, tmp, PGD_T_LOG2);
 	uasm_i_addu(p, ptr, ptr, tmp); /* add in pgd offset */
 }
+EXPORT_SYMBOL_GPL(build_get_pgde32);
 
 #endif /* !CONFIG_64BIT */
 
@@ -989,7 +987,7 @@ static void build_adjust_context(u32 **p, unsigned int ctx)
 	uasm_i_andi(p, ctx, ctx, mask);
 }
 
-static void build_get_ptep(u32 **p, unsigned int tmp, unsigned int ptr)
+void build_get_ptep(u32 **p, unsigned int tmp, unsigned int ptr)
 {
 	/*
 	 * Bug workaround for the Nevada. It seems as if under certain
@@ -1013,8 +1011,9 @@ static void build_get_ptep(u32 **p, unsigned int tmp, unsigned int ptr)
 	build_adjust_context(p, tmp);
 	UASM_i_ADDU(p, ptr, ptr, tmp); /* add in offset */
 }
+EXPORT_SYMBOL_GPL(build_get_ptep);
 
-static void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep)
+void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep)
 {
 	int pte_off_even = 0;
 	int pte_off_odd = sizeof(pte_t);
@@ -1063,6 +1062,7 @@ static void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep)
 		UASM_i_MTC0(p, 0, C0_ENTRYLO1);
 	UASM_i_MTC0(p, ptep, C0_ENTRYLO1); /* load it */
 }
+EXPORT_SYMBOL_GPL(build_update_entries);
 
 struct mips_huge_tlb_info {
 	int huge_pte;
@@ -1536,7 +1536,9 @@ static void build_loongson3_tlb_refill_handler(void)
 extern u32 handle_tlbl[], handle_tlbl_end[];
 extern u32 handle_tlbs[], handle_tlbs_end[];
 extern u32 handle_tlbm[], handle_tlbm_end[];
-extern u32 tlbmiss_handler_setup_pgd_start[], tlbmiss_handler_setup_pgd[];
+extern u32 tlbmiss_handler_setup_pgd_start[];
+extern u32 tlbmiss_handler_setup_pgd[];
+EXPORT_SYMBOL_GPL(tlbmiss_handler_setup_pgd);
 extern u32 tlbmiss_handler_setup_pgd_end[];
 
 static void build_setup_pgd(void)
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 8afb0e00f7d9..d73e9dfa5237 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -44,10 +44,20 @@ struct patb_entry {
 };
 extern struct patb_entry *partition_tb;
 
+/* Bits in patb0 field */
 #define PATB_HR		(1UL << 63)
-#define PATB_GR		(1UL << 63)
 #define RPDB_MASK	0x0ffffffffffff00fUL
 #define RPDB_SHIFT	(1UL << 8)
+#define RTS1_SHIFT	61		/* top 2 bits of radix tree size */
+#define RTS1_MASK	(3UL << RTS1_SHIFT)
+#define RTS2_SHIFT	5		/* bottom 3 bits of radix tree size */
+#define RTS2_MASK	(7UL << RTS2_SHIFT)
+#define RPDS_MASK	0x1f		/* root page dir. size field */
+
+/* Bits in patb1 field */
+#define PATB_GR		(1UL << 63)	/* guest uses radix; must match HR */
+#define PRTS_MASK	0x1f		/* process table size field */
+
 /*
  * Limit process table to PAGE_SIZE table. This
  * also limit the max pid we can support.
@@ -138,5 +148,11 @@ static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 extern int (*register_process_table)(unsigned long base, unsigned long page_size,
 				     unsigned long tbl_size);
 
+#ifdef CONFIG_PPC_PSERIES
+extern void radix_init_pseries(void);
+#else
+static inline void radix_init_pseries(void) { };
+#endif
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 9a3eee661297..8fa09fa500f0 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -97,6 +97,15 @@
 	ld	reg,PACAKBASE(r13);					\
 	ori	reg,reg,(ABS_ADDR(label))@l;
 
+/*
+ * Branches from unrelocated code (e.g., interrupts) to labels outside
+ * head-y require >64K offsets.
+ */
+#define __LOAD_FAR_HANDLER(reg, label)					\
+	ld	reg,PACAKBASE(r13);					\
+	ori	reg,reg,(ABS_ADDR(label))@l;				\
+	addis	reg,reg,(ABS_ADDR(label))@h;
+
 /* Exception register prefixes */
 #define EXC_HV	H
 #define EXC_STD
@@ -227,13 +236,41 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	mtctr	reg;							\
 	bctr
 
+/*
+ * KVM requires __LOAD_FAR_HANDLER.
+ *
+ * __BRANCH_TO_KVM_EXIT branches are also a special case because they
+ * explicitly use r9 then reload it from PACA before branching. Hence
+ * the double-underscore.
+ */
+#define __BRANCH_TO_KVM_EXIT(area, label)				\
+	mfctr	r9;							\
+	std	r9,HSTATE_SCRATCH1(r13);				\
+	__LOAD_FAR_HANDLER(r9, label);					\
+	mtctr	r9;							\
+	ld	r9,area+EX_R9(r13);					\
+	bctr
+
+#define BRANCH_TO_KVM(reg, label)					\
+	__LOAD_FAR_HANDLER(reg, label);					\
+	mtctr	reg;							\
+	bctr
+
 #else
 #define BRANCH_TO_COMMON(reg, label)					\
 	b	label
 
+#define BRANCH_TO_KVM(reg, label)					\
+	b	label
+
+#define __BRANCH_TO_KVM_EXIT(area, label)				\
+	ld	r9,area+EX_R9(r13);					\
+	b	label
+
 #endif
 
-#define __KVM_HANDLER_PROLOG(area, n)					\
+
+#define __KVM_HANDLER(area, h, n)					\
 	BEGIN_FTR_SECTION_NESTED(947)					\
 	ld	r10,area+EX_CFAR(r13);					\
 	std	r10,HSTATE_CFAR(r13);					\
@@ -243,30 +280,28 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	std	r10,HSTATE_PPR(r13);					\
 	END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948);	\
 	ld	r10,area+EX_R10(r13);					\
-	stw	r9,HSTATE_SCRATCH1(r13);				\
-	ld	r9,area+EX_R9(r13);					\
 	std	r12,HSTATE_SCRATCH0(r13);				\
-
-#define __KVM_HANDLER(area, h, n)					\
-	__KVM_HANDLER_PROLOG(area, n)					\
-	li	r12,n;							\
-	b	kvmppc_interrupt
+	sldi	r12,r9,32;						\
+	ori	r12,r12,(n);						\
+	/* This reloads r9 before branching to kvmppc_interrupt */	\
+	__BRANCH_TO_KVM_EXIT(area, kvmppc_interrupt)
 
 #define __KVM_HANDLER_SKIP(area, h, n)					\
 	cmpwi	r10,KVM_GUEST_MODE_SKIP;				\
-	ld	r10,area+EX_R10(r13);					\
 	beq	89f;							\
-	stw	r9,HSTATE_SCRATCH1(r13);				\
 	BEGIN_FTR_SECTION_NESTED(948)					\
-	ld	r9,area+EX_PPR(r13);					\
-	std	r9,HSTATE_PPR(r13);					\
+	ld	r10,area+EX_PPR(r13);					\
+	std	r10,HSTATE_PPR(r13);					\
 	END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948);	\
-	ld	r9,area+EX_R9(r13);					\
+	ld	r10,area+EX_R10(r13);					\
 	std	r12,HSTATE_SCRATCH0(r13);				\
-	li	r12,n;							\
-	b	kvmppc_interrupt;					\
+	sldi	r12,r9,32;						\
+	ori	r12,r12,(n);						\
+	/* This reloads r9 before branching to kvmppc_interrupt */	\
+	__BRANCH_TO_KVM_EXIT(area, kvmppc_interrupt);			\
 89:	mtocrf	0x80,r9;						\
 	ld	r9,area+EX_R9(r13);					\
+	ld	r10,area+EX_R10(r13);					\
 	b	kvmppc_skip_##h##interrupt
 
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
@@ -393,12 +428,12 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_STD)
 
 #define STD_RELON_EXCEPTION_HV(loc, vec, label)		\
-	/* No guest interrupts come through here */	\
 	SET_SCRATCH0(r13);	/* save r13 */		\
-	EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label, EXC_HV, NOTEST, vec);
+	EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label,	\
+				       EXC_HV, KVMTEST_HV, vec);
 
 #define STD_RELON_EXCEPTION_HV_OOL(vec, label)			\
-	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec);		\
+	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, vec);	\
 	EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_HV)
 
 /* This associate vector numbers with bits in paca->irq_happened */
@@ -475,10 +510,10 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 
 #define MASKABLE_RELON_EXCEPTION_HV(loc, vec, label)			\
 	_MASKABLE_RELON_EXCEPTION_PSERIES(vec, label,			\
-					  EXC_HV, SOFTEN_NOTEST_HV)
+					  EXC_HV, SOFTEN_TEST_HV)
 
 #define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label)			\
-	EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_NOTEST_HV, vec);		\
+	EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_HV, vec);		\
 	EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV)
 
 /*
diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h
index fca7033839a9..9bd81619d090 100644
--- a/arch/powerpc/include/asm/head-64.h
+++ b/arch/powerpc/include/asm/head-64.h
@@ -218,7 +218,7 @@ name:
 
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
 #define TRAMP_KVM_BEGIN(name)						\
-	TRAMP_REAL_BEGIN(name)
+	TRAMP_VIRT_BEGIN(name)
 #else
 #define TRAMP_KVM_BEGIN(name)
 #endif
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 77ff1ba99d1f..54d11b3a6bf7 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -276,6 +276,7 @@
 #define H_GET_MPP_X		0x314
 #define H_SET_MODE		0x31C
 #define H_CLEAR_HPT		0x358
+#define H_REGISTER_PROC_TBL	0x37C
 #define H_SIGNAL_SYS_RESET	0x380
 #define MAX_HCALL_OPCODE	H_SIGNAL_SYS_RESET
 
@@ -313,6 +314,16 @@
 #define H_SIGNAL_SYS_RESET_ALL_OTHERS		-2
 /* >= 0 values are CPU number */
 
+/* Flag values used in H_REGISTER_PROC_TBL hcall */
+#define PROC_TABLE_OP_MASK	0x18
+#define PROC_TABLE_DEREG	0x10
+#define PROC_TABLE_NEW		0x18
+#define PROC_TABLE_TYPE_MASK	0x06
+#define PROC_TABLE_HPT_SLB	0x00
+#define PROC_TABLE_HPT_PT	0x02
+#define PROC_TABLE_RADIX	0x04
+#define PROC_TABLE_GTSE		0x01
+
 #ifndef __ASSEMBLY__
 
 /**
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 5cf306ae0ac3..2bf35017ffc0 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -170,6 +170,8 @@ extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run,
 			unsigned long status);
 extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr,
 			unsigned long slb_v, unsigned long valid);
+extern int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			unsigned long gpa, gva_t ea, int is_store);
 
 extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
 extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
@@ -182,6 +184,25 @@ extern void kvmppc_mmu_hpte_sysexit(void);
 extern int kvmppc_mmu_hv_init(void);
 extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
 
+extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
+			struct kvm_vcpu *vcpu,
+			unsigned long ea, unsigned long dsisr);
+extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+			struct kvmppc_pte *gpte, bool data, bool iswrite);
+extern int kvmppc_init_vm_radix(struct kvm *kvm);
+extern void kvmppc_free_radix(struct kvm *kvm);
+extern int kvmppc_radix_init(void);
+extern void kvmppc_radix_exit(void);
+extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			unsigned long gfn);
+extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			unsigned long gfn);
+extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			unsigned long gfn);
+extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
+			struct kvm_memory_slot *memslot, unsigned long *map);
+extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
+
 /* XXX remove this export when load_last_inst() is generic */
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
@@ -211,8 +232,11 @@ extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 			unsigned long pte_index, unsigned long avpn,
 			unsigned long *hpret);
-extern long kvmppc_hv_get_dirty_log(struct kvm *kvm,
+extern long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
 			struct kvm_memory_slot *memslot, unsigned long *map);
+extern void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
+			struct kvm_memory_slot *memslot,
+			unsigned long *map);
 extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr,
 			unsigned long mask);
 extern void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 848292176908..d9b48f5bb606 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -22,6 +22,10 @@
 
 #include <asm/book3s/64/mmu-hash.h>
 
+/* Power architecture requires HPT is at least 256kiB, at most 64TiB */
+#define PPC_MIN_HPT_ORDER	18
+#define PPC_MAX_HPT_ORDER	46
+
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
 {
@@ -36,6 +40,12 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
 #endif
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+
+static inline bool kvm_is_radix(struct kvm *kvm)
+{
+	return kvm->arch.radix;
+}
+
 #define KVM_DEFAULT_HPT_ORDER	24	/* 16MB HPT by default */
 #endif
 
@@ -350,6 +360,18 @@ extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
 
 extern void kvmhv_rm_send_ipi(int cpu);
 
+static inline unsigned long kvmppc_hpt_npte(struct kvm_hpt_info *hpt)
+{
+	/* HPTEs are 2**4 bytes long */
+	return 1UL << (hpt->order - 4);
+}
+
+static inline unsigned long kvmppc_hpt_mask(struct kvm_hpt_info *hpt)
+{
+	/* 128 (2**7) bytes in each HPTEG */
+	return (1UL << (hpt->order - 7)) - 1;
+}
+
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e59b172666cd..7bba8f415627 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -241,12 +241,24 @@ struct kvm_arch_memory_slot {
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 };
 
+struct kvm_hpt_info {
+	/* Host virtual (linear mapping) address of guest HPT */
+	unsigned long virt;
+	/* Array of reverse mapping entries for each guest HPTE */
+	struct revmap_entry *rev;
+	/* Guest HPT size is 2**(order) bytes */
+	u32 order;
+	/* 1 if HPT allocated with CMA, 0 otherwise */
+	int cma;
+};
+
+struct kvm_resize_hpt;
+
 struct kvm_arch {
 	unsigned int lpid;
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	unsigned int tlb_sets;
-	unsigned long hpt_virt;
-	struct revmap_entry *revmap;
+	struct kvm_hpt_info hpt;
 	atomic64_t mmio_update;
 	unsigned int host_lpid;
 	unsigned long host_lpcr;
@@ -256,16 +268,17 @@ struct kvm_arch {
 	unsigned long lpcr;
 	unsigned long vrma_slb_v;
 	int hpte_setup_done;
-	u32 hpt_order;
 	atomic_t vcpus_running;
 	u32 online_vcores;
-	unsigned long hpt_npte;
-	unsigned long hpt_mask;
 	atomic_t hpte_mod_interest;
 	cpumask_t need_tlb_flush;
-	int hpt_cma_alloc;
+	cpumask_t cpu_in_guest;
+	u8 radix;
+	pgd_t *pgtable;
+	u64 process_table;
 	struct dentry *debugfs_dir;
 	struct dentry *htab_dentry;
+	struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 	struct mutex hpt_mutex;
@@ -603,6 +616,7 @@ struct kvm_vcpu_arch {
 	ulong fault_dar;
 	u32 fault_dsisr;
 	unsigned long intr_msr;
+	ulong fault_gpa;	/* guest real address of page fault (POWER9) */
 #endif
 
 #ifdef CONFIG_BOOKE
@@ -657,6 +671,7 @@ struct kvm_vcpu_arch {
 	int state;
 	int ptid;
 	int thread_cpu;
+	int prev_cpu;
 	bool timer_running;
 	wait_queue_head_t cpu_run;
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2da67bf1f2ec..dd11c4c8c56a 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -155,9 +155,10 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
 extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
 extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
 
-extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp);
-extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp);
-extern void kvmppc_free_hpt(struct kvm *kvm);
+extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order);
+extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info);
+extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order);
+extern void kvmppc_free_hpt(struct kvm_hpt_info *info);
 extern long kvmppc_prepare_vrma(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
 extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
@@ -186,8 +187,8 @@ extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 		unsigned long tce_value, unsigned long npages);
 extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 			     unsigned long ioba);
-extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
-extern void kvm_release_hpt(struct page *page, unsigned long nr_pages);
+extern struct page *kvm_alloc_hpt_cma(unsigned long nr_pages);
+extern void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern void kvmppc_core_free_memslot(struct kvm *kvm,
@@ -214,6 +215,10 @@ extern void kvmppc_bookehv_exit(void);
 extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
 
 extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
+extern long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
+					    struct kvm_ppc_resize_hpt *rhpt);
+extern long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
+					   struct kvm_ppc_resize_hpt *rhpt);
 
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
 
@@ -291,6 +296,8 @@ struct kvmppc_ops {
 				       struct irq_bypass_producer *);
 	void (*irq_bypass_del_producer)(struct irq_bypass_consumer *,
 					struct irq_bypass_producer *);
+	int (*configure_mmu)(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg);
+	int (*get_rmmu_info)(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
 };
 
 extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 5e57705b4759..8af2546ea593 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -121,6 +121,8 @@ struct of_drconf_cell {
 #define OV1_PPC_2_06		0x02	/* set if we support PowerPC 2.06 */
 #define OV1_PPC_2_07		0x01	/* set if we support PowerPC 2.07 */
 
+#define OV1_PPC_3_00		0x80	/* set if we support PowerPC 3.00 */
+
 /* Option vector 2: Open Firmware options supported */
 #define OV2_REAL_MODE		0x20	/* set if we want OF in real mode */
 
@@ -151,10 +153,17 @@ struct of_drconf_cell {
 #define OV5_XCMO		0x0440	/* Page Coalescing */
 #define OV5_TYPE1_AFFINITY	0x0580	/* Type 1 NUMA affinity */
 #define OV5_PRRN		0x0540	/* Platform Resource Reassignment */
-#define OV5_PFO_HW_RNG		0x0E80	/* PFO Random Number Generator */
-#define OV5_PFO_HW_842		0x0E40	/* PFO Compression Accelerator */
-#define OV5_PFO_HW_ENCR		0x0E20	/* PFO Encryption Accelerator */
-#define OV5_SUB_PROCESSORS	0x0F01	/* 1,2,or 4 Sub-Processors supported */
+#define OV5_PFO_HW_RNG		0x1180	/* PFO Random Number Generator */
+#define OV5_PFO_HW_842		0x1140	/* PFO Compression Accelerator */
+#define OV5_PFO_HW_ENCR		0x1120	/* PFO Encryption Accelerator */
+#define OV5_SUB_PROCESSORS	0x1501	/* 1,2,or 4 Sub-Processors supported */
+#define OV5_XIVE_EXPLOIT	0x1701	/* XIVE exploitation supported */
+#define OV5_MMU_RADIX_300	0x1880	/* ISA v3.00 radix MMU supported */
+#define OV5_MMU_HASH_300	0x1840	/* ISA v3.00 hash MMU supported */
+#define OV5_MMU_SEGM_RADIX	0x1820	/* radix mode (no segmentation) */
+#define OV5_MMU_PROC_TBL	0x1810	/* hcall selects SLB or proc table */
+#define OV5_MMU_SLB		0x1800	/* always use SLB */
+#define OV5_MMU_GTSE		0x1808	/* Guest translation shootdown */
 
 /* Option Vector 6: IBM PAPR hints */
 #define OV6_LINUX		0x02	/* Linux is our OS */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index d821cb10c1ff..9c4689401298 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -274,10 +274,14 @@
 #define SPRN_DSISR	0x012	/* Data Storage Interrupt Status Register */
 #define   DSISR_NOHPTE		0x40000000	/* no translation found */
 #define   DSISR_PROTFAULT	0x08000000	/* protection fault */
+#define   DSISR_BADACCESS	0x04000000	/* bad access to CI or G */
 #define   DSISR_ISSTORE		0x02000000	/* access was a store */
 #define   DSISR_DABRMATCH	0x00400000	/* hit data breakpoint */
 #define   DSISR_NOSEGMENT	0x00200000	/* SLB miss */
 #define   DSISR_KEYFAULT	0x00200000	/* Key fault */
+#define   DSISR_UNSUPP_MMU	0x00080000	/* Unsupported MMU config */
+#define   DSISR_SET_RC		0x00040000	/* Failed setting of R/C bits */
+#define   DSISR_PGDIRFAULT      0x00020000      /* Fault on page directory */
 #define SPRN_TBRL	0x10C	/* Time Base Read Lower Register (user, R/O) */
 #define SPRN_TBRU	0x10D	/* Time Base Read Upper Register (user, R/O) */
 #define SPRN_CIR	0x11B	/* Chip Information Register (hyper, R/0) */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 3603b6f51b11..4edbe4bb0e8b 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -413,6 +413,26 @@ struct kvm_get_htab_header {
 	__u16	n_invalid;
 };
 
+/* For KVM_PPC_CONFIGURE_V3_MMU */
+struct kvm_ppc_mmuv3_cfg {
+	__u64	flags;
+	__u64	process_table;	/* second doubleword of partition table entry */
+};
+
+/* Flag values for KVM_PPC_CONFIGURE_V3_MMU */
+#define KVM_PPC_MMUV3_RADIX	1	/* 1 = radix mode, 0 = HPT */
+#define KVM_PPC_MMUV3_GTSE	2	/* global translation shootdown enb. */
+
+/* For KVM_PPC_GET_RMMU_INFO */
+struct kvm_ppc_rmmu_info {
+	struct kvm_ppc_radix_geom {
+		__u8	page_shift;
+		__u8	level_bits[4];
+		__u8	pad[3];
+	}	geometries[8];
+	__u32	ap_encodings[8];
+};
+
 /* Per-vcpu XICS interrupt controller state */
 #define KVM_REG_PPC_ICP_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c)
 
@@ -613,5 +633,7 @@ struct kvm_get_htab_header {
 #define  KVM_XICS_LEVEL_SENSITIVE	(1ULL << 40)
 #define  KVM_XICS_MASKED		(1ULL << 41)
 #define  KVM_XICS_PENDING		(1ULL << 42)
+#define  KVM_XICS_PRESENTED		(1ULL << 43)
+#define  KVM_XICS_QUEUED		(1ULL << 44)
 
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 1e8e556996e7..f25239b3a06f 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -495,6 +495,7 @@ int main(void)
 	DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
 	DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls));
 	DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
+	DEFINE(KVM_RADIX, offsetof(struct kvm, arch.radix));
 	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
 	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
@@ -534,6 +535,7 @@ int main(void)
 	DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
 	DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
+	DEFINE(VCPU_FAULT_GPA, offsetof(struct kvm_vcpu, arch.fault_gpa));
 	DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr));
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 069aac8af909..f86eb236324e 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -142,7 +142,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
 	cmpwi	r0,0
 	beq	1f
-	b	kvm_start_guest
+	BRANCH_TO_KVM(r10, kvm_start_guest)
 1:
 #endif
 
@@ -717,13 +717,9 @@ hardware_interrupt_hv:
 	BEGIN_FTR_SECTION
 		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common,
 					    EXC_HV, SOFTEN_TEST_HV)
-do_kvm_H0x500:
-		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
 	FTR_SECTION_ELSE
 		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common,
 					    EXC_STD, SOFTEN_TEST_PR)
-do_kvm_0x500:
-		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
 	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 EXC_REAL_END(hardware_interrupt, 0x500, 0x600)
 
@@ -737,6 +733,8 @@ hardware_interrupt_relon_hv:
 	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x4600)
 
+TRAMP_KVM(PACA_EXGEN, 0x500)
+TRAMP_KVM_HV(PACA_EXGEN, 0x500)
 EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ)
 
 
@@ -832,6 +830,31 @@ EXC_VIRT(trap_0b, 0x4b00, 0x4c00, 0xb00)
 TRAMP_KVM(PACA_EXGEN, 0xb00)
 EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
 
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+	 /*
+	  * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems
+	  * that support it) before changing to HMT_MEDIUM. That allows the KVM
+	  * code to save that value into the guest state (it is the guest's PPR
+	  * value). Otherwise just change to HMT_MEDIUM as userspace has
+	  * already saved the PPR.
+	  */
+#define SYSCALL_KVMTEST							\
+	SET_SCRATCH0(r13);						\
+	GET_PACA(r13);							\
+	std	r9,PACA_EXGEN+EX_R9(r13);				\
+	OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR);			\
+	HMT_MEDIUM;							\
+	std	r10,PACA_EXGEN+EX_R10(r13);				\
+	OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR);	\
+	mfcr	r9;							\
+	KVMTEST_PR(0xc00);						\
+	GET_SCRATCH0(r13)
+
+#else
+#define SYSCALL_KVMTEST							\
+	HMT_MEDIUM
+#endif
+	
 #define LOAD_SYSCALL_HANDLER(reg)					\
 	__LOAD_HANDLER(reg, system_call_common)
 
@@ -885,34 +908,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
 #endif
 
 EXC_REAL_BEGIN(system_call, 0xc00, 0xd00)
-	 /*
-	  * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems
-	  * that support it) before changing to HMT_MEDIUM. That allows the KVM
-	  * code to save that value into the guest state (it is the guest's PPR
-	  * value). Otherwise just change to HMT_MEDIUM as userspace has
-	  * already saved the PPR.
-	  */
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-	SET_SCRATCH0(r13)
-	GET_PACA(r13)
-	std	r9,PACA_EXGEN+EX_R9(r13)
-	OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR);
-	HMT_MEDIUM;
-	std	r10,PACA_EXGEN+EX_R10(r13)
-	OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR);
-	mfcr	r9
-	KVMTEST_PR(0xc00)
-	GET_SCRATCH0(r13)
-#else
-	HMT_MEDIUM;
-#endif
+	SYSCALL_KVMTEST
 	SYSCALL_PSERIES_1
 	SYSCALL_PSERIES_2_RFID
 	SYSCALL_PSERIES_3
 EXC_REAL_END(system_call, 0xc00, 0xd00)
 
 EXC_VIRT_BEGIN(system_call, 0x4c00, 0x4d00)
-	HMT_MEDIUM
+	SYSCALL_KVMTEST
 	SYSCALL_PSERIES_1
 	SYSCALL_PSERIES_2_DIRECT
 	SYSCALL_PSERIES_3
@@ -927,7 +930,7 @@ TRAMP_KVM(PACA_EXGEN, 0xd00)
 EXC_COMMON(single_step_common, 0xd00, single_step_exception)
 
 EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0xe20)
-EXC_VIRT_NONE(0x4e00, 0x4e20)
+EXC_VIRT_OOL_HV(h_data_storage, 0x4e00, 0x4e20, 0xe00)
 TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00)
 EXC_COMMON_BEGIN(h_data_storage_common)
 	mfspr   r10,SPRN_HDAR
@@ -943,7 +946,7 @@ EXC_COMMON_BEGIN(h_data_storage_common)
 
 
 EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0xe40)
-EXC_VIRT_NONE(0x4e20, 0x4e40)
+EXC_VIRT_OOL_HV(h_instr_storage, 0x4e20, 0x4e40, 0xe20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe20)
 EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception)
 
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index ac83eb04a8b8..d3db1bcc9c1e 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -649,6 +649,7 @@ static void __init early_cmdline_parse(void)
 struct option_vector1 {
 	u8 byte1;
 	u8 arch_versions;
+	u8 arch_versions3;
 } __packed;
 
 struct option_vector2 {
@@ -691,6 +692,9 @@ struct option_vector5 {
 	u8 reserved2;
 	__be16 reserved3;
 	u8 subprocessors;
+	u8 byte22;
+	u8 intarch;
+	u8 mmu;
 } __packed;
 
 struct option_vector6 {
@@ -700,7 +704,7 @@ struct option_vector6 {
 } __packed;
 
 struct ibm_arch_vec {
-	struct { u32 mask, val; } pvrs[10];
+	struct { u32 mask, val; } pvrs[12];
 
 	u8 num_vectors;
 
@@ -750,6 +754,14 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
 			.val  = cpu_to_be32(0x004d0000),
 		},
 		{
+			.mask = cpu_to_be32(0xffff0000), /* POWER9 */
+			.val  = cpu_to_be32(0x004e0000),
+		},
+		{
+			.mask = cpu_to_be32(0xffffffff), /* all 3.00-compliant */
+			.val  = cpu_to_be32(0x0f000005),
+		},
+		{
 			.mask = cpu_to_be32(0xffffffff), /* all 2.07-compliant */
 			.val  = cpu_to_be32(0x0f000004),
 		},
@@ -774,6 +786,7 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
 		.byte1 = 0,
 		.arch_versions = OV1_PPC_2_00 | OV1_PPC_2_01 | OV1_PPC_2_02 | OV1_PPC_2_03 |
 				 OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06 | OV1_PPC_2_07,
+		.arch_versions3 = OV1_PPC_3_00,
 	},
 
 	.vec2_len = VECTOR_LENGTH(sizeof(struct option_vector2)),
@@ -836,6 +849,9 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
 		.reserved2 = 0,
 		.reserved3 = 0,
 		.subprocessors = 1,
+		.intarch = 0,
+		.mmu = OV5_FEAT(OV5_MMU_RADIX_300) | OV5_FEAT(OV5_MMU_HASH_300) |
+			OV5_FEAT(OV5_MMU_PROC_TBL) | OV5_FEAT(OV5_MMU_GTSE),
 	},
 
 	/* option vector 6: IBM PAPR hints */
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 7dd89b79d038..b87ccde2137a 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -70,7 +70,8 @@ endif
 kvm-hv-y += \
 	book3s_hv.o \
 	book3s_hv_interrupts.o \
-	book3s_64_mmu_hv.o
+	book3s_64_mmu_hv.o \
+	book3s_64_mmu_radix.o
 
 kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
 	book3s_hv_rm_xics.o
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 019f008775b9..b6b5c185bd92 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -239,6 +239,7 @@ void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar,
 	kvmppc_set_dsisr(vcpu, flags);
 	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE);
 }
+EXPORT_SYMBOL_GPL(kvmppc_core_queue_data_storage);	/* used by kvm_hv */
 
 void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong flags)
 {
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index b795dd1ac2ef..013552f05182 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -40,84 +40,104 @@
 
 #include "trace_hv.h"
 
-/* Power architecture requires HPT is at least 256kB */
-#define PPC_MIN_HPT_ORDER	18
+//#define DEBUG_RESIZE_HPT	1
+
+#ifdef DEBUG_RESIZE_HPT
+#define resize_hpt_debug(resize, ...)				\
+	do {							\
+		printk(KERN_DEBUG "RESIZE HPT %p: ", resize);	\
+		printk(__VA_ARGS__);				\
+	} while (0)
+#else
+#define resize_hpt_debug(resize, ...)				\
+	do { } while (0)
+#endif
 
 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
 				long pte_index, unsigned long pteh,
 				unsigned long ptel, unsigned long *pte_idx_ret);
+
+struct kvm_resize_hpt {
+	/* These fields read-only after init */
+	struct kvm *kvm;
+	struct work_struct work;
+	u32 order;
+
+	/* These fields protected by kvm->lock */
+	int error;
+	bool prepare_done;
+
+	/* Private to the work thread, until prepare_done is true,
+	 * then protected by kvm->resize_hpt_sem */
+	struct kvm_hpt_info hpt;
+};
+
 static void kvmppc_rmap_reset(struct kvm *kvm);
 
-long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
+int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
 {
 	unsigned long hpt = 0;
-	struct revmap_entry *rev;
+	int cma = 0;
 	struct page *page = NULL;
-	long order = KVM_DEFAULT_HPT_ORDER;
+	struct revmap_entry *rev;
+	unsigned long npte;
 
-	if (htab_orderp) {
-		order = *htab_orderp;
-		if (order < PPC_MIN_HPT_ORDER)
-			order = PPC_MIN_HPT_ORDER;
-	}
+	if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER))
+		return -EINVAL;
 
-	kvm->arch.hpt_cma_alloc = 0;
-	page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT));
+	page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
 	if (page) {
 		hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
 		memset((void *)hpt, 0, (1ul << order));
-		kvm->arch.hpt_cma_alloc = 1;
+		cma = 1;
 	}
 
-	/* Lastly try successively smaller sizes from the page allocator */
-	/* Only do this if userspace didn't specify a size via ioctl */
-	while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) {
-		hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
-				       __GFP_NOWARN, order - PAGE_SHIFT);
-		if (!hpt)
-			--order;
-	}
+	if (!hpt)
+		hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT
+				       |__GFP_NOWARN, order - PAGE_SHIFT);
 
 	if (!hpt)
 		return -ENOMEM;
 
-	kvm->arch.hpt_virt = hpt;
-	kvm->arch.hpt_order = order;
 	/* HPTEs are 2**4 bytes long */
-	kvm->arch.hpt_npte = 1ul << (order - 4);
-	/* 128 (2**7) bytes in each HPTEG */
-	kvm->arch.hpt_mask = (1ul << (order - 7)) - 1;
-
-	atomic64_set(&kvm->arch.mmio_update, 0);
+	npte = 1ul << (order - 4);
 
 	/* Allocate reverse map array */
-	rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte);
+	rev = vmalloc(sizeof(struct revmap_entry) * npte);
 	if (!rev) {
-		pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
-		goto out_freehpt;
+		pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n");
+		if (cma)
+			kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
+		else
+			free_pages(hpt, order - PAGE_SHIFT);
+		return -ENOMEM;
 	}
-	kvm->arch.revmap = rev;
-	kvm->arch.sdr1 = __pa(hpt) | (order - 18);
 
-	pr_info("KVM guest htab at %lx (order %ld), LPID %x\n",
-		hpt, order, kvm->arch.lpid);
+	info->order = order;
+	info->virt = hpt;
+	info->cma = cma;
+	info->rev = rev;
 
-	if (htab_orderp)
-		*htab_orderp = order;
 	return 0;
+}
 
- out_freehpt:
-	if (kvm->arch.hpt_cma_alloc)
-		kvm_release_hpt(page, 1 << (order - PAGE_SHIFT));
-	else
-		free_pages(hpt, order - PAGE_SHIFT);
-	return -ENOMEM;
+void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info)
+{
+	atomic64_set(&kvm->arch.mmio_update, 0);
+	kvm->arch.hpt = *info;
+	kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18);
+
+	pr_info("KVM guest htab at %lx (order %ld), LPID %x\n",
+		info->virt, (long)info->order, kvm->arch.lpid);
 }
 
-long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
+long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
 {
 	long err = -EBUSY;
-	long order;
+	struct kvm_hpt_info info;
+
+	if (kvm_is_radix(kvm))
+		return -EINVAL;
 
 	mutex_lock(&kvm->lock);
 	if (kvm->arch.hpte_setup_done) {
@@ -129,37 +149,44 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
 			goto out;
 		}
 	}
-	if (kvm->arch.hpt_virt) {
-		order = kvm->arch.hpt_order;
+	if (kvm->arch.hpt.order == order) {
+		/* We already have a suitable HPT */
+
 		/* Set the entire HPT to 0, i.e. invalid HPTEs */
-		memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
+		memset((void *)kvm->arch.hpt.virt, 0, 1ul << order);
 		/*
 		 * Reset all the reverse-mapping chains for all memslots
 		 */
 		kvmppc_rmap_reset(kvm);
 		/* Ensure that each vcpu will flush its TLB on next entry. */
 		cpumask_setall(&kvm->arch.need_tlb_flush);
-		*htab_orderp = order;
 		err = 0;
-	} else {
-		err = kvmppc_alloc_hpt(kvm, htab_orderp);
-		order = *htab_orderp;
+		goto out;
 	}
- out:
+
+	if (kvm->arch.hpt.virt)
+		kvmppc_free_hpt(&kvm->arch.hpt);
+
+	err = kvmppc_allocate_hpt(&info, order);
+	if (err < 0)
+		goto out;
+	kvmppc_set_hpt(kvm, &info);
+
+out:
 	mutex_unlock(&kvm->lock);
 	return err;
 }
 
-void kvmppc_free_hpt(struct kvm *kvm)
+void kvmppc_free_hpt(struct kvm_hpt_info *info)
 {
-	kvmppc_free_lpid(kvm->arch.lpid);
-	vfree(kvm->arch.revmap);
-	if (kvm->arch.hpt_cma_alloc)
-		kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
-				1 << (kvm->arch.hpt_order - PAGE_SHIFT));
-	else
-		free_pages(kvm->arch.hpt_virt,
-			   kvm->arch.hpt_order - PAGE_SHIFT);
+	vfree(info->rev);
+	if (info->cma)
+		kvm_free_hpt_cma(virt_to_page(info->virt),
+				 1 << (info->order - PAGE_SHIFT));
+	else if (info->virt)
+		free_pages(info->virt, info->order - PAGE_SHIFT);
+	info->virt = 0;
+	info->order = 0;
 }
 
 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
@@ -194,8 +221,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 	if (npages > 1ul << (40 - porder))
 		npages = 1ul << (40 - porder);
 	/* Can't use more than 1 HPTE per HPTEG */
-	if (npages > kvm->arch.hpt_mask + 1)
-		npages = kvm->arch.hpt_mask + 1;
+	if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1)
+		npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1;
 
 	hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
 		HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
@@ -205,7 +232,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 	for (i = 0; i < npages; ++i) {
 		addr = i << porder;
 		/* can't use hpt_hash since va > 64 bits */
-		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask;
+		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25)))
+			& kvmppc_hpt_mask(&kvm->arch.hpt);
 		/*
 		 * We assume that the hash table is empty and no
 		 * vcpus are using it at this stage.  Since we create
@@ -338,11 +366,11 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 		preempt_enable();
 		return -ENOENT;
 	}
-	hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
+	hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
 	v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
 	if (cpu_has_feature(CPU_FTR_ARCH_300))
 		v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1]));
-	gr = kvm->arch.revmap[index].guest_rpte;
+	gr = kvm->arch.hpt.rev[index].guest_rpte;
 
 	unlock_hpte(hptep, orig_v);
 	preempt_enable();
@@ -392,8 +420,8 @@ static int instruction_is_store(unsigned int instr)
 	return (instr & mask) != 0;
 }
 
-static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
-				  unsigned long gpa, gva_t ea, int is_store)
+int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			   unsigned long gpa, gva_t ea, int is_store)
 {
 	u32 last_inst;
 
@@ -458,6 +486,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	unsigned long rcbits;
 	long mmio_update;
 
+	if (kvm_is_radix(kvm))
+		return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr);
+
 	/*
 	 * Real-mode code has already searched the HPT and found the
 	 * entry we're interested in.  Lock the entry and check that
@@ -480,8 +511,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		}
 	}
 	index = vcpu->arch.pgfault_index;
-	hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
-	rev = &kvm->arch.revmap[index];
+	hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
+	rev = &kvm->arch.hpt.rev[index];
 	preempt_disable();
 	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
 		cpu_relax();
@@ -695,12 +726,13 @@ static void kvmppc_rmap_reset(struct kvm *kvm)
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 }
 
+typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			      unsigned long gfn);
+
 static int kvm_handle_hva_range(struct kvm *kvm,
 				unsigned long start,
 				unsigned long end,
-				int (*handler)(struct kvm *kvm,
-					       unsigned long *rmapp,
-					       unsigned long gfn))
+				hva_handler_fn handler)
 {
 	int ret;
 	int retval = 0;
@@ -725,9 +757,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
 		for (; gfn < gfn_end; ++gfn) {
-			gfn_t gfn_offset = gfn - memslot->base_gfn;
-
-			ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn);
+			ret = handler(kvm, memslot, gfn);
 			retval |= ret;
 		}
 	}
@@ -736,20 +766,61 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 }
 
 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
-					 unsigned long gfn))
+			  hva_handler_fn handler)
 {
 	return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+/* Must be called with both HPTE and rmap locked */
+static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
+			      unsigned long *rmapp, unsigned long gfn)
+{
+	__be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
+	struct revmap_entry *rev = kvm->arch.hpt.rev;
+	unsigned long j, h;
+	unsigned long ptel, psize, rcbits;
+
+	j = rev[i].forw;
+	if (j == i) {
+		/* chain is now empty */
+		*rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
+	} else {
+		/* remove i from chain */
+		h = rev[i].back;
+		rev[h].forw = j;
+		rev[j].back = h;
+		rev[i].forw = rev[i].back = i;
+		*rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
+	}
+
+	/* Now check and modify the HPTE */
+	ptel = rev[i].guest_rpte;
+	psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
+	if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
+	    hpte_rpn(ptel, psize) == gfn) {
+		hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
+		kvmppc_invalidate_hpte(kvm, hptep, i);
+		hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
+		/* Harvest R and C */
+		rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
+		*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+		if (rcbits & HPTE_R_C)
+			kvmppc_update_rmap_change(rmapp, psize);
+		if (rcbits & ~rev[i].guest_rpte) {
+			rev[i].guest_rpte = ptel | rcbits;
+			note_hpte_modification(kvm, &rev[i]);
+		}
+	}
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			   unsigned long gfn)
 {
-	struct revmap_entry *rev = kvm->arch.revmap;
-	unsigned long h, i, j;
+	unsigned long i;
 	__be64 *hptep;
-	unsigned long ptel, psize, rcbits;
+	unsigned long *rmapp;
 
+	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
 	for (;;) {
 		lock_rmap(rmapp);
 		if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
@@ -763,7 +834,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 		 * rmap chain lock.
 		 */
 		i = *rmapp & KVMPPC_RMAP_INDEX;
-		hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
+		hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
 		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
 			/* unlock rmap before spinning on the HPTE lock */
 			unlock_rmap(rmapp);
@@ -771,37 +842,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 				cpu_relax();
 			continue;
 		}
-		j = rev[i].forw;
-		if (j == i) {
-			/* chain is now empty */
-			*rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
-		} else {
-			/* remove i from chain */
-			h = rev[i].back;
-			rev[h].forw = j;
-			rev[j].back = h;
-			rev[i].forw = rev[i].back = i;
-			*rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
-		}
 
-		/* Now check and modify the HPTE */
-		ptel = rev[i].guest_rpte;
-		psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
-		if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
-		    hpte_rpn(ptel, psize) == gfn) {
-			hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
-			kvmppc_invalidate_hpte(kvm, hptep, i);
-			hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
-			/* Harvest R and C */
-			rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
-			*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
-			if (rcbits & HPTE_R_C)
-				kvmppc_update_rmap_change(rmapp, psize);
-			if (rcbits & ~rev[i].guest_rpte) {
-				rev[i].guest_rpte = ptel | rcbits;
-				note_hpte_modification(kvm, &rev[i]);
-			}
-		}
+		kvmppc_unmap_hpte(kvm, i, rmapp, gfn);
 		unlock_rmap(rmapp);
 		__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 	}
@@ -810,26 +852,36 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva)
 {
-	kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+	hva_handler_fn handler;
+
+	handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
+	kvm_handle_hva(kvm, hva, handler);
 	return 0;
 }
 
 int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
 {
-	kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
+	hva_handler_fn handler;
+
+	handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
+	kvm_handle_hva_range(kvm, start, end, handler);
 	return 0;
 }
 
 void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
 				  struct kvm_memory_slot *memslot)
 {
-	unsigned long *rmapp;
 	unsigned long gfn;
 	unsigned long n;
+	unsigned long *rmapp;
 
-	rmapp = memslot->arch.rmap;
 	gfn = memslot->base_gfn;
-	for (n = memslot->npages; n; --n) {
+	rmapp = memslot->arch.rmap;
+	for (n = memslot->npages; n; --n, ++gfn) {
+		if (kvm_is_radix(kvm)) {
+			kvm_unmap_radix(kvm, memslot, gfn);
+			continue;
+		}
 		/*
 		 * Testing the present bit without locking is OK because
 		 * the memslot has been marked invalid already, and hence
@@ -837,20 +889,21 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
 		 * thus the present bit can't go from 0 to 1.
 		 */
 		if (*rmapp & KVMPPC_RMAP_PRESENT)
-			kvm_unmap_rmapp(kvm, rmapp, gfn);
+			kvm_unmap_rmapp(kvm, memslot, gfn);
 		++rmapp;
-		++gfn;
 	}
 }
 
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			 unsigned long gfn)
 {
-	struct revmap_entry *rev = kvm->arch.revmap;
+	struct revmap_entry *rev = kvm->arch.hpt.rev;
 	unsigned long head, i, j;
 	__be64 *hptep;
 	int ret = 0;
+	unsigned long *rmapp;
 
+	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
  retry:
 	lock_rmap(rmapp);
 	if (*rmapp & KVMPPC_RMAP_REFERENCED) {
@@ -864,7 +917,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 	i = head = *rmapp & KVMPPC_RMAP_INDEX;
 	do {
-		hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
+		hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
 		j = rev[i].forw;
 
 		/* If this HPTE isn't referenced, ignore it */
@@ -898,17 +951,22 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
 {
-	return kvm_handle_hva_range(kvm, start, end, kvm_age_rmapp);
+	hva_handler_fn handler;
+
+	handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp;
+	return kvm_handle_hva_range(kvm, start, end, handler);
 }
 
-static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			      unsigned long gfn)
 {
-	struct revmap_entry *rev = kvm->arch.revmap;
+	struct revmap_entry *rev = kvm->arch.hpt.rev;
 	unsigned long head, i, j;
 	unsigned long *hp;
 	int ret = 1;
+	unsigned long *rmapp;
 
+	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
 	if (*rmapp & KVMPPC_RMAP_REFERENCED)
 		return 1;
 
@@ -919,7 +977,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	if (*rmapp & KVMPPC_RMAP_PRESENT) {
 		i = head = *rmapp & KVMPPC_RMAP_INDEX;
 		do {
-			hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4));
+			hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4));
 			j = rev[i].forw;
 			if (be64_to_cpu(hp[1]) & HPTE_R_R)
 				goto out;
@@ -934,12 +992,18 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
 {
-	return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
+	hva_handler_fn handler;
+
+	handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp;
+	return kvm_handle_hva(kvm, hva, handler);
 }
 
 void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
 {
-	kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+	hva_handler_fn handler;
+
+	handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
+	kvm_handle_hva(kvm, hva, handler);
 }
 
 static int vcpus_running(struct kvm *kvm)
@@ -953,7 +1017,7 @@ static int vcpus_running(struct kvm *kvm)
  */
 static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
 {
-	struct revmap_entry *rev = kvm->arch.revmap;
+	struct revmap_entry *rev = kvm->arch.hpt.rev;
 	unsigned long head, i, j;
 	unsigned long n;
 	unsigned long v, r;
@@ -978,7 +1042,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
 	i = head = *rmapp & KVMPPC_RMAP_INDEX;
 	do {
 		unsigned long hptep1;
-		hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
+		hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
 		j = rev[i].forw;
 
 		/*
@@ -1040,7 +1104,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
 	return npages_dirty;
 }
 
-static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
+void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
 			      struct kvm_memory_slot *memslot,
 			      unsigned long *map)
 {
@@ -1058,12 +1122,11 @@ static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
 		__set_bit_le(gfn - memslot->base_gfn, map);
 }
 
-long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
-			     unsigned long *map)
+long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
+			struct kvm_memory_slot *memslot, unsigned long *map)
 {
 	unsigned long i, j;
 	unsigned long *rmapp;
-	struct kvm_vcpu *vcpu;
 
 	preempt_disable();
 	rmapp = memslot->arch.rmap;
@@ -1079,15 +1142,6 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
 				__set_bit_le(j, map);
 		++rmapp;
 	}
-
-	/* Harvest dirty bits from VPA and DTL updates */
-	/* Note: we never modify the SLB shadow buffer areas */
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		spin_lock(&vcpu->arch.vpa_update_lock);
-		harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map);
-		harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map);
-		spin_unlock(&vcpu->arch.vpa_update_lock);
-	}
 	preempt_enable();
 	return 0;
 }
@@ -1142,15 +1196,367 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
 	srcu_idx = srcu_read_lock(&kvm->srcu);
 	memslot = gfn_to_memslot(kvm, gfn);
 	if (memslot) {
-		rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
-		lock_rmap(rmap);
-		*rmap |= KVMPPC_RMAP_CHANGED;
-		unlock_rmap(rmap);
+		if (!kvm_is_radix(kvm)) {
+			rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+			lock_rmap(rmap);
+			*rmap |= KVMPPC_RMAP_CHANGED;
+			unlock_rmap(rmap);
+		} else if (memslot->dirty_bitmap) {
+			mark_page_dirty(kvm, gfn);
+		}
 	}
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 }
 
 /*
+ * HPT resizing
+ */
+static int resize_hpt_allocate(struct kvm_resize_hpt *resize)
+{
+	int rc;
+
+	rc = kvmppc_allocate_hpt(&resize->hpt, resize->order);
+	if (rc < 0)
+		return rc;
+
+	resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n",
+			 resize->hpt.virt);
+
+	return 0;
+}
+
+static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
+					    unsigned long idx)
+{
+	struct kvm *kvm = resize->kvm;
+	struct kvm_hpt_info *old = &kvm->arch.hpt;
+	struct kvm_hpt_info *new = &resize->hpt;
+	unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1;
+	unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1;
+	__be64 *hptep, *new_hptep;
+	unsigned long vpte, rpte, guest_rpte;
+	int ret;
+	struct revmap_entry *rev;
+	unsigned long apsize, psize, avpn, pteg, hash;
+	unsigned long new_idx, new_pteg, replace_vpte;
+
+	hptep = (__be64 *)(old->virt + (idx << 4));
+
+	/* Guest is stopped, so new HPTEs can't be added or faulted
+	 * in, only unmapped or altered by host actions.  So, it's
+	 * safe to check this before we take the HPTE lock */
+	vpte = be64_to_cpu(hptep[0]);
+	if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
+		return 0; /* nothing to do */
+
+	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+		cpu_relax();
+
+	vpte = be64_to_cpu(hptep[0]);
+
+	ret = 0;
+	if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
+		/* Nothing to do */
+		goto out;
+
+	/* Unmap */
+	rev = &old->rev[idx];
+	guest_rpte = rev->guest_rpte;
+
+	ret = -EIO;
+	apsize = hpte_page_size(vpte, guest_rpte);
+	if (!apsize)
+		goto out;
+
+	if (vpte & HPTE_V_VALID) {
+		unsigned long gfn = hpte_rpn(guest_rpte, apsize);
+		int srcu_idx = srcu_read_lock(&kvm->srcu);
+		struct kvm_memory_slot *memslot =
+			__gfn_to_memslot(kvm_memslots(kvm), gfn);
+
+		if (memslot) {
+			unsigned long *rmapp;
+			rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
+
+			lock_rmap(rmapp);
+			kvmppc_unmap_hpte(kvm, idx, rmapp, gfn);
+			unlock_rmap(rmapp);
+		}
+
+		srcu_read_unlock(&kvm->srcu, srcu_idx);
+	}
+
+	/* Reload PTE after unmap */
+	vpte = be64_to_cpu(hptep[0]);
+
+	BUG_ON(vpte & HPTE_V_VALID);
+	BUG_ON(!(vpte & HPTE_V_ABSENT));
+
+	ret = 0;
+	if (!(vpte & HPTE_V_BOLTED))
+		goto out;
+
+	rpte = be64_to_cpu(hptep[1]);
+	psize = hpte_base_page_size(vpte, rpte);
+	avpn = HPTE_V_AVPN_VAL(vpte) & ~((psize - 1) >> 23);
+	pteg = idx / HPTES_PER_GROUP;
+	if (vpte & HPTE_V_SECONDARY)
+		pteg = ~pteg;
+
+	if (!(vpte & HPTE_V_1TB_SEG)) {
+		unsigned long offset, vsid;
+
+		/* We only have 28 - 23 bits of offset in avpn */
+		offset = (avpn & 0x1f) << 23;
+		vsid = avpn >> 5;
+		/* We can find more bits from the pteg value */
+		if (psize < (1ULL << 23))
+			offset |= ((vsid ^ pteg) & old_hash_mask) * psize;
+
+		hash = vsid ^ (offset / psize);
+	} else {
+		unsigned long offset, vsid;
+
+		/* We only have 40 - 23 bits of seg_off in avpn */
+		offset = (avpn & 0x1ffff) << 23;
+		vsid = avpn >> 17;
+		if (psize < (1ULL << 23))
+			offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) * psize;
+
+		hash = vsid ^ (vsid << 25) ^ (offset / psize);
+	}
+
+	new_pteg = hash & new_hash_mask;
+	if (vpte & HPTE_V_SECONDARY) {
+		BUG_ON(~pteg != (hash & old_hash_mask));
+		new_pteg = ~new_pteg;
+	} else {
+		BUG_ON(pteg != (hash & old_hash_mask));
+	}
+
+	new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP);
+	new_hptep = (__be64 *)(new->virt + (new_idx << 4));
+
+	replace_vpte = be64_to_cpu(new_hptep[0]);
+
+	if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
+		BUG_ON(new->order >= old->order);
+
+		if (replace_vpte & HPTE_V_BOLTED) {
+			if (vpte & HPTE_V_BOLTED)
+				/* Bolted collision, nothing we can do */
+				ret = -ENOSPC;
+			/* Discard the new HPTE */
+			goto out;
+		}
+
+		/* Discard the previous HPTE */
+	}
+
+	new_hptep[1] = cpu_to_be64(rpte);
+	new->rev[new_idx].guest_rpte = guest_rpte;
+	/* No need for a barrier, since new HPT isn't active */
+	new_hptep[0] = cpu_to_be64(vpte);
+	unlock_hpte(new_hptep, vpte);
+
+out:
+	unlock_hpte(hptep, vpte);
+	return ret;
+}
+
+static int resize_hpt_rehash(struct kvm_resize_hpt *resize)
+{
+	struct kvm *kvm = resize->kvm;
+	unsigned  long i;
+	int rc;
+
+	for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) {
+		rc = resize_hpt_rehash_hpte(resize, i);
+		if (rc != 0)
+			return rc;
+	}
+
+	return 0;
+}
+
+static void resize_hpt_pivot(struct kvm_resize_hpt *resize)
+{
+	struct kvm *kvm = resize->kvm;
+	struct kvm_hpt_info hpt_tmp;
+
+	/* Exchange the pending tables in the resize structure with
+	 * the active tables */
+
+	resize_hpt_debug(resize, "resize_hpt_pivot()\n");
+
+	spin_lock(&kvm->mmu_lock);
+	asm volatile("ptesync" : : : "memory");
+
+	hpt_tmp = kvm->arch.hpt;
+	kvmppc_set_hpt(kvm, &resize->hpt);
+	resize->hpt = hpt_tmp;
+
+	spin_unlock(&kvm->mmu_lock);
+
+	synchronize_srcu_expedited(&kvm->srcu);
+
+	resize_hpt_debug(resize, "resize_hpt_pivot() done\n");
+}
+
+static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
+{
+	BUG_ON(kvm->arch.resize_hpt != resize);
+
+	if (resize->hpt.virt)
+		kvmppc_free_hpt(&resize->hpt);
+
+	kvm->arch.resize_hpt = NULL;
+	kfree(resize);
+}
+
+static void resize_hpt_prepare_work(struct work_struct *work)
+{
+	struct kvm_resize_hpt *resize = container_of(work,
+						     struct kvm_resize_hpt,
+						     work);
+	struct kvm *kvm = resize->kvm;
+	int err;
+
+	resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
+			 resize->order);
+
+	err = resize_hpt_allocate(resize);
+
+	mutex_lock(&kvm->lock);
+
+	resize->error = err;
+	resize->prepare_done = true;
+
+	mutex_unlock(&kvm->lock);
+}
+
+long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
+				     struct kvm_ppc_resize_hpt *rhpt)
+{
+	unsigned long flags = rhpt->flags;
+	unsigned long shift = rhpt->shift;
+	struct kvm_resize_hpt *resize;
+	int ret;
+
+	if (flags != 0)
+		return -EINVAL;
+
+	if (shift && ((shift < 18) || (shift > 46)))
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+
+	resize = kvm->arch.resize_hpt;
+
+	if (resize) {
+		if (resize->order == shift) {
+			/* Suitable resize in progress */
+			if (resize->prepare_done) {
+				ret = resize->error;
+				if (ret != 0)
+					resize_hpt_release(kvm, resize);
+			} else {
+				ret = 100; /* estimated time in ms */
+			}
+
+			goto out;
+		}
+
+		/* not suitable, cancel it */
+		resize_hpt_release(kvm, resize);
+	}
+
+	ret = 0;
+	if (!shift)
+		goto out; /* nothing to do */
+
+	/* start new resize */
+
+	resize = kzalloc(sizeof(*resize), GFP_KERNEL);
+	resize->order = shift;
+	resize->kvm = kvm;
+	INIT_WORK(&resize->work, resize_hpt_prepare_work);
+	kvm->arch.resize_hpt = resize;
+
+	schedule_work(&resize->work);
+
+	ret = 100; /* estimated time in ms */
+
+out:
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
+static void resize_hpt_boot_vcpu(void *opaque)
+{
+	/* Nothing to do, just force a KVM exit */
+}
+
+long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
+				    struct kvm_ppc_resize_hpt *rhpt)
+{
+	unsigned long flags = rhpt->flags;
+	unsigned long shift = rhpt->shift;
+	struct kvm_resize_hpt *resize;
+	long ret;
+
+	if (flags != 0)
+		return -EINVAL;
+
+	if (shift && ((shift < 18) || (shift > 46)))
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+
+	resize = kvm->arch.resize_hpt;
+
+	/* This shouldn't be possible */
+	ret = -EIO;
+	if (WARN_ON(!kvm->arch.hpte_setup_done))
+		goto out_no_hpt;
+
+	/* Stop VCPUs from running while we mess with the HPT */
+	kvm->arch.hpte_setup_done = 0;
+	smp_mb();
+
+	/* Boot all CPUs out of the guest so they re-read
+	 * hpte_setup_done */
+	on_each_cpu(resize_hpt_boot_vcpu, NULL, 1);
+
+	ret = -ENXIO;
+	if (!resize || (resize->order != shift))
+		goto out;
+
+	ret = -EBUSY;
+	if (!resize->prepare_done)
+		goto out;
+
+	ret = resize->error;
+	if (ret != 0)
+		goto out;
+
+	ret = resize_hpt_rehash(resize);
+	if (ret != 0)
+		goto out;
+
+	resize_hpt_pivot(resize);
+
+out:
+	/* Let VCPUs run again */
+	kvm->arch.hpte_setup_done = 1;
+	smp_mb();
+out_no_hpt:
+	resize_hpt_release(kvm, resize);
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
+/*
  * Functions for reading and writing the hash table via reads and
  * writes on a file descriptor.
  *
@@ -1290,8 +1696,8 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 	flags = ctx->flags;
 
 	i = ctx->index;
-	hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
-	revp = kvm->arch.revmap + i;
+	hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
+	revp = kvm->arch.hpt.rev + i;
 	lbuf = (unsigned long __user *)buf;
 
 	nb = 0;
@@ -1306,7 +1712,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 
 		/* Skip uninteresting entries, i.e. clean on not-first pass */
 		if (!first_pass) {
-			while (i < kvm->arch.hpt_npte &&
+			while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
 			       !hpte_dirty(revp, hptp)) {
 				++i;
 				hptp += 2;
@@ -1316,7 +1722,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 		hdr.index = i;
 
 		/* Grab a series of valid entries */
-		while (i < kvm->arch.hpt_npte &&
+		while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
 		       hdr.n_valid < 0xffff &&
 		       nb + HPTE_SIZE < count &&
 		       record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
@@ -1332,7 +1738,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 			++revp;
 		}
 		/* Now skip invalid entries while we can */
-		while (i < kvm->arch.hpt_npte &&
+		while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
 		       hdr.n_invalid < 0xffff &&
 		       record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
 			/* found an invalid entry */
@@ -1353,7 +1759,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 		}
 
 		/* Check if we've wrapped around the hash table */
-		if (i >= kvm->arch.hpt_npte) {
+		if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) {
 			i = 0;
 			ctx->first_pass = 0;
 			break;
@@ -1412,11 +1818,11 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 
 		err = -EINVAL;
 		i = hdr.index;
-		if (i >= kvm->arch.hpt_npte ||
-		    i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
+		if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) ||
+		    i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt))
 			break;
 
-		hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+		hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
 		lbuf = (unsigned long __user *)buf;
 		for (j = 0; j < hdr.n_valid; ++j) {
 			__be64 hpte_v;
@@ -1603,8 +2009,9 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
 
 	kvm = p->kvm;
 	i = p->hpt_index;
-	hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
-	for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) {
+	hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
+	for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt);
+	     ++i, hptp += 2) {
 		if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
 			continue;
 
@@ -1614,7 +2021,7 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
 			cpu_relax();
 		v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
 		hr = be64_to_cpu(hptp[1]);
-		gr = kvm->arch.revmap[i].guest_rpte;
+		gr = kvm->arch.hpt.rev[i].guest_rpte;
 		unlock_hpte(hptp, v);
 		preempt_enable();
 
@@ -1675,7 +2082,10 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.slb_nr = 32;		/* POWER7/POWER8 */
 
-	mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
+	if (kvm_is_radix(vcpu->kvm))
+		mmu->xlate = kvmppc_mmu_radix_xlate;
+	else
+		mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
 	mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
 
 	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
new file mode 100644
index 000000000000..4344651f408c
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -0,0 +1,716 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+
+/*
+ * Supported radix tree geometry.
+ * Like p9, we support either 5 or 9 bits at the first (lowest) level,
+ * for a page size of 64k or 4k.
+ */
+static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
+
+int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+			   struct kvmppc_pte *gpte, bool data, bool iswrite)
+{
+	struct kvm *kvm = vcpu->kvm;
+	u32 pid;
+	int ret, level, ps;
+	__be64 prte, rpte;
+	unsigned long root, pte, index;
+	unsigned long rts, bits, offset;
+	unsigned long gpa;
+	unsigned long proc_tbl_size;
+
+	/* Work out effective PID */
+	switch (eaddr >> 62) {
+	case 0:
+		pid = vcpu->arch.pid;
+		break;
+	case 3:
+		pid = 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+	proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12);
+	if (pid * 16 >= proc_tbl_size)
+		return -EINVAL;
+
+	/* Read partition table to find root of tree for effective PID */
+	ret = kvm_read_guest(kvm, kvm->arch.process_table + pid * 16,
+			     &prte, sizeof(prte));
+	if (ret)
+		return ret;
+
+	root = be64_to_cpu(prte);
+	rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
+		((root & RTS2_MASK) >> RTS2_SHIFT);
+	bits = root & RPDS_MASK;
+	root = root & RPDB_MASK;
+
+	/* P9 DD1 interprets RTS (radix tree size) differently */
+	offset = rts + 31;
+	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+		offset -= 3;
+
+	/* current implementations only support 52-bit space */
+	if (offset != 52)
+		return -EINVAL;
+
+	for (level = 3; level >= 0; --level) {
+		if (level && bits != p9_supported_radix_bits[level])
+			return -EINVAL;
+		if (level == 0 && !(bits == 5 || bits == 9))
+			return -EINVAL;
+		offset -= bits;
+		index = (eaddr >> offset) & ((1UL << bits) - 1);
+		/* check that low bits of page table base are zero */
+		if (root & ((1UL << (bits + 3)) - 1))
+			return -EINVAL;
+		ret = kvm_read_guest(kvm, root + index * 8,
+				     &rpte, sizeof(rpte));
+		if (ret)
+			return ret;
+		pte = __be64_to_cpu(rpte);
+		if (!(pte & _PAGE_PRESENT))
+			return -ENOENT;
+		if (pte & _PAGE_PTE)
+			break;
+		bits = pte & 0x1f;
+		root = pte & 0x0fffffffffffff00ul;
+	}
+	/* need a leaf at lowest level; 512GB pages not supported */
+	if (level < 0 || level == 3)
+		return -EINVAL;
+
+	/* offset is now log base 2 of the page size */
+	gpa = pte & 0x01fffffffffff000ul;
+	if (gpa & ((1ul << offset) - 1))
+		return -EINVAL;
+	gpa += eaddr & ((1ul << offset) - 1);
+	for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
+		if (offset == mmu_psize_defs[ps].shift)
+			break;
+	gpte->page_size = ps;
+
+	gpte->eaddr = eaddr;
+	gpte->raddr = gpa;
+
+	/* Work out permissions */
+	gpte->may_read = !!(pte & _PAGE_READ);
+	gpte->may_write = !!(pte & _PAGE_WRITE);
+	gpte->may_execute = !!(pte & _PAGE_EXEC);
+	if (kvmppc_get_msr(vcpu) & MSR_PR) {
+		if (pte & _PAGE_PRIVILEGED) {
+			gpte->may_read = 0;
+			gpte->may_write = 0;
+			gpte->may_execute = 0;
+		}
+	} else {
+		if (!(pte & _PAGE_PRIVILEGED)) {
+			/* Check AMR/IAMR to see if strict mode is in force */
+			if (vcpu->arch.amr & (1ul << 62))
+				gpte->may_read = 0;
+			if (vcpu->arch.amr & (1ul << 63))
+				gpte->may_write = 0;
+			if (vcpu->arch.iamr & (1ul << 62))
+				gpte->may_execute = 0;
+		}
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+#define MMU_BASE_PSIZE	MMU_PAGE_64K
+#else
+#define MMU_BASE_PSIZE	MMU_PAGE_4K
+#endif
+
+static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
+				    unsigned int pshift)
+{
+	int psize = MMU_BASE_PSIZE;
+
+	if (pshift >= PMD_SHIFT)
+		psize = MMU_PAGE_2M;
+	addr &= ~0xfffUL;
+	addr |= mmu_psize_defs[psize].ap << 5;
+	asm volatile("ptesync": : :"memory");
+	asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1)
+		     : : "r" (addr), "r" (kvm->arch.lpid) : "memory");
+	asm volatile("ptesync": : :"memory");
+}
+
+unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
+				      unsigned long clr, unsigned long set,
+				      unsigned long addr, unsigned int shift)
+{
+	unsigned long old = 0;
+
+	if (!(clr & _PAGE_PRESENT) && cpu_has_feature(CPU_FTR_POWER9_DD1) &&
+	    pte_present(*ptep)) {
+		/* have to invalidate it first */
+		old = __radix_pte_update(ptep, _PAGE_PRESENT, 0);
+		kvmppc_radix_tlbie_page(kvm, addr, shift);
+		set |= _PAGE_PRESENT;
+		old &= _PAGE_PRESENT;
+	}
+	return __radix_pte_update(ptep, clr, set) | old;
+}
+
+void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
+			     pte_t *ptep, pte_t pte)
+{
+	radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
+}
+
+static struct kmem_cache *kvm_pte_cache;
+
+static pte_t *kvmppc_pte_alloc(void)
+{
+	return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
+}
+
+static void kvmppc_pte_free(pte_t *ptep)
+{
+	kmem_cache_free(kvm_pte_cache, ptep);
+}
+
+static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
+			     unsigned int level, unsigned long mmu_seq)
+{
+	pgd_t *pgd;
+	pud_t *pud, *new_pud = NULL;
+	pmd_t *pmd, *new_pmd = NULL;
+	pte_t *ptep, *new_ptep = NULL;
+	unsigned long old;
+	int ret;
+
+	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
+	pgd = kvm->arch.pgtable + pgd_index(gpa);
+	pud = NULL;
+	if (pgd_present(*pgd))
+		pud = pud_offset(pgd, gpa);
+	else
+		new_pud = pud_alloc_one(kvm->mm, gpa);
+
+	pmd = NULL;
+	if (pud && pud_present(*pud))
+		pmd = pmd_offset(pud, gpa);
+	else
+		new_pmd = pmd_alloc_one(kvm->mm, gpa);
+
+	if (level == 0 && !(pmd && pmd_present(*pmd)))
+		new_ptep = kvmppc_pte_alloc();
+
+	/* Check if we might have been invalidated; let the guest retry if so */
+	spin_lock(&kvm->mmu_lock);
+	ret = -EAGAIN;
+	if (mmu_notifier_retry(kvm, mmu_seq))
+		goto out_unlock;
+
+	/* Now traverse again under the lock and change the tree */
+	ret = -ENOMEM;
+	if (pgd_none(*pgd)) {
+		if (!new_pud)
+			goto out_unlock;
+		pgd_populate(kvm->mm, pgd, new_pud);
+		new_pud = NULL;
+	}
+	pud = pud_offset(pgd, gpa);
+	if (pud_none(*pud)) {
+		if (!new_pmd)
+			goto out_unlock;
+		pud_populate(kvm->mm, pud, new_pmd);
+		new_pmd = NULL;
+	}
+	pmd = pmd_offset(pud, gpa);
+	if (pmd_large(*pmd)) {
+		/* Someone else has instantiated a large page here; retry */
+		ret = -EAGAIN;
+		goto out_unlock;
+	}
+	if (level == 1 && !pmd_none(*pmd)) {
+		/*
+		 * There's a page table page here, but we wanted
+		 * to install a large page.  Tell the caller and let
+		 * it try installing a normal page if it wants.
+		 */
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+	if (level == 0) {
+		if (pmd_none(*pmd)) {
+			if (!new_ptep)
+				goto out_unlock;
+			pmd_populate(kvm->mm, pmd, new_ptep);
+			new_ptep = NULL;
+		}
+		ptep = pte_offset_kernel(pmd, gpa);
+		if (pte_present(*ptep)) {
+			/* PTE was previously valid, so invalidate it */
+			old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT,
+						      0, gpa, 0);
+			kvmppc_radix_tlbie_page(kvm, gpa, 0);
+			if (old & _PAGE_DIRTY)
+				mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
+		}
+		kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
+	} else {
+		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
+	}
+	ret = 0;
+
+ out_unlock:
+	spin_unlock(&kvm->mmu_lock);
+	if (new_pud)
+		pud_free(kvm->mm, new_pud);
+	if (new_pmd)
+		pmd_free(kvm->mm, new_pmd);
+	if (new_ptep)
+		kvmppc_pte_free(new_ptep);
+	return ret;
+}
+
+int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				   unsigned long ea, unsigned long dsisr)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long mmu_seq, pte_size;
+	unsigned long gpa, gfn, hva, pfn;
+	struct kvm_memory_slot *memslot;
+	struct page *page = NULL, *pages[1];
+	long ret, npages, ok;
+	unsigned int writing;
+	struct vm_area_struct *vma;
+	unsigned long flags;
+	pte_t pte, *ptep;
+	unsigned long pgflags;
+	unsigned int shift, level;
+
+	/* Check for unusual errors */
+	if (dsisr & DSISR_UNSUPP_MMU) {
+		pr_err("KVM: Got unsupported MMU fault\n");
+		return -EFAULT;
+	}
+	if (dsisr & DSISR_BADACCESS) {
+		/* Reflect to the guest as DSI */
+		pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
+		kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+		return RESUME_GUEST;
+	}
+
+	/* Translate the logical address and get the page */
+	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
+	gpa &= ~0xF000000000000000ul;
+	gfn = gpa >> PAGE_SHIFT;
+	if (!(dsisr & DSISR_PGDIRFAULT))
+		gpa |= ea & 0xfff;
+	memslot = gfn_to_memslot(kvm, gfn);
+
+	/* No memslot means it's an emulated MMIO region */
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
+		if (dsisr & (DSISR_PGDIRFAULT | DSISR_BADACCESS |
+			     DSISR_SET_RC)) {
+			/*
+			 * Bad address in guest page table tree, or other
+			 * unusual error - reflect it to the guest as DSI.
+			 */
+			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+			return RESUME_GUEST;
+		}
+		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
+					      dsisr & DSISR_ISSTORE);
+	}
+
+	/* used to check for invalidations in progress */
+	mmu_seq = kvm->mmu_notifier_seq;
+	smp_rmb();
+
+	writing = (dsisr & DSISR_ISSTORE) != 0;
+	hva = gfn_to_hva_memslot(memslot, gfn);
+	if (dsisr & DSISR_SET_RC) {
+		/*
+		 * Need to set an R or C bit in the 2nd-level tables;
+		 * if the relevant bits aren't already set in the linux
+		 * page tables, fall through to do the gup_fast to
+		 * set them in the linux page tables too.
+		 */
+		ok = 0;
+		pgflags = _PAGE_ACCESSED;
+		if (writing)
+			pgflags |= _PAGE_DIRTY;
+		local_irq_save(flags);
+		ptep = __find_linux_pte_or_hugepte(current->mm->pgd, hva,
+						   NULL, NULL);
+		if (ptep) {
+			pte = READ_ONCE(*ptep);
+			if (pte_present(pte) &&
+			    (pte_val(pte) & pgflags) == pgflags)
+				ok = 1;
+		}
+		local_irq_restore(flags);
+		if (ok) {
+			spin_lock(&kvm->mmu_lock);
+			if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
+				spin_unlock(&kvm->mmu_lock);
+				return RESUME_GUEST;
+			}
+			ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable,
+							gpa, NULL, &shift);
+			if (ptep && pte_present(*ptep)) {
+				kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
+							gpa, shift);
+				spin_unlock(&kvm->mmu_lock);
+				return RESUME_GUEST;
+			}
+			spin_unlock(&kvm->mmu_lock);
+		}
+	}
+
+	ret = -EFAULT;
+	pfn = 0;
+	pte_size = PAGE_SIZE;
+	pgflags = _PAGE_READ | _PAGE_EXEC;
+	level = 0;
+	npages = get_user_pages_fast(hva, 1, writing, pages);
+	if (npages < 1) {
+		/* Check if it's an I/O mapping */
+		down_read(&current->mm->mmap_sem);
+		vma = find_vma(current->mm, hva);
+		if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
+		    (vma->vm_flags & VM_PFNMAP)) {
+			pfn = vma->vm_pgoff +
+				((hva - vma->vm_start) >> PAGE_SHIFT);
+			pgflags = pgprot_val(vma->vm_page_prot);
+		}
+		up_read(&current->mm->mmap_sem);
+		if (!pfn)
+			return -EFAULT;
+	} else {
+		page = pages[0];
+		pfn = page_to_pfn(page);
+		if (PageHuge(page)) {
+			page = compound_head(page);
+			pte_size <<= compound_order(page);
+			/* See if we can insert a 2MB large-page PTE here */
+			if (pte_size >= PMD_SIZE &&
+			    (gpa & PMD_MASK & PAGE_MASK) ==
+			    (hva & PMD_MASK & PAGE_MASK)) {
+				level = 1;
+				pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
+			}
+		}
+		/* See if we can provide write access */
+		if (writing) {
+			/*
+			 * We assume gup_fast has set dirty on the host PTE.
+			 */
+			pgflags |= _PAGE_WRITE;
+		} else {
+			local_irq_save(flags);
+			ptep = __find_linux_pte_or_hugepte(current->mm->pgd,
+							hva, NULL, NULL);
+			if (ptep && pte_write(*ptep) && pte_dirty(*ptep))
+				pgflags |= _PAGE_WRITE;
+			local_irq_restore(flags);
+		}
+	}
+
+	/*
+	 * Compute the PTE value that we need to insert.
+	 */
+	pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED;
+	if (pgflags & _PAGE_WRITE)
+		pgflags |= _PAGE_DIRTY;
+	pte = pfn_pte(pfn, __pgprot(pgflags));
+
+	/* Allocate space in the tree and write the PTE */
+	ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
+	if (ret == -EBUSY) {
+		/*
+		 * There's already a PMD where wanted to install a large page;
+		 * for now, fall back to installing a small page.
+		 */
+		level = 0;
+		pfn |= gfn & ((PMD_SIZE >> PAGE_SHIFT) - 1);
+		pte = pfn_pte(pfn, __pgprot(pgflags));
+		ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
+	}
+	if (ret == 0 || ret == -EAGAIN)
+		ret = RESUME_GUEST;
+
+	if (page) {
+		/*
+		 * We drop pages[0] here, not page because page might
+		 * have been set to the head page of a compound, but
+		 * we have to drop the reference on the correct tail
+		 * page to match the get inside gup()
+		 */
+		put_page(pages[0]);
+	}
+	return ret;
+}
+
+static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			     unsigned long gfn, unsigned int order)
+{
+	unsigned long i, limit;
+	unsigned long *dp;
+
+	if (!memslot->dirty_bitmap)
+		return;
+	limit = 1ul << order;
+	if (limit < BITS_PER_LONG) {
+		for (i = 0; i < limit; ++i)
+			mark_page_dirty(kvm, gfn + i);
+		return;
+	}
+	dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn);
+	limit /= BITS_PER_LONG;
+	for (i = 0; i < limit; ++i)
+		*dp++ = ~0ul;
+}
+
+/* Called with kvm->lock held */
+int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		    unsigned long gfn)
+{
+	pte_t *ptep;
+	unsigned long gpa = gfn << PAGE_SHIFT;
+	unsigned int shift;
+	unsigned long old;
+
+	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
+					   NULL, &shift);
+	if (ptep && pte_present(*ptep)) {
+		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0,
+					      gpa, shift);
+		kvmppc_radix_tlbie_page(kvm, gpa, shift);
+		if (old & _PAGE_DIRTY) {
+			if (!shift)
+				mark_page_dirty(kvm, gfn);
+			else
+				mark_pages_dirty(kvm, memslot,
+						 gfn, shift - PAGE_SHIFT);
+		}
+	}
+	return 0;				
+}
+
+/* Called with kvm->lock held */
+int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		  unsigned long gfn)
+{
+	pte_t *ptep;
+	unsigned long gpa = gfn << PAGE_SHIFT;
+	unsigned int shift;
+	int ref = 0;
+
+	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
+					   NULL, &shift);
+	if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
+		kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
+					gpa, shift);
+		/* XXX need to flush tlb here? */
+		ref = 1;
+	}
+	return ref;
+}
+
+/* Called with kvm->lock held */
+int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		       unsigned long gfn)
+{
+	pte_t *ptep;
+	unsigned long gpa = gfn << PAGE_SHIFT;
+	unsigned int shift;
+	int ref = 0;
+
+	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
+					   NULL, &shift);
+	if (ptep && pte_present(*ptep) && pte_young(*ptep))
+		ref = 1;
+	return ref;
+}
+
+/* Returns the number of PAGE_SIZE pages that are dirty */
+static int kvm_radix_test_clear_dirty(struct kvm *kvm,
+				struct kvm_memory_slot *memslot, int pagenum)
+{
+	unsigned long gfn = memslot->base_gfn + pagenum;
+	unsigned long gpa = gfn << PAGE_SHIFT;
+	pte_t *ptep;
+	unsigned int shift;
+	int ret = 0;
+
+	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
+					   NULL, &shift);
+	if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) {
+		ret = 1;
+		if (shift)
+			ret = 1 << (shift - PAGE_SHIFT);
+		kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
+					gpa, shift);
+		kvmppc_radix_tlbie_page(kvm, gpa, shift);
+	}
+	return ret;
+}
+
+long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
+			struct kvm_memory_slot *memslot, unsigned long *map)
+{
+	unsigned long i, j;
+	unsigned long n, *p;
+	int npages;
+
+	/*
+	 * Radix accumulates dirty bits in the first half of the
+	 * memslot's dirty_bitmap area, for when pages are paged
+	 * out or modified by the host directly.  Pick up these
+	 * bits and add them to the map.
+	 */
+	n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long);
+	p = memslot->dirty_bitmap;
+	for (i = 0; i < n; ++i)
+		map[i] |= xchg(&p[i], 0);
+
+	for (i = 0; i < memslot->npages; i = j) {
+		npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
+
+		/*
+		 * Note that if npages > 0 then i must be a multiple of npages,
+		 * since huge pages are only used to back the guest at guest
+		 * real addresses that are a multiple of their size.
+		 * Since we have at most one PTE covering any given guest
+		 * real address, if npages > 1 we can skip to i + npages.
+		 */
+		j = i + 1;
+		if (npages)
+			for (j = i; npages; ++j, --npages)
+				__set_bit_le(j, map);
+	}
+	return 0;
+}
+
+static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
+				 int psize, int *indexp)
+{
+	if (!mmu_psize_defs[psize].shift)
+		return;
+	info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
+		(mmu_psize_defs[psize].ap << 29);
+	++(*indexp);
+}
+
+int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
+{
+	int i;
+
+	if (!radix_enabled())
+		return -EINVAL;
+	memset(info, 0, sizeof(*info));
+
+	/* 4k page size */
+	info->geometries[0].page_shift = 12;
+	info->geometries[0].level_bits[0] = 9;
+	for (i = 1; i < 4; ++i)
+		info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
+	/* 64k page size */
+	info->geometries[1].page_shift = 16;
+	for (i = 0; i < 4; ++i)
+		info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
+
+	i = 0;
+	add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
+	add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
+	add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
+	add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
+
+	return 0;
+}
+
+int kvmppc_init_vm_radix(struct kvm *kvm)
+{
+	kvm->arch.pgtable = pgd_alloc(kvm->mm);
+	if (!kvm->arch.pgtable)
+		return -ENOMEM;
+	return 0;
+}
+
+void kvmppc_free_radix(struct kvm *kvm)
+{
+	unsigned long ig, iu, im;
+	pte_t *pte;
+	pmd_t *pmd;
+	pud_t *pud;
+	pgd_t *pgd;
+
+	if (!kvm->arch.pgtable)
+		return;
+	pgd = kvm->arch.pgtable;
+	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
+		if (!pgd_present(*pgd))
+			continue;
+		pud = pud_offset(pgd, 0);
+		for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++pud) {
+			if (!pud_present(*pud))
+				continue;
+			pmd = pmd_offset(pud, 0);
+			for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) {
+				if (pmd_huge(*pmd)) {
+					pmd_clear(pmd);
+					continue;
+				}
+				if (!pmd_present(*pmd))
+					continue;
+				pte = pte_offset_map(pmd, 0);
+				memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
+				kvmppc_pte_free(pte);
+				pmd_clear(pmd);
+			}
+			pmd_free(kvm->mm, pmd_offset(pud, 0));
+			pud_clear(pud);
+		}
+		pud_free(kvm->mm, pud_offset(pgd, 0));
+		pgd_clear(pgd);
+	}
+	pgd_free(kvm->mm, kvm->arch.pgtable);
+}
+
+static void pte_ctor(void *addr)
+{
+	memset(addr, 0, PTE_TABLE_SIZE);
+}
+
+int kvmppc_radix_init(void)
+{
+	unsigned long size = sizeof(void *) << PTE_INDEX_SIZE;
+
+	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
+	if (!kvm_pte_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void kvmppc_radix_exit(void)
+{
+	kmem_cache_destroy(kvm_pte_cache);
+}
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index ec34e39471a7..1e107ece4e37 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -182,7 +182,8 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 		++vcpu->stat.halt_wakeup;
 	}
 
-	if (kvmppc_ipi_thread(vcpu->arch.thread_cpu))
+	cpu = READ_ONCE(vcpu->arch.thread_cpu);
+	if (cpu >= 0 && kvmppc_ipi_thread(cpu))
 		return;
 
 	/* CPU points to the first thread of the core */
@@ -773,12 +774,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 		}
 		tvcpu->arch.prodded = 1;
 		smp_mb();
-		if (vcpu->arch.ceded) {
-			if (swait_active(&vcpu->wq)) {
-				swake_up(&vcpu->wq);
-				vcpu->stat.halt_wakeup++;
-			}
-		}
+		if (tvcpu->arch.ceded)
+			kvmppc_fast_vcpu_kick_hv(tvcpu);
 		break;
 	case H_CONFER:
 		target = kvmppc_get_gpr(vcpu, 4);
@@ -1135,7 +1132,7 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
 	/*
 	 * Userspace can only modify DPFD (default prefetch depth),
 	 * ILE (interrupt little-endian) and TC (translation control).
-	 * On POWER8 userspace can also modify AIL (alt. interrupt loc.)
+	 * On POWER8 and POWER9 userspace can also modify AIL (alt. interrupt loc.).
 	 */
 	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
 	if (cpu_has_feature(CPU_FTR_ARCH_207S))
@@ -1821,6 +1818,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 	vcpu->arch.vcore = vcore;
 	vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
 	vcpu->arch.thread_cpu = -1;
+	vcpu->arch.prev_cpu = -1;
 
 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	kvmppc_sanity_check(vcpu);
@@ -1950,11 +1948,33 @@ static void kvmppc_release_hwthread(int cpu)
 	tpaca->kvm_hstate.kvm_split_mode = NULL;
 }
 
+static void do_nothing(void *x)
+{
+}
+
+static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	cpu = cpu_first_thread_sibling(cpu);
+	cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
+	/*
+	 * Make sure setting of bit in need_tlb_flush precedes
+	 * testing of cpu_in_guest bits.  The matching barrier on
+	 * the other side is the first smp_mb() in kvmppc_run_core().
+	 */
+	smp_mb();
+	for (i = 0; i < threads_per_core; ++i)
+		if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
+			smp_call_function_single(cpu + i, do_nothing, NULL, 1);
+}
+
 static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
 {
 	int cpu;
 	struct paca_struct *tpaca;
 	struct kvmppc_vcore *mvc = vc->master_vcore;
+	struct kvm *kvm = vc->kvm;
 
 	cpu = vc->pcpu;
 	if (vcpu) {
@@ -1965,6 +1985,27 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
 		cpu += vcpu->arch.ptid;
 		vcpu->cpu = mvc->pcpu;
 		vcpu->arch.thread_cpu = cpu;
+
+		/*
+		 * With radix, the guest can do TLB invalidations itself,
+		 * and it could choose to use the local form (tlbiel) if
+		 * it is invalidating a translation that has only ever been
+		 * used on one vcpu.  However, that doesn't mean it has
+		 * only ever been used on one physical cpu, since vcpus
+		 * can move around between pcpus.  To cope with this, when
+		 * a vcpu moves from one pcpu to another, we need to tell
+		 * any vcpus running on the same core as this vcpu previously
+		 * ran to flush the TLB.  The TLB is shared between threads,
+		 * so we use a single bit in .need_tlb_flush for all 4 threads.
+		 */
+		if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) {
+			if (vcpu->arch.prev_cpu >= 0 &&
+			    cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
+			    cpu_first_thread_sibling(cpu))
+				radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
+			vcpu->arch.prev_cpu = cpu;
+		}
+		cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
 	}
 	tpaca = &paca[cpu];
 	tpaca->kvm_hstate.kvm_vcpu = vcpu;
@@ -2552,6 +2593,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 		kvmppc_release_hwthread(pcpu + i);
 		if (sip && sip->napped[i])
 			kvmppc_ipi_thread(pcpu + i);
+		cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
 	}
 
 	kvmppc_set_host_core(pcpu);
@@ -2620,7 +2662,8 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
 	int i;
 
 	for_each_runnable_thread(i, vcpu, vc) {
-		if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded)
+		if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded ||
+		    vcpu->arch.prodded)
 			return 1;
 	}
 
@@ -2806,7 +2849,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			break;
 		n_ceded = 0;
 		for_each_runnable_thread(i, v, vc) {
-			if (!v->arch.pending_exceptions)
+			if (!v->arch.pending_exceptions && !v->arch.prodded)
 				n_ceded += v->arch.ceded;
 			else
 				v->arch.ceded = 0;
@@ -2877,7 +2920,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	smp_mb();
 
 	/* On the first time here, set up HTAB and VRMA */
-	if (!vcpu->kvm->arch.hpte_setup_done) {
+	if (!kvm_is_radix(vcpu->kvm) && !vcpu->kvm->arch.hpte_setup_done) {
 		r = kvmppc_hv_setup_htab_rma(vcpu);
 		if (r)
 			goto out;
@@ -2939,6 +2982,13 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
 {
 	struct kvm_ppc_one_seg_page_size *sps;
 
+	/*
+	 * Since we don't yet support HPT guests on a radix host,
+	 * return an error if the host uses radix.
+	 */
+	if (radix_enabled())
+		return -EINVAL;
+
 	info->flags = KVM_PPC_PAGE_SIZES_REAL;
 	if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
 		info->flags |= KVM_PPC_1T_SEGMENTS;
@@ -2961,8 +3011,10 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
-	int r;
+	int i, r;
 	unsigned long n;
+	unsigned long *buf;
+	struct kvm_vcpu *vcpu;
 
 	mutex_lock(&kvm->slots_lock);
 
@@ -2976,15 +3028,32 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
 	if (!memslot->dirty_bitmap)
 		goto out;
 
+	/*
+	 * Use second half of bitmap area because radix accumulates
+	 * bits in the first half.
+	 */
 	n = kvm_dirty_bitmap_bytes(memslot);
-	memset(memslot->dirty_bitmap, 0, n);
+	buf = memslot->dirty_bitmap + n / sizeof(long);
+	memset(buf, 0, n);
 
-	r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap);
+	if (kvm_is_radix(kvm))
+		r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf);
+	else
+		r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf);
 	if (r)
 		goto out;
 
+	/* Harvest dirty bits from VPA and DTL updates */
+	/* Note: we never modify the SLB shadow buffer areas */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf);
+		kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf);
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+	}
+
 	r = -EFAULT;
-	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+	if (copy_to_user(log->dirty_bitmap, buf, n))
 		goto out;
 
 	r = 0;
@@ -3005,6 +3074,15 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
 static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
 					 unsigned long npages)
 {
+	/*
+	 * For now, if radix_enabled() then we only support radix guests,
+	 * and in that case we don't need the rmap array.
+	 */
+	if (radix_enabled()) {
+		slot->arch.rmap = NULL;
+		return 0;
+	}
+
 	slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
 	if (!slot->arch.rmap)
 		return -ENOMEM;
@@ -3037,7 +3115,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 	if (npages)
 		atomic64_inc(&kvm->arch.mmio_update);
 
-	if (npages && old->npages) {
+	if (npages && old->npages && !kvm_is_radix(kvm)) {
 		/*
 		 * If modifying a memslot, reset all the rmap dirty bits.
 		 * If this is a new memslot, we don't need to do anything
@@ -3046,7 +3124,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 		 */
 		slots = kvm_memslots(kvm);
 		memslot = id_to_memslot(slots, mem->slot);
-		kvmppc_hv_get_dirty_log(kvm, memslot, NULL);
+		kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL);
 	}
 }
 
@@ -3085,14 +3163,20 @@ static void kvmppc_setup_partition_table(struct kvm *kvm)
 {
 	unsigned long dw0, dw1;
 
-	/* PS field - page size for VRMA */
-	dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
-		((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
-	/* HTABSIZE and HTABORG fields */
-	dw0 |= kvm->arch.sdr1;
+	if (!kvm_is_radix(kvm)) {
+		/* PS field - page size for VRMA */
+		dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
+			((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
+		/* HTABSIZE and HTABORG fields */
+		dw0 |= kvm->arch.sdr1;
 
-	/* Second dword has GR=0; other fields are unused since UPRT=0 */
-	dw1 = 0;
+		/* Second dword as set by userspace */
+		dw1 = kvm->arch.process_table;
+	} else {
+		dw0 = PATB_HR | radix__get_tree_size() |
+			__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
+		dw1 = PATB_GR | kvm->arch.process_table;
+	}
 
 	mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
 }
@@ -3113,12 +3197,23 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 		goto out;	/* another vcpu beat us to it */
 
 	/* Allocate hashed page table (if not done already) and reset it */
-	if (!kvm->arch.hpt_virt) {
-		err = kvmppc_alloc_hpt(kvm, NULL);
-		if (err) {
+	if (!kvm->arch.hpt.virt) {
+		int order = KVM_DEFAULT_HPT_ORDER;
+		struct kvm_hpt_info info;
+
+		err = kvmppc_allocate_hpt(&info, order);
+		/* If we get here, it means userspace didn't specify a
+		 * size explicitly.  So, try successively smaller
+		 * sizes if the default failed. */
+		while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER)
+			err  = kvmppc_allocate_hpt(&info, order);
+
+		if (err < 0) {
 			pr_err("KVM: Couldn't alloc HPT\n");
 			goto out;
 		}
+
+		kvmppc_set_hpt(kvm, &info);
 	}
 
 	/* Look up the memslot for guest physical address 0 */
@@ -3262,6 +3357,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 {
 	unsigned long lpcr, lpid;
 	char buf[32];
+	int ret;
 
 	/* Allocate the guest's logical partition ID */
 
@@ -3309,13 +3405,33 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 		lpcr |= LPCR_HVICE;
 	}
 
+	/*
+	 * For now, if the host uses radix, the guest must be radix.
+	 */
+	if (radix_enabled()) {
+		kvm->arch.radix = 1;
+		lpcr &= ~LPCR_VPM1;
+		lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
+		ret = kvmppc_init_vm_radix(kvm);
+		if (ret) {
+			kvmppc_free_lpid(kvm->arch.lpid);
+			return ret;
+		}
+		kvmppc_setup_partition_table(kvm);
+	}
+
 	kvm->arch.lpcr = lpcr;
 
+	/* Initialization for future HPT resizes */
+	kvm->arch.resize_hpt = NULL;
+
 	/*
 	 * Work out how many sets the TLB has, for the use of
 	 * the TLB invalidation loop in book3s_hv_rmhandlers.S.
 	 */
-	if (cpu_has_feature(CPU_FTR_ARCH_300))
+	if (kvm_is_radix(kvm))
+		kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX;	/* 128 */
+	else if (cpu_has_feature(CPU_FTR_ARCH_300))
 		kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH;	/* 256 */
 	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
 		kvm->arch.tlb_sets = POWER8_TLB_SETS;		/* 512 */
@@ -3325,8 +3441,11 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	/*
 	 * Track that we now have a HV mode VM active. This blocks secondary
 	 * CPU threads from coming online.
+	 * On POWER9, we only need to do this for HPT guests on a radix
+	 * host, which is not yet supported.
 	 */
-	kvm_hv_vm_activated();
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		kvm_hv_vm_activated();
 
 	/*
 	 * Create a debugfs directory for the VM
@@ -3352,11 +3471,17 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 {
 	debugfs_remove_recursive(kvm->arch.debugfs_dir);
 
-	kvm_hv_vm_deactivated();
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		kvm_hv_vm_deactivated();
 
 	kvmppc_free_vcores(kvm);
 
-	kvmppc_free_hpt(kvm);
+	kvmppc_free_lpid(kvm->arch.lpid);
+
+	if (kvm_is_radix(kvm))
+		kvmppc_free_radix(kvm);
+	else
+		kvmppc_free_hpt(&kvm->arch.hpt);
 
 	kvmppc_free_pimap(kvm);
 }
@@ -3385,11 +3510,6 @@ static int kvmppc_core_check_processor_compat_hv(void)
 	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
 	    !cpu_has_feature(CPU_FTR_ARCH_206))
 		return -EIO;
-	/*
-	 * Disable KVM for Power9 in radix mode.
-	 */
-	if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
-		return -EIO;
 
 	return 0;
 }
@@ -3587,12 +3707,9 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
 		r = -EFAULT;
 		if (get_user(htab_order, (u32 __user *)argp))
 			break;
-		r = kvmppc_alloc_reset_hpt(kvm, &htab_order);
+		r = kvmppc_alloc_reset_hpt(kvm, htab_order);
 		if (r)
 			break;
-		r = -EFAULT;
-		if (put_user(htab_order, (u32 __user *)argp))
-			break;
 		r = 0;
 		break;
 	}
@@ -3607,6 +3724,28 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
 		break;
 	}
 
+	case KVM_PPC_RESIZE_HPT_PREPARE: {
+		struct kvm_ppc_resize_hpt rhpt;
+
+		r = -EFAULT;
+		if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
+			break;
+
+		r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt);
+		break;
+	}
+
+	case KVM_PPC_RESIZE_HPT_COMMIT: {
+		struct kvm_ppc_resize_hpt rhpt;
+
+		r = -EFAULT;
+		if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
+			break;
+
+		r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt);
+		break;
+	}
+
 	default:
 		r = -ENOTTY;
 	}
@@ -3657,6 +3796,41 @@ static void init_default_hcalls(void)
 	}
 }
 
+static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
+{
+	unsigned long lpcr;
+	int radix;
+
+	/* If not on a POWER9, reject it */
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		return -ENODEV;
+
+	/* If any unknown flags set, reject it */
+	if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
+		return -EINVAL;
+
+	/* We can't change a guest to/from radix yet */
+	radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
+	if (radix != kvm_is_radix(kvm))
+		return -EINVAL;
+
+	/* GR (guest radix) bit in process_table field must match */
+	if (!!(cfg->process_table & PATB_GR) != radix)
+		return -EINVAL;
+
+	/* Process table size field must be reasonable, i.e. <= 24 */
+	if ((cfg->process_table & PRTS_MASK) > 24)
+		return -EINVAL;
+
+	kvm->arch.process_table = cfg->process_table;
+	kvmppc_setup_partition_table(kvm);
+
+	lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
+	kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
+
+	return 0;
+}
+
 static struct kvmppc_ops kvm_ops_hv = {
 	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
 	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -3694,6 +3868,8 @@ static struct kvmppc_ops kvm_ops_hv = {
 	.irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
 	.irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
 #endif
+	.configure_mmu = kvmhv_configure_mmu,
+	.get_rmmu_info = kvmhv_get_rmmu_info,
 };
 
 static int kvm_init_subcore_bitmap(void)
@@ -3728,6 +3904,11 @@ static int kvm_init_subcore_bitmap(void)
 	return 0;
 }
 
+static int kvmppc_radix_possible(void)
+{
+	return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
+}
+
 static int kvmppc_book3s_init_hv(void)
 {
 	int r;
@@ -3767,12 +3948,19 @@ static int kvmppc_book3s_init_hv(void)
 	init_vcore_lists();
 
 	r = kvmppc_mmu_hv_init();
+	if (r)
+		return r;
+
+	if (kvmppc_radix_possible())
+		r = kvmppc_radix_init();
 	return r;
 }
 
 static void kvmppc_book3s_exit_hv(void)
 {
 	kvmppc_free_host_rm_ops();
+	if (kvmppc_radix_possible())
+		kvmppc_radix_exit();
 	kvmppc_hv_ops = NULL;
 }
 
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 5bb24be0b346..96e7e609f621 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -29,6 +29,11 @@
 #include <asm/opal.h>
 #include <asm/smp.h>
 
+static bool in_realmode(void)
+{
+	return !(mfmsr() & MSR_IR);
+}
+
 #define KVM_CMA_CHUNK_ORDER	18
 
 /*
@@ -52,19 +57,19 @@ static int __init early_parse_kvm_cma_resv(char *p)
 }
 early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv);
 
-struct page *kvm_alloc_hpt(unsigned long nr_pages)
+struct page *kvm_alloc_hpt_cma(unsigned long nr_pages)
 {
 	VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
 
 	return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES));
 }
-EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
+EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma);
 
-void kvm_release_hpt(struct page *page, unsigned long nr_pages)
+void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages)
 {
 	cma_release(kvm_cma, page, nr_pages);
 }
-EXPORT_SYMBOL_GPL(kvm_release_hpt);
+EXPORT_SYMBOL_GPL(kvm_free_hpt_cma);
 
 /**
  * kvm_cma_reserve() - reserve area for kvm hash pagetable
@@ -200,7 +205,6 @@ static inline void rm_writeb(unsigned long paddr, u8 val)
 
 /*
  * Send an interrupt or message to another CPU.
- * This can only be called in real mode.
  * The caller needs to include any barrier needed to order writes
  * to memory vs. the IPI/message.
  */
@@ -226,7 +230,9 @@ void kvmhv_rm_send_ipi(int cpu)
 
 	/* Else poke the target with an IPI */
 	xics_phys = paca[cpu].kvm_hstate.xics_phys;
-	if (xics_phys)
+	if (!in_realmode())
+		opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
+	else if (xics_phys)
 		rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
 	else
 		opal_rm_int_set_mfrr(get_hard_smp_processor_id(cpu),
@@ -412,14 +418,15 @@ static long kvmppc_read_one_intr(bool *again)
 
 	/* Now read the interrupt from the ICP */
 	xics_phys = local_paca->kvm_hstate.xics_phys;
-	if (!xics_phys) {
-		/* Use OPAL to read the XIRR */
+	rc = 0;
+	if (!in_realmode())
+		rc = opal_int_get_xirr(&xirr, false);
+	else if (!xics_phys)
 		rc = opal_rm_int_get_xirr(&xirr, false);
-		if (rc < 0)
-			return 1;
-	} else {
+	else
 		xirr = _lwzcix(xics_phys + XICS_XIRR);
-	}
+	if (rc < 0)
+		return 1;
 
 	/*
 	 * Save XIRR for later. Since we get control in reverse endian
@@ -445,15 +452,19 @@ static long kvmppc_read_one_intr(bool *again)
 	 * If it is an IPI, clear the MFRR and EOI it.
 	 */
 	if (xisr == XICS_IPI) {
-		if (xics_phys) {
+		rc = 0;
+		if (!in_realmode()) {
+			opal_int_set_mfrr(hard_smp_processor_id(), 0xff);
+			rc = opal_int_eoi(h_xirr);
+		} else if (xics_phys) {
 			_stbcix(xics_phys + XICS_MFRR, 0xff);
 			_stwcix(xics_phys + XICS_XIRR, xirr);
 		} else {
 			opal_rm_int_set_mfrr(hard_smp_processor_id(), 0xff);
 			rc = opal_rm_int_eoi(h_xirr);
-			/* If rc > 0, there is another interrupt pending */
-			*again = rc > 0;
 		}
+		/* If rc > 0, there is another interrupt pending */
+		*again = rc > 0;
 
 		/*
 		 * Need to ensure side effects of above stores
@@ -471,7 +482,10 @@ static long kvmppc_read_one_intr(bool *again)
 			/* We raced with the host,
 			 * we need to resend that IPI, bummer
 			 */
-			if (xics_phys)
+			if (!in_realmode())
+				opal_int_set_mfrr(hard_smp_processor_id(),
+						  IPI_PRIORITY);
+			else if (xics_phys)
 				_stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
 			else
 				opal_rm_int_set_mfrr(hard_smp_processor_id(),
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 9ef3c4be952f..6fca970373ee 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -43,6 +43,7 @@ static void *real_vmalloc_addr(void *x)
 static int global_invalidates(struct kvm *kvm, unsigned long flags)
 {
 	int global;
+	int cpu;
 
 	/*
 	 * If there is only one vcore, and it's currently running,
@@ -60,8 +61,14 @@ static int global_invalidates(struct kvm *kvm, unsigned long flags)
 		/* any other core might now have stale TLB entries... */
 		smp_wmb();
 		cpumask_setall(&kvm->arch.need_tlb_flush);
-		cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
-				  &kvm->arch.need_tlb_flush);
+		cpu = local_paca->kvm_hstate.kvm_vcore->pcpu;
+		/*
+		 * On POWER9, threads are independent but the TLB is shared,
+		 * so use the bit for the first thread to represent the core.
+		 */
+		if (cpu_has_feature(CPU_FTR_ARCH_300))
+			cpu = cpu_first_thread_sibling(cpu);
+		cpumask_clear_cpu(cpu, &kvm->arch.need_tlb_flush);
 	}
 
 	return global;
@@ -79,10 +86,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
 
 	if (*rmap & KVMPPC_RMAP_PRESENT) {
 		i = *rmap & KVMPPC_RMAP_INDEX;
-		head = &kvm->arch.revmap[i];
+		head = &kvm->arch.hpt.rev[i];
 		if (realmode)
 			head = real_vmalloc_addr(head);
-		tail = &kvm->arch.revmap[head->back];
+		tail = &kvm->arch.hpt.rev[head->back];
 		if (realmode)
 			tail = real_vmalloc_addr(tail);
 		rev->forw = i;
@@ -147,8 +154,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 	lock_rmap(rmap);
 
 	head = *rmap & KVMPPC_RMAP_INDEX;
-	next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]);
-	prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]);
+	next = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->forw]);
+	prev = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->back]);
 	next->back = rev->back;
 	prev->forw = rev->forw;
 	if (head == pte_index) {
@@ -182,6 +189,8 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 	unsigned long mmu_seq;
 	unsigned long rcbits, irq_flags = 0;
 
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
 	psize = hpte_page_size(pteh, ptel);
 	if (!psize)
 		return H_PARAMETER;
@@ -283,11 +292,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
 	/* Find and lock the HPTEG slot to use */
  do_insert:
-	if (pte_index >= kvm->arch.hpt_npte)
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
 		return H_PARAMETER;
 	if (likely((flags & H_EXACT) == 0)) {
 		pte_index &= ~7UL;
-		hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+		hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
 		for (i = 0; i < 8; ++i) {
 			if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0 &&
 			    try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
@@ -318,7 +327,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 		}
 		pte_index += i;
 	} else {
-		hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+		hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
 		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
 				   HPTE_V_ABSENT)) {
 			/* Lock the slot and check again */
@@ -335,7 +344,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 	}
 
 	/* Save away the guest's idea of the second HPTE dword */
-	rev = &kvm->arch.revmap[pte_index];
+	rev = &kvm->arch.hpt.rev[pte_index];
 	if (realmode)
 		rev = real_vmalloc_addr(rev);
 	if (rev) {
@@ -458,9 +467,11 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 	struct revmap_entry *rev;
 	u64 pte, orig_pte, pte_r;
 
-	if (pte_index >= kvm->arch.hpt_npte)
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
 		return H_PARAMETER;
-	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+	hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
 	pte = orig_pte = be64_to_cpu(hpte[0]);
@@ -476,7 +487,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 		return H_NOT_FOUND;
 	}
 
-	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
 	v = pte & ~HPTE_V_HVLOCK;
 	if (v & HPTE_V_VALID) {
 		hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
@@ -529,6 +540,8 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 	struct revmap_entry *rev, *revs[4];
 	u64 hp0, hp1;
 
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
 	global = global_invalidates(kvm, 0);
 	for (i = 0; i < 4 && ret == H_SUCCESS; ) {
 		n = 0;
@@ -544,13 +557,13 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 				break;
 			}
 			if (req != 1 || flags == 3 ||
-			    pte_index >= kvm->arch.hpt_npte) {
+			    pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) {
 				/* parameter error */
 				args[j] = ((0xa0 | flags) << 56) + pte_index;
 				ret = H_PARAMETER;
 				break;
 			}
-			hp = (__be64 *) (kvm->arch.hpt_virt + (pte_index << 4));
+			hp = (__be64 *) (kvm->arch.hpt.virt + (pte_index << 4));
 			/* to avoid deadlock, don't spin except for first */
 			if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) {
 				if (n)
@@ -587,7 +600,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 			}
 
 			args[j] = ((0x80 | flags) << 56) + pte_index;
-			rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+			rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
 			note_hpte_modification(kvm, rev);
 
 			if (!(hp0 & HPTE_V_VALID)) {
@@ -642,10 +655,12 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	unsigned long v, r, rb, mask, bits;
 	u64 pte_v, pte_r;
 
-	if (pte_index >= kvm->arch.hpt_npte)
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
 		return H_PARAMETER;
 
-	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+	hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
 	v = pte_v = be64_to_cpu(hpte[0]);
@@ -665,7 +680,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	/* Update guest view of 2nd HPTE dword */
 	mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
 		HPTE_R_KEY_HI | HPTE_R_KEY_LO;
-	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
 	if (rev) {
 		r = (rev->guest_rpte & ~mask) | bits;
 		rev->guest_rpte = r;
@@ -711,15 +726,17 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 	int i, n = 1;
 	struct revmap_entry *rev = NULL;
 
-	if (pte_index >= kvm->arch.hpt_npte)
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
 		return H_PARAMETER;
 	if (flags & H_READ_4) {
 		pte_index &= ~3;
 		n = 4;
 	}
-	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
 	for (i = 0; i < n; ++i, ++pte_index) {
-		hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+		hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
 		v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
 		r = be64_to_cpu(hpte[1]);
 		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
@@ -750,11 +767,13 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
 	unsigned long *rmap;
 	long ret = H_NOT_FOUND;
 
-	if (pte_index >= kvm->arch.hpt_npte)
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
 		return H_PARAMETER;
 
-	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
-	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
+	hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
 	v = be64_to_cpu(hpte[0]);
@@ -796,11 +815,13 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
 	unsigned long *rmap;
 	long ret = H_NOT_FOUND;
 
-	if (pte_index >= kvm->arch.hpt_npte)
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
 		return H_PARAMETER;
 
-	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
-	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
+	hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
 	v = be64_to_cpu(hpte[0]);
@@ -949,7 +970,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 		somask = (1UL << 28) - 1;
 		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
 	}
-	hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask;
+	hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvmppc_hpt_mask(&kvm->arch.hpt);
 	avpn = slb_v & ~(somask >> 16);	/* also includes B */
 	avpn |= (eaddr & somask) >> 16;
 
@@ -960,7 +981,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 	val |= avpn;
 
 	for (;;) {
-		hpte = (__be64 *)(kvm->arch.hpt_virt + (hash << 7));
+		hpte = (__be64 *)(kvm->arch.hpt.virt + (hash << 7));
 
 		for (i = 0; i < 16; i += 2) {
 			/* Read the PTE racily */
@@ -996,7 +1017,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 		if (val & HPTE_V_SECONDARY)
 			break;
 		val |= HPTE_V_SECONDARY;
-		hash = hash ^ kvm->arch.hpt_mask;
+		hash = hash ^ kvmppc_hpt_mask(&kvm->arch.hpt);
 	}
 	return -1;
 }
@@ -1045,14 +1066,14 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 				return status;	/* there really was no HPTE */
 			return 0;	/* for prot fault, HPTE disappeared */
 		}
-		hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
+		hpte = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
 		v = orig_v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
 		r = be64_to_cpu(hpte[1]);
 		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
 			v = hpte_new_to_old_v(v, r);
 			r = hpte_new_to_old_r(r);
 		}
-		rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
+		rev = real_vmalloc_addr(&kvm->arch.hpt.rev[index]);
 		gr = rev->guest_rpte;
 
 		unlock_hpte(hpte, orig_v);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 06edc4366639..0b2e388f4cdf 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -35,7 +35,7 @@ int kvm_irq_bypass = 1;
 EXPORT_SYMBOL(kvm_irq_bypass);
 
 static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
-			    u32 new_irq);
+			    u32 new_irq, bool check_resend);
 static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu);
 
 /* -- ICS routines -- */
@@ -44,20 +44,12 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics,
 {
 	int i;
 
-	arch_spin_lock(&ics->lock);
-
 	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
 		struct ics_irq_state *state = &ics->irq_state[i];
-
-		if (!state->resend)
-			continue;
-
-		arch_spin_unlock(&ics->lock);
-		icp_rm_deliver_irq(xics, icp, state->number);
-		arch_spin_lock(&ics->lock);
+		if (state->resend)
+			icp_rm_deliver_irq(xics, icp, state->number, true);
 	}
 
-	arch_spin_unlock(&ics->lock);
 }
 
 /* -- ICP routines -- */
@@ -70,11 +62,9 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu)
 	hcpu = hcore << threads_shift;
 	kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
 	smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
-	if (paca[hcpu].kvm_hstate.xics_phys)
-		icp_native_cause_ipi_rm(hcpu);
-	else
-		opal_rm_int_set_mfrr(get_hard_smp_processor_id(hcpu),
-				     IPI_PRIORITY);
+	kvmppc_set_host_ipi(hcpu, 1);
+	smp_mb();
+	kvmhv_rm_send_ipi(hcpu);
 }
 #else
 static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { }
@@ -290,7 +280,7 @@ static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
 }
 
 static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
-			    u32 new_irq)
+			    u32 new_irq, bool check_resend)
 {
 	struct ics_irq_state *state;
 	struct kvmppc_ics *ics;
@@ -335,6 +325,10 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		}
 	}
 
+	if (check_resend)
+		if (!state->resend)
+			goto out;
+
 	/* Clear the resend bit of that interrupt */
 	state->resend = 0;
 
@@ -380,7 +374,9 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		 */
 		if (reject && reject != XICS_IPI) {
 			arch_spin_unlock(&ics->lock);
+			icp->n_reject++;
 			new_irq = reject;
+			check_resend = 0;
 			goto again;
 		}
 	} else {
@@ -388,10 +384,16 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		 * We failed to deliver the interrupt we need to set the
 		 * resend map bit and mark the ICS state as needing a resend
 		 */
-		set_bit(ics->icsid, icp->resend_map);
 		state->resend = 1;
 
 		/*
+		 * Make sure when checking resend, we don't miss the resend
+		 * if resend_map bit is seen and cleared.
+		 */
+		smp_wmb();
+		set_bit(ics->icsid, icp->resend_map);
+
+		/*
 		 * If the need_resend flag got cleared in the ICP some time
 		 * between icp_rm_try_to_deliver() atomic update and now, then
 		 * we know it might have missed the resend_map bit. So we
@@ -399,7 +401,9 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		 */
 		smp_mb();
 		if (!icp->state.need_resend) {
+			state->resend = 0;
 			arch_spin_unlock(&ics->lock);
+			check_resend = 0;
 			goto again;
 		}
 	}
@@ -594,7 +598,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
 	/* Handle reject in real mode */
 	if (reject && reject != XICS_IPI) {
 		this_icp->n_reject++;
-		icp_rm_deliver_irq(xics, icp, reject);
+		icp_rm_deliver_irq(xics, icp, reject, false);
 	}
 
 	/* Handle resends in real mode */
@@ -662,59 +666,45 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 	 */
 	if (reject && reject != XICS_IPI) {
 		icp->n_reject++;
-		icp_rm_deliver_irq(xics, icp, reject);
+		icp_rm_deliver_irq(xics, icp, reject, false);
 	}
  bail:
 	return check_too_hard(xics, icp);
 }
 
-int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq)
 {
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
 	struct kvmppc_icp *icp = vcpu->arch.icp;
 	struct kvmppc_ics *ics;
 	struct ics_irq_state *state;
-	u32 irq = xirr & 0x00ffffff;
 	u16 src;
-
-	if (!xics || !xics->real_mode)
-		return H_TOO_HARD;
+	u32 pq_old, pq_new;
 
 	/*
-	 * ICP State: EOI
-	 *
-	 * Note: If EOI is incorrectly used by SW to lower the CPPR
-	 * value (ie more favored), we do not check for rejection of
-	 * a pending interrupt, this is a SW error and PAPR sepcifies
-	 * that we don't have to deal with it.
+	 * ICS EOI handling: For LSI, if P bit is still set, we need to
+	 * resend it.
 	 *
-	 * The sending of an EOI to the ICS is handled after the
-	 * CPPR update
-	 *
-	 * ICP State: Down_CPPR which we handle
-	 * in a separate function as it's shared with H_CPPR.
+	 * For MSI, we move Q bit into P (and clear Q). If it is set,
+	 * resend it.
 	 */
-	icp_rm_down_cppr(xics, icp, xirr >> 24);
 
-	/* IPIs have no EOI */
-	if (irq == XICS_IPI)
-		goto bail;
-	/*
-	 * EOI handling: If the interrupt is still asserted, we need to
-	 * resend it. We can take a lockless "peek" at the ICS state here.
-	 *
-	 * "Message" interrupts will never have "asserted" set
-	 */
 	ics = kvmppc_xics_find_ics(xics, irq, &src);
 	if (!ics)
 		goto bail;
+
 	state = &ics->irq_state[src];
 
-	/* Still asserted, resend it */
-	if (state->asserted) {
-		icp->n_reject++;
-		icp_rm_deliver_irq(xics, icp, irq);
-	}
+	if (state->lsi)
+		pq_new = state->pq_state;
+	else
+		do {
+			pq_old = state->pq_state;
+			pq_new = pq_old >> 1;
+		} while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+	if (pq_new & PQ_PRESENTED)
+		icp_rm_deliver_irq(xics, NULL, irq, false);
 
 	if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {
 		icp->rm_action |= XICS_RM_NOTIFY_EOI;
@@ -735,10 +725,43 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 			state->intr_cpu = -1;
 		}
 	}
+
  bail:
 	return check_too_hard(xics, icp);
 }
 
+int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 irq = xirr & 0x00ffffff;
+
+	if (!xics || !xics->real_mode)
+		return H_TOO_HARD;
+
+	/*
+	 * ICP State: EOI
+	 *
+	 * Note: If EOI is incorrectly used by SW to lower the CPPR
+	 * value (ie more favored), we do not check for rejection of
+	 * a pending interrupt, this is a SW error and PAPR specifies
+	 * that we don't have to deal with it.
+	 *
+	 * The sending of an EOI to the ICS is handled after the
+	 * CPPR update
+	 *
+	 * ICP State: Down_CPPR which we handle
+	 * in a separate function as it's shared with H_CPPR.
+	 */
+	icp_rm_down_cppr(xics, icp, xirr >> 24);
+
+	/* IPIs have no EOI */
+	if (irq == XICS_IPI)
+		return check_too_hard(xics, icp);
+
+	return ics_rm_eoi(vcpu, irq);
+}
+
 unsigned long eoi_rc;
 
 static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
@@ -825,14 +848,33 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
 {
 	struct kvmppc_xics *xics;
 	struct kvmppc_icp *icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
 	u32 irq;
+	u16 src;
+	u32 pq_old, pq_new;
 
 	irq = irq_map->v_hwirq;
 	xics = vcpu->kvm->arch.xics;
 	icp = vcpu->arch.icp;
 
 	kvmppc_rm_handle_irq_desc(irq_map->desc);
-	icp_rm_deliver_irq(xics, icp, irq);
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return 2;
+
+	state = &ics->irq_state[src];
+
+	/* only MSIs register bypass producers, so it must be MSI here */
+	do {
+		pq_old = state->pq_state;
+		pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED;
+	} while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+	/* Test P=1, Q=0, this is the only case where we present */
+	if (pq_new == PQ_PRESENTED)
+		icp_rm_deliver_irq(xics, icp, irq, false);
 
 	/* EOI the interrupt */
 	icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr,
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 9338a818e05c..47414a6fe2dd 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -148,6 +148,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	addi	r1, r1, 112
 	ld	r7, HSTATE_HOST_MSR(r13)
 
+	/*
+	 * If we came back from the guest via a relocation-on interrupt,
+	 * we will be in virtual mode at this point, which makes it a
+	 * little easier to get back to the caller.
+	 */
+	mfmsr	r0
+	andi.	r0, r0, MSR_IR		/* in real mode? */
+	bne	.Lvirt_return
+
 	cmpwi	cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
 	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
 	beq	11f
@@ -181,6 +190,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	mtspr SPRN_HSRR1, r7
 	ba    0xe80
 
+	/* Virtual-mode return - can't get here for HMI or machine check */
+.Lvirt_return:
+	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
+	beq	16f
+	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
+	beq	17f
+	andi.	r0, r7, MSR_EE		/* were interrupts hard-enabled? */
+	beq	18f
+	mtmsrd	r7, 1			/* if so then re-enable them */
+18:	mtlr	r8
+	blr
+
+16:	mtspr	SPRN_HSRR0, r8		/* jump to reloc-on external vector */
+	mtspr	SPRN_HSRR1, r7
+	b	exc_virt_0x4500_hardware_interrupt
+
+17:	mtspr	SPRN_HSRR0, r8
+	mtspr	SPRN_HSRR1, r7
+	b	exc_virt_0x4e80_h_doorbell
+
 kvmppc_primary_no_guest:
 	/* We handle this much like a ceded vcpu */
 	/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
@@ -518,6 +547,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 /* Stack frame offsets */
 #define STACK_SLOT_TID		(112-16)
 #define STACK_SLOT_PSSCR	(112-24)
+#define STACK_SLOT_PID		(112-32)
 
 .global kvmppc_hv_entry
 kvmppc_hv_entry:
@@ -530,6 +560,7 @@ kvmppc_hv_entry:
 	 * R1 = host R1
 	 * R2 = TOC
 	 * all other volatile GPRS = free
+	 * Does not preserve non-volatile GPRs or CR fields
 	 */
 	mflr	r0
 	std	r0, PPC_LR_STKOFF(r1)
@@ -549,32 +580,38 @@ kvmppc_hv_entry:
 	bl	kvmhv_start_timing
 1:
 #endif
-	/* Clear out SLB */
+
+	/* Use cr7 as an indication of radix mode */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	ld	r9, VCORE_KVM(r5)	/* pointer to struct kvm */
+	lbz	r0, KVM_RADIX(r9)
+	cmpwi	cr7, r0, 0
+
+	/* Clear out SLB if hash */
+	bne	cr7, 2f
 	li	r6,0
 	slbmte	r6,r6
 	slbia
 	ptesync
-
+2:
 	/*
 	 * POWER7/POWER8 host -> guest partition switch code.
 	 * We don't have to lock against concurrent tlbies,
 	 * but we do have to coordinate across hardware threads.
 	 */
 	/* Set bit in entry map iff exit map is zero. */
-	ld	r5, HSTATE_KVM_VCORE(r13)
 	li	r7, 1
 	lbz	r6, HSTATE_PTID(r13)
 	sld	r7, r7, r6
-	addi	r9, r5, VCORE_ENTRY_EXIT
-21:	lwarx	r3, 0, r9
+	addi	r8, r5, VCORE_ENTRY_EXIT
+21:	lwarx	r3, 0, r8
 	cmpwi	r3, 0x100		/* any threads starting to exit? */
 	bge	secondary_too_late	/* if so we're too late to the party */
 	or	r3, r3, r7
-	stwcx.	r3, 0, r9
+	stwcx.	r3, 0, r8
 	bne	21b
 
 	/* Primary thread switches to guest partition. */
-	ld	r9,VCORE_KVM(r5)	/* pointer to struct kvm */
 	cmpwi	r6,0
 	bne	10f
 	lwz	r7,KVM_LPID(r9)
@@ -590,30 +627,44 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 
 	/* See if we need to flush the TLB */
 	lhz	r6,PACAPACAINDEX(r13)	/* test_bit(cpu, need_tlb_flush) */
+BEGIN_FTR_SECTION
+	/*
+	 * On POWER9, individual threads can come in here, but the
+	 * TLB is shared between the 4 threads in a core, hence
+	 * invalidating on one thread invalidates for all.
+	 * Thus we make all 4 threads use the same bit here.
+	 */
+	clrrdi	r6,r6,2
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	clrldi	r7,r6,64-6		/* extract bit number (6 bits) */
 	srdi	r6,r6,6			/* doubleword number */
 	sldi	r6,r6,3			/* address offset */
 	add	r6,r6,r9
 	addi	r6,r6,KVM_NEED_FLUSH	/* dword in kvm->arch.need_tlb_flush */
-	li	r0,1
-	sld	r0,r0,r7
+	li	r8,1
+	sld	r8,r8,r7
 	ld	r7,0(r6)
-	and.	r7,r7,r0
+	and.	r7,r7,r8
 	beq	22f
-23:	ldarx	r7,0,r6			/* if set, clear the bit */
-	andc	r7,r7,r0
-	stdcx.	r7,0,r6
-	bne	23b
 	/* Flush the TLB of any entries for this LPID */
-	lwz	r6,KVM_TLB_SETS(r9)
-	li	r0,0			/* RS for P9 version of tlbiel */
-	mtctr	r6
+	lwz	r0,KVM_TLB_SETS(r9)
+	mtctr	r0
 	li	r7,0x800		/* IS field = 0b10 */
 	ptesync
-28:	tlbiel	r7
+	li	r0,0			/* RS for P9 version of tlbiel */
+	bne	cr7, 29f
+28:	tlbiel	r7			/* On P9, rs=0, RIC=0, PRS=0, R=0 */
 	addi	r7,r7,0x1000
 	bdnz	28b
-	ptesync
+	b	30f
+29:	PPC_TLBIEL(7,0,2,1,1)		/* for radix, RIC=2, PRS=1, R=1 */
+	addi	r7,r7,0x1000
+	bdnz	29b
+30:	ptesync
+23:	ldarx	r7,0,r6			/* clear the bit after TLB flushed */
+	andc	r7,r7,r8
+	stdcx.	r7,0,r6
+	bne	23b
 
 	/* Add timebase offset onto timebase */
 22:	ld	r8,VCORE_TB_OFFSET(r5)
@@ -658,7 +709,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	beq	kvmppc_primary_no_guest
 kvmppc_got_guest:
 
-	/* Load up guest SLB entries */
+	/* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
 	lwz	r5,VCPU_SLB_MAX(r4)
 	cmpwi	r5,0
 	beq	9f
@@ -696,8 +747,10 @@ kvmppc_got_guest:
 BEGIN_FTR_SECTION
 	mfspr	r5, SPRN_TIDR
 	mfspr	r6, SPRN_PSSCR
+	mfspr	r7, SPRN_PID
 	std	r5, STACK_SLOT_TID(r1)
 	std	r6, STACK_SLOT_PSSCR(r1)
+	std	r7, STACK_SLOT_PID(r1)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
 BEGIN_FTR_SECTION
@@ -824,6 +877,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	mtspr	SPRN_PID, r7
 	mtspr	SPRN_WORT, r8
 BEGIN_FTR_SECTION
+	PPC_INVALIDATE_ERAT
+END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
+BEGIN_FTR_SECTION
 	/* POWER8-only registers */
 	ld	r5, VCPU_TCSCR(r4)
 	ld	r6, VCPU_ACOP(r4)
@@ -1057,13 +1113,13 @@ hdec_soon:
 kvmppc_interrupt_hv:
 	/*
 	 * Register contents:
-	 * R12		= interrupt vector
+	 * R12		= (guest CR << 32) | interrupt vector
 	 * R13		= PACA
-	 * guest CR, R12 saved in shadow VCPU SCRATCH1/0
+	 * guest R12 saved in shadow VCPU SCRATCH0
+	 * guest CTR saved in shadow VCPU SCRATCH1 if RELOCATABLE
 	 * guest R13 saved in SPRN_SCRATCH0
 	 */
 	std	r9, HSTATE_SCRATCH2(r13)
-
 	lbz	r9, HSTATE_IN_GUEST(r13)
 	cmpwi	r9, KVM_GUEST_MODE_HOST_HV
 	beq	kvmppc_bad_host_intr
@@ -1094,8 +1150,9 @@ kvmppc_interrupt_hv:
 	std	r10, VCPU_GPR(R10)(r9)
 	std	r11, VCPU_GPR(R11)(r9)
 	ld	r3, HSTATE_SCRATCH0(r13)
-	lwz	r4, HSTATE_SCRATCH1(r13)
 	std	r3, VCPU_GPR(R12)(r9)
+	/* CR is in the high half of r12 */
+	srdi	r4, r12, 32
 	stw	r4, VCPU_CR(r9)
 BEGIN_FTR_SECTION
 	ld	r3, HSTATE_CFAR(r13)
@@ -1114,6 +1171,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	mfspr	r11, SPRN_SRR1
 	std	r10, VCPU_SRR0(r9)
 	std	r11, VCPU_SRR1(r9)
+	/* trap is in the low half of r12, clear CR from the high half */
+	clrldi	r12, r12, 32
 	andi.	r0, r12, 2		/* need to read HSRR0/1? */
 	beq	1f
 	mfspr	r10, SPRN_HSRR0
@@ -1149,7 +1208,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 11:	stw	r3,VCPU_HEIR(r9)
 
 	/* these are volatile across C function calls */
+#ifdef CONFIG_RELOCATABLE
+	ld	r3, HSTATE_SCRATCH1(r13)
+	mtctr	r3
+#else
 	mfctr	r3
+#endif
 	mfxer	r4
 	std	r3, VCPU_CTR(r9)
 	std	r4, VCPU_XER(r9)
@@ -1285,11 +1349,15 @@ mc_cont:
 	mtspr	SPRN_CTRLT,r6
 4:
 	/* Read the guest SLB and save it away */
+	ld	r5, VCPU_KVM(r9)
+	lbz	r0, KVM_RADIX(r5)
+	cmpwi	r0, 0
+	li	r5, 0
+	bne	3f			/* for radix, save 0 entries */
 	lwz	r0,VCPU_SLB_NR(r9)	/* number of entries in SLB */
 	mtctr	r0
 	li	r6,0
 	addi	r7,r9,VCPU_SLB
-	li	r5,0
 1:	slbmfee	r8,r6
 	andis.	r0,r8,SLB_ESID_V@h
 	beq	2f
@@ -1301,7 +1369,7 @@ mc_cont:
 	addi	r5,r5,1
 2:	addi	r6,r6,1
 	bdnz	1b
-	stw	r5,VCPU_SLB_MAX(r9)
+3:	stw	r5,VCPU_SLB_MAX(r9)
 
 	/*
 	 * Save the guest PURR/SPURR
@@ -1550,9 +1618,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 BEGIN_FTR_SECTION
 	ld	r5, STACK_SLOT_TID(r1)
 	ld	r6, STACK_SLOT_PSSCR(r1)
+	ld	r7, STACK_SLOT_PID(r1)
 	mtspr	SPRN_TIDR, r5
 	mtspr	SPRN_PSSCR, r6
+	mtspr	SPRN_PID, r7
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+BEGIN_FTR_SECTION
+	PPC_INVALIDATE_ERAT
+END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
 
 	/*
 	 * POWER7/POWER8 guest -> host partition switch code.
@@ -1663,6 +1736,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	isync
 
 	/* load host SLB entries */
+BEGIN_MMU_FTR_SECTION
+	b	0f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
 	ld	r8,PACA_SLBSHADOWPTR(r13)
 
 	.rept	SLB_NUM_BOLTED
@@ -1675,7 +1751,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	slbmte	r6,r5
 1:	addi	r8,r8,16
 	.endr
-
+0:
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
 	/* Finish timing, if we have a vcpu */
 	ld	r4, HSTATE_KVM_VCPU(r13)
@@ -1702,11 +1778,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  * reflect the HDSI to the guest as a DSI.
  */
 kvmppc_hdsi:
+	ld	r3, VCPU_KVM(r9)
+	lbz	r0, KVM_RADIX(r3)
+	cmpwi	r0, 0
 	mfspr	r4, SPRN_HDAR
 	mfspr	r6, SPRN_HDSISR
+	bne	.Lradix_hdsi		/* on radix, just save DAR/DSISR/ASDR */
 	/* HPTE not found fault or protection fault? */
 	andis.	r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h
 	beq	1f			/* if not, send it to the guest */
+BEGIN_FTR_SECTION
+	mfspr	r5, SPRN_ASDR		/* on POWER9, use ASDR to get VSID */
+	b	4f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	andi.	r0, r11, MSR_DR		/* data relocation enabled? */
 	beq	3f
 	clrrdi	r0, r4, 28
@@ -1776,13 +1860,29 @@ fast_interrupt_c_return:
 	stb	r0, HSTATE_IN_GUEST(r13)
 	b	guest_exit_cont
 
+.Lradix_hdsi:
+	std	r4, VCPU_FAULT_DAR(r9)
+	stw	r6, VCPU_FAULT_DSISR(r9)
+.Lradix_hisi:
+	mfspr	r5, SPRN_ASDR
+	std	r5, VCPU_FAULT_GPA(r9)
+	b	guest_exit_cont
+
 /*
  * Similarly for an HISI, reflect it to the guest as an ISI unless
  * it is an HPTE not found fault for a page that we have paged out.
  */
 kvmppc_hisi:
+	ld	r3, VCPU_KVM(r9)
+	lbz	r0, KVM_RADIX(r3)
+	cmpwi	r0, 0
+	bne	.Lradix_hisi		/* for radix, just save ASDR */
 	andis.	r0, r11, SRR1_ISI_NOPT@h
 	beq	1f
+BEGIN_FTR_SECTION
+	mfspr	r5, SPRN_ASDR		/* on POWER9, use ASDR to get VSID */
+	b	4f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	andi.	r0, r11, MSR_IR		/* instruction relocation enabled? */
 	beq	3f
 	clrrdi	r0, r10, 28
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 1482961ceb4d..d4dfc0ca2a44 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -902,6 +902,69 @@ static void kvmppc_clear_debug(struct kvm_vcpu *vcpu)
 	}
 }
 
+static int kvmppc_exit_pr_progint(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				  unsigned int exit_nr)
+{
+	enum emulation_result er;
+	ulong flags;
+	u32 last_inst;
+	int emul, r;
+
+	/*
+	 * shadow_srr1 only contains valid flags if we came here via a program
+	 * exception. The other exceptions (emulation assist, FP unavailable,
+	 * etc.) do not provide flags in SRR1, so use an illegal-instruction
+	 * exception when injecting a program interrupt into the guest.
+	 */
+	if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
+		flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+	else
+		flags = SRR1_PROGILL;
+
+	emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
+	if (emul != EMULATE_DONE)
+		return RESUME_GUEST;
+
+	if (kvmppc_get_msr(vcpu) & MSR_PR) {
+#ifdef EXIT_DEBUG
+		pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n",
+			kvmppc_get_pc(vcpu), last_inst);
+#endif
+		if ((last_inst & 0xff0007ff) != (INS_DCBZ & 0xfffffff7)) {
+			kvmppc_core_queue_program(vcpu, flags);
+			return RESUME_GUEST;
+		}
+	}
+
+	vcpu->stat.emulated_inst_exits++;
+	er = kvmppc_emulate_instruction(run, vcpu);
+	switch (er) {
+	case EMULATE_DONE:
+		r = RESUME_GUEST_NV;
+		break;
+	case EMULATE_AGAIN:
+		r = RESUME_GUEST;
+		break;
+	case EMULATE_FAIL:
+		pr_crit("%s: emulation at %lx failed (%08x)\n",
+			__func__, kvmppc_get_pc(vcpu), last_inst);
+		kvmppc_core_queue_program(vcpu, flags);
+		r = RESUME_GUEST;
+		break;
+	case EMULATE_DO_MMIO:
+		run->exit_reason = KVM_EXIT_MMIO;
+		r = RESUME_HOST_NV;
+		break;
+	case EMULATE_EXIT_USER:
+		r = RESUME_HOST_NV;
+		break;
+	default:
+		BUG();
+	}
+
+	return r;
+}
+
 int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			  unsigned int exit_nr)
 {
@@ -1044,71 +1107,8 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 	case BOOK3S_INTERRUPT_PROGRAM:
 	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
-	{
-		enum emulation_result er;
-		ulong flags;
-		u32 last_inst;
-		int emul;
-
-program_interrupt:
-		/*
-		 * shadow_srr1 only contains valid flags if we came here via
-		 * a program exception. The other exceptions (emulation assist,
-		 * FP unavailable, etc.) do not provide flags in SRR1, so use
-		 * an illegal-instruction exception when injecting a program
-		 * interrupt into the guest.
-		 */
-		if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
-			flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
-		else
-			flags = SRR1_PROGILL;
-
-		emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
-		if (emul != EMULATE_DONE) {
-			r = RESUME_GUEST;
-			break;
-		}
-
-		if (kvmppc_get_msr(vcpu) & MSR_PR) {
-#ifdef EXIT_DEBUG
-			pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n",
-				kvmppc_get_pc(vcpu), last_inst);
-#endif
-			if ((last_inst & 0xff0007ff) !=
-			    (INS_DCBZ & 0xfffffff7)) {
-				kvmppc_core_queue_program(vcpu, flags);
-				r = RESUME_GUEST;
-				break;
-			}
-		}
-
-		vcpu->stat.emulated_inst_exits++;
-		er = kvmppc_emulate_instruction(run, vcpu);
-		switch (er) {
-		case EMULATE_DONE:
-			r = RESUME_GUEST_NV;
-			break;
-		case EMULATE_AGAIN:
-			r = RESUME_GUEST;
-			break;
-		case EMULATE_FAIL:
-			printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
-			       __func__, kvmppc_get_pc(vcpu), last_inst);
-			kvmppc_core_queue_program(vcpu, flags);
-			r = RESUME_GUEST;
-			break;
-		case EMULATE_DO_MMIO:
-			run->exit_reason = KVM_EXIT_MMIO;
-			r = RESUME_HOST_NV;
-			break;
-		case EMULATE_EXIT_USER:
-			r = RESUME_HOST_NV;
-			break;
-		default:
-			BUG();
-		}
+		r = kvmppc_exit_pr_progint(run, vcpu, exit_nr);
 		break;
-	}
 	case BOOK3S_INTERRUPT_SYSCALL:
 	{
 		u32 last_sc;
@@ -1185,7 +1185,7 @@ program_interrupt:
 			emul = kvmppc_get_last_inst(vcpu, INST_GENERIC,
 						    &last_inst);
 			if (emul == EMULATE_DONE)
-				goto program_interrupt;
+				r = kvmppc_exit_pr_progint(run, vcpu, exit_nr);
 			else
 				r = RESUME_GUEST;
 
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index ca8f174289bb..2a2b96d53999 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -167,20 +167,38 @@ kvmppc_handler_trampoline_enter_end:
  *                                                                            *
  *****************************************************************************/
 
-.global kvmppc_handler_trampoline_exit
-kvmppc_handler_trampoline_exit:
-
 .global kvmppc_interrupt_pr
 kvmppc_interrupt_pr:
+	/* 64-bit entry. Register usage at this point:
+	 *
+	 * SPRG_SCRATCH0   = guest R13
+	 * R12             = (guest CR << 32) | exit handler id
+	 * R13             = PACA
+	 * HSTATE.SCRATCH0 = guest R12
+	 * HSTATE.SCRATCH1 = guest CTR if RELOCATABLE
+	 */
+#ifdef CONFIG_PPC64
+	/* Match 32-bit entry */
+#ifdef CONFIG_RELOCATABLE
+	std	r9, HSTATE_SCRATCH2(r13)
+	ld	r9, HSTATE_SCRATCH1(r13)
+	mtctr	r9
+	ld	r9, HSTATE_SCRATCH2(r13)
+#endif
+	rotldi	r12, r12, 32		  /* Flip R12 halves for stw */
+	stw	r12, HSTATE_SCRATCH1(r13) /* CR is now in the low half */
+	srdi	r12, r12, 32		  /* shift trap into low half */
+#endif
 
+.global kvmppc_handler_trampoline_exit
+kvmppc_handler_trampoline_exit:
 	/* Register usage at this point:
 	 *
-	 * SPRG_SCRATCH0  = guest R13
-	 * R12            = exit handler id
-	 * R13            = shadow vcpu (32-bit) or PACA (64-bit)
+	 * SPRG_SCRATCH0   = guest R13
+	 * R12             = exit handler id
+	 * R13             = shadow vcpu (32-bit) or PACA (64-bit)
 	 * HSTATE.SCRATCH0 = guest R12
 	 * HSTATE.SCRATCH1 = guest CR
-	 *
 	 */
 
 	/* Save registers */
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 20dff102a06f..e48803e2918d 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -63,7 +63,7 @@
 /* -- ICS routines -- */
 
 static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
-			    u32 new_irq);
+			    u32 new_irq, bool check_resend);
 
 /*
  * Return value ideally indicates how the interrupt was handled, but no
@@ -75,6 +75,7 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
 	struct ics_irq_state *state;
 	struct kvmppc_ics *ics;
 	u16 src;
+	u32 pq_old, pq_new;
 
 	XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
 
@@ -87,25 +88,41 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
 	if (!state->exists)
 		return -EINVAL;
 
+	if (level == KVM_INTERRUPT_SET_LEVEL || level == KVM_INTERRUPT_SET)
+		level = 1;
+	else if (level == KVM_INTERRUPT_UNSET)
+		level = 0;
 	/*
-	 * We set state->asserted locklessly. This should be fine as
-	 * we are the only setter, thus concurrent access is undefined
-	 * to begin with.
+	 * Take other values the same as 1, consistent with original code.
+	 * maybe WARN here?
 	 */
-	if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL)
-		state->asserted = 1;
-	else if (level == 0 || level == KVM_INTERRUPT_UNSET) {
-		state->asserted = 0;
+
+	if (!state->lsi && level == 0) /* noop for MSI */
 		return 0;
-	}
+
+	do {
+		pq_old = state->pq_state;
+		if (state->lsi) {
+			if (level) {
+				if (pq_old & PQ_PRESENTED)
+					/* Setting already set LSI ... */
+					return 0;
+
+				pq_new = PQ_PRESENTED;
+			} else
+				pq_new = 0;
+		} else
+			pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED;
+	} while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+	/* Test P=1, Q=0, this is the only case where we present */
+	if (pq_new == PQ_PRESENTED)
+		icp_deliver_irq(xics, NULL, irq, false);
 
 	/* Record which CPU this arrived on for passed-through interrupts */
 	if (state->host_irq)
 		state->intr_cpu = raw_smp_processor_id();
 
-	/* Attempt delivery */
-	icp_deliver_irq(xics, NULL, irq);
-
 	return 0;
 }
 
@@ -114,29 +131,14 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
 {
 	int i;
 
-	unsigned long flags;
-
-	local_irq_save(flags);
-	arch_spin_lock(&ics->lock);
-
 	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
 		struct ics_irq_state *state = &ics->irq_state[i];
-
-		if (!state->resend)
-			continue;
-
-		XICS_DBG("resend %#x prio %#x\n", state->number,
-			      state->priority);
-
-		arch_spin_unlock(&ics->lock);
-		local_irq_restore(flags);
-		icp_deliver_irq(xics, icp, state->number);
-		local_irq_save(flags);
-		arch_spin_lock(&ics->lock);
+		if (state->resend) {
+			XICS_DBG("resend %#x prio %#x\n", state->number,
+				      state->priority);
+			icp_deliver_irq(xics, icp, state->number, true);
+		}
 	}
-
-	arch_spin_unlock(&ics->lock);
-	local_irq_restore(flags);
 }
 
 static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
@@ -155,6 +157,7 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
 	deliver = false;
 	if ((state->masked_pending || state->resend) && priority != MASKED) {
 		state->masked_pending = 0;
+		state->resend = 0;
 		deliver = true;
 	}
 
@@ -189,7 +192,7 @@ int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
 		 state->masked_pending, state->resend);
 
 	if (write_xive(xics, ics, state, server, priority, priority))
-		icp_deliver_irq(xics, icp, irq);
+		icp_deliver_irq(xics, icp, irq, false);
 
 	return 0;
 }
@@ -242,7 +245,7 @@ int kvmppc_xics_int_on(struct kvm *kvm, u32 irq)
 
 	if (write_xive(xics, ics, state, state->server, state->saved_priority,
 		       state->saved_priority))
-		icp_deliver_irq(xics, icp, irq);
+		icp_deliver_irq(xics, icp, irq, false);
 
 	return 0;
 }
@@ -376,7 +379,7 @@ static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
 }
 
 static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
-			    u32 new_irq)
+			    u32 new_irq, bool check_resend)
 {
 	struct ics_irq_state *state;
 	struct kvmppc_ics *ics;
@@ -422,6 +425,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		}
 	}
 
+	if (check_resend)
+		if (!state->resend)
+			goto out;
+
 	/* Clear the resend bit of that interrupt */
 	state->resend = 0;
 
@@ -470,6 +477,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 			arch_spin_unlock(&ics->lock);
 			local_irq_restore(flags);
 			new_irq = reject;
+			check_resend = 0;
 			goto again;
 		}
 	} else {
@@ -477,10 +485,16 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		 * We failed to deliver the interrupt we need to set the
 		 * resend map bit and mark the ICS state as needing a resend
 		 */
-		set_bit(ics->icsid, icp->resend_map);
 		state->resend = 1;
 
 		/*
+		 * Make sure when checking resend, we don't miss the resend
+		 * if resend_map bit is seen and cleared.
+		 */
+		smp_wmb();
+		set_bit(ics->icsid, icp->resend_map);
+
+		/*
 		 * If the need_resend flag got cleared in the ICP some time
 		 * between icp_try_to_deliver() atomic update and now, then
 		 * we know it might have missed the resend_map bit. So we
@@ -488,8 +502,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		 */
 		smp_mb();
 		if (!icp->state.need_resend) {
+			state->resend = 0;
 			arch_spin_unlock(&ics->lock);
 			local_irq_restore(flags);
+			check_resend = 0;
 			goto again;
 		}
 	}
@@ -681,7 +697,7 @@ static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
 
 	/* Handle reject */
 	if (reject && reject != XICS_IPI)
-		icp_deliver_irq(xics, icp, reject);
+		icp_deliver_irq(xics, icp, reject, false);
 
 	/* Handle resend */
 	if (resend)
@@ -761,17 +777,54 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 	 * attempt (see comments in icp_deliver_irq).
 	 */
 	if (reject && reject != XICS_IPI)
-		icp_deliver_irq(xics, icp, reject);
+		icp_deliver_irq(xics, icp, reject, false);
 }
 
-static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq)
 {
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
 	struct kvmppc_icp *icp = vcpu->arch.icp;
 	struct kvmppc_ics *ics;
 	struct ics_irq_state *state;
-	u32 irq = xirr & 0x00ffffff;
 	u16 src;
+	u32 pq_old, pq_new;
+
+	/*
+	 * ICS EOI handling: For LSI, if P bit is still set, we need to
+	 * resend it.
+	 *
+	 * For MSI, we move Q bit into P (and clear Q). If it is set,
+	 * resend it.
+	 */
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics) {
+		XICS_DBG("ios_eoi: IRQ 0x%06x not found !\n", irq);
+		return H_PARAMETER;
+	}
+	state = &ics->irq_state[src];
+
+	if (state->lsi)
+		pq_new = state->pq_state;
+	else
+		do {
+			pq_old = state->pq_state;
+			pq_new = pq_old >> 1;
+		} while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+	if (pq_new & PQ_PRESENTED)
+		icp_deliver_irq(xics, icp, irq, false);
+
+	kvm_notify_acked_irq(vcpu->kvm, 0, irq);
+
+	return H_SUCCESS;
+}
+
+static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 irq = xirr & 0x00ffffff;
 
 	XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
 
@@ -794,26 +847,8 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 	/* IPIs have no EOI */
 	if (irq == XICS_IPI)
 		return H_SUCCESS;
-	/*
-	 * EOI handling: If the interrupt is still asserted, we need to
-	 * resend it. We can take a lockless "peek" at the ICS state here.
-	 *
-	 * "Message" interrupts will never have "asserted" set
-	 */
-	ics = kvmppc_xics_find_ics(xics, irq, &src);
-	if (!ics) {
-		XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
-		return H_PARAMETER;
-	}
-	state = &ics->irq_state[src];
 
-	/* Still asserted, resend it */
-	if (state->asserted)
-		icp_deliver_irq(xics, icp, irq);
-
-	kvm_notify_acked_irq(vcpu->kvm, 0, irq);
-
-	return H_SUCCESS;
+	return ics_eoi(vcpu, irq);
 }
 
 int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
@@ -832,10 +867,6 @@ int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
 		icp->n_rm_check_resend++;
 		icp_check_resend(xics, icp->rm_resend_icp);
 	}
-	if (icp->rm_action & XICS_RM_REJECT) {
-		icp->n_rm_reject++;
-		icp_deliver_irq(xics, icp, icp->rm_reject);
-	}
 	if (icp->rm_action & XICS_RM_NOTIFY_EOI) {
 		icp->n_rm_notify_eoi++;
 		kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq);
@@ -920,7 +951,7 @@ static int xics_debug_show(struct seq_file *m, void *private)
 	int icsid, i;
 	unsigned long flags;
 	unsigned long t_rm_kick_vcpu, t_rm_check_resend;
-	unsigned long t_rm_reject, t_rm_notify_eoi;
+	unsigned long t_rm_notify_eoi;
 	unsigned long t_reject, t_check_resend;
 
 	if (!kvm)
@@ -929,7 +960,6 @@ static int xics_debug_show(struct seq_file *m, void *private)
 	t_rm_kick_vcpu = 0;
 	t_rm_notify_eoi = 0;
 	t_rm_check_resend = 0;
-	t_rm_reject = 0;
 	t_check_resend = 0;
 	t_reject = 0;
 
@@ -952,14 +982,13 @@ static int xics_debug_show(struct seq_file *m, void *private)
 		t_rm_kick_vcpu += icp->n_rm_kick_vcpu;
 		t_rm_notify_eoi += icp->n_rm_notify_eoi;
 		t_rm_check_resend += icp->n_rm_check_resend;
-		t_rm_reject += icp->n_rm_reject;
 		t_check_resend += icp->n_check_resend;
 		t_reject += icp->n_reject;
 	}
 
-	seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n",
+	seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu notify_eoi=%lu\n",
 			t_rm_kick_vcpu, t_rm_check_resend,
-			t_rm_reject, t_rm_notify_eoi);
+			t_rm_notify_eoi);
 	seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n",
 			t_check_resend, t_reject);
 	for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
@@ -977,9 +1006,9 @@ static int xics_debug_show(struct seq_file *m, void *private)
 		for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
 			struct ics_irq_state *irq = &ics->irq_state[i];
 
-			seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n",
+			seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x pq_state %d resend %d masked pending %d\n",
 				   irq->number, irq->server, irq->priority,
-				   irq->saved_priority, irq->asserted,
+				   irq->saved_priority, irq->pq_state,
 				   irq->resend, irq->masked_pending);
 
 		}
@@ -1198,10 +1227,17 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
 		val |= prio << KVM_XICS_PRIORITY_SHIFT;
 		if (irqp->lsi) {
 			val |= KVM_XICS_LEVEL_SENSITIVE;
-			if (irqp->asserted)
+			if (irqp->pq_state & PQ_PRESENTED)
 				val |= KVM_XICS_PENDING;
 		} else if (irqp->masked_pending || irqp->resend)
 			val |= KVM_XICS_PENDING;
+
+		if (irqp->pq_state & PQ_PRESENTED)
+			val |= KVM_XICS_PRESENTED;
+
+		if (irqp->pq_state & PQ_QUEUED)
+			val |= KVM_XICS_QUEUED;
+
 		ret = 0;
 	}
 	arch_spin_unlock(&ics->lock);
@@ -1253,18 +1289,20 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
 	irqp->resend = 0;
 	irqp->masked_pending = 0;
 	irqp->lsi = 0;
-	irqp->asserted = 0;
-	if (val & KVM_XICS_LEVEL_SENSITIVE) {
+	irqp->pq_state = 0;
+	if (val & KVM_XICS_LEVEL_SENSITIVE)
 		irqp->lsi = 1;
-		if (val & KVM_XICS_PENDING)
-			irqp->asserted = 1;
-	}
+	/* If PENDING, set P in case P is not saved because of old code */
+	if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
+		irqp->pq_state |= PQ_PRESENTED;
+	if (val & KVM_XICS_QUEUED)
+		irqp->pq_state |= PQ_QUEUED;
 	irqp->exists = 1;
 	arch_spin_unlock(&ics->lock);
 	local_irq_restore(flags);
 
 	if (val & KVM_XICS_PENDING)
-		icp_deliver_irq(xics, NULL, irqp->number);
+		icp_deliver_irq(xics, NULL, irqp->number, false);
 
 	return 0;
 }
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index 2a50320b55ca..ec5474cf70c6 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -31,16 +31,19 @@
 /* Priority value to use for disabling an interrupt */
 #define MASKED	0xff
 
+#define PQ_PRESENTED	1
+#define PQ_QUEUED	2
+
 /* State for one irq source */
 struct ics_irq_state {
 	u32 number;
 	u32 server;
+	u32 pq_state;
 	u8  priority;
 	u8  saved_priority;
 	u8  resend;
 	u8  masked_pending;
 	u8  lsi;		/* level-sensitive interrupt */
-	u8  asserted; /* Only for LSI */
 	u8  exists;
 	int intr_cpu;
 	u32 host_irq;
@@ -73,7 +76,6 @@ struct kvmppc_icp {
 	 */
 #define XICS_RM_KICK_VCPU	0x1
 #define XICS_RM_CHECK_RESEND	0x2
-#define XICS_RM_REJECT		0x4
 #define XICS_RM_NOTIFY_EOI	0x8
 	u32 rm_action;
 	struct kvm_vcpu *rm_kick_target;
@@ -84,7 +86,6 @@ struct kvmppc_icp {
 	/* Counters for each reason we exited real mode */
 	unsigned long n_rm_kick_vcpu;
 	unsigned long n_rm_check_resend;
-	unsigned long n_rm_reject;
 	unsigned long n_rm_notify_eoi;
 	/* Counters for handling ICP processing in real mode */
 	unsigned long n_check_resend;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index cd892dec7cb6..2b3e4e620078 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -565,6 +565,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_PPC_HWRNG:
 		r = kvmppc_hwrng_present();
 		break;
+	case KVM_CAP_PPC_MMU_RADIX:
+		r = !!(hv_enabled && radix_enabled());
+		break;
+	case KVM_CAP_PPC_MMU_HASH_V3:
+		r = !!(hv_enabled && !radix_enabled() &&
+		       cpu_has_feature(CPU_FTR_ARCH_300));
+		break;
 #endif
 	case KVM_CAP_SYNC_MMU:
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -605,6 +612,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_SPAPR_MULTITCE:
 		r = 1;
 		break;
+	case KVM_CAP_SPAPR_RESIZE_HPT:
+		r = !!hv_enabled;
+		break;
 #endif
 	case KVM_CAP_PPC_HTM:
 		r = cpu_has_feature(CPU_FTR_TM_COMP) &&
@@ -1468,6 +1478,31 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
 		break;
 	}
+	case KVM_PPC_CONFIGURE_V3_MMU: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_ppc_mmuv3_cfg cfg;
+
+		r = -EINVAL;
+		if (!kvm->arch.kvm_ops->configure_mmu)
+			goto out;
+		r = -EFAULT;
+		if (copy_from_user(&cfg, argp, sizeof(cfg)))
+			goto out;
+		r = kvm->arch.kvm_ops->configure_mmu(kvm, &cfg);
+		break;
+	}
+	case KVM_PPC_GET_RMMU_INFO: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_ppc_rmmu_info info;
+
+		r = -EINVAL;
+		if (!kvm->arch.kvm_ops->get_rmmu_info)
+			goto out;
+		r = kvm->arch.kvm_ops->get_rmmu_info(kvm, &info);
+		if (r >= 0 && copy_to_user(argp, &info, sizeof(info)))
+			r = -EFAULT;
+		break;
+	}
 	default: {
 		struct kvm *kvm = filp->private_data;
 		r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg);
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index f2108c40e697..eb8c6c8c4851 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -41,6 +41,7 @@ static void pmd_ctor(void *addr)
 }
 
 struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
+EXPORT_SYMBOL_GPL(pgtable_cache);	/* used by kvm_hv module */
 
 /*
  * Create a kmem_cache() for pagetables.  This is not used for PTE
@@ -86,7 +87,7 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
 
 	pr_debug("Allocated pgtable cache for order %d\n", shift);
 }
-
+EXPORT_SYMBOL_GPL(pgtable_cache_add);	/* used by kvm_hv module */
 
 void pgtable_cache_init(void)
 {
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 93abf8a9813d..10c9a545a646 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -42,6 +42,8 @@
 #include <linux/memblock.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -344,12 +346,45 @@ static int __init parse_disable_radix(char *p)
 }
 early_param("disable_radix", parse_disable_radix);
 
+/*
+ * If we're running under a hypervisor, we need to check the contents of
+ * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do
+ * radix.  If not, we clear the radix feature bit so we fall back to hash.
+ */
+static void early_check_vec5(void)
+{
+	unsigned long root, chosen;
+	int size;
+	const u8 *vec5;
+
+	root = of_get_flat_dt_root();
+	chosen = of_get_flat_dt_subnode_by_name(root, "chosen");
+	if (chosen == -FDT_ERR_NOTFOUND)
+		return;
+	vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size);
+	if (!vec5)
+		return;
+	if (size <= OV5_INDX(OV5_MMU_RADIX_300) ||
+	    !(vec5[OV5_INDX(OV5_MMU_RADIX_300)] & OV5_FEAT(OV5_MMU_RADIX_300)))
+		/* Hypervisor doesn't support radix */
+		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+}
+
 void __init mmu_early_init_devtree(void)
 {
 	/* Disable radix mode based on kernel command line. */
 	if (disable_radix)
 		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
 
+	/*
+	 * Check /chosen/ibm,architecture-vec-5 if running as a guest.
+	 * When running bare-metal, we can use radix if we like
+	 * even though the ibm,architecture-vec-5 property created by
+	 * skiboot doesn't have the necessary bits set.
+	 */
+	if (early_radix_enabled() && !(mfmsr() & MSR_HV))
+		early_check_vec5();
+
 	if (early_radix_enabled())
 		radix__early_init_devtree();
 	else
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 08f15ec61d07..feeda90cd06d 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -414,6 +414,8 @@ void __init radix__early_init_mmu(void)
 		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
 		radix_init_partition_table();
 		radix_init_amor();
+	} else {
+		radix_init_pseries();
 	}
 
 	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 4ee9c9d18760..db93cf747a03 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -458,13 +458,23 @@ void __init mmu_partition_table_init(void)
 void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
 				   unsigned long dw1)
 {
+	unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
+
 	partition_tb[lpid].patb0 = cpu_to_be64(dw0);
 	partition_tb[lpid].patb1 = cpu_to_be64(dw1);
 
-	/* Global flush of TLBs and partition table caches for this lpid */
+	/*
+	 * Global flush of TLBs and partition table caches for this lpid.
+	 * The type of flush (hash or radix) depends on what the previous
+	 * use of this partition ID was, not the new use.
+	 */
 	asm volatile("ptesync" : : : "memory");
-	asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
-		     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+	if (old & PATB_HR)
+		asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
+			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+	else
+		asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
+			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
 	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
 }
 EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
index ea7f09bd73b1..7d67623203b8 100644
--- a/arch/powerpc/platforms/pseries/firmware.c
+++ b/arch/powerpc/platforms/pseries/firmware.c
@@ -126,7 +126,7 @@ static void __init fw_vec5_feature_init(const char *vec5, unsigned long len)
 		index = OV5_INDX(vec5_fw_features_table[i].feature);
 		feat = OV5_FEAT(vec5_fw_features_table[i].feature);
 
-		if (vec5[index] & feat)
+		if (index < len && (vec5[index] & feat))
 			powerpc_firmware_features |=
 				vec5_fw_features_table[i].val;
 	}
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 5dc1c3c6e716..0587655aea69 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -609,6 +609,29 @@ static int __init disable_bulk_remove(char *str)
 
 __setup("bulk_remove=", disable_bulk_remove);
 
+/* Actually only used for radix, so far */
+static int pseries_lpar_register_process_table(unsigned long base,
+			unsigned long page_size, unsigned long table_size)
+{
+	long rc;
+	unsigned long flags = PROC_TABLE_NEW;
+
+	if (radix_enabled())
+		flags |= PROC_TABLE_RADIX | PROC_TABLE_GTSE;
+	for (;;) {
+		rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
+					page_size, table_size);
+		if (!H_IS_LONG_BUSY(rc))
+			break;
+		mdelay(get_longbusy_msecs(rc));
+	}
+	if (rc != H_SUCCESS) {
+		pr_err("Failed to register process table (rc=%ld)\n", rc);
+		BUG();
+	}
+	return rc;
+}
+
 void __init hpte_init_pseries(void)
 {
 	mmu_hash_ops.hpte_invalidate	 = pSeries_lpar_hpte_invalidate;
@@ -622,6 +645,12 @@ void __init hpte_init_pseries(void)
 	mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
 }
 
+void radix_init_pseries(void)
+{
+	pr_info("Using radix MMU under hypervisor\n");
+	register_process_table = pseries_lpar_register_process_table;
+}
+
 #ifdef CONFIG_PPC_SMLPAR
 #define CMO_FREE_HINT_DEFAULT 1
 static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 4aa8a7e2a1da..4492c9363178 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -373,7 +373,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu)
 		ipte_unlock_simple(vcpu);
 }
 
-static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar,
+static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar,
 			  enum gacc_mode mode)
 {
 	union alet alet;
@@ -465,7 +465,9 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar,
 struct trans_exc_code_bits {
 	unsigned long addr : 52; /* Translation-exception Address */
 	unsigned long fsi  : 2;  /* Access Exception Fetch/Store Indication */
-	unsigned long	   : 6;
+	unsigned long	   : 2;
+	unsigned long b56  : 1;
+	unsigned long	   : 3;
 	unsigned long b60  : 1;
 	unsigned long b61  : 1;
 	unsigned long as   : 2;  /* ASCE Identifier */
@@ -485,7 +487,7 @@ enum prot_type {
 };
 
 static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
-		     ar_t ar, enum gacc_mode mode, enum prot_type prot)
+		     u8 ar, enum gacc_mode mode, enum prot_type prot)
 {
 	struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
 	struct trans_exc_code_bits *tec;
@@ -497,14 +499,18 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
 	switch (code) {
 	case PGM_PROTECTION:
 		switch (prot) {
+		case PROT_TYPE_LA:
+			tec->b56 = 1;
+			break;
+		case PROT_TYPE_KEYC:
+			tec->b60 = 1;
+			break;
 		case PROT_TYPE_ALC:
 			tec->b60 = 1;
 			/* FALL THROUGH */
 		case PROT_TYPE_DAT:
 			tec->b61 = 1;
 			break;
-		default: /* LA and KEYC set b61 to 0, other params undefined */
-			return code;
 		}
 		/* FALL THROUGH */
 	case PGM_ASCE_TYPE:
@@ -539,7 +545,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
 }
 
 static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
-			 unsigned long ga, ar_t ar, enum gacc_mode mode)
+			 unsigned long ga, u8 ar, enum gacc_mode mode)
 {
 	int rc;
 	struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
@@ -771,7 +777,7 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
 	return 1;
 }
 
-static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar,
+static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 			    unsigned long *pages, unsigned long nr_pages,
 			    const union asce asce, enum gacc_mode mode)
 {
@@ -803,7 +809,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar,
 	return 0;
 }
 
-int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
+int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
 		 unsigned long len, enum gacc_mode mode)
 {
 	psw_t *psw = &vcpu->arch.sie_block->gpsw;
@@ -877,7 +883,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
  * Note: The IPTE lock is not taken during this function, so the caller
  * has to take care of this.
  */
-int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
+int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
 			    unsigned long *gpa, enum gacc_mode mode)
 {
 	psw_t *psw = &vcpu->arch.sie_block->gpsw;
@@ -910,7 +916,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
 /**
  * check_gva_range - test a range of guest virtual addresses for accessibility
  */
-int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
+int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
 		    unsigned long length, enum gacc_mode mode)
 {
 	unsigned long gpa;
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 8756569ad938..7ce47fd36f28 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -162,11 +162,11 @@ enum gacc_mode {
 };
 
 int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
-			    ar_t ar, unsigned long *gpa, enum gacc_mode mode);
-int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
+			    u8 ar, unsigned long *gpa, enum gacc_mode mode);
+int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
 		    unsigned long length, enum gacc_mode mode);
 
-int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
+int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
 		 unsigned long len, enum gacc_mode mode);
 
 int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
@@ -218,7 +218,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
  *	 if data has been changed in guest space in case of an exception.
  */
 static inline __must_check
-int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
+int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
 		unsigned long len)
 {
 	return access_guest(vcpu, ga, ar, data, len, GACC_STORE);
@@ -238,7 +238,7 @@ int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
  * data will be copied from guest space to kernel space.
  */
 static inline __must_check
-int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
+int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
 	       unsigned long len)
 {
 	return access_guest(vcpu, ga, ar, data, len, GACC_FETCH);
@@ -247,10 +247,11 @@ int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
 /**
  * read_guest_instr - copy instruction data from guest space to kernel space
  * @vcpu: virtual cpu
+ * @ga: guest address
  * @data: destination address in kernel space
  * @len: number of bytes to copy
  *
- * Copy @len bytes from the current psw address (guest space) to @data (kernel
+ * Copy @len bytes from the given address (guest space) to @data (kernel
  * space).
  *
  * The behaviour of read_guest_instr is identical to read_guest, except that
@@ -258,10 +259,10 @@ int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
  * address-space mode.
  */
 static inline __must_check
-int read_guest_instr(struct kvm_vcpu *vcpu, void *data, unsigned long len)
+int read_guest_instr(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
+		     unsigned long len)
 {
-	return access_guest(vcpu, vcpu->arch.sie_block->gpsw.addr, 0, data, len,
-			    GACC_IFETCH);
+	return access_guest(vcpu, ga, 0, data, len, GACC_IFETCH);
 }
 
 /**
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index d7c6a7f53ced..23d9a4e12da1 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -388,14 +388,13 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu)
 #define per_write_wp_event(code) \
 			(code & (PER_CODE_STORE | PER_CODE_STORE_REAL))
 
-static int debug_exit_required(struct kvm_vcpu *vcpu)
+static int debug_exit_required(struct kvm_vcpu *vcpu, u8 perc,
+			       unsigned long peraddr)
 {
-	u8 perc = vcpu->arch.sie_block->perc;
 	struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch;
 	struct kvm_hw_wp_info_arch *wp_info = NULL;
 	struct kvm_hw_bp_info_arch *bp_info = NULL;
 	unsigned long addr = vcpu->arch.sie_block->gpsw.addr;
-	unsigned long peraddr = vcpu->arch.sie_block->peraddr;
 
 	if (guestdbg_hw_bp_enabled(vcpu)) {
 		if (per_write_wp_event(perc) &&
@@ -437,36 +436,118 @@ exit_required:
 	return 1;
 }
 
+static int per_fetched_addr(struct kvm_vcpu *vcpu, unsigned long *addr)
+{
+	u8 exec_ilen = 0;
+	u16 opcode[3];
+	int rc;
+
+	if (vcpu->arch.sie_block->icptcode == ICPT_PROGI) {
+		/* PER address references the fetched or the execute instr */
+		*addr = vcpu->arch.sie_block->peraddr;
+		/*
+		 * Manually detect if we have an EXECUTE instruction. As
+		 * instructions are always 2 byte aligned we can read the
+		 * first two bytes unconditionally
+		 */
+		rc = read_guest_instr(vcpu, *addr, &opcode, 2);
+		if (rc)
+			return rc;
+		if (opcode[0] >> 8 == 0x44)
+			exec_ilen = 4;
+		if ((opcode[0] & 0xff0f) == 0xc600)
+			exec_ilen = 6;
+	} else {
+		/* instr was suppressed, calculate the responsible instr */
+		*addr = __rewind_psw(vcpu->arch.sie_block->gpsw,
+				     kvm_s390_get_ilen(vcpu));
+		if (vcpu->arch.sie_block->icptstatus & 0x01) {
+			exec_ilen = (vcpu->arch.sie_block->icptstatus & 0x60) >> 4;
+			if (!exec_ilen)
+				exec_ilen = 4;
+		}
+	}
+
+	if (exec_ilen) {
+		/* read the complete EXECUTE instr to detect the fetched addr */
+		rc = read_guest_instr(vcpu, *addr, &opcode, exec_ilen);
+		if (rc)
+			return rc;
+		if (exec_ilen == 6) {
+			/* EXECUTE RELATIVE LONG - RIL-b format */
+			s32 rl = *((s32 *) (opcode + 1));
+
+			/* rl is a _signed_ 32 bit value specifying halfwords */
+			*addr += (u64)(s64) rl * 2;
+		} else {
+			/* EXECUTE - RX-a format */
+			u32 base = (opcode[1] & 0xf000) >> 12;
+			u32 disp = opcode[1] & 0x0fff;
+			u32 index = opcode[0] & 0x000f;
+
+			*addr = base ? vcpu->run->s.regs.gprs[base] : 0;
+			*addr += index ? vcpu->run->s.regs.gprs[index] : 0;
+			*addr += disp;
+		}
+		*addr = kvm_s390_logical_to_effective(vcpu, *addr);
+	}
+	return 0;
+}
+
 #define guest_per_enabled(vcpu) \
 			     (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER)
 
 int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
 {
+	const u64 cr10 = vcpu->arch.sie_block->gcr[10];
+	const u64 cr11 = vcpu->arch.sie_block->gcr[11];
 	const u8 ilen = kvm_s390_get_ilen(vcpu);
 	struct kvm_s390_pgm_info pgm_info = {
 		.code = PGM_PER,
 		.per_code = PER_CODE_IFETCH,
 		.per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen),
 	};
+	unsigned long fetched_addr;
+	int rc;
 
 	/*
 	 * The PSW points to the next instruction, therefore the intercepted
 	 * instruction generated a PER i-fetch event. PER address therefore
 	 * points at the previous PSW address (could be an EXECUTE function).
 	 */
-	return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+	if (!guestdbg_enabled(vcpu))
+		return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+
+	if (debug_exit_required(vcpu, pgm_info.per_code, pgm_info.per_address))
+		vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING;
+
+	if (!guest_per_enabled(vcpu) ||
+	    !(vcpu->arch.sie_block->gcr[9] & PER_EVENT_IFETCH))
+		return 0;
+
+	rc = per_fetched_addr(vcpu, &fetched_addr);
+	if (rc < 0)
+		return rc;
+	if (rc)
+		/* instruction-fetching exceptions */
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+	if (in_addr_range(fetched_addr, cr10, cr11))
+		return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+	return 0;
 }
 
-static void filter_guest_per_event(struct kvm_vcpu *vcpu)
+static int filter_guest_per_event(struct kvm_vcpu *vcpu)
 {
 	const u8 perc = vcpu->arch.sie_block->perc;
-	u64 peraddr = vcpu->arch.sie_block->peraddr;
 	u64 addr = vcpu->arch.sie_block->gpsw.addr;
 	u64 cr9 = vcpu->arch.sie_block->gcr[9];
 	u64 cr10 = vcpu->arch.sie_block->gcr[10];
 	u64 cr11 = vcpu->arch.sie_block->gcr[11];
 	/* filter all events, demanded by the guest */
 	u8 guest_perc = perc & (cr9 >> 24) & PER_CODE_MASK;
+	unsigned long fetched_addr;
+	int rc;
 
 	if (!guest_per_enabled(vcpu))
 		guest_perc = 0;
@@ -478,9 +559,17 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
 		guest_perc &= ~PER_CODE_BRANCH;
 
 	/* filter "instruction-fetching" events */
-	if (guest_perc & PER_CODE_IFETCH &&
-	    !in_addr_range(peraddr, cr10, cr11))
-		guest_perc &= ~PER_CODE_IFETCH;
+	if (guest_perc & PER_CODE_IFETCH) {
+		rc = per_fetched_addr(vcpu, &fetched_addr);
+		if (rc < 0)
+			return rc;
+		/*
+		 * Don't inject an irq on exceptions. This would make handling
+		 * on icpt code 8 very complex (as PSW was already rewound).
+		 */
+		if (rc || !in_addr_range(fetched_addr, cr10, cr11))
+			guest_perc &= ~PER_CODE_IFETCH;
+	}
 
 	/* All other PER events will be given to the guest */
 	/* TODO: Check altered address/address space */
@@ -489,6 +578,7 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
 
 	if (!guest_perc)
 		vcpu->arch.sie_block->iprcc &= ~PGM_PER;
+	return 0;
 }
 
 #define pssec(vcpu) (vcpu->arch.sie_block->gcr[1] & _ASCE_SPACE_SWITCH)
@@ -496,14 +586,17 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
 #define old_ssec(vcpu) ((vcpu->arch.sie_block->tecmc >> 31) & 0x1)
 #define old_as_is_home(vcpu) !(vcpu->arch.sie_block->tecmc & 0xffff)
 
-void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu)
+int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu)
 {
-	int new_as;
+	int rc, new_as;
 
-	if (debug_exit_required(vcpu))
+	if (debug_exit_required(vcpu, vcpu->arch.sie_block->perc,
+				vcpu->arch.sie_block->peraddr))
 		vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING;
 
-	filter_guest_per_event(vcpu);
+	rc = filter_guest_per_event(vcpu);
+	if (rc)
+		return rc;
 
 	/*
 	 * Only RP, SAC, SACF, PT, PTI, PR, PC instructions can trigger
@@ -532,4 +625,5 @@ void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu)
 		    (pssec(vcpu) || old_ssec(vcpu)))
 			vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH;
 	}
+	return 0;
 }
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 7a27eebab28a..59920f96ebc0 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -238,7 +238,9 @@ static int handle_prog(struct kvm_vcpu *vcpu)
 	vcpu->stat.exit_program_interruption++;
 
 	if (guestdbg_enabled(vcpu) && per_event(vcpu)) {
-		kvm_s390_handle_per_event(vcpu);
+		rc = kvm_s390_handle_per_event(vcpu);
+		if (rc)
+			return rc;
 		/* the interrupt might have been filtered out completely */
 		if (vcpu->arch.sie_block->iprcc == 0)
 			return 0;
@@ -359,6 +361,9 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
 
 static int handle_operexc(struct kvm_vcpu *vcpu)
 {
+	psw_t oldpsw, newpsw;
+	int rc;
+
 	vcpu->stat.exit_operation_exception++;
 	trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa,
 				      vcpu->arch.sie_block->ipb);
@@ -369,6 +374,24 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
 
 	if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
 		return -EOPNOTSUPP;
+	rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &newpsw, sizeof(psw_t));
+	if (rc)
+		return rc;
+	/*
+	 * Avoid endless loops of operation exceptions, if the pgm new
+	 * PSW will cause a new operation exception.
+	 * The heuristic checks if the pgm new psw is within 6 bytes before
+	 * the faulting psw address (with same DAT, AS settings) and the
+	 * new psw is not a wait psw and the fault was not triggered by
+	 * problem state.
+	 */
+	oldpsw = vcpu->arch.sie_block->gpsw;
+	if (oldpsw.addr - newpsw.addr <= 6 &&
+	    !(newpsw.mask & PSW_MASK_WAIT) &&
+	    !(oldpsw.mask & PSW_MASK_PSTATE) &&
+	    (newpsw.mask & PSW_MASK_ASC) == (oldpsw.mask & PSW_MASK_ASC) &&
+	    (newpsw.mask & PSW_MASK_DAT) == (oldpsw.mask & PSW_MASK_DAT))
+		return -EOPNOTSUPP;
 
 	return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
 }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 6484a250021e..502de74ea984 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -217,7 +217,7 @@ static void allow_cpu_feat(unsigned long nr)
 static inline int plo_test_bit(unsigned char nr)
 {
 	register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
-	int cc = 3; /* subfunction not available */
+	int cc;
 
 	asm volatile(
 		/* Parameter registers are ignored for "test bit" */
@@ -442,6 +442,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	struct kvm_memory_slot *memslot;
 	int is_dirty = 0;
 
+	if (kvm_is_ucontrol(kvm))
+		return -EINVAL;
+
 	mutex_lock(&kvm->slots_lock);
 
 	r = -EINVAL;
@@ -505,6 +508,14 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		} else if (MACHINE_HAS_VX) {
 			set_kvm_facility(kvm->arch.model.fac_mask, 129);
 			set_kvm_facility(kvm->arch.model.fac_list, 129);
+			if (test_facility(134)) {
+				set_kvm_facility(kvm->arch.model.fac_mask, 134);
+				set_kvm_facility(kvm->arch.model.fac_list, 134);
+			}
+			if (test_facility(135)) {
+				set_kvm_facility(kvm->arch.model.fac_mask, 135);
+				set_kvm_facility(kvm->arch.model.fac_list, 135);
+			}
 			r = 0;
 		} else
 			r = -EINVAL;
@@ -821,6 +832,13 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
 		}
 		memcpy(kvm->arch.model.fac_list, proc->fac_list,
 		       S390_ARCH_FAC_LIST_SIZE_BYTE);
+		VM_EVENT(kvm, 3, "SET: guest ibc: 0x%4.4x, guest cpuid: 0x%16.16llx",
+			 kvm->arch.model.ibc,
+			 kvm->arch.model.cpuid);
+		VM_EVENT(kvm, 3, "SET: guest faclist: 0x%16.16llx.%16.16llx.%16.16llx",
+			 kvm->arch.model.fac_list[0],
+			 kvm->arch.model.fac_list[1],
+			 kvm->arch.model.fac_list[2]);
 	} else
 		ret = -EFAULT;
 	kfree(proc);
@@ -894,6 +912,13 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr)
 	proc->ibc = kvm->arch.model.ibc;
 	memcpy(&proc->fac_list, kvm->arch.model.fac_list,
 	       S390_ARCH_FAC_LIST_SIZE_BYTE);
+	VM_EVENT(kvm, 3, "GET: guest ibc: 0x%4.4x, guest cpuid: 0x%16.16llx",
+		 kvm->arch.model.ibc,
+		 kvm->arch.model.cpuid);
+	VM_EVENT(kvm, 3, "GET: guest faclist: 0x%16.16llx.%16.16llx.%16.16llx",
+		 kvm->arch.model.fac_list[0],
+		 kvm->arch.model.fac_list[1],
+		 kvm->arch.model.fac_list[2]);
 	if (copy_to_user((void __user *)attr->addr, proc, sizeof(*proc)))
 		ret = -EFAULT;
 	kfree(proc);
@@ -917,6 +942,17 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
 	       S390_ARCH_FAC_LIST_SIZE_BYTE);
 	memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list,
 	       sizeof(S390_lowcore.stfle_fac_list));
+	VM_EVENT(kvm, 3, "GET: host ibc:  0x%4.4x, host cpuid:  0x%16.16llx",
+		 kvm->arch.model.ibc,
+		 kvm->arch.model.cpuid);
+	VM_EVENT(kvm, 3, "GET: host facmask:  0x%16.16llx.%16.16llx.%16.16llx",
+		 mach->fac_mask[0],
+		 mach->fac_mask[1],
+		 mach->fac_mask[2]);
+	VM_EVENT(kvm, 3, "GET: host faclist:  0x%16.16llx.%16.16llx.%16.16llx",
+		 mach->fac_list[0],
+		 mach->fac_list[1],
+		 mach->fac_list[2]);
 	if (copy_to_user((void __user *)attr->addr, mach, sizeof(*mach)))
 		ret = -EFAULT;
 	kfree(mach);
@@ -1938,6 +1974,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)
 		vcpu->arch.sie_block->ecb2 |= 0x08;
+	if (test_kvm_facility(vcpu->kvm, 130))
+		vcpu->arch.sie_block->ecb2 |= 0x20;
 	vcpu->arch.sie_block->eca = 0x1002000U;
 	if (sclp.has_cei)
 		vcpu->arch.sie_block->eca |= 0x80000000U;
@@ -2578,7 +2616,7 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
 	 * to look up the current opcode to get the length of the instruction
 	 * to be able to forward the PSW.
 	 */
-	rc = read_guest_instr(vcpu, &opcode, 1);
+	rc = read_guest_instr(vcpu, vcpu->arch.sie_block->gpsw.addr, &opcode, 1);
 	ilen = insn_length(opcode);
 	if (rc < 0) {
 		return rc;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 3a4e97f1a9e6..af9fa91a0c91 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -86,9 +86,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
 	kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
 }
 
-typedef u8 __bitwise ar_t;
-
-static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, ar_t *ar)
+static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, u8 *ar)
 {
 	u32 base2 = vcpu->arch.sie_block->ipb >> 28;
 	u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
@@ -101,7 +99,7 @@ static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, ar_t *ar)
 
 static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu,
 					      u64 *address1, u64 *address2,
-					      ar_t *ar_b1, ar_t *ar_b2)
+					      u8 *ar_b1, u8 *ar_b2)
 {
 	u32 base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28;
 	u32 disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16;
@@ -125,7 +123,7 @@ static inline void kvm_s390_get_regs_rre(struct kvm_vcpu *vcpu, int *r1, int *r2
 		*r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16;
 }
 
-static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, ar_t *ar)
+static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, u8 *ar)
 {
 	u32 base2 = vcpu->arch.sie_block->ipb >> 28;
 	u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) +
@@ -140,7 +138,7 @@ static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, ar_t *ar)
 	return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + (long)(int)disp2;
 }
 
-static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu, ar_t *ar)
+static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu, u8 *ar)
 {
 	u32 base2 = vcpu->arch.sie_block->ipb >> 28;
 	u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
@@ -379,7 +377,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
 void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu);
 void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu);
-void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
 
 /* support for Basic/Extended SCA handling */
 static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm)
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 794503516bd4..fb4b494cde9b 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -54,7 +54,7 @@ int kvm_s390_handle_aa(struct kvm_vcpu *vcpu)
 static int handle_set_clock(struct kvm_vcpu *vcpu)
 {
 	int rc;
-	ar_t ar;
+	u8 ar;
 	u64 op2, val;
 
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
@@ -79,7 +79,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
 	u64 operand2;
 	u32 address;
 	int rc;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_spx++;
 
@@ -117,7 +117,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
 	u64 operand2;
 	u32 address;
 	int rc;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_stpx++;
 
@@ -147,7 +147,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
 	u16 vcpu_id = vcpu->vcpu_id;
 	u64 ga;
 	int rc;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_stap++;
 
@@ -380,7 +380,7 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
 	u32 tpi_data[3];
 	int rc;
 	u64 addr;
-	ar_t ar;
+	u8 ar;
 
 	addr = kvm_s390_get_base_disp_s(vcpu, &ar);
 	if (addr & 3)
@@ -548,7 +548,7 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
 	psw_compat_t new_psw;
 	u64 addr;
 	int rc;
-	ar_t ar;
+	u8 ar;
 
 	if (gpsw->mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@@ -575,7 +575,7 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
 	psw_t new_psw;
 	u64 addr;
 	int rc;
-	ar_t ar;
+	u8 ar;
 
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@@ -597,7 +597,7 @@ static int handle_stidp(struct kvm_vcpu *vcpu)
 	u64 stidp_data = vcpu->kvm->arch.model.cpuid;
 	u64 operand2;
 	int rc;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_stidp++;
 
@@ -644,7 +644,7 @@ static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem)
 	ASCEBC(mem->vm[0].cpi, 16);
 }
 
-static void insert_stsi_usr_data(struct kvm_vcpu *vcpu, u64 addr, ar_t ar,
+static void insert_stsi_usr_data(struct kvm_vcpu *vcpu, u64 addr, u8 ar,
 				 u8 fc, u8 sel1, u16 sel2)
 {
 	vcpu->run->exit_reason = KVM_EXIT_S390_STSI;
@@ -663,7 +663,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
 	unsigned long mem = 0;
 	u64 operand2;
 	int rc = 0;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_stsi++;
 	VCPU_EVENT(vcpu, 3, "STSI: fc: %u sel1: %u sel2: %u", fc, sel1, sel2);
@@ -970,7 +970,7 @@ int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu)
 	int reg, rc, nr_regs;
 	u32 ctl_array[16];
 	u64 ga;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_lctl++;
 
@@ -1009,7 +1009,7 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu)
 	int reg, rc, nr_regs;
 	u32 ctl_array[16];
 	u64 ga;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_stctl++;
 
@@ -1043,7 +1043,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
 	int reg, rc, nr_regs;
 	u64 ctl_array[16];
 	u64 ga;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_lctlg++;
 
@@ -1081,7 +1081,7 @@ static int handle_stctg(struct kvm_vcpu *vcpu)
 	int reg, rc, nr_regs;
 	u64 ctl_array[16];
 	u64 ga;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_stctg++;
 
@@ -1132,7 +1132,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
 	unsigned long hva, gpa;
 	int ret = 0, cc = 0;
 	bool writable;
-	ar_t ar;
+	u8 ar;
 
 	vcpu->stat.instruction_tprot++;
 
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index a9a9d974d9a4..38556e395915 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -324,6 +324,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	/* Run-time-Instrumentation */
 	if (test_kvm_facility(vcpu->kvm, 64))
 		scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
+	/* Instruction Execution Prevention */
+	if (test_kvm_facility(vcpu->kvm, 130))
+		scb_s->ecb2 |= scb_o->ecb2 & 0x20U;
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
 		scb_s->eca |= scb_o->eca & 0x00000001U;
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index beb90f3993e6..b48dc5f1900b 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -744,7 +744,7 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
 
 	pgste_set_unlock(ptep, new);
 	pte_unmap_unlock(ptep, ptl);
-	return 0;
+	return cc;
 }
 EXPORT_SYMBOL(reset_guest_reference_bit);
 
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index 8cc53b1e6d03..0cf802de52a1 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -80,6 +80,8 @@ static struct facility_def facility_defs[] = {
 			76, /* msa extension 3 */
 			77, /* msa extension 4 */
 			78, /* enhanced-DAT 2 */
+			130, /* instruction-execution-protection */
+			131, /* enhanced-SOP 2 and side-effect */
 			-1  /* END */
 		}
 	},
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index e9cd7befcb76..3e8c287090e4 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -441,5 +441,6 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
 void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
 void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt);
 
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a7066dc1a7e9..417502cf42b6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -115,7 +115,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
 
 #define KVM_PERMILLE_MMU_PAGES 20
 #define KVM_MIN_ALLOC_MMU_PAGES 64
-#define KVM_MMU_HASH_SHIFT 10
+#define KVM_MMU_HASH_SHIFT 12
 #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
 #define KVM_MIN_FREE_MMU_PAGES 5
 #define KVM_REFILL_PAGES 25
@@ -208,6 +208,13 @@ enum {
 				 PFERR_WRITE_MASK |		\
 				 PFERR_PRESENT_MASK)
 
+/*
+ * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
+ * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting
+ * with the SVE bit in EPT PTEs.
+ */
+#define SPTE_SPECIAL_MASK (1ULL << 62)
+
 /* apic attention bits */
 #define KVM_APIC_CHECK_VAPIC	0
 /*
@@ -668,6 +675,9 @@ struct kvm_vcpu_arch {
 
 	int pending_ioapic_eoi;
 	int pending_external_vector;
+
+	/* GPA available (AMD only) */
+	bool gpa_available;
 };
 
 struct kvm_lpage_info {
@@ -716,6 +726,12 @@ struct kvm_hv {
 	HV_REFERENCE_TSC_PAGE tsc_ref;
 };
 
+enum kvm_irqchip_mode {
+	KVM_IRQCHIP_NONE,
+	KVM_IRQCHIP_KERNEL,       /* created with KVM_CREATE_IRQCHIP */
+	KVM_IRQCHIP_SPLIT,        /* created with KVM_CAP_SPLIT_IRQCHIP */
+};
+
 struct kvm_arch {
 	unsigned int n_used_mmu_pages;
 	unsigned int n_requested_mmu_pages;
@@ -788,7 +804,7 @@ struct kvm_arch {
 
 	u64 disabled_quirks;
 
-	bool irqchip_split;
+	enum kvm_irqchip_mode irqchip_mode;
 	u8 nr_reserved_ioapic_pins;
 
 	bool disabled_lapic_found;
@@ -815,6 +831,7 @@ struct kvm_vm_stat {
 	ulong mmu_unsync;
 	ulong remote_tlb_flush;
 	ulong lpages;
+	ulong max_mmu_page_hash_collisions;
 };
 
 struct kvm_vcpu_stat {
@@ -844,6 +861,7 @@ struct kvm_vcpu_stat {
 	u64 hypercalls;
 	u64 irq_injections;
 	u64 nmi_injections;
+	u64 req_event;
 };
 
 struct x86_instruction_info;
@@ -1050,7 +1068,8 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_init_vm(struct kvm *kvm);
 void kvm_mmu_uninit_vm(struct kvm *kvm);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask);
+		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
+		u64 acc_track_mask);
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h
new file mode 100644
index 000000000000..f260bef63591
--- /dev/null
+++ b/arch/x86/include/asm/kvmclock.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_X86_KVM_CLOCK_H
+#define _ASM_X86_KVM_CLOCK_H
+
+extern struct clocksource kvm_clock;
+
+#endif /* _ASM_X86_KVM_CLOCK_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2b5b2d4b924e..cc54b7026567 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -467,8 +467,16 @@ enum vmcs_field {
 #define VMX_EPT_WRITABLE_MASK			0x2ull
 #define VMX_EPT_EXECUTABLE_MASK			0x4ull
 #define VMX_EPT_IPAT_BIT    			(1ull << 6)
-#define VMX_EPT_ACCESS_BIT				(1ull << 8)
-#define VMX_EPT_DIRTY_BIT				(1ull << 9)
+#define VMX_EPT_ACCESS_BIT			(1ull << 8)
+#define VMX_EPT_DIRTY_BIT			(1ull << 9)
+#define VMX_EPT_RWX_MASK                        (VMX_EPT_READABLE_MASK |       \
+						 VMX_EPT_WRITABLE_MASK |       \
+						 VMX_EPT_EXECUTABLE_MASK)
+#define VMX_EPT_MT_MASK				(7ull << VMX_EPT_MT_EPTE_SHIFT)
+
+/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */
+#define VMX_EPT_MISCONFIG_WX_VALUE		(VMX_EPT_WRITABLE_MASK |       \
+						 VMX_EPT_EXECUTABLE_MASK)
 
 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul
 
@@ -500,6 +508,22 @@ struct vmx_msr_entry {
 #define ENTRY_FAIL_VMCS_LINK_PTR	4
 
 /*
+ * Exit Qualifications for EPT Violations
+ */
+#define EPT_VIOLATION_ACC_READ_BIT	0
+#define EPT_VIOLATION_ACC_WRITE_BIT	1
+#define EPT_VIOLATION_ACC_INSTR_BIT	2
+#define EPT_VIOLATION_READABLE_BIT	3
+#define EPT_VIOLATION_WRITABLE_BIT	4
+#define EPT_VIOLATION_EXECUTABLE_BIT	5
+#define EPT_VIOLATION_ACC_READ		(1 << EPT_VIOLATION_ACC_READ_BIT)
+#define EPT_VIOLATION_ACC_WRITE		(1 << EPT_VIOLATION_ACC_WRITE_BIT)
+#define EPT_VIOLATION_ACC_INSTR		(1 << EPT_VIOLATION_ACC_INSTR_BIT)
+#define EPT_VIOLATION_READABLE		(1 << EPT_VIOLATION_READABLE_BIT)
+#define EPT_VIOLATION_WRITABLE		(1 << EPT_VIOLATION_WRITABLE_BIT)
+#define EPT_VIOLATION_EXECUTABLE	(1 << EPT_VIOLATION_EXECUTABLE_BIT)
+
+/*
  * VM-instruction error numbers
  */
 enum vm_instruction_error_number {
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 1421a6585126..cff0bb6556f8 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -50,6 +50,15 @@ struct kvm_steal_time {
 	__u32 pad[11];
 };
 
+#define KVM_CLOCK_PAIRING_WALLCLOCK 0
+struct kvm_clock_pairing {
+	__s64 sec;
+	__s64 nsec;
+	__u64 tsc;
+	__u32 flags;
+	__u32 pad[9];
+};
+
 #define KVM_STEAL_ALIGNMENT_BITS 5
 #define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
 #define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 542710b99f52..bae6ea6cfb94 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -28,6 +28,7 @@
 
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
+#include <asm/kvmclock.h>
 
 static int kvmclock __ro_after_init = 1;
 static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
@@ -49,6 +50,7 @@ struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
 {
 	return hv_clock;
 }
+EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va);
 
 /*
  * The wallclock is the time of day when we booted. Since then, some time may
@@ -174,13 +176,14 @@ bool kvm_check_and_clear_guest_paused(void)
 	return ret;
 }
 
-static struct clocksource kvm_clock = {
+struct clocksource kvm_clock = {
 	.name = "kvm-clock",
 	.read = kvm_clock_get_cycles,
 	.rating = 400,
 	.mask = CLOCKSOURCE_MASK(64),
 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
 };
+EXPORT_SYMBOL_GPL(kvm_clock);
 
 int kvm_register_clock(char *txt)
 {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index e85f6bd7b9d5..c0e2036217ad 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -383,7 +383,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 7.0.ecx*/
 	const u32 kvm_cpuid_7_0_ecx_x86_features =
-		F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/;
+		F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
 
 	/* cpuid 7.0.edx*/
 	const u32 kvm_cpuid_7_0_edx_x86_features =
@@ -861,12 +861,6 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
 	if (!best)
 		best = check_cpuid_limit(vcpu, function, index);
 
-	/*
-	 * Perfmon not yet supported for L2 guest.
-	 */
-	if (is_guest_mode(vcpu) && function == 0xa)
-		best = NULL;
-
 	if (best) {
 		*eax = best->eax;
 		*ebx = best->ebx;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index cedbba0f3402..45c7306c8780 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -173,6 +173,7 @@
 #define NearBranch  ((u64)1 << 52)  /* Near branches */
 #define No16	    ((u64)1 << 53)  /* No 16 bit operand */
 #define IncSP       ((u64)1 << 54)  /* SP is incremented before ModRM calc */
+#define TwoMemOp    ((u64)1 << 55)  /* Instruction has two memory operand */
 
 #define DstXacc     (DstAccLo | SrcAccHi | SrcWrite)
 
@@ -4298,7 +4299,7 @@ static const struct opcode group1[] = {
 };
 
 static const struct opcode group1A[] = {
-	I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N,
+	I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N,
 };
 
 static const struct opcode group2[] = {
@@ -4336,7 +4337,7 @@ static const struct opcode group5[] = {
 	I(SrcMemFAddr | ImplicitOps,		em_call_far),
 	I(SrcMem | NearBranch,			em_jmp_abs),
 	I(SrcMemFAddr | ImplicitOps,		em_jmp_far),
-	I(SrcMem | Stack,			em_push), D(Undefined),
+	I(SrcMem | Stack | TwoMemOp,		em_push), D(Undefined),
 };
 
 static const struct opcode group6[] = {
@@ -4556,8 +4557,8 @@ static const struct opcode opcode_table[256] = {
 	/* 0xA0 - 0xA7 */
 	I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
 	I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
-	I2bv(SrcSI | DstDI | Mov | String, em_mov),
-	F2bv(SrcSI | DstDI | String | NoWrite, em_cmp_r),
+	I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov),
+	F2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r),
 	/* 0xA8 - 0xAF */
 	F2bv(DstAcc | SrcImm | NoWrite, em_test),
 	I2bv(SrcAcc | DstDI | Mov | String, em_mov),
@@ -5671,3 +5672,14 @@ void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
 {
 	writeback_registers(ctxt);
 }
+
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt)
+{
+	if (ctxt->rep_prefix && (ctxt->d & String))
+		return false;
+
+	if (ctxt->d & TwoMemOp)
+		return false;
+
+	return true;
+}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 1572c35b4f1a..08b27e0c7b71 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -305,13 +305,13 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
 		return -ENOENT;
 
 	memset(&irq, 0, sizeof(irq));
-	irq.dest_id = kvm_apic_id(vcpu->arch.apic);
+	irq.shorthand = APIC_DEST_SELF;
 	irq.dest_mode = APIC_DEST_PHYSICAL;
 	irq.delivery_mode = APIC_DM_FIXED;
 	irq.vector = vector;
 	irq.level = 1;
 
-	ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL);
+	ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL);
 	trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret);
 	return ret;
 }
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 7cc2360f1848..73ea24d4f119 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -598,14 +598,14 @@ static const struct kvm_io_device_ops picdev_eclr_ops = {
 	.write    = picdev_eclr_write,
 };
 
-struct kvm_pic *kvm_create_pic(struct kvm *kvm)
+int kvm_pic_init(struct kvm *kvm)
 {
 	struct kvm_pic *s;
 	int ret;
 
 	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
 	if (!s)
-		return NULL;
+		return -ENOMEM;
 	spin_lock_init(&s->lock);
 	s->kvm = kvm;
 	s->pics[0].elcr_mask = 0xf8;
@@ -635,7 +635,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 
 	mutex_unlock(&kvm->slots_lock);
 
-	return s;
+	kvm->arch.vpic = s;
+
+	return 0;
 
 fail_unreg_1:
 	kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
@@ -648,13 +650,17 @@ fail_unlock:
 
 	kfree(s);
 
-	return NULL;
+	return ret;
 }
 
-void kvm_destroy_pic(struct kvm_pic *vpic)
+void kvm_pic_destroy(struct kvm *kvm)
 {
+	struct kvm_pic *vpic = kvm->arch.vpic;
+
 	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
 	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
 	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr);
+
+	kvm->arch.vpic = NULL;
 	kfree(vpic);
 }
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 035731eb3897..40d5b2cf6061 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -73,8 +73,8 @@ struct kvm_pic {
 	unsigned long irq_states[PIC_NUM_PINS];
 };
 
-struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_destroy_pic(struct kvm_pic *vpic);
+int kvm_pic_init(struct kvm *kvm);
+void kvm_pic_destroy(struct kvm *kvm);
 int kvm_pic_read_irq(struct kvm *kvm);
 void kvm_pic_update_irq(struct kvm_pic *s);
 
@@ -93,18 +93,19 @@ static inline int pic_in_kernel(struct kvm *kvm)
 
 static inline int irqchip_split(struct kvm *kvm)
 {
-	return kvm->arch.irqchip_split;
+	return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT;
 }
 
-static inline int irqchip_in_kernel(struct kvm *kvm)
+static inline int irqchip_kernel(struct kvm *kvm)
 {
-	struct kvm_pic *vpic = pic_irqchip(kvm);
-	bool ret;
+	return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL;
+}
 
-	ret = (vpic != NULL);
-	ret |= irqchip_split(kvm);
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+	bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE;
 
-	/* Read vpic before kvm->irq_routing.  */
+	/* Matches with wmb after initializing kvm->irq_routing. */
 	smp_rmb();
 	return ret;
 }
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 6c0191615f23..b96d3893f121 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -41,15 +41,6 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 			   bool line_status)
 {
 	struct kvm_pic *pic = pic_irqchip(kvm);
-
-	/*
-	 * XXX: rejecting pic routes when pic isn't in use would be better,
-	 * but the default routing table is installed while kvm->arch.vpic is
-	 * NULL and KVM_CREATE_IRQCHIP can race with KVM_IRQ_LINE.
-	 */
-	if (!pic)
-		return -1;
-
 	return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
 }
 
@@ -58,10 +49,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 			      bool line_status)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-
-	if (!ioapic)
-		return -1;
-
 	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
 				line_status);
 }
@@ -297,16 +284,20 @@ int kvm_set_routing_entry(struct kvm *kvm,
 	case KVM_IRQ_ROUTING_IRQCHIP:
 		delta = 0;
 		switch (ue->u.irqchip.irqchip) {
-		case KVM_IRQCHIP_PIC_MASTER:
-			e->set = kvm_set_pic_irq;
-			max_pin = PIC_NUM_PINS;
-			break;
 		case KVM_IRQCHIP_PIC_SLAVE:
+			delta = 8;
+			/* fall through */
+		case KVM_IRQCHIP_PIC_MASTER:
+			if (!pic_in_kernel(kvm))
+				goto out;
+
 			e->set = kvm_set_pic_irq;
 			max_pin = PIC_NUM_PINS;
-			delta = 8;
 			break;
 		case KVM_IRQCHIP_IOAPIC:
+			if (!ioapic_in_kernel(kvm))
+				goto out;
+
 			max_pin = KVM_IOAPIC_NUM_PINS;
 			e->set = kvm_set_ioapic_irq;
 			break;
@@ -409,7 +400,7 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm)
 
 void kvm_arch_post_irq_routing_update(struct kvm *kvm)
 {
-	if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
+	if (!irqchip_split(kvm))
 		return;
 	kvm_make_scan_ioapic_request(kvm);
 }
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2f6ef5121a4c..33b799fd3a6e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -115,6 +115,16 @@ static inline int apic_enabled(struct kvm_lapic *apic)
 	(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
 	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
 
+static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
+{
+	return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
+}
+
+static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
+{
+	return apic->vcpu->vcpu_id;
+}
+
 static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
 		u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
 	switch (map->mode) {
@@ -159,13 +169,13 @@ static void recalculate_apic_map(struct kvm *kvm)
 	struct kvm_apic_map *new, *old = NULL;
 	struct kvm_vcpu *vcpu;
 	int i;
-	u32 max_id = 255;
+	u32 max_id = 255; /* enough space for any xAPIC ID */
 
 	mutex_lock(&kvm->arch.apic_map_lock);
 
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		if (kvm_apic_present(vcpu))
-			max_id = max(max_id, kvm_apic_id(vcpu->arch.apic));
+			max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
 
 	new = kvm_kvzalloc(sizeof(struct kvm_apic_map) +
 	                   sizeof(struct kvm_lapic *) * ((u64)max_id + 1));
@@ -179,16 +189,28 @@ static void recalculate_apic_map(struct kvm *kvm)
 		struct kvm_lapic *apic = vcpu->arch.apic;
 		struct kvm_lapic **cluster;
 		u16 mask;
-		u32 ldr, aid;
+		u32 ldr;
+		u8 xapic_id;
+		u32 x2apic_id;
 
 		if (!kvm_apic_present(vcpu))
 			continue;
 
-		aid = kvm_apic_id(apic);
-		ldr = kvm_lapic_get_reg(apic, APIC_LDR);
+		xapic_id = kvm_xapic_id(apic);
+		x2apic_id = kvm_x2apic_id(apic);
 
-		if (aid <= new->max_apic_id)
-			new->phys_map[aid] = apic;
+		/* Hotplug hack: see kvm_apic_match_physical_addr(), ... */
+		if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
+				x2apic_id <= new->max_apic_id)
+			new->phys_map[x2apic_id] = apic;
+		/*
+		 * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around,
+		 * prevent them from masking VCPUs with APIC ID <= 0xff.
+		 */
+		if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
+			new->phys_map[xapic_id] = apic;
+
+		ldr = kvm_lapic_get_reg(apic, APIC_LDR);
 
 		if (apic_x2apic_mode(apic)) {
 			new->mode |= KVM_APIC_MODE_X2APIC;
@@ -250,6 +272,8 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
 {
 	u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
 
+	WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
+
 	kvm_lapic_set_reg(apic, APIC_ID, id);
 	kvm_lapic_set_reg(apic, APIC_LDR, ldr);
 	recalculate_apic_map(apic->vcpu->kvm);
@@ -546,7 +570,15 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 	__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
 }
 
-static void apic_update_ppr(struct kvm_lapic *apic)
+static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
+{
+	int highest_irr = apic_find_highest_irr(apic);
+	if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
+		return -1;
+	return highest_irr;
+}
+
+static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr)
 {
 	u32 tpr, isrv, ppr, old_ppr;
 	int isr;
@@ -564,12 +596,27 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 	apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
 		   apic, ppr, isr, isrv);
 
-	if (old_ppr != ppr) {
+	*new_ppr = ppr;
+	if (old_ppr != ppr)
 		kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr);
-		if (ppr < old_ppr)
-			kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
-	}
+
+	return ppr < old_ppr;
+}
+
+static void apic_update_ppr(struct kvm_lapic *apic)
+{
+	u32 ppr;
+
+	if (__apic_update_ppr(apic, &ppr) &&
+	    apic_has_interrupt_for_ppr(apic, ppr) != -1)
+		kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu)
+{
+	apic_update_ppr(vcpu->arch.apic);
 }
+EXPORT_SYMBOL_GPL(kvm_apic_update_ppr);
 
 static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
 {
@@ -579,10 +626,8 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
 
 static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
 {
-	if (apic_x2apic_mode(apic))
-		return mda == X2APIC_BROADCAST;
-
-	return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST;
+	return mda == (apic_x2apic_mode(apic) ?
+			X2APIC_BROADCAST : APIC_BROADCAST);
 }
 
 static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
@@ -591,9 +636,18 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
 		return true;
 
 	if (apic_x2apic_mode(apic))
-		return mda == kvm_apic_id(apic);
+		return mda == kvm_x2apic_id(apic);
+
+	/*
+	 * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if
+	 * it were in x2APIC mode.  Hotplugged VCPUs start in xAPIC mode and
+	 * this allows unique addressing of VCPUs with APIC ID over 0xff.
+	 * The 0xff condition is needed because writeable xAPIC ID.
+	 */
+	if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic))
+		return true;
 
-	return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic));
+	return mda == kvm_xapic_id(apic);
 }
 
 static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
@@ -610,7 +664,6 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
 		       && (logical_id & mda & 0xffff) != 0;
 
 	logical_id = GET_APIC_LOGICAL_ID(logical_id);
-	mda = GET_APIC_DEST_FIELD(mda);
 
 	switch (kvm_lapic_get_reg(apic, APIC_DFR)) {
 	case APIC_DFR_FLAT:
@@ -627,9 +680,9 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
 
 /* The KVM local APIC implementation has two quirks:
  *
- *  - the xAPIC MDA stores the destination at bits 24-31, while this
- *    is not true of struct kvm_lapic_irq's dest_id field.  This is
- *    just a quirk in the API and is not problematic.
+ *  - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs
+ *    in xAPIC mode if the "destination & 0xff" matches its xAPIC ID.
+ *    KVM doesn't do that aliasing.
  *
  *  - in-kernel IOAPIC messages have to be delivered directly to
  *    x2APIC, because the kernel does not support interrupt remapping.
@@ -645,13 +698,12 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
 		struct kvm_lapic *source, struct kvm_lapic *target)
 {
 	bool ipi = source != NULL;
-	bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
 
 	if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
-	    !ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+	    !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target))
 		return X2APIC_BROADCAST;
 
-	return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
+	return dest_id;
 }
 
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
@@ -1907,9 +1959,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 	vcpu->arch.apic_arb_prio = 0;
 	vcpu->arch.apic_attention = 0;
 
-	apic_debug("%s: vcpu=%p, id=%d, base_msr="
+	apic_debug("%s: vcpu=%p, id=0x%x, base_msr="
 		   "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
-		   vcpu, kvm_apic_id(apic),
+		   vcpu, kvm_lapic_get_reg(apic, APIC_ID),
 		   vcpu->arch.apic_base, apic->base_address);
 }
 
@@ -2021,17 +2073,13 @@ nomem:
 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
-	int highest_irr;
+	u32 ppr;
 
 	if (!apic_enabled(apic))
 		return -1;
 
-	apic_update_ppr(apic);
-	highest_irr = apic_find_highest_irr(apic);
-	if ((highest_irr == -1) ||
-	    ((highest_irr & 0xF0) <= kvm_lapic_get_reg(apic, APIC_PROCPRI)))
-		return -1;
-	return highest_irr;
+	__apic_update_ppr(apic, &ppr);
+	return apic_has_interrupt_for_ppr(apic, ppr);
 }
 
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
@@ -2067,6 +2115,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
 {
 	int vector = kvm_apic_has_interrupt(vcpu);
 	struct kvm_lapic *apic = vcpu->arch.apic;
+	u32 ppr;
 
 	if (vector == -1)
 		return -1;
@@ -2078,13 +2127,23 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
 	 * because the process would deliver it through the IDT.
 	 */
 
-	apic_set_isr(vector, apic);
-	apic_update_ppr(apic);
 	apic_clear_irr(vector, apic);
-
 	if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
-		apic_clear_isr(vector, apic);
+		/*
+		 * For auto-EOI interrupts, there might be another pending
+		 * interrupt above PPR, so check whether to raise another
+		 * KVM_REQ_EVENT.
+		 */
 		apic_update_ppr(apic);
+	} else {
+		/*
+		 * For normal interrupts, PPR has been raised and there cannot
+		 * be a higher-priority pending interrupt---except if there was
+		 * a concurrent interrupt injection, but that would have
+		 * triggered KVM_REQ_EVENT already.
+		 */
+		apic_set_isr(vector, apic);
+		__apic_update_ppr(apic, &ppr);
 	}
 
 	return vector;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index ff8039d61672..05abd837b78a 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -73,6 +73,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 
 void __kvm_apic_update_irr(u32 *pir, void *regs);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 		     struct dest_map *dest_map);
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
@@ -203,17 +204,6 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
 	return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
 }
 
-static inline u32 kvm_apic_id(struct kvm_lapic *apic)
-{
-	/* To avoid a race between apic_base and following APIC_ID update when
-	 * switching to x2apic_mode, the x2apic mode returns initial x2apic id.
-	 */
-	if (apic_x2apic_mode(apic))
-		return apic->vcpu->vcpu_id;
-
-	return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
-}
-
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
 void wait_lapic_expire(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7012de4a1fed..2fd7586aad4d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -37,6 +37,8 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/hash.h>
+#include <linux/kern_levels.h>
 
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
@@ -129,6 +131,10 @@ module_param(dbg, bool, 0644);
 #define ACC_USER_MASK    PT_USER_MASK
 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 
+/* The mask for the R/X bits in EPT PTEs */
+#define PT64_EPT_READABLE_MASK			0x1ull
+#define PT64_EPT_EXECUTABLE_MASK		0x4ull
+
 #include <trace/events/kvm.h>
 
 #define CREATE_TRACE_POINTS
@@ -178,15 +184,40 @@ static u64 __read_mostly shadow_dirty_mask;
 static u64 __read_mostly shadow_mmio_mask;
 static u64 __read_mostly shadow_present_mask;
 
+/*
+ * The mask/value to distinguish a PTE that has been marked not-present for
+ * access tracking purposes.
+ * The mask would be either 0 if access tracking is disabled, or
+ * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled.
+ */
+static u64 __read_mostly shadow_acc_track_mask;
+static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page.
+ */
+static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
+						    PT64_EPT_EXECUTABLE_MASK;
+static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
+
 static void mmu_spte_set(u64 *sptep, u64 spte);
 static void mmu_free_roots(struct kvm_vcpu *vcpu);
 
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
 {
-	shadow_mmio_mask = mmio_mask;
+	shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
+static inline bool is_access_track_spte(u64 spte)
+{
+	/* Always false if shadow_acc_track_mask is zero.  */
+	return (spte & shadow_acc_track_mask) == shadow_acc_track_value;
+}
+
 /*
  * the low bit of the generation number is always presumed to be zero.
  * This disables mmio caching during memslot updates.  The concept is
@@ -284,17 +315,35 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 }
 
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask)
+		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
+		u64 acc_track_mask)
 {
+	if (acc_track_mask != 0)
+		acc_track_mask |= SPTE_SPECIAL_MASK;
+
 	shadow_user_mask = user_mask;
 	shadow_accessed_mask = accessed_mask;
 	shadow_dirty_mask = dirty_mask;
 	shadow_nx_mask = nx_mask;
 	shadow_x_mask = x_mask;
 	shadow_present_mask = p_mask;
+	shadow_acc_track_mask = acc_track_mask;
+	WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
+void kvm_mmu_clear_all_pte_masks(void)
+{
+	shadow_user_mask = 0;
+	shadow_accessed_mask = 0;
+	shadow_dirty_mask = 0;
+	shadow_nx_mask = 0;
+	shadow_x_mask = 0;
+	shadow_mmio_mask = 0;
+	shadow_present_mask = 0;
+	shadow_acc_track_mask = 0;
+}
+
 static int is_cpuid_PSE36(void)
 {
 	return 1;
@@ -307,7 +356,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
 
 static int is_shadow_present_pte(u64 pte)
 {
-	return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte);
+	return (pte != 0) && !is_mmio_spte(pte);
 }
 
 static int is_large_pte(u64 pte)
@@ -324,6 +373,11 @@ static int is_last_spte(u64 pte, int level)
 	return 0;
 }
 
+static bool is_executable_pte(u64 spte)
+{
+	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+}
+
 static kvm_pfn_t spte_to_pfn(u64 pte)
 {
 	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -473,7 +527,7 @@ retry:
 }
 #endif
 
-static bool spte_is_locklessly_modifiable(u64 spte)
+static bool spte_can_locklessly_be_made_writable(u64 spte)
 {
 	return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
 		(SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
@@ -481,36 +535,38 @@ static bool spte_is_locklessly_modifiable(u64 spte)
 
 static bool spte_has_volatile_bits(u64 spte)
 {
+	if (!is_shadow_present_pte(spte))
+		return false;
+
 	/*
 	 * Always atomically update spte if it can be updated
 	 * out of mmu-lock, it can ensure dirty bit is not lost,
 	 * also, it can help us to get a stable is_writable_pte()
 	 * to ensure tlb flush is not missed.
 	 */
-	if (spte_is_locklessly_modifiable(spte))
+	if (spte_can_locklessly_be_made_writable(spte) ||
+	    is_access_track_spte(spte))
 		return true;
 
-	if (!shadow_accessed_mask)
-		return false;
-
-	if (!is_shadow_present_pte(spte))
-		return false;
-
-	if ((spte & shadow_accessed_mask) &&
-	      (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
-		return false;
+	if (shadow_accessed_mask) {
+		if ((spte & shadow_accessed_mask) == 0 ||
+	    	    (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
+			return true;
+	}
 
-	return true;
+	return false;
 }
 
-static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
+static bool is_accessed_spte(u64 spte)
 {
-	return (old_spte & bit_mask) && !(new_spte & bit_mask);
+	return shadow_accessed_mask ? spte & shadow_accessed_mask
+				    : !is_access_track_spte(spte);
 }
 
-static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask)
+static bool is_dirty_spte(u64 spte)
 {
-	return (old_spte & bit_mask) != (new_spte & bit_mask);
+	return shadow_dirty_mask ? spte & shadow_dirty_mask
+				 : spte & PT_WRITABLE_MASK;
 }
 
 /* Rules for using mmu_spte_set:
@@ -525,25 +581,19 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
 	__set_spte(sptep, new_spte);
 }
 
-/* Rules for using mmu_spte_update:
- * Update the state bits, it means the mapped pfn is not changed.
- *
- * Whenever we overwrite a writable spte with a read-only one we
- * should flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB, the return value indicates this
- * case.
+/*
+ * Update the SPTE (excluding the PFN), but do not track changes in its
+ * accessed/dirty status.
  */
-static bool mmu_spte_update(u64 *sptep, u64 new_spte)
+static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
 {
 	u64 old_spte = *sptep;
-	bool ret = false;
 
 	WARN_ON(!is_shadow_present_pte(new_spte));
 
 	if (!is_shadow_present_pte(old_spte)) {
 		mmu_spte_set(sptep, new_spte);
-		return ret;
+		return old_spte;
 	}
 
 	if (!spte_has_volatile_bits(old_spte))
@@ -551,45 +601,62 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 	else
 		old_spte = __update_clear_spte_slow(sptep, new_spte);
 
+	WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+
+	return old_spte;
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changed.
+ *
+ * Whenever we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB, the return value indicates this
+ * case.
+ *
+ * Returns true if the TLB needs to be flushed
+ */
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
+{
+	bool flush = false;
+	u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
+
+	if (!is_shadow_present_pte(old_spte))
+		return false;
+
 	/*
 	 * For the spte updated out of mmu-lock is safe, since
 	 * we always atomically update it, see the comments in
 	 * spte_has_volatile_bits().
 	 */
-	if (spte_is_locklessly_modifiable(old_spte) &&
+	if (spte_can_locklessly_be_made_writable(old_spte) &&
 	      !is_writable_pte(new_spte))
-		ret = true;
-
-	if (!shadow_accessed_mask) {
-		/*
-		 * We don't set page dirty when dropping non-writable spte.
-		 * So do it now if the new spte is becoming non-writable.
-		 */
-		if (ret)
-			kvm_set_pfn_dirty(spte_to_pfn(old_spte));
-		return ret;
-	}
+		flush = true;
 
 	/*
-	 * Flush TLB when accessed/dirty bits are changed in the page tables,
+	 * Flush TLB when accessed/dirty states are changed in the page tables,
 	 * to guarantee consistency between TLB and page tables.
 	 */
-	if (spte_is_bit_changed(old_spte, new_spte,
-                                shadow_accessed_mask | shadow_dirty_mask))
-		ret = true;
 
-	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
+	if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
+		flush = true;
 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
-	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
+	}
+
+	if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
+		flush = true;
 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+	}
 
-	return ret;
+	return flush;
 }
 
 /*
  * Rules for using mmu_spte_clear_track_bits:
  * It sets the sptep from present to nonpresent, and track the
  * state bits, it is used to clear the last level sptep.
+ * Returns non-zero if the PTE was previously valid.
  */
 static int mmu_spte_clear_track_bits(u64 *sptep)
 {
@@ -613,11 +680,12 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
 	 */
 	WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
 
-	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+	if (is_accessed_spte(old_spte))
 		kvm_set_pfn_accessed(pfn);
-	if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask :
-					    PT_WRITABLE_MASK))
+
+	if (is_dirty_spte(old_spte))
 		kvm_set_pfn_dirty(pfn);
+
 	return 1;
 }
 
@@ -636,6 +704,78 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
 	return __get_spte_lockless(sptep);
 }
 
+static u64 mark_spte_for_access_track(u64 spte)
+{
+	if (shadow_accessed_mask != 0)
+		return spte & ~shadow_accessed_mask;
+
+	if (shadow_acc_track_mask == 0 || is_access_track_spte(spte))
+		return spte;
+
+	/*
+	 * Making an Access Tracking PTE will result in removal of write access
+	 * from the PTE. So, verify that we will be able to restore the write
+	 * access in the fast page fault path later on.
+	 */
+	WARN_ONCE((spte & PT_WRITABLE_MASK) &&
+		  !spte_can_locklessly_be_made_writable(spte),
+		  "kvm: Writable SPTE is not locklessly dirty-trackable\n");
+
+	WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
+			  shadow_acc_track_saved_bits_shift),
+		  "kvm: Access Tracking saved bit locations are not zero\n");
+
+	spte |= (spte & shadow_acc_track_saved_bits_mask) <<
+		shadow_acc_track_saved_bits_shift;
+	spte &= ~shadow_acc_track_mask;
+	spte |= shadow_acc_track_value;
+
+	return spte;
+}
+
+/* Restore an acc-track PTE back to a regular PTE */
+static u64 restore_acc_track_spte(u64 spte)
+{
+	u64 new_spte = spte;
+	u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
+			 & shadow_acc_track_saved_bits_mask;
+
+	WARN_ON_ONCE(!is_access_track_spte(spte));
+
+	new_spte &= ~shadow_acc_track_mask;
+	new_spte &= ~(shadow_acc_track_saved_bits_mask <<
+		      shadow_acc_track_saved_bits_shift);
+	new_spte |= saved_bits;
+
+	return new_spte;
+}
+
+/* Returns the Accessed status of the PTE and resets it at the same time. */
+static bool mmu_spte_age(u64 *sptep)
+{
+	u64 spte = mmu_spte_get_lockless(sptep);
+
+	if (!is_accessed_spte(spte))
+		return false;
+
+	if (shadow_accessed_mask) {
+		clear_bit((ffs(shadow_accessed_mask) - 1),
+			  (unsigned long *)sptep);
+	} else {
+		/*
+		 * Capture the dirty status of the page, so that it doesn't get
+		 * lost when the SPTE is marked for access tracking.
+		 */
+		if (is_writable_pte(spte))
+			kvm_set_pfn_dirty(spte_to_pfn(spte));
+
+		spte = mark_spte_for_access_track(spte);
+		mmu_spte_update_no_track(sptep, spte);
+	}
+
+	return true;
+}
+
 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -1212,7 +1352,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
 	u64 spte = *sptep;
 
 	if (!is_writable_pte(spte) &&
-	      !(pt_protect && spte_is_locklessly_modifiable(spte)))
+	      !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
 		return false;
 
 	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
@@ -1420,7 +1560,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 restart:
 	for_each_rmap_spte(rmap_head, &iter, sptep) {
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
-			     sptep, *sptep, gfn, level);
+			    sptep, *sptep, gfn, level);
 
 		need_flush = 1;
 
@@ -1433,7 +1573,8 @@ restart:
 
 			new_spte &= ~PT_WRITABLE_MASK;
 			new_spte &= ~SPTE_HOST_WRITEABLE;
-			new_spte &= ~shadow_accessed_mask;
+
+			new_spte = mark_spte_for_access_track(new_spte);
 
 			mmu_spte_clear_track_bits(sptep);
 			mmu_spte_set(sptep, new_spte);
@@ -1595,15 +1736,8 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 	struct rmap_iterator uninitialized_var(iter);
 	int young = 0;
 
-	BUG_ON(!shadow_accessed_mask);
-
-	for_each_rmap_spte(rmap_head, &iter, sptep) {
-		if (*sptep & shadow_accessed_mask) {
-			young = 1;
-			clear_bit((ffs(shadow_accessed_mask) - 1),
-				 (unsigned long *)sptep);
-		}
-	}
+	for_each_rmap_spte(rmap_head, &iter, sptep)
+		young |= mmu_spte_age(sptep);
 
 	trace_kvm_age_page(gfn, level, slot, young);
 	return young;
@@ -1615,24 +1749,20 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
-	int young = 0;
 
 	/*
-	 * If there's no access bit in the secondary pte set by the
-	 * hardware it's up to gup-fast/gup to set the access bit in
-	 * the primary pte or in the page structure.
+	 * If there's no access bit in the secondary pte set by the hardware and
+	 * fast access tracking is also not enabled, it's up to gup-fast/gup to
+	 * set the access bit in the primary pte or in the page structure.
 	 */
-	if (!shadow_accessed_mask)
+	if (!shadow_accessed_mask && !shadow_acc_track_mask)
 		goto out;
 
-	for_each_rmap_spte(rmap_head, &iter, sptep) {
-		if (*sptep & shadow_accessed_mask) {
-			young = 1;
-			break;
-		}
-	}
+	for_each_rmap_spte(rmap_head, &iter, sptep)
+		if (is_accessed_spte(*sptep))
+			return 1;
 out:
-	return young;
+	return 0;
 }
 
 #define RMAP_RECYCLE_THRESHOLD 1000
@@ -1660,7 +1790,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
 	 * This has some overhead, but not as much as the cost of swapping
 	 * out actively used pages or breaking up actively used hugepages.
 	 */
-	if (!shadow_accessed_mask)
+	if (!shadow_accessed_mask && !shadow_acc_track_mask)
 		return kvm_handle_hva_range(kvm, start, end, 0,
 					    kvm_unmap_rmapp);
 
@@ -1713,7 +1843,7 @@ static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 
 static unsigned kvm_page_table_hashfn(gfn_t gfn)
 {
-	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
+	return hash_64(gfn, KVM_MMU_HASH_SHIFT);
 }
 
 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
@@ -1904,17 +2034,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
  * since it has been deleted from active_mmu_pages but still can be found
  * at hast list.
  *
- * for_each_gfn_valid_sp() has skipped that kind of pages.
+ * for_each_valid_sp() has skipped that kind of pages.
  */
-#define for_each_gfn_valid_sp(_kvm, _sp, _gfn)				\
+#define for_each_valid_sp(_kvm, _sp, _gfn)				\
 	hlist_for_each_entry(_sp,					\
 	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
-		if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \
-			|| (_sp)->role.invalid) {} else
+		if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) {    \
+		} else
 
 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)			\
-	for_each_gfn_valid_sp(_kvm, _sp, _gfn)				\
-		if ((_sp)->role.direct) {} else
+	for_each_valid_sp(_kvm, _sp, _gfn)				\
+		if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
 
 /* @sp->gfn should be write-protected at the call site */
 static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -2116,6 +2246,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	struct kvm_mmu_page *sp;
 	bool need_sync = false;
 	bool flush = false;
+	int collisions = 0;
 	LIST_HEAD(invalid_list);
 
 	role = vcpu->arch.mmu.base_role;
@@ -2130,7 +2261,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 		role.quadrant = quadrant;
 	}
-	for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) {
+	for_each_valid_sp(vcpu->kvm, sp, gfn) {
+		if (sp->gfn != gfn) {
+			collisions++;
+			continue;
+		}
+
 		if (!need_sync && sp->unsync)
 			need_sync = true;
 
@@ -2153,7 +2289,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 
 		__clear_sp_write_flooding_count(sp);
 		trace_kvm_mmu_get_page(sp, false);
-		return sp;
+		goto out;
 	}
 
 	++vcpu->kvm->stat.mmu_cache_miss;
@@ -2183,6 +2319,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	trace_kvm_mmu_get_page(sp, true);
 
 	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+out:
+	if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
+		vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
 	return sp;
 }
 
@@ -2583,6 +2722,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= shadow_dirty_mask;
 	}
 
+	if (speculative)
+		spte = mark_spte_for_access_track(spte);
+
 set_pte:
 	if (mmu_spte_update(sptep, spte))
 		kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2636,7 +2778,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
 	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
 		 is_large_pte(*sptep)? "2MB" : "4kB",
-		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
+		 *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
 		 *sptep, sptep);
 	if (!was_rmapped && is_large_pte(*sptep))
 		++vcpu->kvm->stat.lpages;
@@ -2869,33 +3011,43 @@ static bool page_fault_can_be_fast(u32 error_code)
 	if (unlikely(error_code & PFERR_RSVD_MASK))
 		return false;
 
+	/* See if the page fault is due to an NX violation */
+	if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
+		      == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
+		return false;
+
 	/*
-	 * #PF can be fast only if the shadow page table is present and it
-	 * is caused by write-protect, that means we just need change the
-	 * W bit of the spte which can be done out of mmu-lock.
+	 * #PF can be fast if:
+	 * 1. The shadow page table entry is not present, which could mean that
+	 *    the fault is potentially caused by access tracking (if enabled).
+	 * 2. The shadow page table entry is present and the fault
+	 *    is caused by write-protect, that means we just need change the W
+	 *    bit of the spte which can be done out of mmu-lock.
+	 *
+	 * However, if access tracking is disabled we know that a non-present
+	 * page must be a genuine page fault where we have to create a new SPTE.
+	 * So, if access tracking is disabled, we return true only for write
+	 * accesses to a present page.
 	 */
-	if (!(error_code & PFERR_PRESENT_MASK) ||
-	      !(error_code & PFERR_WRITE_MASK))
-		return false;
 
-	return true;
+	return shadow_acc_track_mask != 0 ||
+	       ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
+		== (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
 }
 
+/*
+ * Returns true if the SPTE was fixed successfully. Otherwise,
+ * someone else modified the SPTE from its original value.
+ */
 static bool
 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			u64 *sptep, u64 spte)
+			u64 *sptep, u64 old_spte, u64 new_spte)
 {
 	gfn_t gfn;
 
 	WARN_ON(!sp->role.direct);
 
 	/*
-	 * The gfn of direct spte is stable since it is calculated
-	 * by sp->gfn.
-	 */
-	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
-
-	/*
 	 * Theoretically we could also set dirty bit (and flush TLB) here in
 	 * order to eliminate unnecessary PML logging. See comments in
 	 * set_spte. But fast_page_fault is very unlikely to happen with PML
@@ -2907,12 +3059,33 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 *
 	 * Compare with set_spte where instead shadow_dirty_mask is set.
 	 */
-	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
+	if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
+		return false;
+
+	if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
+		/*
+		 * The gfn of direct spte is stable since it is
+		 * calculated by sp->gfn.
+		 */
+		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
 		kvm_vcpu_mark_page_dirty(vcpu, gfn);
+	}
 
 	return true;
 }
 
+static bool is_access_allowed(u32 fault_err_code, u64 spte)
+{
+	if (fault_err_code & PFERR_FETCH_MASK)
+		return is_executable_pte(spte);
+
+	if (fault_err_code & PFERR_WRITE_MASK)
+		return is_writable_pte(spte);
+
+	/* Fault was on Read access */
+	return spte & PT_PRESENT_MASK;
+}
+
 /*
  * Return value:
  * - true: let the vcpu to access on the same address again.
@@ -2923,8 +3096,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
-	bool ret = false;
+	bool fault_handled = false;
 	u64 spte = 0ull;
+	uint retry_count = 0;
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return false;
@@ -2933,66 +3107,93 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		return false;
 
 	walk_shadow_page_lockless_begin(vcpu);
-	for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
-		if (!is_shadow_present_pte(spte) || iterator.level < level)
+
+	do {
+		u64 new_spte;
+
+		for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+			if (!is_shadow_present_pte(spte) ||
+			    iterator.level < level)
+				break;
+
+		sp = page_header(__pa(iterator.sptep));
+		if (!is_last_spte(spte, sp->role.level))
 			break;
 
-	/*
-	 * If the mapping has been changed, let the vcpu fault on the
-	 * same address again.
-	 */
-	if (!is_shadow_present_pte(spte)) {
-		ret = true;
-		goto exit;
-	}
+		/*
+		 * Check whether the memory access that caused the fault would
+		 * still cause it if it were to be performed right now. If not,
+		 * then this is a spurious fault caused by TLB lazily flushed,
+		 * or some other CPU has already fixed the PTE after the
+		 * current CPU took the fault.
+		 *
+		 * Need not check the access of upper level table entries since
+		 * they are always ACC_ALL.
+		 */
+		if (is_access_allowed(error_code, spte)) {
+			fault_handled = true;
+			break;
+		}
 
-	sp = page_header(__pa(iterator.sptep));
-	if (!is_last_spte(spte, sp->role.level))
-		goto exit;
+		new_spte = spte;
 
-	/*
-	 * Check if it is a spurious fault caused by TLB lazily flushed.
-	 *
-	 * Need not check the access of upper level table entries since
-	 * they are always ACC_ALL.
-	 */
-	 if (is_writable_pte(spte)) {
-		ret = true;
-		goto exit;
-	}
+		if (is_access_track_spte(spte))
+			new_spte = restore_acc_track_spte(new_spte);
 
-	/*
-	 * Currently, to simplify the code, only the spte write-protected
-	 * by dirty-log can be fast fixed.
-	 */
-	if (!spte_is_locklessly_modifiable(spte))
-		goto exit;
+		/*
+		 * Currently, to simplify the code, write-protection can
+		 * be removed in the fast path only if the SPTE was
+		 * write-protected for dirty-logging or access tracking.
+		 */
+		if ((error_code & PFERR_WRITE_MASK) &&
+		    spte_can_locklessly_be_made_writable(spte))
+		{
+			new_spte |= PT_WRITABLE_MASK;
 
-	/*
-	 * Do not fix write-permission on the large spte since we only dirty
-	 * the first page into the dirty-bitmap in fast_pf_fix_direct_spte()
-	 * that means other pages are missed if its slot is dirty-logged.
-	 *
-	 * Instead, we let the slow page fault path create a normal spte to
-	 * fix the access.
-	 *
-	 * See the comments in kvm_arch_commit_memory_region().
-	 */
-	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
-		goto exit;
+			/*
+			 * Do not fix write-permission on the large spte.  Since
+			 * we only dirty the first page into the dirty-bitmap in
+			 * fast_pf_fix_direct_spte(), other pages are missed
+			 * if its slot has dirty logging enabled.
+			 *
+			 * Instead, we let the slow page fault path create a
+			 * normal spte to fix the access.
+			 *
+			 * See the comments in kvm_arch_commit_memory_region().
+			 */
+			if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+				break;
+		}
+
+		/* Verify that the fault can be handled in the fast path */
+		if (new_spte == spte ||
+		    !is_access_allowed(error_code, new_spte))
+			break;
+
+		/*
+		 * Currently, fast page fault only works for direct mapping
+		 * since the gfn is not stable for indirect shadow page. See
+		 * Documentation/virtual/kvm/locking.txt to get more detail.
+		 */
+		fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
+							iterator.sptep, spte,
+							new_spte);
+		if (fault_handled)
+			break;
+
+		if (++retry_count > 4) {
+			printk_once(KERN_WARNING
+				"kvm: Fast #PF retrying more than 4 times.\n");
+			break;
+		}
+
+	} while (true);
 
-	/*
-	 * Currently, fast page fault only works for direct mapping since
-	 * the gfn is not stable for indirect shadow page.
-	 * See Documentation/virtual/kvm/locking.txt to get more detail.
-	 */
-	ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
-exit:
 	trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
-			      spte, ret);
+			      spte, fault_handled);
 	walk_shadow_page_lockless_end(vcpu);
 
-	return ret;
+	return fault_handled;
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
@@ -5063,6 +5264,8 @@ static void mmu_destroy_caches(void)
 
 int kvm_mmu_module_init(void)
 {
+	kvm_mmu_clear_all_pte_masks();
+
 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
 					    sizeof(struct pte_list_desc),
 					    0, 0, NULL);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 08a4d3ab3455..d0414f054bdf 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4182,6 +4182,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 
 	trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
 
+	vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF);
+
 	if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
 		vcpu->arch.cr0 = svm->vmcb->save.cr0;
 	if (npt_enabled)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a236decb81e4..7c3e42623090 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4953,7 +4953,7 @@ static bool vmx_get_enable_apicv(void)
 	return enable_apicv;
 }
 
-static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	int max_irr;
@@ -4964,19 +4964,15 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 	    vmx->nested.pi_pending) {
 		vmx->nested.pi_pending = false;
 		if (!pi_test_and_clear_on(vmx->nested.pi_desc))
-			return 0;
+			return;
 
 		max_irr = find_last_bit(
 			(unsigned long *)vmx->nested.pi_desc->pir, 256);
 
 		if (max_irr == 256)
-			return 0;
+			return;
 
 		vapic_page = kmap(vmx->nested.virtual_apic_page);
-		if (!vapic_page) {
-			WARN_ON(1);
-			return -ENOMEM;
-		}
 		__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
 		kunmap(vmx->nested.virtual_apic_page);
 
@@ -4987,7 +4983,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 			vmcs_write16(GUEST_INTR_STATUS, status);
 		}
 	}
-	return 0;
 }
 
 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -5236,10 +5231,8 @@ static void ept_set_mmio_spte_mask(void)
 	/*
 	 * EPT Misconfigurations can be generated if the value of bits 2:0
 	 * of an EPT paging-structure entry is 110b (write/execute).
-	 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
-	 * spte.
 	 */
-	kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
+	kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE);
 }
 
 #define VMX_XSS_EXIT_BITMAP 0
@@ -6152,7 +6145,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
 
 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
 {
-	kvm_make_request(KVM_REQ_EVENT, vcpu);
+	kvm_apic_update_ppr(vcpu);
 	return 1;
 }
 
@@ -6374,14 +6367,20 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 	trace_kvm_page_fault(gpa, exit_qualification);
 
-	/* it is a read fault? */
-	error_code = (exit_qualification << 2) & PFERR_USER_MASK;
-	/* it is a write fault? */
-	error_code |= exit_qualification & PFERR_WRITE_MASK;
-	/* It is a fetch fault? */
-	error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
-	/* ept page table is present? */
-	error_code |= (exit_qualification & 0x38) != 0;
+	/* Is it a read fault? */
+	error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
+		     ? PFERR_USER_MASK : 0;
+	/* Is it a write fault? */
+	error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
+		      ? PFERR_WRITE_MASK : 0;
+	/* Is it a fetch fault? */
+	error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
+		      ? PFERR_FETCH_MASK : 0;
+	/* ept page table entry is present? */
+	error_code |= (exit_qualification &
+		       (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
+			EPT_VIOLATION_EXECUTABLE))
+		      ? PFERR_PRESENT_MASK : 0;
 
 	vcpu->arch.exit_qualification = exit_qualification;
 
@@ -6572,6 +6571,19 @@ static void wakeup_handler(void)
 	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 }
 
+void vmx_enable_tdp(void)
+{
+	kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
+		enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
+		enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
+		0ull, VMX_EPT_EXECUTABLE_MASK,
+		cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
+		enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK);
+
+	ept_set_mmio_spte_mask();
+	kvm_enable_tdp();
+}
+
 static __init int hardware_setup(void)
 {
 	int r = -ENOMEM, i, msr;
@@ -6697,16 +6709,9 @@ static __init int hardware_setup(void)
 	/* SELF-IPI */
 	vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
 
-	if (enable_ept) {
-		kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
-			(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
-			(enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
-			0ull, VMX_EPT_EXECUTABLE_MASK,
-			cpu_has_vmx_ept_execute_only() ?
-				      0ull : VMX_EPT_READABLE_MASK);
-		ept_set_mmio_spte_mask();
-		kvm_enable_tdp();
-	} else
+	if (enable_ept)
+		vmx_enable_tdp();
+	else
 		kvm_disable_tdp();
 
 	update_ple_window_actual_max();
@@ -7168,9 +7173,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
-	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
-		return 1;
-
 	if (vmx->nested.vmxon) {
 		nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
 		return kvm_skip_emulated_instruction(vcpu);
@@ -7182,6 +7184,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
+	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
+		return 1;
+
 	if (cpu_has_vmx_msr_bitmap()) {
 		vmx->nested.msr_bitmap =
 				(unsigned long *)__get_free_page(GFP_KERNEL);
@@ -8191,8 +8196,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_TASK_SWITCH:
 		return true;
 	case EXIT_REASON_CPUID:
-		if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
-			return false;
 		return true;
 	case EXIT_REASON_HLT:
 		return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
@@ -9730,11 +9733,6 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 		return false;
 	}
 	msr_bitmap_l1 = (unsigned long *)kmap(page);
-	if (!msr_bitmap_l1) {
-		nested_release_page_clean(page);
-		WARN_ON(1);
-		return false;
-	}
 
 	memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
 
@@ -10696,7 +10694,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
 		return 0;
 	}
 
-	return vmx_complete_nested_posted_interrupt(vcpu);
+	vmx_complete_nested_posted_interrupt(vcpu);
+	return 0;
 }
 
 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e52c9088660f..10e20afc3314 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -180,6 +180,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 	{ "irq_injections", VCPU_STAT(irq_injections) },
 	{ "nmi_injections", VCPU_STAT(nmi_injections) },
+	{ "req_event", VCPU_STAT(req_event) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -190,6 +191,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "mmu_unsync", VM_STAT(mmu_unsync) },
 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 	{ "largepages", VM_STAT(lpages) },
+	{ "max_mmu_page_hash_collisions",
+		VM_STAT(max_mmu_page_hash_collisions) },
 	{ NULL }
 };
 
@@ -1139,6 +1142,7 @@ struct pvclock_gtod_data {
 
 	u64		boot_ns;
 	u64		nsec_base;
+	u64		wall_time_sec;
 };
 
 static struct pvclock_gtod_data pvclock_gtod_data;
@@ -1162,6 +1166,8 @@ static void update_pvclock_gtod(struct timekeeper *tk)
 	vdata->boot_ns			= boot_ns;
 	vdata->nsec_base		= tk->tkr_mono.xtime_nsec;
 
+	vdata->wall_time_sec            = tk->xtime_sec;
+
 	write_seqcount_end(&vdata->seq);
 }
 #endif
@@ -1623,6 +1629,28 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now)
 	return mode;
 }
 
+static int do_realtime(struct timespec *ts, u64 *cycle_now)
+{
+	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+	unsigned long seq;
+	int mode;
+	u64 ns;
+
+	do {
+		seq = read_seqcount_begin(&gtod->seq);
+		mode = gtod->clock.vclock_mode;
+		ts->tv_sec = gtod->wall_time_sec;
+		ns = gtod->nsec_base;
+		ns += vgettsc(cycle_now);
+		ns >>= gtod->clock.shift;
+	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+
+	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+	ts->tv_nsec = ns;
+
+	return mode;
+}
+
 /* returns true if host is using tsc clocksource */
 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
 {
@@ -1632,6 +1660,17 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
 
 	return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
 }
+
+/* returns true if host is using tsc clocksource */
+static bool kvm_get_walltime_and_clockread(struct timespec *ts,
+					   u64 *cycle_now)
+{
+	/* checked again under seqlock below */
+	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+		return false;
+
+	return do_realtime(ts, cycle_now) == VCLOCK_TSC;
+}
 #endif
 
 /*
@@ -3897,7 +3936,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 			goto split_irqchip_unlock;
 		/* Pairs with irqchip_in_kernel. */
 		smp_wmb();
-		kvm->arch.irqchip_split = true;
+		kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
 		kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
 		r = 0;
 split_irqchip_unlock:
@@ -3960,40 +3999,41 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
 		break;
 	case KVM_CREATE_IRQCHIP: {
-		struct kvm_pic *vpic;
-
 		mutex_lock(&kvm->lock);
+
 		r = -EEXIST;
-		if (kvm->arch.vpic)
+		if (irqchip_in_kernel(kvm))
 			goto create_irqchip_unlock;
+
 		r = -EINVAL;
 		if (kvm->created_vcpus)
 			goto create_irqchip_unlock;
-		r = -ENOMEM;
-		vpic = kvm_create_pic(kvm);
-		if (vpic) {
-			r = kvm_ioapic_init(kvm);
-			if (r) {
-				mutex_lock(&kvm->slots_lock);
-				kvm_destroy_pic(vpic);
-				mutex_unlock(&kvm->slots_lock);
-				goto create_irqchip_unlock;
-			}
-		} else
+
+		r = kvm_pic_init(kvm);
+		if (r)
 			goto create_irqchip_unlock;
+
+		r = kvm_ioapic_init(kvm);
+		if (r) {
+			mutex_lock(&kvm->slots_lock);
+			kvm_pic_destroy(kvm);
+			mutex_unlock(&kvm->slots_lock);
+			goto create_irqchip_unlock;
+		}
+
 		r = kvm_setup_default_irq_routing(kvm);
 		if (r) {
 			mutex_lock(&kvm->slots_lock);
 			mutex_lock(&kvm->irq_lock);
 			kvm_ioapic_destroy(kvm);
-			kvm_destroy_pic(vpic);
+			kvm_pic_destroy(kvm);
 			mutex_unlock(&kvm->irq_lock);
 			mutex_unlock(&kvm->slots_lock);
 			goto create_irqchip_unlock;
 		}
-		/* Write kvm->irq_routing before kvm->arch.vpic.  */
+		/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
 		smp_wmb();
-		kvm->arch.vpic = vpic;
+		kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
 	create_irqchip_unlock:
 		mutex_unlock(&kvm->lock);
 		break;
@@ -4029,7 +4069,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		}
 
 		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
+		if (!irqchip_kernel(kvm))
 			goto get_irqchip_out;
 		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
 		if (r)
@@ -4053,7 +4093,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		}
 
 		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
+		if (!irqchip_kernel(kvm))
 			goto set_irqchip_out;
 		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
 		if (r)
@@ -4462,6 +4502,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
 
+static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+			    gpa_t gpa, bool write)
+{
+	/* For APIC access vmexit */
+	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+		return 1;
+
+	if (vcpu_match_mmio_gpa(vcpu, gpa)) {
+		trace_vcpu_match_mmio(gva, gpa, write, true);
+		return 1;
+	}
+
+	return 0;
+}
+
 static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 				gpa_t *gpa, struct x86_exception *exception,
 				bool write)
@@ -4488,16 +4543,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 	if (*gpa == UNMAPPED_GVA)
 		return -1;
 
-	/* For APIC access vmexit */
-	if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
-		return 1;
-
-	if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
-		trace_vcpu_match_mmio(gva, *gpa, write, true);
-		return 1;
-	}
-
-	return 0;
+	return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
 }
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -4594,6 +4640,22 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
 	int handled, ret;
 	bool write = ops->write;
 	struct kvm_mmio_fragment *frag;
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+
+	/*
+	 * If the exit was due to a NPF we may already have a GPA.
+	 * If the GPA is present, use it to avoid the GVA to GPA table walk.
+	 * Note, this cannot be used on string operations since string
+	 * operation using rep will only have the initial GPA from the NPF
+	 * occurred.
+	 */
+	if (vcpu->arch.gpa_available &&
+	    emulator_can_use_gpa(ctxt) &&
+	    vcpu_is_mmio_gpa(vcpu, addr, exception->address, write) &&
+	    (addr & ~PAGE_MASK) == (exception->address & ~PAGE_MASK)) {
+		gpa = exception->address;
+		goto mmio;
+	}
 
 	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
 
@@ -5610,6 +5672,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	}
 
 restart:
+	/* Save the faulting GPA (cr2) in the address field */
+	ctxt->exception.address = cr2;
+
 	r = x86_emulate_insn(ctxt);
 
 	if (r == EMULATION_INTERCEPTED)
@@ -5924,9 +5989,6 @@ static void kvm_set_mmio_spte_mask(void)
 	 /* Mask the reserved physical address bits. */
 	mask = rsvd_bits(maxphyaddr, 51);
 
-	/* Bit 62 is always reserved for 32bit host. */
-	mask |= 0x3ull << 62;
-
 	/* Set the present bit. */
 	mask |= 1ull;
 
@@ -6025,7 +6087,7 @@ int kvm_arch_init(void *opaque)
 
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 			PT_DIRTY_MASK, PT64_NX_MASK, 0,
-			PT_PRESENT_MASK);
+			PT_PRESENT_MASK, 0);
 	kvm_timer_init();
 
 	perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -6087,6 +6149,33 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
+static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
+			        unsigned long clock_type)
+{
+	struct kvm_clock_pairing clock_pairing;
+	struct timespec ts;
+	u64 cycle;
+	int ret;
+
+	if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
+		return -KVM_EOPNOTSUPP;
+
+	if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
+		return -KVM_EOPNOTSUPP;
+
+	clock_pairing.sec = ts.tv_sec;
+	clock_pairing.nsec = ts.tv_nsec;
+	clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
+	clock_pairing.flags = 0;
+
+	ret = 0;
+	if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
+			    sizeof(struct kvm_clock_pairing)))
+		ret = -KVM_EFAULT;
+
+	return ret;
+}
+
 /*
  * kvm_pv_kick_cpu_op:  Kick a vcpu.
  *
@@ -6151,6 +6240,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
 		ret = 0;
 		break;
+	case KVM_HC_CLOCK_PAIRING:
+		ret = kvm_pv_clock_pairing(vcpu, a0, a1);
+		break;
 	default:
 		ret = -KVM_ENOSYS;
 		break;
@@ -6733,6 +6825,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+		++vcpu->stat.req_event;
 		kvm_apic_accept_events(vcpu);
 		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
 			r = 1;
diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index bdce33291161..384f661a6496 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -90,4 +90,16 @@ config PTP_1588_CLOCK_PCH
 	  To compile this driver as a module, choose M here: the module
 	  will be called ptp_pch.
 
+config PTP_1588_CLOCK_KVM
+	tristate "KVM virtual PTP clock"
+	depends on PTP_1588_CLOCK
+	depends on KVM_GUEST && X86
+	default y
+	help
+	  This driver adds support for using kvm infrastructure as a PTP
+	  clock. This clock is only useful if you are using KVM guests.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called ptp_kvm.
+
 endmenu
diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
index 8b58597298de..530736161a8b 100644
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -6,3 +6,4 @@ ptp-y					:= ptp_clock.o ptp_chardev.o ptp_sysfs.o
 obj-$(CONFIG_PTP_1588_CLOCK)		+= ptp.o
 obj-$(CONFIG_PTP_1588_CLOCK_IXP46X)	+= ptp_ixp46x.o
 obj-$(CONFIG_PTP_1588_CLOCK_PCH)	+= ptp_pch.o
+obj-$(CONFIG_PTP_1588_CLOCK_KVM)	+= ptp_kvm.o
diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm.c
new file mode 100644
index 000000000000..0a54e8326a90
--- /dev/null
+++ b/drivers/ptp/ptp_kvm.c
@@ -0,0 +1,200 @@
+/*
+ * Virtual PTP 1588 clock for use with KVM guests
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ */
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <uapi/linux/kvm_para.h>
+#include <asm/kvm_para.h>
+#include <asm/pvclock.h>
+#include <asm/kvmclock.h>
+#include <uapi/asm/kvm_para.h>
+
+#include <linux/ptp_clock_kernel.h>
+
+struct kvm_ptp_clock {
+	struct ptp_clock *ptp_clock;
+	struct ptp_clock_info caps;
+};
+
+DEFINE_SPINLOCK(kvm_ptp_lock);
+
+static struct pvclock_vsyscall_time_info *hv_clock;
+
+static struct kvm_clock_pairing clock_pair;
+static phys_addr_t clock_pair_gpa;
+
+static int ptp_kvm_get_time_fn(ktime_t *device_time,
+			       struct system_counterval_t *system_counter,
+			       void *ctx)
+{
+	unsigned long ret;
+	struct timespec64 tspec;
+	unsigned version;
+	int cpu;
+	struct pvclock_vcpu_time_info *src;
+
+	spin_lock(&kvm_ptp_lock);
+
+	preempt_disable_notrace();
+	cpu = smp_processor_id();
+	src = &hv_clock[cpu].pvti;
+
+	do {
+		/*
+		 * We are using a TSC value read in the hosts
+		 * kvm_hc_clock_pairing handling.
+		 * So any changes to tsc_to_system_mul
+		 * and tsc_shift or any other pvclock
+		 * data invalidate that measurement.
+		 */
+		version = pvclock_read_begin(src);
+
+		ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+				     clock_pair_gpa,
+				     KVM_CLOCK_PAIRING_WALLCLOCK);
+		if (ret != 0) {
+			pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
+			spin_unlock(&kvm_ptp_lock);
+			preempt_enable_notrace();
+			return -EOPNOTSUPP;
+		}
+
+		tspec.tv_sec = clock_pair.sec;
+		tspec.tv_nsec = clock_pair.nsec;
+		ret = __pvclock_read_cycles(src, clock_pair.tsc);
+	} while (pvclock_read_retry(src, version));
+
+	preempt_enable_notrace();
+
+	system_counter->cycles = ret;
+	system_counter->cs = &kvm_clock;
+
+	*device_time = timespec64_to_ktime(tspec);
+
+	spin_unlock(&kvm_ptp_lock);
+
+	return 0;
+}
+
+static int ptp_kvm_getcrosststamp(struct ptp_clock_info *ptp,
+				  struct system_device_crosststamp *xtstamp)
+{
+	return get_device_system_crosststamp(ptp_kvm_get_time_fn, NULL,
+					     NULL, xtstamp);
+}
+
+/*
+ * PTP clock operations
+ */
+
+static int ptp_kvm_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ptp_kvm_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ptp_kvm_settime(struct ptp_clock_info *ptp,
+			   const struct timespec64 *ts)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
+{
+	unsigned long ret;
+	struct timespec64 tspec;
+
+	spin_lock(&kvm_ptp_lock);
+
+	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+			     clock_pair_gpa,
+			     KVM_CLOCK_PAIRING_WALLCLOCK);
+	if (ret != 0) {
+		pr_err_ratelimited("clock offset hypercall ret %lu\n", ret);
+		spin_unlock(&kvm_ptp_lock);
+		return -EOPNOTSUPP;
+	}
+
+	tspec.tv_sec = clock_pair.sec;
+	tspec.tv_nsec = clock_pair.nsec;
+	spin_unlock(&kvm_ptp_lock);
+
+	memcpy(ts, &tspec, sizeof(struct timespec64));
+
+	return 0;
+}
+
+static int ptp_kvm_enable(struct ptp_clock_info *ptp,
+			  struct ptp_clock_request *rq, int on)
+{
+	return -EOPNOTSUPP;
+}
+
+static struct ptp_clock_info ptp_kvm_caps = {
+	.owner		= THIS_MODULE,
+	.name		= "KVM virtual PTP",
+	.max_adj	= 0,
+	.n_ext_ts	= 0,
+	.n_pins		= 0,
+	.pps		= 0,
+	.adjfreq	= ptp_kvm_adjfreq,
+	.adjtime	= ptp_kvm_adjtime,
+	.gettime64	= ptp_kvm_gettime,
+	.settime64	= ptp_kvm_settime,
+	.enable		= ptp_kvm_enable,
+	.getcrosststamp = ptp_kvm_getcrosststamp,
+};
+
+/* module operations */
+
+static struct kvm_ptp_clock kvm_ptp_clock;
+
+static void __exit ptp_kvm_exit(void)
+{
+	ptp_clock_unregister(kvm_ptp_clock.ptp_clock);
+}
+
+static int __init ptp_kvm_init(void)
+{
+	clock_pair_gpa = slow_virt_to_phys(&clock_pair);
+	hv_clock = pvclock_pvti_cpu0_va();
+
+	if (!hv_clock)
+		return -ENODEV;
+
+	kvm_ptp_clock.caps = ptp_kvm_caps;
+
+	kvm_ptp_clock.ptp_clock = ptp_clock_register(&kvm_ptp_clock.caps, NULL);
+
+	if (IS_ERR(kvm_ptp_clock.ptp_clock))
+		return PTR_ERR(kvm_ptp_clock.ptp_clock);
+
+	return 0;
+}
+
+module_init(ptp_kvm_init);
+module_exit(ptp_kvm_exit);
+
+MODULE_AUTHOR("Marcelo Tosatti <mtosatti@redhat.com>");
+MODULE_DESCRIPTION("PTP clock using KVMCLOCK");
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index cac48eda1075..7964b970b9ad 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -685,6 +685,13 @@ struct kvm_ppc_smmu_info {
 	struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
 };
 
+/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */
+struct kvm_ppc_resize_hpt {
+	__u64 flags;
+	__u32 shift;
+	__u32 pad;
+};
+
 #define KVMIO 0xAE
 
 /* machine type bits, to be used as argument to KVM_CREATE_VM */
@@ -871,6 +878,9 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_S390_USER_INSTR0 130
 #define KVM_CAP_MSI_DEVID 131
 #define KVM_CAP_PPC_HTM 132
+#define KVM_CAP_SPAPR_RESIZE_HPT 133
+#define KVM_CAP_PPC_MMU_RADIX 134
+#define KVM_CAP_PPC_MMU_HASH_V3 135
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1187,6 +1197,13 @@ struct kvm_s390_ucas_mapping {
 #define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
 /* Available with KVM_CAP_PPC_RTAS */
 #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xac, struct kvm_rtas_token_args)
+/* Available with KVM_CAP_SPAPR_RESIZE_HPT */
+#define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt)
+#define KVM_PPC_RESIZE_HPT_COMMIT  _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt)
+/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */
+#define KVM_PPC_CONFIGURE_V3_MMU  _IOW(KVMIO,  0xaf, struct kvm_ppc_mmuv3_cfg)
+/* Available with KVM_CAP_PPC_RADIX_MMU */
+#define KVM_PPC_GET_RMMU_INFO	  _IOW(KVMIO,  0xb0, struct kvm_ppc_rmmu_info)
 
 /* ioctl for vm fd */
 #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index bf6cd7d5cac2..fed506aeff62 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -14,6 +14,7 @@
 #define KVM_EFAULT		EFAULT
 #define KVM_E2BIG		E2BIG
 #define KVM_EPERM		EPERM
+#define KVM_EOPNOTSUPP		95
 
 #define KVM_HC_VAPIC_POLL_IRQ		1
 #define KVM_HC_MMU_OP			2
@@ -23,6 +24,7 @@
 #define KVM_HC_MIPS_GET_CLOCK_FREQ	6
 #define KVM_HC_MIPS_EXIT_VM		7
 #define KVM_HC_MIPS_CONSOLE_OUTPUT	8
+#define KVM_HC_CLOCK_PAIRING		9
 
 /*
  * hypercalls use architecture specific
author	Stephen Rothwell <sfr@canb.auug.org.au>	2017-02-09 14:22:45 +1100
committer	Stephen Rothwell <sfr@canb.auug.org.au>	2017-02-09 14:22:45 +1100
commit	f494e0835d2db55f49ba1c97ce7f85b1fccacb45 (patch)
tree	16d13f85796efa859a3736b32966bc9173692e54
parent	e15246babb490d1285a70591b5544d57c427c4d5 (diff)
parent	a0e136d436ded817c0aade72efdefa56a00b4e5e (diff)