86 files changed, 3823 insertions, 844 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 15f79c27748d..aa8341e73b35 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1829,6 +1829,18 @@
 			for all guests.
 			Default is 1 (enabled) if in 64-bit or 32-bit PAE mode.
 
+	kvm-arm.vgic_v3_group0_trap=
+			[KVM,ARM] Trap guest accesses to GICv3 group-0
+			system registers
+
+	kvm-arm.vgic_v3_group1_trap=
+			[KVM,ARM] Trap guest accesses to GICv3 group-1
+			system registers
+
+	kvm-arm.vgic_v3_common_trap=
+			[KVM,ARM] Trap guest accesses to GICv3 common
+			system registers
+
 	kvm-intel.ept=	[KVM,Intel] Disable extended page tables
 			(virtualized MMU) support on capable Intel chips.
 			Default is 1 (enabled)
diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt
index 10f2dddbf449..f5f93dca54b7 100644
--- a/Documentation/arm64/silicon-errata.txt
+++ b/Documentation/arm64/silicon-errata.txt
@@ -62,6 +62,7 @@ stable kernels.
 | Cavium         | ThunderX GICv3  | #23154          | CAVIUM_ERRATUM_23154        |
 | Cavium         | ThunderX Core   | #27456          | CAVIUM_ERRATUM_27456        |
 | Cavium         | ThunderX SMMUv2 | #27704          | N/A                         |
+| Cavium         | ThunderX Core   | #30115          | CAVIUM_ERRATUM_30115        |
 |                |                 |                 |                             |
 | Freescale/NXP  | LS2080A/LS1043A | A-008585        | FSL_ERRATUM_A008585         |
 |                |                 |                 |                             |
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 63875dbb6ef5..3a9831b72945 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3255,6 +3255,141 @@ Otherwise, if the MCE is a corrected error, KVM will just
 store it in the corresponding bank (provided this bank is
 not holding a previously reported uncorrected error).
 
+4.107 KVM_S390_GET_CMMA_BITS
+
+Capability: KVM_CAP_S390_CMMA_MIGRATION
+Architectures: s390
+Type: vm ioctl
+Parameters: struct kvm_s390_cmma_log (in, out)
+Returns: 0 on success, a negative value on error
+
+This ioctl is used to get the values of the CMMA bits on the s390
+architecture. It is meant to be used in two scenarios:
+- During live migration to save the CMMA values. Live migration needs
+  to be enabled via the KVM_REQ_START_MIGRATION VM property.
+- To non-destructively peek at the CMMA values, with the flag
+  KVM_S390_CMMA_PEEK set.
+
+The ioctl takes parameters via the kvm_s390_cmma_log struct. The desired
+values are written to a buffer whose location is indicated via the "values"
+member in the kvm_s390_cmma_log struct.  The values in the input struct are
+also updated as needed.
+Each CMMA value takes up one byte.
+
+struct kvm_s390_cmma_log {
+	__u64 start_gfn;
+	__u32 count;
+	__u32 flags;
+	union {
+		__u64 remaining;
+		__u64 mask;
+	};
+	__u64 values;
+};
+
+start_gfn is the number of the first guest frame whose CMMA values are
+to be retrieved,
+
+count is the length of the buffer in bytes,
+
+values points to the buffer where the result will be written to.
+
+If count is greater than KVM_S390_SKEYS_MAX, then it is considered to be
+KVM_S390_SKEYS_MAX. KVM_S390_SKEYS_MAX is re-used for consistency with
+other ioctls.
+
+The result is written in the buffer pointed to by the field values, and
+the values of the input parameter are updated as follows.
+
+Depending on the flags, different actions are performed. The only
+supported flag so far is KVM_S390_CMMA_PEEK.
+
+The default behaviour if KVM_S390_CMMA_PEEK is not set is:
+start_gfn will indicate the first page frame whose CMMA bits were dirty.
+It is not necessarily the same as the one passed as input, as clean pages
+are skipped.
+
+count will indicate the number of bytes actually written in the buffer.
+It can (and very often will) be smaller than the input value, since the
+buffer is only filled until 16 bytes of clean values are found (which
+are then not copied in the buffer). Since a CMMA migration block needs
+the base address and the length, for a total of 16 bytes, we will send
+back some clean data if there is some dirty data afterwards, as long as
+the size of the clean data does not exceed the size of the header. This
+allows to minimize the amount of data to be saved or transferred over
+the network at the expense of more roundtrips to userspace. The next
+invocation of the ioctl will skip over all the clean values, saving
+potentially more than just the 16 bytes we found.
+
+If KVM_S390_CMMA_PEEK is set:
+the existing storage attributes are read even when not in migration
+mode, and no other action is performed;
+
+the output start_gfn will be equal to the input start_gfn,
+
+the output count will be equal to the input count, except if the end of
+memory has been reached.
+
+In both cases:
+the field "remaining" will indicate the total number of dirty CMMA values
+still remaining, or 0 if KVM_S390_CMMA_PEEK is set and migration mode is
+not enabled.
+
+mask is unused.
+
+values points to the userspace buffer where the result will be stored.
+
+This ioctl can fail with -ENOMEM if not enough memory can be allocated to
+complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if
+KVM_S390_CMMA_PEEK is not set but migration mode was not enabled, with
+-EFAULT if the userspace address is invalid or if no page table is
+present for the addresses (e.g. when using hugepages).
+
+4.108 KVM_S390_SET_CMMA_BITS
+
+Capability: KVM_CAP_S390_CMMA_MIGRATION
+Architectures: s390
+Type: vm ioctl
+Parameters: struct kvm_s390_cmma_log (in)
+Returns: 0 on success, a negative value on error
+
+This ioctl is used to set the values of the CMMA bits on the s390
+architecture. It is meant to be used during live migration to restore
+the CMMA values, but there are no restrictions on its use.
+The ioctl takes parameters via the kvm_s390_cmma_values struct.
+Each CMMA value takes up one byte.
+
+struct kvm_s390_cmma_log {
+	__u64 start_gfn;
+	__u32 count;
+	__u32 flags;
+	union {
+		__u64 remaining;
+		__u64 mask;
+	};
+	__u64 values;
+};
+
+start_gfn indicates the starting guest frame number,
+
+count indicates how many values are to be considered in the buffer,
+
+flags is not used and must be 0.
+
+mask indicates which PGSTE bits are to be considered.
+
+remaining is not used.
+
+values points to the buffer in userspace where to store the values.
+
+This ioctl can fail with -ENOMEM if not enough memory can be allocated to
+complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if
+the count field is too large (e.g. more than KVM_S390_CMMA_SIZE_MAX) or
+if the flags field was not 0, with -EFAULT if the userspace address is
+invalid, if invalid pages are written to (e.g. after the end of memory)
+or if no page table is present for the addresses (e.g. when using
+hugepages).
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
index c2518cea8ab4..2f1cbf1301d2 100644
--- a/Documentation/virtual/kvm/devices/s390_flic.txt
+++ b/Documentation/virtual/kvm/devices/s390_flic.txt
@@ -16,6 +16,7 @@ FLIC provides support to
 - register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*)
 - modify AIS (adapter-interruption-suppression) mode state (KVM_DEV_FLIC_AISM)
 - inject adapter interrupts on a specified adapter (KVM_DEV_FLIC_AIRQ_INJECT)
+- get/set all AIS mode states (KVM_DEV_FLIC_AISM_ALL)
 
 Groups:
   KVM_DEV_FLIC_ENQUEUE
@@ -136,6 +137,20 @@ struct kvm_s390_ais_req {
     an isc according to the adapter-interruption-suppression mode on condition
     that the AIS capability is enabled.
 
+  KVM_DEV_FLIC_AISM_ALL
+    Gets or sets the adapter-interruption-suppression mode for all ISCs. Takes
+    a kvm_s390_ais_all describing:
+
+struct kvm_s390_ais_all {
+       __u8 simm; /* Single-Interruption-Mode mask */
+       __u8 nimm; /* No-Interruption-Mode mask *
+};
+
+    simm contains Single-Interruption-Mode mask for all ISCs, nimm contains
+    No-Interruption-Mode mask for all ISCs. Each bit in simm and nimm corresponds
+    to an ISC (MSB0 bit 0 to ISC 0 and so on). The combination of simm bit and
+    nimm bit presents AIS mode for a ISC.
+
 Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on
 FLIC with an unknown group or attribute gives the error code EINVAL (instead of
 ENXIO, as specified in the API documentation). It is not possible to conclude
diff --git a/Documentation/virtual/kvm/devices/vcpu.txt b/Documentation/virtual/kvm/devices/vcpu.txt
index 02f50686c418..2b5dab16c4f2 100644
--- a/Documentation/virtual/kvm/devices/vcpu.txt
+++ b/Documentation/virtual/kvm/devices/vcpu.txt
@@ -16,7 +16,9 @@ Parameters: in kvm_device_attr.addr the address for PMU overflow interrupt is a
 Returns: -EBUSY: The PMU overflow interrupt is already set
          -ENXIO: The overflow interrupt not set when attempting to get it
          -ENODEV: PMUv3 not supported
-         -EINVAL: Invalid PMU overflow interrupt number supplied
+         -EINVAL: Invalid PMU overflow interrupt number supplied or
+                  trying to set the IRQ number without using an in-kernel
+                  irqchip.
 
 A value describing the PMUv3 (Performance Monitor Unit v3) overflow interrupt
 number for this vcpu. This interrupt could be a PPI or SPI, but the interrupt
@@ -25,11 +27,36 @@ all vcpus, while as an SPI it must be a separate number per vcpu.
 
 1.2 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_INIT
 Parameters: no additional parameter in kvm_device_attr.addr
-Returns: -ENODEV: PMUv3 not supported
-         -ENXIO: PMUv3 not properly configured as required prior to calling this
-                 attribute
+Returns: -ENODEV: PMUv3 not supported or GIC not initialized
+         -ENXIO: PMUv3 not properly configured or in-kernel irqchip not
+                 configured as required prior to calling this attribute
          -EBUSY: PMUv3 already initialized
 
-Request the initialization of the PMUv3.  This must be done after creating the
-in-kernel irqchip.  Creating a PMU with a userspace irqchip is currently not
-supported.
+Request the initialization of the PMUv3.  If using the PMUv3 with an in-kernel
+virtual GIC implementation, this must be done after initializing the in-kernel
+irqchip.
+
+
+2. GROUP: KVM_ARM_VCPU_TIMER_CTRL
+Architectures: ARM,ARM64
+
+2.1. ATTRIBUTE: KVM_ARM_VCPU_TIMER_IRQ_VTIMER
+2.2. ATTRIBUTE: KVM_ARM_VCPU_TIMER_IRQ_PTIMER
+Parameters: in kvm_device_attr.addr the address for the timer interrupt is a
+            pointer to an int
+Returns: -EINVAL: Invalid timer interrupt number
+         -EBUSY:  One or more VCPUs has already run
+
+A value describing the architected timer interrupt number when connected to an
+in-kernel virtual GIC.  These must be a PPI (16 <= intid < 32).  Setting the
+attribute overrides the default values (see below).
+
+KVM_ARM_VCPU_TIMER_IRQ_VTIMER: The EL1 virtual timer intid (default: 27)
+KVM_ARM_VCPU_TIMER_IRQ_PTIMER: The EL1 physical timer intid (default: 30)
+
+Setting the same PPI for different timers will prevent the VCPUs from running.
+Setting the interrupt number on a VCPU configures all VCPUs created at that
+time to use the number provided for a given timer, overwriting any previously
+configured values on other VCPUs.  Userspace should configure the interrupt
+numbers on at least one VCPU after creating all VCPUs and before running any
+VCPUs.
diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt
index 575ccb022aac..903fc926860b 100644
--- a/Documentation/virtual/kvm/devices/vm.txt
+++ b/Documentation/virtual/kvm/devices/vm.txt
@@ -222,3 +222,36 @@ Allows user space to disable dea key wrapping, clearing the wrapping key.
 
 Parameters: none
 Returns:    0
+
+5. GROUP: KVM_S390_VM_MIGRATION
+Architectures: s390
+
+5.1. ATTRIBUTE: KVM_S390_VM_MIGRATION_STOP (w/o)
+
+Allows userspace to stop migration mode, needed for PGSTE migration.
+Setting this attribute when migration mode is not active will have no
+effects.
+
+Parameters: none
+Returns:    0
+
+5.2. ATTRIBUTE: KVM_S390_VM_MIGRATION_START (w/o)
+
+Allows userspace to start migration mode, needed for PGSTE migration.
+Setting this attribute when migration mode is already active will have
+no effects.
+
+Parameters: none
+Returns:    -ENOMEM if there is not enough free memory to start migration mode
+	    -EINVAL if the state of the VM is invalid (e.g. no memory defined)
+	    0 in case of success.
+
+5.3. ATTRIBUTE: KVM_S390_VM_MIGRATION_STATUS (r/o)
+
+Allows userspace to query the status of migration mode.
+
+Parameters: address of a buffer in user space to store the data (u64) to;
+	    the data itself is either 0 if migration mode is disabled or 1
+	    if it is enabled
+Returns:    -EFAULT if the given address is not accessible from kernel space
+	    0 in case of success.
diff --git a/Documentation/virtual/kvm/vcpu-requests.rst b/Documentation/virtual/kvm/vcpu-requests.rst
new file mode 100644
index 000000000000..5feb3706a7ae
--- /dev/null
+++ b/Documentation/virtual/kvm/vcpu-requests.rst
@@ -0,0 +1,307 @@
+=================
+KVM VCPU Requests
+=================
+
+Overview
+========
+
+KVM supports an internal API enabling threads to request a VCPU thread to
+perform some activity.  For example, a thread may request a VCPU to flush
+its TLB with a VCPU request.  The API consists of the following functions::
+
+  /* Check if any requests are pending for VCPU @vcpu. */
+  bool kvm_request_pending(struct kvm_vcpu *vcpu);
+
+  /* Check if VCPU @vcpu has request @req pending. */
+  bool kvm_test_request(int req, struct kvm_vcpu *vcpu);
+
+  /* Clear request @req for VCPU @vcpu. */
+  void kvm_clear_request(int req, struct kvm_vcpu *vcpu);
+
+  /*
+   * Check if VCPU @vcpu has request @req pending. When the request is
+   * pending it will be cleared and a memory barrier, which pairs with
+   * another in kvm_make_request(), will be issued.
+   */
+  bool kvm_check_request(int req, struct kvm_vcpu *vcpu);
+
+  /*
+   * Make request @req of VCPU @vcpu. Issues a memory barrier, which pairs
+   * with another in kvm_check_request(), prior to setting the request.
+   */
+  void kvm_make_request(int req, struct kvm_vcpu *vcpu);
+
+  /* Make request @req of all VCPUs of the VM with struct kvm @kvm. */
+  bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
+
+Typically a requester wants the VCPU to perform the activity as soon
+as possible after making the request.  This means most requests
+(kvm_make_request() calls) are followed by a call to kvm_vcpu_kick(),
+and kvm_make_all_cpus_request() has the kicking of all VCPUs built
+into it.
+
+VCPU Kicks
+----------
+
+The goal of a VCPU kick is to bring a VCPU thread out of guest mode in
+order to perform some KVM maintenance.  To do so, an IPI is sent, forcing
+a guest mode exit.  However, a VCPU thread may not be in guest mode at the
+time of the kick.  Therefore, depending on the mode and state of the VCPU
+thread, there are two other actions a kick may take.  All three actions
+are listed below:
+
+1) Send an IPI.  This forces a guest mode exit.
+2) Waking a sleeping VCPU.  Sleeping VCPUs are VCPU threads outside guest
+   mode that wait on waitqueues.  Waking them removes the threads from
+   the waitqueues, allowing the threads to run again.  This behavior
+   may be suppressed, see KVM_REQUEST_NO_WAKEUP below.
+3) Nothing.  When the VCPU is not in guest mode and the VCPU thread is not
+   sleeping, then there is nothing to do.
+
+VCPU Mode
+---------
+
+VCPUs have a mode state, ``vcpu->mode``, that is used to track whether the
+guest is running in guest mode or not, as well as some specific
+outside guest mode states.  The architecture may use ``vcpu->mode`` to
+ensure VCPU requests are seen by VCPUs (see "Ensuring Requests Are Seen"),
+as well as to avoid sending unnecessary IPIs (see "IPI Reduction"), and
+even to ensure IPI acknowledgements are waited upon (see "Waiting for
+Acknowledgements").  The following modes are defined:
+
+OUTSIDE_GUEST_MODE
+
+  The VCPU thread is outside guest mode.
+
+IN_GUEST_MODE
+
+  The VCPU thread is in guest mode.
+
+EXITING_GUEST_MODE
+
+  The VCPU thread is transitioning from IN_GUEST_MODE to
+  OUTSIDE_GUEST_MODE.
+
+READING_SHADOW_PAGE_TABLES
+
+  The VCPU thread is outside guest mode, but it wants the sender of
+  certain VCPU requests, namely KVM_REQ_TLB_FLUSH, to wait until the VCPU
+  thread is done reading the page tables.
+
+VCPU Request Internals
+======================
+
+VCPU requests are simply bit indices of the ``vcpu->requests`` bitmap.
+This means general bitops, like those documented in [atomic-ops]_ could
+also be used, e.g. ::
+
+  clear_bit(KVM_REQ_UNHALT & KVM_REQUEST_MASK, &vcpu->requests);
+
+However, VCPU request users should refrain from doing so, as it would
+break the abstraction.  The first 8 bits are reserved for architecture
+independent requests, all additional bits are available for architecture
+dependent requests.
+
+Architecture Independent Requests
+---------------------------------
+
+KVM_REQ_TLB_FLUSH
+
+  KVM's common MMU notifier may need to flush all of a guest's TLB
+  entries, calling kvm_flush_remote_tlbs() to do so.  Architectures that
+  choose to use the common kvm_flush_remote_tlbs() implementation will
+  need to handle this VCPU request.
+
+KVM_REQ_MMU_RELOAD
+
+  When shadow page tables are used and memory slots are removed it's
+  necessary to inform each VCPU to completely refresh the tables.  This
+  request is used for that.
+
+KVM_REQ_PENDING_TIMER
+
+  This request may be made from a timer handler run on the host on behalf
+  of a VCPU.  It informs the VCPU thread to inject a timer interrupt.
+
+KVM_REQ_UNHALT
+
+  This request may be made from the KVM common function kvm_vcpu_block(),
+  which is used to emulate an instruction that causes a CPU to halt until
+  one of an architectural specific set of events and/or interrupts is
+  received (determined by checking kvm_arch_vcpu_runnable()).  When that
+  event or interrupt arrives kvm_vcpu_block() makes the request.  This is
+  in contrast to when kvm_vcpu_block() returns due to any other reason,
+  such as a pending signal, which does not indicate the VCPU's halt
+  emulation should stop, and therefore does not make the request.
+
+KVM_REQUEST_MASK
+----------------
+
+VCPU requests should be masked by KVM_REQUEST_MASK before using them with
+bitops.  This is because only the lower 8 bits are used to represent the
+request's number.  The upper bits are used as flags.  Currently only two
+flags are defined.
+
+VCPU Request Flags
+------------------
+
+KVM_REQUEST_NO_WAKEUP
+
+  This flag is applied to requests that only need immediate attention
+  from VCPUs running in guest mode.  That is, sleeping VCPUs do not need
+  to be awaken for these requests.  Sleeping VCPUs will handle the
+  requests when they are awaken later for some other reason.
+
+KVM_REQUEST_WAIT
+
+  When requests with this flag are made with kvm_make_all_cpus_request(),
+  then the caller will wait for each VCPU to acknowledge its IPI before
+  proceeding.  This flag only applies to VCPUs that would receive IPIs.
+  If, for example, the VCPU is sleeping, so no IPI is necessary, then
+  the requesting thread does not wait.  This means that this flag may be
+  safely combined with KVM_REQUEST_NO_WAKEUP.  See "Waiting for
+  Acknowledgements" for more information about requests with
+  KVM_REQUEST_WAIT.
+
+VCPU Requests with Associated State
+===================================
+
+Requesters that want the receiving VCPU to handle new state need to ensure
+the newly written state is observable to the receiving VCPU thread's CPU
+by the time it observes the request.  This means a write memory barrier
+must be inserted after writing the new state and before setting the VCPU
+request bit.  Additionally, on the receiving VCPU thread's side, a
+corresponding read barrier must be inserted after reading the request bit
+and before proceeding to read the new state associated with it.  See
+scenario 3, Message and Flag, of [lwn-mb]_ and the kernel documentation
+[memory-barriers]_.
+
+The pair of functions, kvm_check_request() and kvm_make_request(), provide
+the memory barriers, allowing this requirement to be handled internally by
+the API.
+
+Ensuring Requests Are Seen
+==========================
+
+When making requests to VCPUs, we want to avoid the receiving VCPU
+executing in guest mode for an arbitrary long time without handling the
+request.  We can be sure this won't happen as long as we ensure the VCPU
+thread checks kvm_request_pending() before entering guest mode and that a
+kick will send an IPI to force an exit from guest mode when necessary.
+Extra care must be taken to cover the period after the VCPU thread's last
+kvm_request_pending() check and before it has entered guest mode, as kick
+IPIs will only trigger guest mode exits for VCPU threads that are in guest
+mode or at least have already disabled interrupts in order to prepare to
+enter guest mode.  This means that an optimized implementation (see "IPI
+Reduction") must be certain when it's safe to not send the IPI.  One
+solution, which all architectures except s390 apply, is to:
+
+- set ``vcpu->mode`` to IN_GUEST_MODE between disabling the interrupts and
+  the last kvm_request_pending() check;
+- enable interrupts atomically when entering the guest.
+
+This solution also requires memory barriers to be placed carefully in both
+the requesting thread and the receiving VCPU.  With the memory barriers we
+can exclude the possibility of a VCPU thread observing
+!kvm_request_pending() on its last check and then not receiving an IPI for
+the next request made of it, even if the request is made immediately after
+the check.  This is done by way of the Dekker memory barrier pattern
+(scenario 10 of [lwn-mb]_).  As the Dekker pattern requires two variables,
+this solution pairs ``vcpu->mode`` with ``vcpu->requests``.  Substituting
+them into the pattern gives::
+
+  CPU1                                    CPU2
+  =================                       =================
+  local_irq_disable();
+  WRITE_ONCE(vcpu->mode, IN_GUEST_MODE);  kvm_make_request(REQ, vcpu);
+  smp_mb();                               smp_mb();
+  if (kvm_request_pending(vcpu)) {        if (READ_ONCE(vcpu->mode) ==
+                                              IN_GUEST_MODE) {
+      ...abort guest entry...                 ...send IPI...
+  }                                       }
+
+As stated above, the IPI is only useful for VCPU threads in guest mode or
+that have already disabled interrupts.  This is why this specific case of
+the Dekker pattern has been extended to disable interrupts before setting
+``vcpu->mode`` to IN_GUEST_MODE.  WRITE_ONCE() and READ_ONCE() are used to
+pedantically implement the memory barrier pattern, guaranteeing the
+compiler doesn't interfere with ``vcpu->mode``'s carefully planned
+accesses.
+
+IPI Reduction
+-------------
+
+As only one IPI is needed to get a VCPU to check for any/all requests,
+then they may be coalesced.  This is easily done by having the first IPI
+sending kick also change the VCPU mode to something !IN_GUEST_MODE.  The
+transitional state, EXITING_GUEST_MODE, is used for this purpose.
+
+Waiting for Acknowledgements
+----------------------------
+
+Some requests, those with the KVM_REQUEST_WAIT flag set, require IPIs to
+be sent, and the acknowledgements to be waited upon, even when the target
+VCPU threads are in modes other than IN_GUEST_MODE.  For example, one case
+is when a target VCPU thread is in READING_SHADOW_PAGE_TABLES mode, which
+is set after disabling interrupts.  To support these cases, the
+KVM_REQUEST_WAIT flag changes the condition for sending an IPI from
+checking that the VCPU is IN_GUEST_MODE to checking that it is not
+OUTSIDE_GUEST_MODE.
+
+Request-less VCPU Kicks
+-----------------------
+
+As the determination of whether or not to send an IPI depends on the
+two-variable Dekker memory barrier pattern, then it's clear that
+request-less VCPU kicks are almost never correct.  Without the assurance
+that a non-IPI generating kick will still result in an action by the
+receiving VCPU, as the final kvm_request_pending() check does for
+request-accompanying kicks, then the kick may not do anything useful at
+all.  If, for instance, a request-less kick was made to a VCPU that was
+just about to set its mode to IN_GUEST_MODE, meaning no IPI is sent, then
+the VCPU thread may continue its entry without actually having done
+whatever it was the kick was meant to initiate.
+
+One exception is x86's posted interrupt mechanism.  In this case, however,
+even the request-less VCPU kick is coupled with the same
+local_irq_disable() + smp_mb() pattern described above; the ON bit
+(Outstanding Notification) in the posted interrupt descriptor takes the
+role of ``vcpu->requests``.  When sending a posted interrupt, PIR.ON is
+set before reading ``vcpu->mode``; dually, in the VCPU thread,
+vmx_sync_pir_to_irr() reads PIR after setting ``vcpu->mode`` to
+IN_GUEST_MODE.
+
+Additional Considerations
+=========================
+
+Sleeping VCPUs
+--------------
+
+VCPU threads may need to consider requests before and/or after calling
+functions that may put them to sleep, e.g. kvm_vcpu_block().  Whether they
+do or not, and, if they do, which requests need consideration, is
+architecture dependent.  kvm_vcpu_block() calls kvm_arch_vcpu_runnable()
+to check if it should awaken.  One reason to do so is to provide
+architectures a function where requests may be checked if necessary.
+
+Clearing Requests
+-----------------
+
+Generally it only makes sense for the receiving VCPU thread to clear a
+request.  However, in some circumstances, such as when the requesting
+thread and the receiving VCPU thread are executed serially, such as when
+they are the same thread, or when they are using some form of concurrency
+control to temporarily execute synchronously, then it's possible to know
+that the request may be cleared immediately, rather than waiting for the
+receiving VCPU thread to handle the request in VCPU RUN.  The only current
+examples of this are kvm_vcpu_block() calls made by VCPUs to block
+themselves.  A possible side-effect of that call is to make the
+KVM_REQ_UNHALT request, which may then be cleared immediately when the
+VCPU returns from the call.
+
+References
+==========
+
+.. [atomic-ops] Documentation/core-api/atomic_ops.rst
+.. [memory-barriers] Documentation/memory-barriers.txt
+.. [lwn-mb] https://lwn.net/Articles/573436/
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index f0e66577ce05..127e2dd2e21c 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -44,7 +44,9 @@
 #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS
 #endif
 
-#define KVM_REQ_VCPU_EXIT	(8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_SLEEP \
+	KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_IRQ_PENDING	KVM_ARCH_REQ(1)
 
 u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
 int __attribute_const__ kvm_target_cpu(void);
@@ -233,8 +235,6 @@ struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
 struct kvm_vcpu __percpu **kvm_get_running_vcpus(void);
 void kvm_arm_halt_guest(struct kvm *kvm);
 void kvm_arm_resume_guest(struct kvm *kvm);
-void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu);
-void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu);
 
 int kvm_arm_copy_coproc_indices(struct kvm_vcpu *vcpu, u64 __user *uindices);
 unsigned long kvm_arm_num_coproc_regs(struct kvm_vcpu *vcpu);
@@ -291,20 +291,12 @@ static inline void kvm_arm_init_debug(void) {}
 static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {}
-static inline int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
-					     struct kvm_device_attr *attr)
-{
-	return -ENXIO;
-}
-static inline int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
-					     struct kvm_device_attr *attr)
-{
-	return -ENXIO;
-}
-static inline int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
-					     struct kvm_device_attr *attr)
-{
-	return -ENXIO;
-}
+
+int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
+			       struct kvm_device_attr *attr);
+int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
+			       struct kvm_device_attr *attr);
+int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
+			       struct kvm_device_attr *attr);
 
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 5e3c673fa3f4..5db2d4c6a55f 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -203,6 +203,14 @@ struct kvm_arch_memory_slot {
 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff
 #define VGIC_LEVEL_INFO_LINE_LEVEL	0
 
+/* Device Control API on vcpu fd */
+#define KVM_ARM_VCPU_PMU_V3_CTRL	0
+#define   KVM_ARM_VCPU_PMU_V3_IRQ	0
+#define   KVM_ARM_VCPU_PMU_V3_INIT	1
+#define KVM_ARM_VCPU_TIMER_CTRL		1
+#define   KVM_ARM_VCPU_TIMER_IRQ_VTIMER		0
+#define   KVM_ARM_VCPU_TIMER_IRQ_PTIMER		1
+
 #define   KVM_DEV_ARM_VGIC_CTRL_INIT		0
 #define   KVM_DEV_ARM_ITS_SAVE_TABLES		1
 #define   KVM_DEV_ARM_ITS_RESTORE_TABLES	2
diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
index fa6182a40941..1e0784ebbfd6 100644
--- a/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@@ -301,3 +301,54 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 {
 	return -EINVAL;
 }
+
+int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
+			       struct kvm_device_attr *attr)
+{
+	int ret;
+
+	switch (attr->group) {
+	case KVM_ARM_VCPU_TIMER_CTRL:
+		ret = kvm_arm_timer_set_attr(vcpu, attr);
+		break;
+	default:
+		ret = -ENXIO;
+		break;
+	}
+
+	return ret;
+}
+
+int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
+			       struct kvm_device_attr *attr)
+{
+	int ret;
+
+	switch (attr->group) {
+	case KVM_ARM_VCPU_TIMER_CTRL:
+		ret = kvm_arm_timer_get_attr(vcpu, attr);
+		break;
+	default:
+		ret = -ENXIO;
+		break;
+	}
+
+	return ret;
+}
+
+int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
+			       struct kvm_device_attr *attr)
+{
+	int ret;
+
+	switch (attr->group) {
+	case KVM_ARM_VCPU_TIMER_CTRL:
+		ret = kvm_arm_timer_has_attr(vcpu, attr);
+		break;
+	default:
+		ret = -ENXIO;
+		break;
+	}
+
+	return ret;
+}
diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index f86a9aaef462..54442e375354 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -72,6 +72,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		trace_kvm_wfx(*vcpu_pc(vcpu), false);
 		vcpu->stat.wfi_exit_stat++;
 		kvm_vcpu_block(vcpu);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 	}
 
 	kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c
index 624a510d31df..ebd2dd46adf7 100644
--- a/arch/arm/kvm/hyp/switch.c
+++ b/arch/arm/kvm/hyp/switch.c
@@ -237,8 +237,10 @@ void __hyp_text __noreturn __hyp_panic(int cause)
 
 		vcpu = (struct kvm_vcpu *)read_sysreg(HTPIDR);
 		host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
+		__timer_save_state(vcpu);
 		__deactivate_traps(vcpu);
 		__deactivate_vm(vcpu);
+		__banked_restore_state(host_ctxt);
 		__sysreg_restore_state(host_ctxt);
 	}
 
diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
index 570ed4a9c261..5386528665b5 100644
--- a/arch/arm/kvm/init.S
+++ b/arch/arm/kvm/init.S
@@ -104,7 +104,6 @@ __do_hyp_init:
 	@  - Write permission implies XN: disabled
 	@  - Instruction cache: enabled
 	@  - Data/Unified cache: enabled
-	@  - Memory alignment checks: enabled
 	@  - MMU: enabled (this code must be run from an identity mapping)
 	mrc	p15, 4, r0, c1, c0, 0	@ HSCR
 	ldr	r2, =HSCTLR_MASK
@@ -112,8 +111,8 @@ __do_hyp_init:
 	mrc	p15, 0, r1, c1, c0, 0	@ SCTLR
 	ldr	r2, =(HSCTLR_EE | HSCTLR_FI | HSCTLR_I | HSCTLR_C)
 	and	r1, r1, r2
- ARM(	ldr	r2, =(HSCTLR_M | HSCTLR_A)			)
- THUMB(	ldr	r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE)		)
+ ARM(	ldr	r2, =(HSCTLR_M)					)
+ THUMB(	ldr	r2, =(HSCTLR_M | HSCTLR_TE)			)
 	orr	r1, r1, r2
 	orr	r0, r0, r1
 	mcr	p15, 4, r0, c1, c0, 0	@ HSCR
diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c
index 1da8b2d14550..5ed0c3ee33d6 100644
--- a/arch/arm/kvm/reset.c
+++ b/arch/arm/kvm/reset.c
@@ -37,16 +37,6 @@ static struct kvm_regs cortexa_regs_reset = {
 	.usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT,
 };
 
-static const struct kvm_irq_level cortexa_ptimer_irq = {
-	{ .irq = 30 },
-	.level = 1,
-};
-
-static const struct kvm_irq_level cortexa_vtimer_irq = {
-	{ .irq = 27 },
-	.level = 1,
-};
-
 
 /*******************************************************************************
  * Exported reset function
@@ -62,16 +52,12 @@ static const struct kvm_irq_level cortexa_vtimer_irq = {
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct kvm_regs *reset_regs;
-	const struct kvm_irq_level *cpu_vtimer_irq;
-	const struct kvm_irq_level *cpu_ptimer_irq;
 
 	switch (vcpu->arch.target) {
 	case KVM_ARM_TARGET_CORTEX_A7:
 	case KVM_ARM_TARGET_CORTEX_A15:
 		reset_regs = &cortexa_regs_reset;
 		vcpu->arch.midr = read_cpuid_id();
-		cpu_vtimer_irq = &cortexa_vtimer_irq;
-		cpu_ptimer_irq = &cortexa_ptimer_irq;
 		break;
 	default:
 		return -ENODEV;
@@ -84,5 +70,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	kvm_reset_coprocs(vcpu);
 
 	/* Reset arch_timer context */
-	return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq, cpu_ptimer_irq);
+	return kvm_timer_vcpu_reset(vcpu);
 }
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3dcd7ec69bca..6252365b0c96 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -480,6 +480,17 @@ config CAVIUM_ERRATUM_27456
 
 	  If unsure, say Y.
 
+config CAVIUM_ERRATUM_30115
+	bool "Cavium erratum 30115: Guest may disable interrupts in host"
+	default y
+	help
+	  On ThunderX T88 pass 1.x through 2.2, T81 pass 1.0 through
+	  1.2, and T83 Pass 1.0, KVM guest execution may disable
+	  interrupts in host. Trapping both GICv3 group-0 and group-1
+	  accesses sidesteps the issue.
+
+	  If unsure, say Y.
+
 config QCOM_FALKOR_ERRATUM_1003
 	bool "Falkor E1003: Incorrect translation due to ASID change"
 	default y
diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h
index 1a98bc8602a2..8cef47fa2218 100644
--- a/arch/arm64/include/asm/arch_gicv3.h
+++ b/arch/arm64/include/asm/arch_gicv3.h
@@ -89,7 +89,7 @@ static inline void gic_write_ctlr(u32 val)
 
 static inline void gic_write_grpen1(u32 val)
 {
-	write_sysreg_s(val, SYS_ICC_GRPEN1_EL1);
+	write_sysreg_s(val, SYS_ICC_IGRPEN1_EL1);
 	isb();
 }
 
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index b3aab8a17868..8d2272c6822c 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -38,7 +38,8 @@
 #define ARM64_WORKAROUND_REPEAT_TLBI		17
 #define ARM64_WORKAROUND_QCOM_FALKOR_E1003	18
 #define ARM64_WORKAROUND_858921			19
+#define ARM64_WORKAROUND_CAVIUM_30115		20
 
-#define ARM64_NCAPS				20
+#define ARM64_NCAPS				21
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 0984d1b3a8f2..235e77d98261 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -86,6 +86,7 @@
 
 #define CAVIUM_CPU_PART_THUNDERX	0x0A1
 #define CAVIUM_CPU_PART_THUNDERX_81XX	0x0A2
+#define CAVIUM_CPU_PART_THUNDERX_83XX	0x0A3
 
 #define BRCM_CPU_PART_VULCAN		0x516
 
@@ -96,6 +97,7 @@
 #define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73)
 #define MIDR_THUNDERX	MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
 #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
+#define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)
 #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1)
 
 #ifndef __ASSEMBLY__
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 85997c0e5443..e7d8e281ff62 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -19,6 +19,7 @@
 #define __ASM_ESR_H
 
 #include <asm/memory.h>
+#include <asm/sysreg.h>
 
 #define ESR_ELx_EC_UNKNOWN	(0x00)
 #define ESR_ELx_EC_WFx		(0x01)
@@ -181,6 +182,29 @@
 #define ESR_ELx_SYS64_ISS_SYS_CNTFRQ	(ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 0, 14, 0) | \
 					 ESR_ELx_SYS64_ISS_DIR_READ)
 
+#define esr_sys64_to_sysreg(e)					\
+	sys_reg((((e) & ESR_ELx_SYS64_ISS_OP0_MASK) >>		\
+		 ESR_ELx_SYS64_ISS_OP0_SHIFT),			\
+		(((e) & ESR_ELx_SYS64_ISS_OP1_MASK) >>		\
+		 ESR_ELx_SYS64_ISS_OP1_SHIFT),			\
+		(((e) & ESR_ELx_SYS64_ISS_CRN_MASK) >>		\
+		 ESR_ELx_SYS64_ISS_CRN_SHIFT),			\
+		(((e) & ESR_ELx_SYS64_ISS_CRM_MASK) >>		\
+		 ESR_ELx_SYS64_ISS_CRM_SHIFT),			\
+		(((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >>		\
+		 ESR_ELx_SYS64_ISS_OP2_SHIFT))
+
+#define esr_cp15_to_sysreg(e)					\
+	sys_reg(3,						\
+		(((e) & ESR_ELx_SYS64_ISS_OP1_MASK) >>		\
+		 ESR_ELx_SYS64_ISS_OP1_SHIFT),			\
+		(((e) & ESR_ELx_SYS64_ISS_CRN_MASK) >>		\
+		 ESR_ELx_SYS64_ISS_CRN_SHIFT),			\
+		(((e) & ESR_ELx_SYS64_ISS_CRM_MASK) >>		\
+		 ESR_ELx_SYS64_ISS_CRM_SHIFT),			\
+		(((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >>		\
+		 ESR_ELx_SYS64_ISS_OP2_SHIFT))
+
 #ifndef __ASSEMBLY__
 #include <asm/types.h>
 
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 1f252a95bc02..d68630007b14 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -42,7 +42,9 @@
 
 #define KVM_VCPU_MAX_FEATURES 4
 
-#define KVM_REQ_VCPU_EXIT	(8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_SLEEP \
+	KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_IRQ_PENDING	KVM_ARCH_REQ(1)
 
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
@@ -334,8 +336,6 @@ struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
 void kvm_arm_halt_guest(struct kvm *kvm);
 void kvm_arm_resume_guest(struct kvm *kvm);
-void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu);
-void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu);
 
 u64 __kvm_call_hyp(void *hypfn, ...);
 #define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__)
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index b18e852d27e8..4572a9b560fa 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -127,6 +127,7 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu);
 
 void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
 void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
+int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu);
 
 void __timer_save_state(struct kvm_vcpu *vcpu);
 void __timer_restore_state(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 15c142ce991c..16e44fa9b3b6 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -180,14 +180,31 @@
 
 #define SYS_VBAR_EL1			sys_reg(3, 0, 12, 0, 0)
 
+#define SYS_ICC_IAR0_EL1		sys_reg(3, 0, 12, 8, 0)
+#define SYS_ICC_EOIR0_EL1		sys_reg(3, 0, 12, 8, 1)
+#define SYS_ICC_HPPIR0_EL1		sys_reg(3, 0, 12, 8, 2)
+#define SYS_ICC_BPR0_EL1		sys_reg(3, 0, 12, 8, 3)
+#define SYS_ICC_AP0Rn_EL1(n)		sys_reg(3, 0, 12, 8, 4 | n)
+#define SYS_ICC_AP0R0_EL1		SYS_ICC_AP0Rn_EL1(0)
+#define SYS_ICC_AP0R1_EL1		SYS_ICC_AP0Rn_EL1(1)
+#define SYS_ICC_AP0R2_EL1		SYS_ICC_AP0Rn_EL1(2)
+#define SYS_ICC_AP0R3_EL1		SYS_ICC_AP0Rn_EL1(3)
+#define SYS_ICC_AP1Rn_EL1(n)		sys_reg(3, 0, 12, 9, n)
+#define SYS_ICC_AP1R0_EL1		SYS_ICC_AP1Rn_EL1(0)
+#define SYS_ICC_AP1R1_EL1		SYS_ICC_AP1Rn_EL1(1)
+#define SYS_ICC_AP1R2_EL1		SYS_ICC_AP1Rn_EL1(2)
+#define SYS_ICC_AP1R3_EL1		SYS_ICC_AP1Rn_EL1(3)
 #define SYS_ICC_DIR_EL1			sys_reg(3, 0, 12, 11, 1)
+#define SYS_ICC_RPR_EL1			sys_reg(3, 0, 12, 11, 3)
 #define SYS_ICC_SGI1R_EL1		sys_reg(3, 0, 12, 11, 5)
 #define SYS_ICC_IAR1_EL1		sys_reg(3, 0, 12, 12, 0)
 #define SYS_ICC_EOIR1_EL1		sys_reg(3, 0, 12, 12, 1)
+#define SYS_ICC_HPPIR1_EL1		sys_reg(3, 0, 12, 12, 2)
 #define SYS_ICC_BPR1_EL1		sys_reg(3, 0, 12, 12, 3)
 #define SYS_ICC_CTLR_EL1		sys_reg(3, 0, 12, 12, 4)
 #define SYS_ICC_SRE_EL1			sys_reg(3, 0, 12, 12, 5)
-#define SYS_ICC_GRPEN1_EL1		sys_reg(3, 0, 12, 12, 7)
+#define SYS_ICC_IGRPEN0_EL1		sys_reg(3, 0, 12, 12, 6)
+#define SYS_ICC_IGRPEN1_EL1		sys_reg(3, 0, 12, 12, 7)
 
 #define SYS_CONTEXTIDR_EL1		sys_reg(3, 0, 13, 0, 1)
 #define SYS_TPIDR_EL1			sys_reg(3, 0, 13, 0, 4)
@@ -286,6 +303,10 @@
 #define SCTLR_ELx_A	(1 << 1)
 #define SCTLR_ELx_M	1
 
+#define SCTLR_EL2_RES1	((1 << 4)  | (1 << 5)  | (1 << 11) | (1 << 16) | \
+			 (1 << 18) | (1 << 22) | (1 << 23) | (1 << 28) | \
+			 (1 << 29))
+
 #define SCTLR_ELx_FLAGS	(SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \
 			 SCTLR_ELx_SA | SCTLR_ELx_I)
 
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 70eea2ecc663..9f3ca24bbcc6 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -232,6 +232,9 @@ struct kvm_arch_memory_slot {
 #define KVM_ARM_VCPU_PMU_V3_CTRL	0
 #define   KVM_ARM_VCPU_PMU_V3_IRQ	0
 #define   KVM_ARM_VCPU_PMU_V3_INIT	1
+#define KVM_ARM_VCPU_TIMER_CTRL		1
+#define   KVM_ARM_VCPU_TIMER_IRQ_VTIMER		0
+#define   KVM_ARM_VCPU_TIMER_IRQ_PTIMER		1
 
 /* KVM_IRQ_LINE irq field index values */
 #define KVM_ARM_IRQ_TYPE_SHIFT		24
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 2ed2a7657711..0e27f86ee709 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -133,6 +133,27 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 		MIDR_RANGE(MIDR_THUNDERX_81XX, 0x00, 0x00),
 	},
 #endif
+#ifdef CONFIG_CAVIUM_ERRATUM_30115
+	{
+	/* Cavium ThunderX, T88 pass 1.x - 2.2 */
+		.desc = "Cavium erratum 30115",
+		.capability = ARM64_WORKAROUND_CAVIUM_30115,
+		MIDR_RANGE(MIDR_THUNDERX, 0x00,
+			   (1 << MIDR_VARIANT_SHIFT) | 2),
+	},
+	{
+	/* Cavium ThunderX, T81 pass 1.0 - 1.2 */
+		.desc = "Cavium erratum 30115",
+		.capability = ARM64_WORKAROUND_CAVIUM_30115,
+		MIDR_RANGE(MIDR_THUNDERX_81XX, 0x00, 0x02),
+	},
+	{
+	/* Cavium ThunderX, T83 pass 1.0 */
+		.desc = "Cavium erratum 30115",
+		.capability = ARM64_WORKAROUND_CAVIUM_30115,
+		MIDR_RANGE(MIDR_THUNDERX_83XX, 0x00, 0x00),
+	},
+#endif
 	{
 		.desc = "Mismatched cache line size",
 		.capability = ARM64_MISMATCHED_CACHE_LINE_SIZE,
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index b37446a8ffdb..5c7f657dd207 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -390,6 +390,9 @@ int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
 	case KVM_ARM_VCPU_PMU_V3_CTRL:
 		ret = kvm_arm_pmu_v3_set_attr(vcpu, attr);
 		break;
+	case KVM_ARM_VCPU_TIMER_CTRL:
+		ret = kvm_arm_timer_set_attr(vcpu, attr);
+		break;
 	default:
 		ret = -ENXIO;
 		break;
@@ -407,6 +410,9 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
 	case KVM_ARM_VCPU_PMU_V3_CTRL:
 		ret = kvm_arm_pmu_v3_get_attr(vcpu, attr);
 		break;
+	case KVM_ARM_VCPU_TIMER_CTRL:
+		ret = kvm_arm_timer_get_attr(vcpu, attr);
+		break;
 	default:
 		ret = -ENXIO;
 		break;
@@ -424,6 +430,9 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
 	case KVM_ARM_VCPU_PMU_V3_CTRL:
 		ret = kvm_arm_pmu_v3_has_attr(vcpu, attr);
 		break;
+	case KVM_ARM_VCPU_TIMER_CTRL:
+		ret = kvm_arm_timer_has_attr(vcpu, attr);
+		break;
 	default:
 		ret = -ENXIO;
 		break;
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index fa1b18e364fc..17d8a1677a0b 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -89,6 +89,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false);
 		vcpu->stat.wfi_exit_stat++;
 		kvm_vcpu_block(vcpu);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 	}
 
 	kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index 839425c24b1c..3f9615582377 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -106,10 +106,13 @@ __do_hyp_init:
 	tlbi	alle2
 	dsb	sy
 
-	mrs	x4, sctlr_el2
-	and	x4, x4, #SCTLR_ELx_EE	// preserve endianness of EL2
-	ldr	x5, =SCTLR_ELx_FLAGS
-	orr	x4, x4, x5
+	/*
+	 * Preserve all the RES1 bits while setting the default flags,
+	 * as well as the EE bit on BE. Drop the A flag since the compiler
+	 * is allowed to generate unaligned accesses.
+	 */
+	ldr	x4, =(SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
+CPU_BE(	orr	x4, x4, #SCTLR_ELx_EE)
 	msr	sctlr_el2, x4
 	isb
 
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index aede1658aeda..945e79c641c4 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -350,6 +350,20 @@ again:
 		}
 	}
 
+	if (static_branch_unlikely(&vgic_v3_cpuif_trap) &&
+	    exit_code == ARM_EXCEPTION_TRAP &&
+	    (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 ||
+	     kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) {
+		int ret = __vgic_v3_perform_cpuif_access(vcpu);
+
+		if (ret == 1) {
+			__skip_instr(vcpu);
+			goto again;
+		}
+
+		/* 0 falls through to be handled out of EL2 */
+	}
+
 	fp_enabled = __fpsimd_enabled();
 
 	__sysreg_save_guest_state(guest_ctxt);
@@ -422,6 +436,7 @@ void __hyp_text __noreturn __hyp_panic(void)
 
 		vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2);
 		host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
+		__timer_save_state(vcpu);
 		__deactivate_traps(vcpu);
 		__deactivate_vm(vcpu);
 		__sysreg_restore_host_state(host_ctxt);
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 561badf93de8..3256b9228e75 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -46,16 +46,6 @@ static const struct kvm_regs default_regs_reset32 = {
 			COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT),
 };
 
-static const struct kvm_irq_level default_ptimer_irq = {
-	.irq	= 30,
-	.level	= 1,
-};
-
-static const struct kvm_irq_level default_vtimer_irq = {
-	.irq	= 27,
-	.level	= 1,
-};
-
 static bool cpu_has_32bit_el1(void)
 {
 	u64 pfr0;
@@ -108,8 +98,6 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
  */
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 {
-	const struct kvm_irq_level *cpu_vtimer_irq;
-	const struct kvm_irq_level *cpu_ptimer_irq;
 	const struct kvm_regs *cpu_reset;
 
 	switch (vcpu->arch.target) {
@@ -122,8 +110,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 			cpu_reset = &default_regs_reset;
 		}
 
-		cpu_vtimer_irq = &default_vtimer_irq;
-		cpu_ptimer_irq = &default_ptimer_irq;
 		break;
 	}
 
@@ -137,5 +123,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	kvm_pmu_vcpu_reset(vcpu);
 
 	/* Reset timer */
-	return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq, cpu_ptimer_irq);
+	return kvm_timer_vcpu_reset(vcpu);
 }
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 0fe27024a2e1..77862881ae86 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -56,7 +56,8 @@
  */
 
 static bool read_from_write_only(struct kvm_vcpu *vcpu,
-				 const struct sys_reg_params *params)
+				 struct sys_reg_params *params,
+				 const struct sys_reg_desc *r)
 {
 	WARN_ONCE(1, "Unexpected sys_reg read to write-only register\n");
 	print_sys_reg_instr(params);
@@ -64,6 +65,16 @@ static bool read_from_write_only(struct kvm_vcpu *vcpu,
 	return false;
 }
 
+static bool write_to_read_only(struct kvm_vcpu *vcpu,
+			       struct sys_reg_params *params,
+			       const struct sys_reg_desc *r)
+{
+	WARN_ONCE(1, "Unexpected sys_reg write to read-only register\n");
+	print_sys_reg_instr(params);
+	kvm_inject_undefined(vcpu);
+	return false;
+}
+
 /* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */
 static u32 cache_levels;
 
@@ -93,7 +104,7 @@ static bool access_dcsw(struct kvm_vcpu *vcpu,
 			const struct sys_reg_desc *r)
 {
 	if (!p->is_write)
-		return read_from_write_only(vcpu, p);
+		return read_from_write_only(vcpu, p, r);
 
 	kvm_set_way_flush(vcpu);
 	return true;
@@ -135,7 +146,7 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu,
 			   const struct sys_reg_desc *r)
 {
 	if (!p->is_write)
-		return read_from_write_only(vcpu, p);
+		return read_from_write_only(vcpu, p, r);
 
 	vgic_v3_dispatch_sgi(vcpu, p->regval);
 
@@ -773,7 +784,7 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 		return trap_raz_wi(vcpu, p, r);
 
 	if (!p->is_write)
-		return read_from_write_only(vcpu, p);
+		return read_from_write_only(vcpu, p, r);
 
 	if (pmu_write_swinc_el0_disabled(vcpu))
 		return false;
@@ -953,7 +964,15 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 
 	{ SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 },
 
+	{ SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only },
+	{ SYS_DESC(SYS_ICC_EOIR0_EL1), read_from_write_only },
+	{ SYS_DESC(SYS_ICC_HPPIR0_EL1), write_to_read_only },
+	{ SYS_DESC(SYS_ICC_DIR_EL1), read_from_write_only },
+	{ SYS_DESC(SYS_ICC_RPR_EL1), write_to_read_only },
 	{ SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi },
+	{ SYS_DESC(SYS_ICC_IAR1_EL1), write_to_read_only },
+	{ SYS_DESC(SYS_ICC_EOIR1_EL1), read_from_write_only },
+	{ SYS_DESC(SYS_ICC_HPPIR1_EL1), write_to_read_only },
 	{ SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre },
 
 	{ SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 },
diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c
index 79f37e37d367..116786d2e8e8 100644
--- a/arch/arm64/kvm/vgic-sys-reg-v3.c
+++ b/arch/arm64/kvm/vgic-sys-reg-v3.c
@@ -65,8 +65,8 @@ static bool access_gic_ctlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 		 * Here set VMCR.CTLR in ICC_CTLR_EL1 layout.
 		 * The vgic_set_vmcr() will convert to ICH_VMCR layout.
 		 */
-		vmcr.ctlr = val & ICC_CTLR_EL1_CBPR_MASK;
-		vmcr.ctlr |= val & ICC_CTLR_EL1_EOImode_MASK;
+		vmcr.cbpr = (val & ICC_CTLR_EL1_CBPR_MASK) >> ICC_CTLR_EL1_CBPR_SHIFT;
+		vmcr.eoim = (val & ICC_CTLR_EL1_EOImode_MASK) >> ICC_CTLR_EL1_EOImode_SHIFT;
 		vgic_set_vmcr(vcpu, &vmcr);
 	} else {
 		val = 0;
@@ -83,8 +83,8 @@ static bool access_gic_ctlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 		 * The VMCR.CTLR value is in ICC_CTLR_EL1 layout.
 		 * Extract it directly using ICC_CTLR_EL1 reg definitions.
 		 */
-		val |= vmcr.ctlr & ICC_CTLR_EL1_CBPR_MASK;
-		val |= vmcr.ctlr & ICC_CTLR_EL1_EOImode_MASK;
+		val |= (vmcr.cbpr << ICC_CTLR_EL1_CBPR_SHIFT) & ICC_CTLR_EL1_CBPR_MASK;
+		val |= (vmcr.eoim << ICC_CTLR_EL1_EOImode_SHIFT) & ICC_CTLR_EL1_EOImode_MASK;
 
 		p->regval = val;
 	}
@@ -135,7 +135,7 @@ static bool access_gic_bpr1(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 		p->regval = 0;
 
 	vgic_get_vmcr(vcpu, &vmcr);
-	if (!((vmcr.ctlr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT)) {
+	if (!vmcr.cbpr) {
 		if (p->is_write) {
 			vmcr.abpr = (p->regval & ICC_BPR1_EL1_MASK) >>
 				     ICC_BPR1_EL1_SHIFT;
@@ -268,36 +268,21 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 	return true;
 }
 static const struct sys_reg_desc gic_v3_icc_reg_descs[] = {
-	/* ICC_PMR_EL1 */
-	{ Op0(3), Op1(0), CRn(4), CRm(6), Op2(0), access_gic_pmr },
-	/* ICC_BPR0_EL1 */
-	{ Op0(3), Op1(0), CRn(12), CRm(8), Op2(3), access_gic_bpr0 },
-	/* ICC_AP0R0_EL1 */
-	{ Op0(3), Op1(0), CRn(12), CRm(8), Op2(4), access_gic_ap0r },
-	/* ICC_AP0R1_EL1 */
-	{ Op0(3), Op1(0), CRn(12), CRm(8), Op2(5), access_gic_ap0r },
-	/* ICC_AP0R2_EL1 */
-	{ Op0(3), Op1(0), CRn(12), CRm(8), Op2(6), access_gic_ap0r },
-	/* ICC_AP0R3_EL1 */
-	{ Op0(3), Op1(0), CRn(12), CRm(8), Op2(7), access_gic_ap0r },
-	/* ICC_AP1R0_EL1 */
-	{ Op0(3), Op1(0), CRn(12), CRm(9), Op2(0), access_gic_ap1r },
-	/* ICC_AP1R1_EL1 */
-	{ Op0(3), Op1(0), CRn(12), CRm(9), Op2(1), access_gic_ap1r },
-	/* ICC_AP1R2_EL1 */
-	{ Op0(3), Op1(0), CRn(12), CRm(9), Op2(2), access_gic_ap1r },
-	/* ICC_AP1R3_EL1 */
-	{ Op0(3), Op1(0), CRn(12), CRm(9), Op2(3), access_gic_ap1r },
-	/* ICC_BPR1_EL1 */
-	{ Op0(3), Op1(0), CRn(12), CRm(12), Op2(3), access_gic_bpr1 },
-	/* ICC_CTLR_EL1 */
-	{ Op0(3), Op1(0), CRn(12), CRm(12), Op2(4), access_gic_ctlr },
-	/* ICC_SRE_EL1 */
-	{ Op0(3), Op1(0), CRn(12), CRm(12), Op2(5), access_gic_sre },
-	/* ICC_IGRPEN0_EL1 */
-	{ Op0(3), Op1(0), CRn(12), CRm(12), Op2(6), access_gic_grpen0 },
-	/* ICC_GRPEN1_EL1 */
-	{ Op0(3), Op1(0), CRn(12), CRm(12), Op2(7), access_gic_grpen1 },
+	{ SYS_DESC(SYS_ICC_PMR_EL1), access_gic_pmr },
+	{ SYS_DESC(SYS_ICC_BPR0_EL1), access_gic_bpr0 },
+	{ SYS_DESC(SYS_ICC_AP0R0_EL1), access_gic_ap0r },
+	{ SYS_DESC(SYS_ICC_AP0R1_EL1), access_gic_ap0r },
+	{ SYS_DESC(SYS_ICC_AP0R2_EL1), access_gic_ap0r },
+	{ SYS_DESC(SYS_ICC_AP0R3_EL1), access_gic_ap0r },
+	{ SYS_DESC(SYS_ICC_AP1R0_EL1), access_gic_ap1r },
+	{ SYS_DESC(SYS_ICC_AP1R1_EL1), access_gic_ap1r },
+	{ SYS_DESC(SYS_ICC_AP1R2_EL1), access_gic_ap1r },
+	{ SYS_DESC(SYS_ICC_AP1R3_EL1), access_gic_ap1r },
+	{ SYS_DESC(SYS_ICC_BPR1_EL1), access_gic_bpr1 },
+	{ SYS_DESC(SYS_ICC_CTLR_EL1), access_gic_ctlr },
+	{ SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre },
+	{ SYS_DESC(SYS_ICC_IGRPEN0_EL1), access_gic_grpen0 },
+	{ SYS_DESC(SYS_ICC_IGRPEN1_EL1), access_gic_grpen1 },
 };
 
 int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id,
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index a563759fd142..6a0d7040d882 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -1094,7 +1094,7 @@ static void kvm_trap_emul_check_requests(struct kvm_vcpu *vcpu, int cpu,
 	struct mm_struct *mm;
 	int i;
 
-	if (likely(!vcpu->requests))
+	if (likely(!kvm_request_pending(vcpu)))
 		return;
 
 	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index 71d8856ade64..74805035edc8 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -2337,7 +2337,7 @@ static int kvm_vz_check_requests(struct kvm_vcpu *vcpu, int cpu)
 	int ret = 0;
 	int i;
 
-	if (!vcpu->requests)
+	if (!kvm_request_pending(vcpu))
 		return 0;
 
 	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 7d64f99ea3b8..8b3f1238d07f 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -53,8 +53,8 @@
 #define KVM_IRQCHIP_NUM_PINS     256
 
 /* PPC-specific vcpu->requests bit members */
-#define KVM_REQ_WATCHDOG           8
-#define KVM_REQ_EPR_EXIT           9
+#define KVM_REQ_WATCHDOG	KVM_ARCH_REQ(0)
+#define KVM_REQ_EPR_EXIT	KVM_ARCH_REQ(1)
 
 #include <linux/mmu_notifier.h>
 
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index ffe1da95033a..08b200a0bbce 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -1257,8 +1257,8 @@ static void xive_pre_save_scan(struct kvmppc_xive *xive)
 		if (!xc)
 			continue;
 		for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) {
-			if (xc->queues[i].qpage)
-				xive_pre_save_queue(xive, &xc->queues[i]);
+			if (xc->queues[j].qpage)
+				xive_pre_save_queue(xive, &xc->queues[j]);
 		}
 	}
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 3eaac3809977..071b87ee682f 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -687,7 +687,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 
 	kvmppc_core_check_exceptions(vcpu);
 
-	if (vcpu->requests) {
+	if (kvm_request_pending(vcpu)) {
 		/* Exception delivery raised request; start over */
 		return 1;
 	}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index b14736f3870b..1a75c0b5f4ca 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -55,8 +55,7 @@ EXPORT_SYMBOL_GPL(kvmppc_pr_ops);
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-	return !!(v->arch.pending_exceptions) ||
-	       v->requests;
+	return !!(v->arch.pending_exceptions) || kvm_request_pending(v);
 }
 
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
@@ -108,7 +107,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
 		 */
 		smp_mb();
 
-		if (vcpu->requests) {
+		if (kvm_request_pending(vcpu)) {
 			/* Make sure we process requests preemptable */
 			local_irq_enable();
 			trace_kvm_check_requests(vcpu);
diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h
index d0441ad2a990..e508dff92535 100644
--- a/arch/s390/include/asm/ctl_reg.h
+++ b/arch/s390/include/asm/ctl_reg.h
@@ -59,7 +59,9 @@ union ctlreg0 {
 		unsigned long lap  : 1; /* Low-address-protection control */
 		unsigned long	   : 4;
 		unsigned long edat : 1; /* Enhanced-DAT-enablement control */
-		unsigned long	   : 4;
+		unsigned long	   : 2;
+		unsigned long iep  : 1; /* Instruction-Execution-Protection */
+		unsigned long	   : 1;
 		unsigned long afp  : 1; /* AFP-register control */
 		unsigned long vx   : 1; /* Vector enablement control */
 		unsigned long	   : 7;
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 426614a882a9..495aedbaf447 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -42,9 +42,11 @@
 #define KVM_HALT_POLL_NS_DEFAULT 80000
 
 /* s390-specific vcpu->requests bit members */
-#define KVM_REQ_ENABLE_IBS         8
-#define KVM_REQ_DISABLE_IBS        9
-#define KVM_REQ_ICPT_OPEREXC       10
+#define KVM_REQ_ENABLE_IBS	KVM_ARCH_REQ(0)
+#define KVM_REQ_DISABLE_IBS	KVM_ARCH_REQ(1)
+#define KVM_REQ_ICPT_OPEREXC	KVM_ARCH_REQ(2)
+#define KVM_REQ_START_MIGRATION KVM_ARCH_REQ(3)
+#define KVM_REQ_STOP_MIGRATION  KVM_ARCH_REQ(4)
 
 #define SIGP_CTRL_C		0x80
 #define SIGP_CTRL_SCN_MASK	0x3f
@@ -56,7 +58,7 @@ union bsca_sigp_ctrl {
 		__u8 r : 1;
 		__u8 scn : 6;
 	};
-} __packed;
+};
 
 union esca_sigp_ctrl {
 	__u16 value;
@@ -65,14 +67,14 @@ union esca_sigp_ctrl {
 		__u8 reserved: 7;
 		__u8 scn;
 	};
-} __packed;
+};
 
 struct esca_entry {
 	union esca_sigp_ctrl sigp_ctrl;
 	__u16   reserved1[3];
 	__u64   sda;
 	__u64   reserved2[6];
-} __packed;
+};
 
 struct bsca_entry {
 	__u8	reserved0;
@@ -80,7 +82,7 @@ struct bsca_entry {
 	__u16	reserved[3];
 	__u64	sda;
 	__u64	reserved2[2];
-} __attribute__((packed));
+};
 
 union ipte_control {
 	unsigned long val;
@@ -97,7 +99,7 @@ struct bsca_block {
 	__u64	mcn;
 	__u64	reserved2;
 	struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS];
-} __attribute__((packed));
+};
 
 struct esca_block {
 	union ipte_control ipte_control;
@@ -105,7 +107,21 @@ struct esca_block {
 	__u64   mcn[4];
 	__u64   reserved2[20];
 	struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS];
-} __packed;
+};
+
+/*
+ * This struct is used to store some machine check info from lowcore
+ * for machine checks that happen while the guest is running.
+ * This info in host's lowcore might be overwritten by a second machine
+ * check from host when host is in the machine check's high-level handling.
+ * The size is 24 bytes.
+ */
+struct mcck_volatile_info {
+	__u64 mcic;
+	__u64 failing_storage_address;
+	__u32 ext_damage_code;
+	__u32 reserved;
+};
 
 #define CPUSTAT_STOPPED    0x80000000
 #define CPUSTAT_WAIT       0x10000000
@@ -260,14 +276,15 @@ struct kvm_s390_sie_block {
 
 struct kvm_s390_itdb {
 	__u8	data[256];
-} __packed;
+};
 
 struct sie_page {
 	struct kvm_s390_sie_block sie_block;
-	__u8 reserved200[1024];		/* 0x0200 */
+	struct mcck_volatile_info mcck_info;	/* 0x0200 */
+	__u8 reserved218[1000];		/* 0x0218 */
 	struct kvm_s390_itdb itdb;	/* 0x0600 */
 	__u8 reserved700[2304];		/* 0x0700 */
-} __packed;
+};
 
 struct kvm_vcpu_stat {
 	u64 exit_userspace;
@@ -681,7 +698,7 @@ struct sie_page2 {
 	__u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64];	/* 0x0000 */
 	struct kvm_s390_crypto_cb crycb;		/* 0x0800 */
 	u8 reserved900[0x1000 - 0x900];			/* 0x0900 */
-} __packed;
+};
 
 struct kvm_s390_vsie {
 	struct mutex mutex;
@@ -691,6 +708,12 @@ struct kvm_s390_vsie {
 	struct page *pages[KVM_MAX_VCPUS];
 };
 
+struct kvm_s390_migration_state {
+	unsigned long bitmap_size;	/* in bits (number of guest pages) */
+	atomic64_t dirty_pages;		/* number of dirty pages */
+	unsigned long *pgste_bitmap;
+};
+
 struct kvm_arch{
 	void *sca;
 	int use_esca;
@@ -718,6 +741,7 @@ struct kvm_arch{
 	struct kvm_s390_crypto crypto;
 	struct kvm_s390_vsie vsie;
 	u64 epoch;
+	struct kvm_s390_migration_state *migration_state;
 	/* subset of available cpu features enabled by user space */
 	DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
 };
diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h
index e3e8895f5d3e..9d91cf3e427f 100644
--- a/arch/s390/include/asm/nmi.h
+++ b/arch/s390/include/asm/nmi.h
@@ -14,11 +14,24 @@
 #include <linux/const.h>
 #include <linux/types.h>
 
+#define MCIC_SUBCLASS_MASK	(1ULL<<63 | 1ULL<<62 | 1ULL<<61 | \
+				1ULL<<59 | 1ULL<<58 | 1ULL<<56 | \
+				1ULL<<55 | 1ULL<<54 | 1ULL<<53 | \
+				1ULL<<52 | 1ULL<<47 | 1ULL<<46 | \
+				1ULL<<45 | 1ULL<<44)
 #define MCCK_CODE_SYSTEM_DAMAGE		_BITUL(63)
+#define MCCK_CODE_EXT_DAMAGE		_BITUL(63 - 5)
+#define MCCK_CODE_CP			_BITUL(63 - 9)
 #define MCCK_CODE_CPU_TIMER_VALID	_BITUL(63 - 46)
 #define MCCK_CODE_PSW_MWP_VALID		_BITUL(63 - 20)
 #define MCCK_CODE_PSW_IA_VALID		_BITUL(63 - 23)
 
+#define MCCK_CR14_CR_PENDING_SUB_MASK	(1 << 28)
+#define MCCK_CR14_RECOVERY_SUB_MASK	(1 << 27)
+#define MCCK_CR14_DEGRAD_SUB_MASK	(1 << 26)
+#define MCCK_CR14_EXT_DAMAGE_SUB_MASK	(1 << 25)
+#define MCCK_CR14_WARN_SUB_MASK		(1 << 24)
+
 #ifndef __ASSEMBLY__
 
 union mci {
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 60d395fdc864..5b1b247dfbca 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -20,6 +20,7 @@
 #define CIF_FPU			4	/* restore FPU registers */
 #define CIF_IGNORE_IRQ		5	/* ignore interrupt (for udelay) */
 #define CIF_ENABLED_WAIT	6	/* in enabled wait state */
+#define CIF_MCCK_GUEST		7	/* machine check happening in guest */
 
 #define _CIF_MCCK_PENDING	_BITUL(CIF_MCCK_PENDING)
 #define _CIF_ASCE_PRIMARY	_BITUL(CIF_ASCE_PRIMARY)
@@ -28,6 +29,7 @@
 #define _CIF_FPU		_BITUL(CIF_FPU)
 #define _CIF_IGNORE_IRQ		_BITUL(CIF_IGNORE_IRQ)
 #define _CIF_ENABLED_WAIT	_BITUL(CIF_ENABLED_WAIT)
+#define _CIF_MCCK_GUEST		_BITUL(CIF_MCCK_GUEST)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 3dd2a1d308dd..69d09c39bbcd 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -28,6 +28,7 @@
 #define KVM_DEV_FLIC_CLEAR_IO_IRQ	8
 #define KVM_DEV_FLIC_AISM		9
 #define KVM_DEV_FLIC_AIRQ_INJECT	10
+#define KVM_DEV_FLIC_AISM_ALL		11
 /*
  * We can have up to 4*64k pending subchannels + 8 adapter interrupts,
  * as well as up  to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts.
@@ -53,6 +54,11 @@ struct kvm_s390_ais_req {
 	__u16 mode;
 };
 
+struct kvm_s390_ais_all {
+	__u8 simm;
+	__u8 nimm;
+};
+
 #define KVM_S390_IO_ADAPTER_MASK 1
 #define KVM_S390_IO_ADAPTER_MAP 2
 #define KVM_S390_IO_ADAPTER_UNMAP 3
@@ -70,6 +76,7 @@ struct kvm_s390_io_adapter_req {
 #define KVM_S390_VM_TOD			1
 #define KVM_S390_VM_CRYPTO		2
 #define KVM_S390_VM_CPU_MODEL		3
+#define KVM_S390_VM_MIGRATION		4
 
 /* kvm attributes for mem_ctrl */
 #define KVM_S390_VM_MEM_ENABLE_CMMA	0
@@ -151,6 +158,11 @@ struct kvm_s390_vm_cpu_subfunc {
 #define KVM_S390_VM_CRYPTO_DISABLE_AES_KW	2
 #define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW	3
 
+/* kvm attributes for migration mode */
+#define KVM_S390_VM_MIGRATION_STOP	0
+#define KVM_S390_VM_MIGRATION_START	1
+#define KVM_S390_VM_MIGRATION_STATUS	2
+
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
 	/* general purpose regs for s390 */
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index 6bb29633e1f1..b65c414b6c0e 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -58,6 +58,9 @@ int main(void)
 	OFFSET(__SF_BACKCHAIN, stack_frame, back_chain);
 	OFFSET(__SF_GPRS, stack_frame, gprs);
 	OFFSET(__SF_EMPTY, stack_frame, empty1);
+	OFFSET(__SF_SIE_CONTROL, stack_frame, empty1[0]);
+	OFFSET(__SF_SIE_SAVEAREA, stack_frame, empty1[1]);
+	OFFSET(__SF_SIE_REASON, stack_frame, empty1[2]);
 	BLANK();
 	/* timeval/timezone offsets for use by vdso */
 	OFFSET(__VDSO_UPD_COUNT, vdso_data, tb_update_count);
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index e408d9cc5b96..3b4b158382ea 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -225,6 +225,7 @@ ENTRY(sie64a)
 	jnz	.Lsie_skip
 	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
 	jo	.Lsie_skip			# exit if fp/vx regs changed
+.Lsie_entry:
 	sie	0(%r14)
 .Lsie_skip:
 	ni	__SIE_PROG0C+3(%r14),0xfe	# no longer in SIE
@@ -1104,7 +1105,13 @@ cleanup_critical:
 	.quad	.Lsie_done
 
 .Lcleanup_sie:
-	lg	%r9,__SF_EMPTY(%r15)		# get control block pointer
+	cghi    %r11,__LC_SAVE_AREA_ASYNC 	#Is this in normal interrupt?
+	je      1f
+	slg     %r9,BASED(.Lsie_crit_mcck_start)
+	clg     %r9,BASED(.Lsie_crit_mcck_length)
+	jh      1f
+	oi      __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST
+1:	lg	%r9,__SF_EMPTY(%r15)		# get control block pointer
 	ni	__SIE_PROG0C+3(%r9),0xfe	# no longer in SIE
 	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
 	larl	%r9,sie_exit			# skip forward to sie_exit
@@ -1289,6 +1296,10 @@ cleanup_critical:
 	.quad	.Lsie_gmap
 .Lsie_critical_length:
 	.quad	.Lsie_done - .Lsie_gmap
+.Lsie_crit_mcck_start:
+	.quad   .Lsie_entry
+.Lsie_crit_mcck_length:
+	.quad   .Lsie_skip - .Lsie_entry
 #endif
 
 	.section .rodata, "a"
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 985589523970..31d03a84126c 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -25,6 +25,8 @@
 #include <asm/crw.h>
 #include <asm/switch_to.h>
 #include <asm/ctl_reg.h>
+#include <asm/asm-offsets.h>
+#include <linux/kvm_host.h>
 
 struct mcck_struct {
 	unsigned int kill_task : 1;
@@ -274,12 +276,39 @@ static int notrace s390_validate_registers(union mci mci, int umode)
 	return kill_task;
 }
 
+/*
+ * Backup the guest's machine check info to its description block
+ */
+static void notrace s390_backup_mcck_info(struct pt_regs *regs)
+{
+	struct mcck_volatile_info *mcck_backup;
+	struct sie_page *sie_page;
+
+	/* r14 contains the sie block, which was set in sie64a */
+	struct kvm_s390_sie_block *sie_block =
+			(struct kvm_s390_sie_block *) regs->gprs[14];
+
+	if (sie_block == NULL)
+		/* Something's seriously wrong, stop system. */
+		s390_handle_damage();
+
+	sie_page = container_of(sie_block, struct sie_page, sie_block);
+	mcck_backup = &sie_page->mcck_info;
+	mcck_backup->mcic = S390_lowcore.mcck_interruption_code &
+				~(MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE);
+	mcck_backup->ext_damage_code = S390_lowcore.external_damage_code;
+	mcck_backup->failing_storage_address
+			= S390_lowcore.failing_storage_address;
+}
+
 #define MAX_IPD_COUNT	29
 #define MAX_IPD_TIME	(5 * 60 * USEC_PER_SEC) /* 5 minutes */
 
 #define ED_STP_ISLAND	6	/* External damage STP island check */
 #define ED_STP_SYNC	7	/* External damage STP sync check */
 
+#define MCCK_CODE_NO_GUEST	(MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE)
+
 /*
  * machine check handler.
  */
@@ -291,6 +320,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
 	struct mcck_struct *mcck;
 	unsigned long long tmp;
 	union mci mci;
+	unsigned long mcck_dam_code;
 
 	nmi_enter();
 	inc_irq_stat(NMI_NMI);
@@ -301,7 +331,13 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
 		/* System damage -> stopping machine */
 		s390_handle_damage();
 	}
-	if (mci.pd) {
+
+	/*
+	 * Reinject the instruction processing damages' machine checks
+	 * including Delayed Access Exception into the guest
+	 * instead of damaging the host if they happen in the guest.
+	 */
+	if (mci.pd && !test_cpu_flag(CIF_MCCK_GUEST)) {
 		if (mci.b) {
 			/* Processing backup -> verify if we can survive this */
 			u64 z_mcic, o_mcic, t_mcic;
@@ -345,6 +381,14 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
 		mcck->mcck_code = mci.val;
 		set_cpu_flag(CIF_MCCK_PENDING);
 	}
+
+	/*
+	 * Backup the machine check's info if it happens when the guest
+	 * is running.
+	 */
+	if (test_cpu_flag(CIF_MCCK_GUEST))
+		s390_backup_mcck_info(regs);
+
 	if (mci.cd) {
 		/* Timing facility damage */
 		s390_handle_damage();
@@ -358,15 +402,22 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
 		if (mcck->stp_queue)
 			set_cpu_flag(CIF_MCCK_PENDING);
 	}
-	if (mci.se)
-		/* Storage error uncorrected */
-		s390_handle_damage();
-	if (mci.ke)
-		/* Storage key-error uncorrected */
-		s390_handle_damage();
-	if (mci.ds && mci.fa)
-		/* Storage degradation */
-		s390_handle_damage();
+
+	/*
+	 * Reinject storage related machine checks into the guest if they
+	 * happen when the guest is running.
+	 */
+	if (!test_cpu_flag(CIF_MCCK_GUEST)) {
+		if (mci.se)
+			/* Storage error uncorrected */
+			s390_handle_damage();
+		if (mci.ke)
+			/* Storage key-error uncorrected */
+			s390_handle_damage();
+		if (mci.ds && mci.fa)
+			/* Storage degradation */
+			s390_handle_damage();
+	}
 	if (mci.cp) {
 		/* Channel report word pending */
 		mcck->channel_report = 1;
@@ -377,6 +428,19 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
 		mcck->warning = 1;
 		set_cpu_flag(CIF_MCCK_PENDING);
 	}
+
+	/*
+	 * If there are only Channel Report Pending and External Damage
+	 * machine checks, they will not be reinjected into the guest
+	 * because they refer to host conditions only.
+	 */
+	mcck_dam_code = (mci.val & MCIC_SUBCLASS_MASK);
+	if (test_cpu_flag(CIF_MCCK_GUEST) &&
+	(mcck_dam_code & MCCK_CODE_NO_GUEST) != mcck_dam_code) {
+		/* Set exit reason code for host's later handling */
+		*((long *)(regs->gprs[15] + __SF_SIE_REASON)) = -EINTR;
+	}
+	clear_cpu_flag(CIF_MCCK_GUEST);
 	nmi_exit();
 }
 
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 9da243d94cc3..17e3a4e71bc9 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -89,7 +89,7 @@ struct region3_table_entry_fc1 {
 	unsigned long f  : 1; /* Fetch-Protection Bit */
 	unsigned long fc : 1; /* Format-Control */
 	unsigned long p  : 1; /* DAT-Protection Bit */
-	unsigned long co : 1; /* Change-Recording Override */
+	unsigned long iep: 1; /* Instruction-Execution-Protection */
 	unsigned long	 : 2;
 	unsigned long i  : 1; /* Region-Invalid Bit */
 	unsigned long cr : 1; /* Common-Region Bit */
@@ -131,7 +131,7 @@ struct segment_entry_fc1 {
 	unsigned long f  : 1; /* Fetch-Protection Bit */
 	unsigned long fc : 1; /* Format-Control */
 	unsigned long p  : 1; /* DAT-Protection Bit */
-	unsigned long co : 1; /* Change-Recording Override */
+	unsigned long iep: 1; /* Instruction-Execution-Protection */
 	unsigned long	 : 2;
 	unsigned long i  : 1; /* Segment-Invalid Bit */
 	unsigned long cs : 1; /* Common-Segment Bit */
@@ -168,7 +168,8 @@ union page_table_entry {
 		unsigned long z  : 1; /* Zero Bit */
 		unsigned long i  : 1; /* Page-Invalid Bit */
 		unsigned long p  : 1; /* DAT-Protection Bit */
-		unsigned long	 : 9;
+		unsigned long iep: 1; /* Instruction-Execution-Protection */
+		unsigned long	 : 8;
 	};
 };
 
@@ -241,7 +242,7 @@ struct ale {
 	unsigned long asteo  : 25; /* ASN-Second-Table-Entry Origin */
 	unsigned long        : 6;
 	unsigned long astesn : 32; /* ASTE Sequence Number */
-} __packed;
+};
 
 struct aste {
 	unsigned long i      : 1; /* ASX-Invalid Bit */
@@ -257,7 +258,7 @@ struct aste {
 	unsigned long ald    : 32;
 	unsigned long astesn : 32;
 	/* .. more fields there */
-} __packed;
+};
 
 int ipte_lock_held(struct kvm_vcpu *vcpu)
 {
@@ -485,6 +486,7 @@ enum prot_type {
 	PROT_TYPE_KEYC = 1,
 	PROT_TYPE_ALC  = 2,
 	PROT_TYPE_DAT  = 3,
+	PROT_TYPE_IEP  = 4,
 };
 
 static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
@@ -500,6 +502,9 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
 	switch (code) {
 	case PGM_PROTECTION:
 		switch (prot) {
+		case PROT_TYPE_IEP:
+			tec->b61 = 1;
+			/* FALL THROUGH */
 		case PROT_TYPE_LA:
 			tec->b56 = 1;
 			break;
@@ -591,6 +596,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
  * @gpa: points to where guest physical (absolute) address should be stored
  * @asce: effective asce
  * @mode: indicates the access mode to be used
+ * @prot: returns the type for protection exceptions
  *
  * Translate a guest virtual address into a guest absolute address by means
  * of dynamic address translation as specified by the architecture.
@@ -606,19 +612,21 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
  */
 static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 				     unsigned long *gpa, const union asce asce,
-				     enum gacc_mode mode)
+				     enum gacc_mode mode, enum prot_type *prot)
 {
 	union vaddress vaddr = {.addr = gva};
 	union raddress raddr = {.addr = gva};
 	union page_table_entry pte;
 	int dat_protection = 0;
+	int iep_protection = 0;
 	union ctlreg0 ctlreg0;
 	unsigned long ptr;
-	int edat1, edat2;
+	int edat1, edat2, iep;
 
 	ctlreg0.val = vcpu->arch.sie_block->gcr[0];
 	edat1 = ctlreg0.edat && test_kvm_facility(vcpu->kvm, 8);
 	edat2 = edat1 && test_kvm_facility(vcpu->kvm, 78);
+	iep = ctlreg0.iep && test_kvm_facility(vcpu->kvm, 130);
 	if (asce.r)
 		goto real_address;
 	ptr = asce.origin * 4096;
@@ -702,6 +710,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 			return PGM_TRANSLATION_SPEC;
 		if (rtte.fc && edat2) {
 			dat_protection |= rtte.fc1.p;
+			iep_protection = rtte.fc1.iep;
 			raddr.rfaa = rtte.fc1.rfaa;
 			goto absolute_address;
 		}
@@ -729,6 +738,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 			return PGM_TRANSLATION_SPEC;
 		if (ste.fc && edat1) {
 			dat_protection |= ste.fc1.p;
+			iep_protection = ste.fc1.iep;
 			raddr.sfaa = ste.fc1.sfaa;
 			goto absolute_address;
 		}
@@ -745,12 +755,19 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 	if (pte.z)
 		return PGM_TRANSLATION_SPEC;
 	dat_protection |= pte.p;
+	iep_protection = pte.iep;
 	raddr.pfra = pte.pfra;
 real_address:
 	raddr.addr = kvm_s390_real_to_abs(vcpu, raddr.addr);
 absolute_address:
-	if (mode == GACC_STORE && dat_protection)
+	if (mode == GACC_STORE && dat_protection) {
+		*prot = PROT_TYPE_DAT;
 		return PGM_PROTECTION;
+	}
+	if (mode == GACC_IFETCH && iep_protection && iep) {
+		*prot = PROT_TYPE_IEP;
+		return PGM_PROTECTION;
+	}
 	if (kvm_is_error_gpa(vcpu->kvm, raddr.addr))
 		return PGM_ADDRESSING;
 	*gpa = raddr.addr;
@@ -782,6 +799,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 {
 	psw_t *psw = &vcpu->arch.sie_block->gpsw;
 	int lap_enabled, rc = 0;
+	enum prot_type prot;
 
 	lap_enabled = low_address_protection_enabled(vcpu, asce);
 	while (nr_pages) {
@@ -791,7 +809,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 					 PROT_TYPE_LA);
 		ga &= PAGE_MASK;
 		if (psw_bits(*psw).t) {
-			rc = guest_translate(vcpu, ga, pages, asce, mode);
+			rc = guest_translate(vcpu, ga, pages, asce, mode, &prot);
 			if (rc < 0)
 				return rc;
 		} else {
@@ -800,7 +818,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 				rc = PGM_ADDRESSING;
 		}
 		if (rc)
-			return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_DAT);
+			return trans_exc(vcpu, rc, ga, ar, mode, prot);
 		ga += PAGE_SIZE;
 		pages++;
 		nr_pages--;
@@ -886,6 +904,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
 			    unsigned long *gpa, enum gacc_mode mode)
 {
 	psw_t *psw = &vcpu->arch.sie_block->gpsw;
+	enum prot_type prot;
 	union asce asce;
 	int rc;
 
@@ -900,9 +919,9 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
 	}
 
 	if (psw_bits(*psw).t && !asce.r) {	/* Use DAT? */
-		rc = guest_translate(vcpu, gva, gpa, asce, mode);
+		rc = guest_translate(vcpu, gva, gpa, asce, mode, &prot);
 		if (rc > 0)
-			return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT);
+			return trans_exc(vcpu, rc, gva, 0, mode, prot);
 	} else {
 		*gpa = kvm_s390_real_to_abs(vcpu, gva);
 		if (kvm_is_error_gpa(vcpu->kvm, *gpa))
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index caf15c8a8948..f2c78fc1852d 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -251,8 +251,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
 		__clear_bit(IRQ_PEND_EXT_SERVICE, &active_mask);
 	if (psw_mchk_disabled(vcpu))
 		active_mask &= ~IRQ_PEND_MCHK_MASK;
+	/*
+	 * Check both floating and local interrupt's cr14 because
+	 * bit IRQ_PEND_MCHK_REP could be set in both cases.
+	 */
 	if (!(vcpu->arch.sie_block->gcr[14] &
-	      vcpu->kvm->arch.float_int.mchk.cr14))
+	   (vcpu->kvm->arch.float_int.mchk.cr14 |
+	   vcpu->arch.local_int.irq.mchk.cr14)))
 		__clear_bit(IRQ_PEND_MCHK_REP, &active_mask);
 
 	/*
@@ -1876,6 +1881,28 @@ out:
 	return ret < 0 ? ret : n;
 }
 
+static int flic_ais_mode_get_all(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+	struct kvm_s390_ais_all ais;
+
+	if (attr->attr < sizeof(ais))
+		return -EINVAL;
+
+	if (!test_kvm_facility(kvm, 72))
+		return -ENOTSUPP;
+
+	mutex_lock(&fi->ais_lock);
+	ais.simm = fi->simm;
+	ais.nimm = fi->nimm;
+	mutex_unlock(&fi->ais_lock);
+
+	if (copy_to_user((void __user *)attr->addr, &ais, sizeof(ais)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
 	int r;
@@ -1885,6 +1912,9 @@ static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 		r = get_all_floating_irqs(dev->kvm, (u8 __user *) attr->addr,
 					  attr->attr);
 		break;
+	case KVM_DEV_FLIC_AISM_ALL:
+		r = flic_ais_mode_get_all(dev->kvm, attr);
+		break;
 	default:
 		r = -EINVAL;
 	}
@@ -2235,6 +2265,25 @@ static int flic_inject_airq(struct kvm *kvm, struct kvm_device_attr *attr)
 	return kvm_s390_inject_airq(kvm, adapter);
 }
 
+static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+	struct kvm_s390_ais_all ais;
+
+	if (!test_kvm_facility(kvm, 72))
+		return -ENOTSUPP;
+
+	if (copy_from_user(&ais, (void __user *)attr->addr, sizeof(ais)))
+		return -EFAULT;
+
+	mutex_lock(&fi->ais_lock);
+	fi->simm = ais.simm;
+	fi->nimm = ais.nimm;
+	mutex_unlock(&fi->ais_lock);
+
+	return 0;
+}
+
 static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
 	int r = 0;
@@ -2277,6 +2326,9 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 	case KVM_DEV_FLIC_AIRQ_INJECT:
 		r = flic_inject_airq(dev->kvm, attr);
 		break;
+	case KVM_DEV_FLIC_AISM_ALL:
+		r = flic_ais_mode_set_all(dev->kvm, attr);
+		break;
 	default:
 		r = -EINVAL;
 	}
@@ -2298,6 +2350,7 @@ static int flic_has_attr(struct kvm_device *dev,
 	case KVM_DEV_FLIC_CLEAR_IO_IRQ:
 	case KVM_DEV_FLIC_AISM:
 	case KVM_DEV_FLIC_AIRQ_INJECT:
+	case KVM_DEV_FLIC_AISM_ALL:
 		return 0;
 	}
 	return -ENXIO;
@@ -2415,6 +2468,42 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
 	return ret;
 }
 
+/*
+ * Inject the machine check to the guest.
+ */
+void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
+				     struct mcck_volatile_info *mcck_info)
+{
+	struct kvm_s390_interrupt_info inti;
+	struct kvm_s390_irq irq;
+	struct kvm_s390_mchk_info *mchk;
+	union mci mci;
+	__u64 cr14 = 0;         /* upper bits are not used */
+
+	mci.val = mcck_info->mcic;
+	if (mci.sr)
+		cr14 |= MCCK_CR14_RECOVERY_SUB_MASK;
+	if (mci.dg)
+		cr14 |= MCCK_CR14_DEGRAD_SUB_MASK;
+	if (mci.w)
+		cr14 |= MCCK_CR14_WARN_SUB_MASK;
+
+	mchk = mci.ck ? &inti.mchk : &irq.u.mchk;
+	mchk->cr14 = cr14;
+	mchk->mcic = mcck_info->mcic;
+	mchk->ext_damage_code = mcck_info->ext_damage_code;
+	mchk->failing_storage_address = mcck_info->failing_storage_address;
+	if (mci.ck) {
+		/* Inject the floating machine check */
+		inti.type = KVM_S390_MCHK;
+		WARN_ON_ONCE(__inject_vm(vcpu->kvm, &inti));
+	} else {
+		/* Inject the machine check to specified vcpu */
+		irq.type = KVM_S390_MCHK;
+		WARN_ON_ONCE(kvm_s390_inject_vcpu(vcpu, &irq));
+	}
+}
+
 int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
 			  const struct kvm_irq_routing_entry *ue)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 689ac48361c6..ef6419654c16 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -30,6 +30,7 @@
 #include <linux/vmalloc.h>
 #include <linux/bitmap.h>
 #include <linux/sched/signal.h>
+#include <linux/string.h>
 
 #include <asm/asm-offsets.h>
 #include <asm/lowcore.h>
@@ -386,6 +387,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_SKEYS:
 	case KVM_CAP_S390_IRQ_STATE:
 	case KVM_CAP_S390_USER_INSTR0:
+	case KVM_CAP_S390_CMMA_MIGRATION:
 	case KVM_CAP_S390_AIS:
 		r = 1;
 		break;
@@ -750,6 +752,129 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
 	return 0;
 }
 
+static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req)
+{
+	int cx;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(cx, vcpu, kvm)
+		kvm_s390_sync_request(req, vcpu);
+}
+
+/*
+ * Must be called with kvm->srcu held to avoid races on memslots, and with
+ * kvm->lock to avoid races with ourselves and kvm_s390_vm_stop_migration.
+ */
+static int kvm_s390_vm_start_migration(struct kvm *kvm)
+{
+	struct kvm_s390_migration_state *mgs;
+	struct kvm_memory_slot *ms;
+	/* should be the only one */
+	struct kvm_memslots *slots;
+	unsigned long ram_pages;
+	int slotnr;
+
+	/* migration mode already enabled */
+	if (kvm->arch.migration_state)
+		return 0;
+
+	slots = kvm_memslots(kvm);
+	if (!slots || !slots->used_slots)
+		return -EINVAL;
+
+	mgs = kzalloc(sizeof(*mgs), GFP_KERNEL);
+	if (!mgs)
+		return -ENOMEM;
+	kvm->arch.migration_state = mgs;
+
+	if (kvm->arch.use_cmma) {
+		/*
+		 * Get the last slot. They should be sorted by base_gfn, so the
+		 * last slot is also the one at the end of the address space.
+		 * We have verified above that at least one slot is present.
+		 */
+		ms = slots->memslots + slots->used_slots - 1;
+		/* round up so we only use full longs */
+		ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG);
+		/* allocate enough bytes to store all the bits */
+		mgs->pgste_bitmap = vmalloc(ram_pages / 8);
+		if (!mgs->pgste_bitmap) {
+			kfree(mgs);
+			kvm->arch.migration_state = NULL;
+			return -ENOMEM;
+		}
+
+		mgs->bitmap_size = ram_pages;
+		atomic64_set(&mgs->dirty_pages, ram_pages);
+		/* mark all the pages in active slots as dirty */
+		for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
+			ms = slots->memslots + slotnr;
+			bitmap_set(mgs->pgste_bitmap, ms->base_gfn, ms->npages);
+		}
+
+		kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
+	}
+	return 0;
+}
+
+/*
+ * Must be called with kvm->lock to avoid races with ourselves and
+ * kvm_s390_vm_start_migration.
+ */
+static int kvm_s390_vm_stop_migration(struct kvm *kvm)
+{
+	struct kvm_s390_migration_state *mgs;
+
+	/* migration mode already disabled */
+	if (!kvm->arch.migration_state)
+		return 0;
+	mgs = kvm->arch.migration_state;
+	kvm->arch.migration_state = NULL;
+
+	if (kvm->arch.use_cmma) {
+		kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION);
+		vfree(mgs->pgste_bitmap);
+	}
+	kfree(mgs);
+	return 0;
+}
+
+static int kvm_s390_vm_set_migration(struct kvm *kvm,
+				     struct kvm_device_attr *attr)
+{
+	int idx, res = -ENXIO;
+
+	mutex_lock(&kvm->lock);
+	switch (attr->attr) {
+	case KVM_S390_VM_MIGRATION_START:
+		idx = srcu_read_lock(&kvm->srcu);
+		res = kvm_s390_vm_start_migration(kvm);
+		srcu_read_unlock(&kvm->srcu, idx);
+		break;
+	case KVM_S390_VM_MIGRATION_STOP:
+		res = kvm_s390_vm_stop_migration(kvm);
+		break;
+	default:
+		break;
+	}
+	mutex_unlock(&kvm->lock);
+
+	return res;
+}
+
+static int kvm_s390_vm_get_migration(struct kvm *kvm,
+				     struct kvm_device_attr *attr)
+{
+	u64 mig = (kvm->arch.migration_state != NULL);
+
+	if (attr->attr != KVM_S390_VM_MIGRATION_STATUS)
+		return -ENXIO;
+
+	if (copy_to_user((void __user *)attr->addr, &mig, sizeof(mig)))
+		return -EFAULT;
+	return 0;
+}
+
 static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 	u8 gtod_high;
@@ -1090,6 +1215,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_CRYPTO:
 		ret = kvm_s390_vm_set_crypto(kvm, attr);
 		break;
+	case KVM_S390_VM_MIGRATION:
+		ret = kvm_s390_vm_set_migration(kvm, attr);
+		break;
 	default:
 		ret = -ENXIO;
 		break;
@@ -1112,6 +1240,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_CPU_MODEL:
 		ret = kvm_s390_get_cpu_model(kvm, attr);
 		break;
+	case KVM_S390_VM_MIGRATION:
+		ret = kvm_s390_vm_get_migration(kvm, attr);
+		break;
 	default:
 		ret = -ENXIO;
 		break;
@@ -1179,6 +1310,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 			break;
 		}
 		break;
+	case KVM_S390_VM_MIGRATION:
+		ret = 0;
+		break;
 	default:
 		ret = -ENXIO;
 		break;
@@ -1286,6 +1420,182 @@ out:
 	return r;
 }
 
+/*
+ * Base address and length must be sent at the start of each block, therefore
+ * it's cheaper to send some clean data, as long as it's less than the size of
+ * two longs.
+ */
+#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *))
+/* for consistency */
+#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
+
+/*
+ * This function searches for the next page with dirty CMMA attributes, and
+ * saves the attributes in the buffer up to either the end of the buffer or
+ * until a block of at least KVM_S390_MAX_BIT_DISTANCE clean bits is found;
+ * no trailing clean bytes are saved.
+ * In case no dirty bits were found, or if CMMA was not enabled or used, the
+ * output buffer will indicate 0 as length.
+ */
+static int kvm_s390_get_cmma_bits(struct kvm *kvm,
+				  struct kvm_s390_cmma_log *args)
+{
+	struct kvm_s390_migration_state *s = kvm->arch.migration_state;
+	unsigned long bufsize, hva, pgstev, i, next, cur;
+	int srcu_idx, peek, r = 0, rr;
+	u8 *res;
+
+	cur = args->start_gfn;
+	i = next = pgstev = 0;
+
+	if (unlikely(!kvm->arch.use_cmma))
+		return -ENXIO;
+	/* Invalid/unsupported flags were specified */
+	if (args->flags & ~KVM_S390_CMMA_PEEK)
+		return -EINVAL;
+	/* Migration mode query, and we are not doing a migration */
+	peek = !!(args->flags & KVM_S390_CMMA_PEEK);
+	if (!peek && !s)
+		return -EINVAL;
+	/* CMMA is disabled or was not used, or the buffer has length zero */
+	bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX);
+	if (!bufsize || !kvm->mm->context.use_cmma) {
+		memset(args, 0, sizeof(*args));
+		return 0;
+	}
+
+	if (!peek) {
+		/* We are not peeking, and there are no dirty pages */
+		if (!atomic64_read(&s->dirty_pages)) {
+			memset(args, 0, sizeof(*args));
+			return 0;
+		}
+		cur = find_next_bit(s->pgste_bitmap, s->bitmap_size,
+				    args->start_gfn);
+		if (cur >= s->bitmap_size)	/* nothing found, loop back */
+			cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, 0);
+		if (cur >= s->bitmap_size) {	/* again! (very unlikely) */
+			memset(args, 0, sizeof(*args));
+			return 0;
+		}
+		next = find_next_bit(s->pgste_bitmap, s->bitmap_size, cur + 1);
+	}
+
+	res = vmalloc(bufsize);
+	if (!res)
+		return -ENOMEM;
+
+	args->start_gfn = cur;
+
+	down_read(&kvm->mm->mmap_sem);
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	while (i < bufsize) {
+		hva = gfn_to_hva(kvm, cur);
+		if (kvm_is_error_hva(hva)) {
+			r = -EFAULT;
+			break;
+		}
+		/* decrement only if we actually flipped the bit to 0 */
+		if (!peek && test_and_clear_bit(cur, s->pgste_bitmap))
+			atomic64_dec(&s->dirty_pages);
+		r = get_pgste(kvm->mm, hva, &pgstev);
+		if (r < 0)
+			pgstev = 0;
+		/* save the value */
+		res[i++] = (pgstev >> 24) & 0x3;
+		/*
+		 * if the next bit is too far away, stop.
+		 * if we reached the previous "next", find the next one
+		 */
+		if (!peek) {
+			if (next > cur + KVM_S390_MAX_BIT_DISTANCE)
+				break;
+			if (cur == next)
+				next = find_next_bit(s->pgste_bitmap,
+						     s->bitmap_size, cur + 1);
+		/* reached the end of the bitmap or of the buffer, stop */
+			if ((next >= s->bitmap_size) ||
+			    (next >= args->start_gfn + bufsize))
+				break;
+		}
+		cur++;
+	}
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	up_read(&kvm->mm->mmap_sem);
+	args->count = i;
+	args->remaining = s ? atomic64_read(&s->dirty_pages) : 0;
+
+	rr = copy_to_user((void __user *)args->values, res, args->count);
+	if (rr)
+		r = -EFAULT;
+
+	vfree(res);
+	return r;
+}
+
+/*
+ * This function sets the CMMA attributes for the given pages. If the input
+ * buffer has zero length, no action is taken, otherwise the attributes are
+ * set and the mm->context.use_cmma flag is set.
+ */
+static int kvm_s390_set_cmma_bits(struct kvm *kvm,
+				  const struct kvm_s390_cmma_log *args)
+{
+	unsigned long hva, mask, pgstev, i;
+	uint8_t *bits;
+	int srcu_idx, r = 0;
+
+	mask = args->mask;
+
+	if (!kvm->arch.use_cmma)
+		return -ENXIO;
+	/* invalid/unsupported flags */
+	if (args->flags != 0)
+		return -EINVAL;
+	/* Enforce sane limit on memory allocation */
+	if (args->count > KVM_S390_CMMA_SIZE_MAX)
+		return -EINVAL;
+	/* Nothing to do */
+	if (args->count == 0)
+		return 0;
+
+	bits = vmalloc(sizeof(*bits) * args->count);
+	if (!bits)
+		return -ENOMEM;
+
+	r = copy_from_user(bits, (void __user *)args->values, args->count);
+	if (r) {
+		r = -EFAULT;
+		goto out;
+	}
+
+	down_read(&kvm->mm->mmap_sem);
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	for (i = 0; i < args->count; i++) {
+		hva = gfn_to_hva(kvm, args->start_gfn + i);
+		if (kvm_is_error_hva(hva)) {
+			r = -EFAULT;
+			break;
+		}
+
+		pgstev = bits[i];
+		pgstev = pgstev << 24;
+		mask &= _PGSTE_GPS_USAGE_MASK;
+		set_pgste_bits(kvm->mm, hva, mask, pgstev);
+	}
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	up_read(&kvm->mm->mmap_sem);
+
+	if (!kvm->mm->context.use_cmma) {
+		down_write(&kvm->mm->mmap_sem);
+		kvm->mm->context.use_cmma = 1;
+		up_write(&kvm->mm->mmap_sem);
+	}
+out:
+	vfree(bits);
+	return r;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg)
 {
@@ -1364,6 +1674,29 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_s390_set_skeys(kvm, &args);
 		break;
 	}
+	case KVM_S390_GET_CMMA_BITS: {
+		struct kvm_s390_cmma_log args;
+
+		r = -EFAULT;
+		if (copy_from_user(&args, argp, sizeof(args)))
+			break;
+		r = kvm_s390_get_cmma_bits(kvm, &args);
+		if (!r) {
+			r = copy_to_user(argp, &args, sizeof(args));
+			if (r)
+				r = -EFAULT;
+		}
+		break;
+	}
+	case KVM_S390_SET_CMMA_BITS: {
+		struct kvm_s390_cmma_log args;
+
+		r = -EFAULT;
+		if (copy_from_user(&args, argp, sizeof(args)))
+			break;
+		r = kvm_s390_set_cmma_bits(kvm, &args);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
@@ -1633,6 +1966,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_s390_destroy_adapters(kvm);
 	kvm_s390_clear_float_irqs(kvm);
 	kvm_s390_vsie_destroy(kvm);
+	if (kvm->arch.migration_state) {
+		vfree(kvm->arch.migration_state->pgste_bitmap);
+		kfree(kvm->arch.migration_state);
+	}
 	KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
 }
 
@@ -1977,7 +2314,6 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu)
 	if (!vcpu->arch.sie_block->cbrlo)
 		return -ENOMEM;
 
-	vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
 	vcpu->arch.sie_block->ecb2 &= ~ECB2_PFMFI;
 	return 0;
 }
@@ -2069,6 +2405,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 	if (!vcpu)
 		goto out;
 
+	BUILD_BUG_ON(sizeof(struct sie_page) != 4096);
 	sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL);
 	if (!sie_page)
 		goto out_free_cpu;
@@ -2440,7 +2777,7 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
 {
 retry:
 	kvm_s390_vcpu_request_handled(vcpu);
-	if (!vcpu->requests)
+	if (!kvm_request_pending(vcpu))
 		return 0;
 	/*
 	 * We use MMU_RELOAD just to re-arm the ipte notifier for the
@@ -2489,6 +2826,27 @@ retry:
 		goto retry;
 	}
 
+	if (kvm_check_request(KVM_REQ_START_MIGRATION, vcpu)) {
+		/*
+		 * Disable CMMA virtualization; we will emulate the ESSA
+		 * instruction manually, in order to provide additional
+		 * functionalities needed for live migration.
+		 */
+		vcpu->arch.sie_block->ecb2 &= ~ECB2_CMMA;
+		goto retry;
+	}
+
+	if (kvm_check_request(KVM_REQ_STOP_MIGRATION, vcpu)) {
+		/*
+		 * Re-enable CMMA virtualization if CMMA is available and
+		 * was used.
+		 */
+		if ((vcpu->kvm->arch.use_cmma) &&
+		    (vcpu->kvm->mm->context.use_cmma))
+			vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
+		goto retry;
+	}
+
 	/* nothing to do, just clear the request */
 	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 
@@ -2683,6 +3041,9 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
 
 static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
 {
+	struct mcck_volatile_info *mcck_info;
+	struct sie_page *sie_page;
+
 	VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
 		   vcpu->arch.sie_block->icptcode);
 	trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
@@ -2693,6 +3054,15 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
 	vcpu->run->s.regs.gprs[14] = vcpu->arch.sie_block->gg14;
 	vcpu->run->s.regs.gprs[15] = vcpu->arch.sie_block->gg15;
 
+	if (exit_reason == -EINTR) {
+		VCPU_EVENT(vcpu, 3, "%s", "machine check");
+		sie_page = container_of(vcpu->arch.sie_block,
+					struct sie_page, sie_block);
+		mcck_info = &sie_page->mcck_info;
+		kvm_s390_reinject_machine_check(vcpu, mcck_info);
+		return 0;
+	}
+
 	if (vcpu->arch.sie_block->icptcode > 0) {
 		int rc = kvm_handle_sie_intercept(vcpu);
 
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 55f5c8457d6d..6fedc8bc7a37 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -397,4 +397,6 @@ static inline int kvm_s390_use_sca_entries(void)
 	 */
 	return sclp.has_sigpif;
 }
+void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
+				     struct mcck_volatile_info *mcck_info);
 #endif
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index c03106c428cf..a226c459809b 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -24,6 +24,7 @@
 #include <asm/ebcdic.h>
 #include <asm/sysinfo.h>
 #include <asm/pgtable.h>
+#include <asm/page-states.h>
 #include <asm/pgalloc.h>
 #include <asm/gmap.h>
 #include <asm/io.h>
@@ -949,13 +950,72 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
+{
+	struct kvm_s390_migration_state *ms = vcpu->kvm->arch.migration_state;
+	int r1, r2, nappended, entries;
+	unsigned long gfn, hva, res, pgstev, ptev;
+	unsigned long *cbrlo;
+
+	/*
+	 * We don't need to set SD.FPF.SK to 1 here, because if we have a
+	 * machine check here we either handle it or crash
+	 */
+
+	kvm_s390_get_regs_rre(vcpu, &r1, &r2);
+	gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT;
+	hva = gfn_to_hva(vcpu->kvm, gfn);
+	entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
+
+	if (kvm_is_error_hva(hva))
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+	nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev);
+	if (nappended < 0) {
+		res = orc ? 0x10 : 0;
+		vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */
+		return 0;
+	}
+	res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22;
+	/*
+	 * Set the block-content state part of the result. 0 means resident, so
+	 * nothing to do if the page is valid. 2 is for preserved pages
+	 * (non-present and non-zero), and 3 for zero pages (non-present and
+	 * zero).
+	 */
+	if (ptev & _PAGE_INVALID) {
+		res |= 2;
+		if (pgstev & _PGSTE_GPS_ZERO)
+			res |= 1;
+	}
+	vcpu->run->s.regs.gprs[r1] = res;
+	/*
+	 * It is possible that all the normal 511 slots were full, in which case
+	 * we will now write in the 512th slot, which is reserved for host use.
+	 * In both cases we let the normal essa handling code process all the
+	 * slots, including the reserved one, if needed.
+	 */
+	if (nappended > 0) {
+		cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo & PAGE_MASK);
+		cbrlo[entries] = gfn << PAGE_SHIFT;
+	}
+
+	if (orc) {
+		/* increment only if we are really flipping the bit to 1 */
+		if (!test_and_set_bit(gfn, ms->pgste_bitmap))
+			atomic64_inc(&ms->dirty_pages);
+	}
+
+	return nappended;
+}
+
 static int handle_essa(struct kvm_vcpu *vcpu)
 {
 	/* entries expected to be 1FF */
 	int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
 	unsigned long *cbrlo;
 	struct gmap *gmap;
-	int i;
+	int i, orc;
 
 	VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries);
 	gmap = vcpu->arch.gmap;
@@ -965,12 +1025,45 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
-
-	if (((vcpu->arch.sie_block->ipb & 0xf0000000) >> 28) > 6)
+	/* Check for invalid operation request code */
+	orc = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28;
+	if (orc > ESSA_MAX)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-	/* Retry the ESSA instruction */
-	kvm_s390_retry_instr(vcpu);
+	if (likely(!vcpu->kvm->arch.migration_state)) {
+		/*
+		 * CMMA is enabled in the KVM settings, but is disabled in
+		 * the SIE block and in the mm_context, and we are not doing
+		 * a migration. Enable CMMA in the mm_context.
+		 * Since we need to take a write lock to write to the context
+		 * to avoid races with storage keys handling, we check if the
+		 * value really needs to be written to; if the value is
+		 * already correct, we do nothing and avoid the lock.
+		 */
+		if (vcpu->kvm->mm->context.use_cmma == 0) {
+			down_write(&vcpu->kvm->mm->mmap_sem);
+			vcpu->kvm->mm->context.use_cmma = 1;
+			up_write(&vcpu->kvm->mm->mmap_sem);
+		}
+		/*
+		 * If we are here, we are supposed to have CMMA enabled in
+		 * the SIE block. Enabling CMMA works on a per-CPU basis,
+		 * while the context use_cmma flag is per process.
+		 * It's possible that the context flag is enabled and the
+		 * SIE flag is not, so we set the flag always; if it was
+		 * already set, nothing changes, otherwise we enable it
+		 * on this CPU too.
+		 */
+		vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
+		/* Retry the ESSA instruction */
+		kvm_s390_retry_instr(vcpu);
+	} else {
+		/* Account for the possible extra cbrl entry */
+		i = do_essa(vcpu, orc);
+		if (i < 0)
+			return i;
+		entries += i;
+	}
 	vcpu->arch.sie_block->cbrlo &= PAGE_MASK;	/* reset nceo */
 	cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
 	down_read(&gmap->mm->mmap_sem);
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 4719ecb9ab42..715c19c45d9a 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -26,16 +26,21 @@
 
 struct vsie_page {
 	struct kvm_s390_sie_block scb_s;	/* 0x0000 */
+	/*
+	 * the backup info for machine check. ensure it's at
+	 * the same offset as that in struct sie_page!
+	 */
+	struct mcck_volatile_info mcck_info;    /* 0x0200 */
 	/* the pinned originial scb */
-	struct kvm_s390_sie_block *scb_o;	/* 0x0200 */
+	struct kvm_s390_sie_block *scb_o;	/* 0x0218 */
 	/* the shadow gmap in use by the vsie_page */
-	struct gmap *gmap;			/* 0x0208 */
+	struct gmap *gmap;			/* 0x0220 */
 	/* address of the last reported fault to guest2 */
-	unsigned long fault_addr;		/* 0x0210 */
-	__u8 reserved[0x0700 - 0x0218];		/* 0x0218 */
+	unsigned long fault_addr;		/* 0x0228 */
+	__u8 reserved[0x0700 - 0x0230];		/* 0x0230 */
 	struct kvm_s390_crypto_cb crycb;	/* 0x0700 */
 	__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE];	/* 0x0800 */
-} __packed;
+};
 
 /* trigger a validity icpt for the given scb */
 static int set_validity_icpt(struct kvm_s390_sie_block *scb,
@@ -801,6 +806,8 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	struct mcck_volatile_info *mcck_info;
+	struct sie_page *sie_page;
 	int rc;
 
 	handle_last_fault(vcpu, vsie_page);
@@ -822,6 +829,14 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	local_irq_enable();
 	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 
+	if (rc == -EINTR) {
+		VCPU_EVENT(vcpu, 3, "%s", "machine check");
+		sie_page = container_of(scb_s, struct sie_page, sie_block);
+		mcck_info = &sie_page->mcck_info;
+		kvm_s390_reinject_machine_check(vcpu, mcck_info);
+		return 0;
+	}
+
 	if (rc > 0)
 		rc = 0; /* we could still have an icpt */
 	else if (rc == -EFAULT)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 695605eb1dfb..9be890893885 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -48,28 +48,31 @@
 #define KVM_IRQCHIP_NUM_PINS  KVM_IOAPIC_NUM_PINS
 
 /* x86-specific vcpu->requests bit members */
-#define KVM_REQ_MIGRATE_TIMER      8
-#define KVM_REQ_REPORT_TPR_ACCESS  9
-#define KVM_REQ_TRIPLE_FAULT      10
-#define KVM_REQ_MMU_SYNC          11
-#define KVM_REQ_CLOCK_UPDATE      12
-#define KVM_REQ_EVENT             14
-#define KVM_REQ_APF_HALT          15
-#define KVM_REQ_STEAL_UPDATE      16
-#define KVM_REQ_NMI               17
-#define KVM_REQ_PMU               18
-#define KVM_REQ_PMI               19
-#define KVM_REQ_SMI               20
-#define KVM_REQ_MASTERCLOCK_UPDATE 21
-#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_SCAN_IOAPIC       (23 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_GLOBAL_CLOCK_UPDATE 24
-#define KVM_REQ_APIC_PAGE_RELOAD  (25 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_HV_CRASH          26
-#define KVM_REQ_IOAPIC_EOI_EXIT   27
-#define KVM_REQ_HV_RESET          28
-#define KVM_REQ_HV_EXIT           29
-#define KVM_REQ_HV_STIMER         30
+#define KVM_REQ_MIGRATE_TIMER		KVM_ARCH_REQ(0)
+#define KVM_REQ_REPORT_TPR_ACCESS	KVM_ARCH_REQ(1)
+#define KVM_REQ_TRIPLE_FAULT		KVM_ARCH_REQ(2)
+#define KVM_REQ_MMU_SYNC		KVM_ARCH_REQ(3)
+#define KVM_REQ_CLOCK_UPDATE		KVM_ARCH_REQ(4)
+#define KVM_REQ_EVENT			KVM_ARCH_REQ(6)
+#define KVM_REQ_APF_HALT		KVM_ARCH_REQ(7)
+#define KVM_REQ_STEAL_UPDATE		KVM_ARCH_REQ(8)
+#define KVM_REQ_NMI			KVM_ARCH_REQ(9)
+#define KVM_REQ_PMU			KVM_ARCH_REQ(10)
+#define KVM_REQ_PMI			KVM_ARCH_REQ(11)
+#define KVM_REQ_SMI			KVM_ARCH_REQ(12)
+#define KVM_REQ_MASTERCLOCK_UPDATE	KVM_ARCH_REQ(13)
+#define KVM_REQ_MCLOCK_INPROGRESS \
+	KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_SCAN_IOAPIC \
+	KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_GLOBAL_CLOCK_UPDATE	KVM_ARCH_REQ(16)
+#define KVM_REQ_APIC_PAGE_RELOAD \
+	KVM_ARCH_REQ_FLAGS(17, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_HV_CRASH		KVM_ARCH_REQ(18)
+#define KVM_REQ_IOAPIC_EOI_EXIT		KVM_ARCH_REQ(19)
+#define KVM_REQ_HV_RESET		KVM_ARCH_REQ(20)
+#define KVM_REQ_HV_EXIT			KVM_ARCH_REQ(21)
+#define KVM_REQ_HV_STIMER		KVM_ARCH_REQ(22)
 
 #define CR0_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 673f9ac50f6d..dbf266b0d14a 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -420,6 +420,8 @@
 #define MSR_IA32_TSC_ADJUST             0x0000003b
 #define MSR_IA32_BNDCFGS		0x00000d90
 
+#define MSR_IA32_BNDCFGS_RSVD		0x00000ffc
+
 #define MSR_IA32_XSS			0x00000da0
 
 #define FEATURE_CONTROL_LOCKED				(1<<0)
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index a6fd40aade7c..da6728383052 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -144,6 +144,14 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
 	return best && (best->ebx & bit(X86_FEATURE_RTM));
 }
 
+static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ebx & bit(X86_FEATURE_MPX));
+}
+
 static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0816ab2e8adc..4a38b9656391 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -900,7 +900,7 @@ static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
 	if (rc != X86EMUL_CONTINUE)					\
 		goto done;						\
 	ctxt->_eip += sizeof(_type);					\
-	_x = *(_type __aligned(1) *) ctxt->fetch.ptr;			\
+	memcpy(&_x, ctxt->fetch.ptr, sizeof(_type));			\
 	ctxt->fetch.ptr += sizeof(_type);				\
 	_x;								\
 })
@@ -3941,6 +3941,25 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
 }
 
 /*
+ * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save
+ * and restore MXCSR.
+ */
+static size_t __fxstate_size(int nregs)
+{
+	return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16;
+}
+
+static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt)
+{
+	bool cr4_osfxsr;
+	if (ctxt->mode == X86EMUL_MODE_PROT64)
+		return __fxstate_size(16);
+
+	cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR;
+	return __fxstate_size(cr4_osfxsr ? 8 : 0);
+}
+
+/*
  * FXSAVE and FXRSTOR have 4 different formats depending on execution mode,
  *  1) 16 bit mode
  *  2) 32 bit mode
@@ -3961,7 +3980,6 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
 static int em_fxsave(struct x86_emulate_ctxt *ctxt)
 {
 	struct fxregs_state fx_state;
-	size_t size;
 	int rc;
 
 	rc = check_fxsr(ctxt);
@@ -3977,68 +3995,42 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt)
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)
-		size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]);
-	else
-		size = offsetof(struct fxregs_state, xmm_space[0]);
-
-	return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
-}
-
-static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt,
-		struct fxregs_state *new)
-{
-	int rc = X86EMUL_CONTINUE;
-	struct fxregs_state old;
-
-	rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old));
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
-
-	/*
-	 * 64 bit host will restore XMM 8-15, which is not correct on non-64
-	 * bit guests.  Load the current values in order to preserve 64 bit
-	 * XMMs after fxrstor.
-	 */
-#ifdef CONFIG_X86_64
-	/* XXX: accessing XMM 8-15 very awkwardly */
-	memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16);
-#endif
-
-	/*
-	 * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but
-	 * does save and restore MXCSR.
-	 */
-	if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))
-		memcpy(new->xmm_space, old.xmm_space, 8 * 16);
-
-	return rc;
+	return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state,
+		                   fxstate_size(ctxt));
 }
 
 static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
 {
 	struct fxregs_state fx_state;
 	int rc;
+	size_t size;
 
 	rc = check_fxsr(ctxt);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512);
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
+	ctxt->ops->get_fpu(ctxt);
 
-	if (fx_state.mxcsr >> 16)
-		return emulate_gp(ctxt, 0);
+	size = fxstate_size(ctxt);
+	if (size < __fxstate_size(16)) {
+		rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+		if (rc != X86EMUL_CONTINUE)
+			goto out;
+	}
 
-	ctxt->ops->get_fpu(ctxt);
+	rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+	if (rc != X86EMUL_CONTINUE)
+		goto out;
 
-	if (ctxt->mode < X86EMUL_MODE_PROT64)
-		rc = fxrstor_fixup(ctxt, &fx_state);
+	if (fx_state.mxcsr >> 16) {
+		rc = emulate_gp(ctxt, 0);
+		goto out;
+	}
 
 	if (rc == X86EMUL_CONTINUE)
 		rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
 
+out:
 	ctxt->ops->put_fpu(ctxt);
 
 	return rc;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c329d2894905..2819d4c123eb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1495,31 +1495,65 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
 
 static void cancel_hv_timer(struct kvm_lapic *apic)
 {
+	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+	preempt_disable();
 	kvm_x86_ops->cancel_hv_timer(apic->vcpu);
 	apic->lapic_timer.hv_timer_in_use = false;
+	preempt_enable();
 }
 
 static bool start_hv_timer(struct kvm_lapic *apic)
 {
-	u64 tscdeadline = apic->lapic_timer.tscdeadline;
+	struct kvm_timer *ktimer = &apic->lapic_timer;
+	int r;
 
-	if ((atomic_read(&apic->lapic_timer.pending) &&
-		!apic_lvtt_period(apic)) ||
-		kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
-		if (apic->lapic_timer.hv_timer_in_use)
-			cancel_hv_timer(apic);
-	} else {
-		apic->lapic_timer.hv_timer_in_use = true;
-		hrtimer_cancel(&apic->lapic_timer.timer);
+	if (!kvm_x86_ops->set_hv_timer)
+		return false;
+
+	if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+		return false;
 
-		/* In case the sw timer triggered in the window */
-		if (atomic_read(&apic->lapic_timer.pending) &&
-			!apic_lvtt_period(apic))
-			cancel_hv_timer(apic);
+	r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline);
+	if (r < 0)
+		return false;
+
+	ktimer->hv_timer_in_use = true;
+	hrtimer_cancel(&ktimer->timer);
+
+	/*
+	 * Also recheck ktimer->pending, in case the sw timer triggered in
+	 * the window.  For periodic timer, leave the hv timer running for
+	 * simplicity, and the deadline will be recomputed on the next vmexit.
+	 */
+	if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) {
+		if (r)
+			apic_timer_expired(apic);
+		return false;
 	}
-	trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
-			apic->lapic_timer.hv_timer_in_use);
-	return apic->lapic_timer.hv_timer_in_use;
+
+	trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true);
+	return true;
+}
+
+static void start_sw_timer(struct kvm_lapic *apic)
+{
+	struct kvm_timer *ktimer = &apic->lapic_timer;
+	if (apic->lapic_timer.hv_timer_in_use)
+		cancel_hv_timer(apic);
+	if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+		return;
+
+	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+		start_sw_period(apic);
+	else if (apic_lvtt_tscdeadline(apic))
+		start_sw_tscdeadline(apic);
+	trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false);
+}
+
+static void restart_apic_timer(struct kvm_lapic *apic)
+{
+	if (!start_hv_timer(apic))
+		start_sw_timer(apic);
 }
 
 void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
@@ -1533,19 +1567,14 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
 
 	if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
 		advance_periodic_target_expiration(apic);
-		if (!start_hv_timer(apic))
-			start_sw_period(apic);
+		restart_apic_timer(apic);
 	}
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
 
 void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
-
-	WARN_ON(apic->lapic_timer.hv_timer_in_use);
-
-	start_hv_timer(apic);
+	restart_apic_timer(vcpu->arch.apic);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
 
@@ -1554,33 +1583,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	/* Possibly the TSC deadline timer is not enabled yet */
-	if (!apic->lapic_timer.hv_timer_in_use)
-		return;
-
-	cancel_hv_timer(apic);
+	if (apic->lapic_timer.hv_timer_in_use)
+		start_sw_timer(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
 
-	if (atomic_read(&apic->lapic_timer.pending))
-		return;
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
-		start_sw_period(apic);
-	else if (apic_lvtt_tscdeadline(apic))
-		start_sw_tscdeadline(apic);
+	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+	restart_apic_timer(apic);
 }
-EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
 
 static void start_apic_timer(struct kvm_lapic *apic)
 {
 	atomic_set(&apic->lapic_timer.pending, 0);
 
-	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
-		if (set_target_expiration(apic) &&
-			!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic)))
-			start_sw_period(apic);
-	} else if (apic_lvtt_tscdeadline(apic)) {
-		if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic)))
-			start_sw_tscdeadline(apic);
-	}
+	if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+	    && !set_target_expiration(apic))
+		return;
+
+	restart_apic_timer(apic);
 }
 
 static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -1811,16 +1835,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
  * LAPIC interface
  *----------------------------------------------------------------------
  */
-u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = vcpu->arch.apic;
-
-	if (!lapic_in_kernel(vcpu))
-		return 0;
-
-	return apic->lapic_timer.tscdeadline;
-}
-
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
@@ -1934,7 +1948,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 	for (i = 0; i < KVM_APIC_LVT_NUM; i++)
 		kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
 	apic_update_lvtt(apic);
-	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
+	if (kvm_vcpu_is_reset_bsp(vcpu) &&
+	    kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
 		kvm_lapic_set_reg(apic, APIC_LVT0,
 			     SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
 	apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index bcbe811f3b97..29caa2c3dff9 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -87,7 +87,6 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
 int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
-u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu);
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
 void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
 
@@ -216,4 +215,5 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
 void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
 void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
 bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 183ddb235fb4..03df7c1da581 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -189,6 +189,7 @@ struct vcpu_svm {
 	struct nested_state nested;
 
 	bool nmi_singlestep;
+	u64 nmi_singlestep_guest_rflags;
 
 	unsigned int3_injected;
 	unsigned long int3_rip;
@@ -963,6 +964,18 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
 	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 }
 
+static void disable_nmi_singlestep(struct vcpu_svm *svm)
+{
+	svm->nmi_singlestep = false;
+	if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
+		/* Clear our flags if they were not set by the guest */
+		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+			svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
+		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+			svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
+	}
+}
+
 /* Note:
  * This hash table is used to map VM_ID to a struct kvm_arch,
  * when handling AMD IOMMU GALOG notification to schedule in
@@ -1712,11 +1725,24 @@ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
 
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 {
-	return to_svm(vcpu)->vmcb->save.rflags;
+	struct vcpu_svm *svm = to_svm(vcpu);
+	unsigned long rflags = svm->vmcb->save.rflags;
+
+	if (svm->nmi_singlestep) {
+		/* Hide our flags if they were not set by the guest */
+		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+			rflags &= ~X86_EFLAGS_TF;
+		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+			rflags &= ~X86_EFLAGS_RF;
+	}
+	return rflags;
 }
 
 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
+	if (to_svm(vcpu)->nmi_singlestep)
+		rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+
        /*
         * Any change of EFLAGS.VM is accompanied by a reload of SS
         * (caused by either a task switch or an inter-privilege IRET),
@@ -1807,7 +1833,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 	 * AMD's VMCB does not have an explicit unusable field, so emulate it
 	 * for cross vendor migration purposes by "not present"
 	 */
-	var->unusable = !var->present || (var->type == 0);
+	var->unusable = !var->present;
 
 	switch (seg) {
 	case VCPU_SREG_TR:
@@ -1840,6 +1866,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 		 */
 		if (var->unusable)
 			var->db = 0;
+		/* This is symmetric with svm_set_segment() */
 		var->dpl = to_svm(vcpu)->vmcb->save.cpl;
 		break;
 	}
@@ -1980,18 +2007,14 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 	s->base = var->base;
 	s->limit = var->limit;
 	s->selector = var->selector;
-	if (var->unusable)
-		s->attrib = 0;
-	else {
-		s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
-		s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
-		s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
-		s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
-		s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
-		s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
-		s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
-		s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
-	}
+	s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
+	s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
+	s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
+	s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
+	s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
+	s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
+	s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
+	s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
 
 	/*
 	 * This is always accurate, except if SYSRET returned to a segment
@@ -2000,7 +2023,8 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 	 * would entail passing the CPL to userspace and back.
 	 */
 	if (seg == VCPU_SREG_SS)
-		svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
+		/* This is symmetric with svm_get_segment() */
+		svm->vmcb->save.cpl = (var->dpl & 3);
 
 	mark_dirty(svm->vmcb, VMCB_SEG);
 }
@@ -2113,10 +2137,7 @@ static int db_interception(struct vcpu_svm *svm)
 	}
 
 	if (svm->nmi_singlestep) {
-		svm->nmi_singlestep = false;
-		if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
-			svm->vmcb->save.rflags &=
-				~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+		disable_nmi_singlestep(svm);
 	}
 
 	if (svm->vcpu.guest_debug &
@@ -2371,8 +2392,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
 
 static int nested_svm_check_permissions(struct vcpu_svm *svm)
 {
-	if (!(svm->vcpu.arch.efer & EFER_SVME)
-	    || !is_paging(&svm->vcpu)) {
+	if (!(svm->vcpu.arch.efer & EFER_SVME) ||
+	    !is_paging(&svm->vcpu)) {
 		kvm_queue_exception(&svm->vcpu, UD_VECTOR);
 		return 1;
 	}
@@ -2382,7 +2403,7 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
 		return 1;
 	}
 
-       return 0;
+	return 0;
 }
 
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
@@ -2535,6 +2556,31 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
 }
 
+/* DB exceptions for our internal use must not cause vmexit */
+static int nested_svm_intercept_db(struct vcpu_svm *svm)
+{
+	unsigned long dr6;
+
+	/* if we're not singlestepping, it's not ours */
+	if (!svm->nmi_singlestep)
+		return NESTED_EXIT_DONE;
+
+	/* if it's not a singlestep exception, it's not ours */
+	if (kvm_get_dr(&svm->vcpu, 6, &dr6))
+		return NESTED_EXIT_DONE;
+	if (!(dr6 & DR6_BS))
+		return NESTED_EXIT_DONE;
+
+	/* if the guest is singlestepping, it should get the vmexit */
+	if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
+		disable_nmi_singlestep(svm);
+		return NESTED_EXIT_DONE;
+	}
+
+	/* it's ours, the nested hypervisor must not see this one */
+	return NESTED_EXIT_HOST;
+}
+
 static int nested_svm_exit_special(struct vcpu_svm *svm)
 {
 	u32 exit_code = svm->vmcb->control.exit_code;
@@ -2590,8 +2636,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
 	}
 	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
 		u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
-		if (svm->nested.intercept_exceptions & excp_bits)
-			vmexit = NESTED_EXIT_DONE;
+		if (svm->nested.intercept_exceptions & excp_bits) {
+			if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
+				vmexit = nested_svm_intercept_db(svm);
+			else
+				vmexit = NESTED_EXIT_DONE;
+		}
 		/* async page fault always cause vmexit */
 		else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
 			 svm->apf_reason != 0)
@@ -4628,10 +4678,17 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 	    == HF_NMI_MASK)
 		return; /* IRET will cause a vm exit */
 
+	if ((svm->vcpu.arch.hflags & HF_GIF_MASK) == 0)
+		return; /* STGI will cause a vm exit */
+
+	if (svm->nested.exit_required)
+		return; /* we're not going to run the guest yet */
+
 	/*
 	 * Something prevents NMI from been injected. Single step over possible
 	 * problem (IRET or exception injection or interrupt shadow)
 	 */
+	svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
 	svm->nmi_singlestep = true;
 	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
 }
@@ -4772,6 +4829,22 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(svm->nested.exit_required))
 		return;
 
+	/*
+	 * Disable singlestep if we're injecting an interrupt/exception.
+	 * We don't want our modified rflags to be pushed on the stack where
+	 * we might not be able to easily reset them if we disabled NMI
+	 * singlestep later.
+	 */
+	if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
+		/*
+		 * Event injection happens before external interrupts cause a
+		 * vmexit and interrupts are disabled here, so smp_send_reschedule
+		 * is enough to force an immediate vmexit.
+		 */
+		disable_nmi_singlestep(svm);
+		smp_send_reschedule(vcpu->cpu);
+	}
+
 	pre_svm_run(svm);
 
 	sync_lapic_to_cr8(vcpu);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 72f78396bc09..92ddea08f999 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2769,7 +2769,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		if (enable_ept_ad_bits) {
 			vmx->nested.nested_vmx_secondary_ctls_high |=
 				SECONDARY_EXEC_ENABLE_PML;
-		       vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
+			vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
 		}
 	} else
 		vmx->nested.nested_vmx_ept_caps = 0;
@@ -3195,7 +3195,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
 		break;
 	case MSR_IA32_BNDCFGS:
-		if (!kvm_mpx_supported())
+		if (!kvm_mpx_supported() || !guest_cpuid_has_mpx(vcpu))
 			return 1;
 		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
 		break;
@@ -3277,7 +3277,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		vmcs_writel(GUEST_SYSENTER_ESP, data);
 		break;
 	case MSR_IA32_BNDCFGS:
-		if (!kvm_mpx_supported())
+		if (!kvm_mpx_supported() || !guest_cpuid_has_mpx(vcpu))
+			return 1;
+		if (is_noncanonical_address(data & PAGE_MASK) ||
+		    (data & MSR_IA32_BNDCFGS_RSVD))
 			return 1;
 		vmcs_write64(GUEST_BNDCFGS, data);
 		break;
@@ -6547,7 +6550,6 @@ static __init int hardware_setup(void)
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
-	vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
 
 	memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
 			vmx_msr_bitmap_legacy, PAGE_SIZE);
@@ -6914,97 +6916,21 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-/*
- * This function performs the various checks including
- * - if it's 4KB aligned
- * - No bits beyond the physical address width are set
- * - Returns 0 on success or else 1
- * (Intel SDM Section 30.3)
- */
-static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
-				  gpa_t *vmpointer)
+static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
 {
 	gva_t gva;
-	gpa_t vmptr;
 	struct x86_exception e;
-	struct page *page;
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 
 	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
 			vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
 		return 1;
 
-	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
-				sizeof(vmptr), &e)) {
+	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer,
+				sizeof(*vmpointer), &e)) {
 		kvm_inject_page_fault(vcpu, &e);
 		return 1;
 	}
 
-	switch (exit_reason) {
-	case EXIT_REASON_VMON:
-		/*
-		 * SDM 3: 24.11.5
-		 * The first 4 bytes of VMXON region contain the supported
-		 * VMCS revision identifier
-		 *
-		 * Note - IA32_VMX_BASIC[48] will never be 1
-		 * for the nested case;
-		 * which replaces physical address width with 32
-		 *
-		 */
-		if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
-			nested_vmx_failInvalid(vcpu);
-			return kvm_skip_emulated_instruction(vcpu);
-		}
-
-		page = nested_get_page(vcpu, vmptr);
-		if (page == NULL) {
-			nested_vmx_failInvalid(vcpu);
-			return kvm_skip_emulated_instruction(vcpu);
-		}
-		if (*(u32 *)kmap(page) != VMCS12_REVISION) {
-			kunmap(page);
-			nested_release_page_clean(page);
-			nested_vmx_failInvalid(vcpu);
-			return kvm_skip_emulated_instruction(vcpu);
-		}
-		kunmap(page);
-		nested_release_page_clean(page);
-		vmx->nested.vmxon_ptr = vmptr;
-		break;
-	case EXIT_REASON_VMCLEAR:
-		if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
-			nested_vmx_failValid(vcpu,
-					     VMXERR_VMCLEAR_INVALID_ADDRESS);
-			return kvm_skip_emulated_instruction(vcpu);
-		}
-
-		if (vmptr == vmx->nested.vmxon_ptr) {
-			nested_vmx_failValid(vcpu,
-					     VMXERR_VMCLEAR_VMXON_POINTER);
-			return kvm_skip_emulated_instruction(vcpu);
-		}
-		break;
-	case EXIT_REASON_VMPTRLD:
-		if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
-			nested_vmx_failValid(vcpu,
-					     VMXERR_VMPTRLD_INVALID_ADDRESS);
-			return kvm_skip_emulated_instruction(vcpu);
-		}
-
-		if (vmptr == vmx->nested.vmxon_ptr) {
-			nested_vmx_failValid(vcpu,
-					     VMXERR_VMPTRLD_VMXON_POINTER);
-			return kvm_skip_emulated_instruction(vcpu);
-		}
-		break;
-	default:
-		return 1; /* shouldn't happen */
-	}
-
-	if (vmpointer)
-		*vmpointer = vmptr;
 	return 0;
 }
 
@@ -7066,6 +6992,8 @@ out_msr_bitmap:
 static int handle_vmon(struct kvm_vcpu *vcpu)
 {
 	int ret;
+	gpa_t vmptr;
+	struct page *page;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
 		| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -7095,9 +7023,37 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
-	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
+	if (nested_vmx_get_vmptr(vcpu, &vmptr))
 		return 1;
- 
+
+	/*
+	 * SDM 3: 24.11.5
+	 * The first 4 bytes of VMXON region contain the supported
+	 * VMCS revision identifier
+	 *
+	 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
+	 * which replaces physical address width with 32
+	 */
+	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+		nested_vmx_failInvalid(vcpu);
+		return kvm_skip_emulated_instruction(vcpu);
+	}
+
+	page = nested_get_page(vcpu, vmptr);
+	if (page == NULL) {
+		nested_vmx_failInvalid(vcpu);
+		return kvm_skip_emulated_instruction(vcpu);
+	}
+	if (*(u32 *)kmap(page) != VMCS12_REVISION) {
+		kunmap(page);
+		nested_release_page_clean(page);
+		nested_vmx_failInvalid(vcpu);
+		return kvm_skip_emulated_instruction(vcpu);
+	}
+	kunmap(page);
+	nested_release_page_clean(page);
+
+	vmx->nested.vmxon_ptr = vmptr;
 	ret = enter_vmx_operation(vcpu);
 	if (ret)
 		return ret;
@@ -7213,9 +7169,19 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
 
-	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr))
+	if (nested_vmx_get_vmptr(vcpu, &vmptr))
 		return 1;
 
+	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+		nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
+		return kvm_skip_emulated_instruction(vcpu);
+	}
+
+	if (vmptr == vmx->nested.vmxon_ptr) {
+		nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
+		return kvm_skip_emulated_instruction(vcpu);
+	}
+
 	if (vmptr == vmx->nested.current_vmptr)
 		nested_release_vmcs12(vmx);
 
@@ -7545,9 +7511,19 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
 
-	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr))
+	if (nested_vmx_get_vmptr(vcpu, &vmptr))
 		return 1;
 
+	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+		nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
+		return kvm_skip_emulated_instruction(vcpu);
+	}
+
+	if (vmptr == vmx->nested.vmxon_ptr) {
+		nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
+		return kvm_skip_emulated_instruction(vcpu);
+	}
+
 	if (vmx->nested.current_vmptr != vmptr) {
 		struct vmcs12 *new_vmcs12;
 		struct page *page;
@@ -7677,7 +7653,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 	unsigned long type, types;
 	gva_t gva;
 	struct x86_exception e;
-	int vpid;
+	struct {
+		u64 vpid;
+		u64 gla;
+	} operand;
 
 	if (!(vmx->nested.nested_vmx_secondary_ctls_high &
 	      SECONDARY_EXEC_ENABLE_VPID) ||
@@ -7707,17 +7686,28 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
 			vmx_instruction_info, false, &gva))
 		return 1;
-	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid,
-				sizeof(u32), &e)) {
+	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+				sizeof(operand), &e)) {
 		kvm_inject_page_fault(vcpu, &e);
 		return 1;
 	}
+	if (operand.vpid >> 16) {
+		nested_vmx_failValid(vcpu,
+			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+		return kvm_skip_emulated_instruction(vcpu);
+	}
 
 	switch (type) {
 	case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
+		if (is_noncanonical_address(operand.gla)) {
+			nested_vmx_failValid(vcpu,
+				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+			return kvm_skip_emulated_instruction(vcpu);
+		}
+		/* fall through */
 	case VMX_VPID_EXTENT_SINGLE_CONTEXT:
 	case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
-		if (!vpid) {
+		if (!operand.vpid) {
 			nested_vmx_failValid(vcpu,
 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
 			return kvm_skip_emulated_instruction(vcpu);
@@ -7913,11 +7903,13 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
 {
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	int cr = exit_qualification & 15;
-	int reg = (exit_qualification >> 8) & 15;
-	unsigned long val = kvm_register_readl(vcpu, reg);
+	int reg;
+	unsigned long val;
 
 	switch ((exit_qualification >> 4) & 3) {
 	case 0: /* mov to cr */
+		reg = (exit_qualification >> 8) & 15;
+		val = kvm_register_readl(vcpu, reg);
 		switch (cr) {
 		case 0:
 			if (vmcs12->cr0_guest_host_mask &
@@ -7972,6 +7964,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
 		 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
 		 * cr0. Other attempted changes are ignored, with no exit.
 		 */
+		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
 		if (vmcs12->cr0_guest_host_mask & 0xe &
 		    (val ^ vmcs12->cr0_read_shadow))
 			return true;
@@ -10733,8 +10726,7 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
 	}
 
-	if (nested_cpu_has_ept(vmcs12))
-		vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+	vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
 
 	if (nested_cpu_has_vid(vmcs12))
 		vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
@@ -10759,8 +10751,6 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
 	if (kvm_mpx_supported())
 		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
-	if (nested_cpu_has_xsaves(vmcs12))
-		vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
 }
 
 /*
@@ -11157,7 +11147,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
 	vmx->hv_deadline_tsc = tscl + delta_tsc;
 	vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
 			PIN_BASED_VMX_PREEMPTION_TIMER);
-	return 0;
+
+	return delta_tsc == 0;
 }
 
 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 02363e37d4a6..f2bd155883dd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2841,10 +2841,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			kvm_vcpu_write_tsc_offset(vcpu, offset);
 			vcpu->arch.tsc_catchup = 1;
 		}
-		if (kvm_lapic_hv_timer_in_use(vcpu) &&
-				kvm_x86_ops->set_hv_timer(vcpu,
-					kvm_get_lapic_target_expiration_tsc(vcpu)))
-			kvm_lapic_switch_to_sw_timer(vcpu);
+
+		if (kvm_lapic_hv_timer_in_use(vcpu))
+			kvm_lapic_restart_hv_timer(vcpu);
+
 		/*
 		 * On a host with synchronized TSC, there is no need to update
 		 * kvmclock on vcpu->cpu migration
@@ -6731,7 +6731,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	bool req_immediate_exit = false;
 
-	if (vcpu->requests) {
+	if (kvm_request_pending(vcpu)) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
 			kvm_mmu_unload(vcpu);
 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
@@ -6895,7 +6895,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_x86_ops->sync_pir_to_irr(vcpu);
 	}
 
-	if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
+	if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
 	    || need_resched() || signal_pending(current)) {
 		vcpu->mode = OUTSIDE_GUEST_MODE;
 		smp_wmb();
@@ -8394,10 +8394,13 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.pv.pv_unhalted)
 		return true;
 
-	if (atomic_read(&vcpu->arch.nmi_queued))
+	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+	    (vcpu->arch.nmi_pending &&
+	     kvm_x86_ops->nmi_allowed(vcpu)))
 		return true;
 
-	if (kvm_test_request(KVM_REQ_SMI, vcpu))
+	if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
+	    (vcpu->arch.smi_pending && !is_smm(vcpu)))
 		return true;
 
 	if (kvm_arch_interrupt_allowed(vcpu) &&
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 295584f31a4e..f0053f884b4a 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -57,9 +57,7 @@ struct arch_timer_cpu {
 
 int kvm_timer_hyp_init(void);
 int kvm_timer_enable(struct kvm_vcpu *vcpu);
-int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
-			 const struct kvm_irq_level *virt_irq,
-			 const struct kvm_irq_level *phys_irq);
+int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu);
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu);
@@ -70,6 +68,10 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu);
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
+int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
+int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
+int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
+
 bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx);
 void kvm_timer_schedule(struct kvm_vcpu *vcpu);
 void kvm_timer_unschedule(struct kvm_vcpu *vcpu);
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 1ab4633adf4f..f6e030617467 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -35,6 +35,7 @@ struct kvm_pmu {
 	int irq_num;
 	struct kvm_pmc pmc[ARMV8_PMU_MAX_COUNTERS];
 	bool ready;
+	bool created;
 	bool irq_level;
 };
 
@@ -63,6 +64,7 @@ int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu,
 			    struct kvm_device_attr *attr);
 int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu,
 			    struct kvm_device_attr *attr);
+int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu);
 #else
 struct kvm_pmu {
 };
@@ -112,6 +114,10 @@ static inline int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu,
 {
 	return -ENXIO;
 }
+static inline int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
 #endif
 
 #endif
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index ef718586321c..34dba516ef24 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -38,6 +38,10 @@
 #define VGIC_MIN_LPI		8192
 #define KVM_IRQCHIP_NUM_PINS	(1020 - 32)
 
+#define irq_is_ppi(irq) ((irq) >= VGIC_NR_SGIS && (irq) < VGIC_NR_PRIVATE_IRQS)
+#define irq_is_spi(irq) ((irq) >= VGIC_NR_PRIVATE_IRQS && \
+			 (irq) <= VGIC_MAX_SPI)
+
 enum vgic_type {
 	VGIC_V2,		/* Good ol' GICv2 */
 	VGIC_V3,		/* New fancy GICv3 */
@@ -119,6 +123,9 @@ struct vgic_irq {
 	u8 source;			/* GICv2 SGIs only */
 	u8 priority;
 	enum vgic_irq_config config;	/* Level or edge */
+
+	void *owner;			/* Opaque pointer to reserve an interrupt
+					   for in-kernel devices. */
 };
 
 struct vgic_register_region;
@@ -285,6 +292,7 @@ struct vgic_cpu {
 };
 
 extern struct static_key_false vgic_v2_cpuif_trap;
+extern struct static_key_false vgic_v3_cpuif_trap;
 
 int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
 void kvm_vgic_early_init(struct kvm *kvm);
@@ -298,9 +306,7 @@ int kvm_vgic_hyp_init(void);
 void kvm_vgic_init_cpu_hardware(void);
 
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
-			bool level);
-int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
-			       bool level);
+			bool level, void *owner);
 int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq);
 int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq);
 bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
@@ -341,4 +347,6 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
  */
 int kvm_vgic_setup_default_irq_routing(struct kvm *kvm);
 
+int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner);
+
 #endif /* __KVM_ARM_VGIC_H */
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index fffb91202bc9..6a1f87ff94e2 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -405,6 +405,7 @@
 #define ICH_LR_PHYS_ID_SHIFT		32
 #define ICH_LR_PHYS_ID_MASK		(0x3ffULL << ICH_LR_PHYS_ID_SHIFT)
 #define ICH_LR_PRIORITY_SHIFT		48
+#define ICH_LR_PRIORITY_MASK		(0xffULL << ICH_LR_PRIORITY_SHIFT)
 
 /* These are for GICv2 emulation only */
 #define GICH_LR_VIRTUALID		(0x3ffUL << 0)
@@ -416,7 +417,16 @@
 
 #define ICH_HCR_EN			(1 << 0)
 #define ICH_HCR_UIE			(1 << 1)
-
+#define ICH_HCR_TC			(1 << 10)
+#define ICH_HCR_TALL0			(1 << 11)
+#define ICH_HCR_TALL1			(1 << 12)
+#define ICH_HCR_EOIcount_SHIFT		27
+#define ICH_HCR_EOIcount_MASK		(0x1f << ICH_HCR_EOIcount_SHIFT)
+
+#define ICH_VMCR_ACK_CTL_SHIFT		2
+#define ICH_VMCR_ACK_CTL_MASK		(1 << ICH_VMCR_ACK_CTL_SHIFT)
+#define ICH_VMCR_FIQ_EN_SHIFT		3
+#define ICH_VMCR_FIQ_EN_MASK		(1 << ICH_VMCR_FIQ_EN_SHIFT)
 #define ICH_VMCR_CBPR_SHIFT		4
 #define ICH_VMCR_CBPR_MASK		(1 << ICH_VMCR_CBPR_SHIFT)
 #define ICH_VMCR_EOIM_SHIFT		9
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index dc30f3d057eb..d3453ee072fc 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -25,7 +25,18 @@
 #define GICC_ENABLE			0x1
 #define GICC_INT_PRI_THRESHOLD		0xf0
 
-#define GIC_CPU_CTRL_EOImodeNS		(1 << 9)
+#define GIC_CPU_CTRL_EnableGrp0_SHIFT	0
+#define GIC_CPU_CTRL_EnableGrp0		(1 << GIC_CPU_CTRL_EnableGrp0_SHIFT)
+#define GIC_CPU_CTRL_EnableGrp1_SHIFT	1
+#define GIC_CPU_CTRL_EnableGrp1		(1 << GIC_CPU_CTRL_EnableGrp1_SHIFT)
+#define GIC_CPU_CTRL_AckCtl_SHIFT	2
+#define GIC_CPU_CTRL_AckCtl		(1 << GIC_CPU_CTRL_AckCtl_SHIFT)
+#define GIC_CPU_CTRL_FIQEn_SHIFT	3
+#define GIC_CPU_CTRL_FIQEn		(1 << GIC_CPU_CTRL_FIQEn_SHIFT)
+#define GIC_CPU_CTRL_CBPR_SHIFT		4
+#define GIC_CPU_CTRL_CBPR		(1 << GIC_CPU_CTRL_CBPR_SHIFT)
+#define GIC_CPU_CTRL_EOImodeNS_SHIFT	9
+#define GIC_CPU_CTRL_EOImodeNS		(1 << GIC_CPU_CTRL_EOImodeNS_SHIFT)
 
 #define GICC_IAR_INT_ID_MASK		0x3ff
 #define GICC_INT_SPURIOUS		1023
@@ -84,8 +95,19 @@
 #define GICH_LR_EOI			(1 << 19)
 #define GICH_LR_HW			(1 << 31)
 
-#define GICH_VMCR_CTRL_SHIFT		0
-#define GICH_VMCR_CTRL_MASK		(0x21f << GICH_VMCR_CTRL_SHIFT)
+#define GICH_VMCR_ENABLE_GRP0_SHIFT	0
+#define GICH_VMCR_ENABLE_GRP0_MASK	(1 << GICH_VMCR_ENABLE_GRP0_SHIFT)
+#define GICH_VMCR_ENABLE_GRP1_SHIFT	1
+#define GICH_VMCR_ENABLE_GRP1_MASK	(1 << GICH_VMCR_ENABLE_GRP1_SHIFT)
+#define GICH_VMCR_ACK_CTL_SHIFT		2
+#define GICH_VMCR_ACK_CTL_MASK		(1 << GICH_VMCR_ACK_CTL_SHIFT)
+#define GICH_VMCR_FIQ_EN_SHIFT		3
+#define GICH_VMCR_FIQ_EN_MASK		(1 << GICH_VMCR_FIQ_EN_SHIFT)
+#define GICH_VMCR_CBPR_SHIFT		4
+#define GICH_VMCR_CBPR_MASK		(1 << GICH_VMCR_CBPR_SHIFT)
+#define GICH_VMCR_EOI_MODE_SHIFT	9
+#define GICH_VMCR_EOI_MODE_MASK		(1 << GICH_VMCR_EOI_MODE_SHIFT)
+
 #define GICH_VMCR_PRIMASK_SHIFT		27
 #define GICH_VMCR_PRIMASK_MASK		(0x1f << GICH_VMCR_PRIMASK_SHIFT)
 #define GICH_VMCR_BINPOINT_SHIFT	21
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8c0664309815..0b50e7b35ed4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -126,6 +126,13 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_MMU_RELOAD        (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_PENDING_TIMER     2
 #define KVM_REQ_UNHALT            3
+#define KVM_REQUEST_ARCH_BASE     8
+
+#define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \
+	BUILD_BUG_ON((unsigned)(nr) >= 32 - KVM_REQUEST_ARCH_BASE); \
+	(unsigned)(((nr) + KVM_REQUEST_ARCH_BASE) | (flags)); \
+})
+#define KVM_ARCH_REQ(nr)           KVM_ARCH_REQ_FLAGS(nr, 0)
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
@@ -1098,6 +1105,11 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
 	set_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
 }
 
+static inline bool kvm_request_pending(struct kvm_vcpu *vcpu)
+{
+	return READ_ONCE(vcpu->requests);
+}
+
 static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu)
 {
 	return test_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 36ea74aa3422..c0b6dfec5f87 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -155,6 +155,35 @@ struct kvm_s390_skeys {
 	__u32 reserved[9];
 };
 
+#define KVM_S390_CMMA_PEEK (1 << 0)
+
+/**
+ * kvm_s390_cmma_log - Used for CMMA migration.
+ *
+ * Used both for input and output.
+ *
+ * @start_gfn: Guest page number to start from.
+ * @count: Size of the result buffer.
+ * @flags: Control operation mode via KVM_S390_CMMA_* flags
+ * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty
+ *             pages are still remaining.
+ * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set
+ *        in the PGSTE.
+ * @values: Pointer to the values buffer.
+ *
+ * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls.
+ */
+struct kvm_s390_cmma_log {
+	__u64 start_gfn;
+	__u32 count;
+	__u32 flags;
+	union {
+		__u64 remaining;
+		__u64 mask;
+	};
+	__u64 values;
+};
+
 struct kvm_hyperv_exit {
 #define KVM_EXIT_HYPERV_SYNIC          1
 #define KVM_EXIT_HYPERV_HCALL          2
@@ -895,8 +924,9 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SPAPR_TCE_VFIO 142
 #define KVM_CAP_X86_GUEST_MWAIT 143
 #define KVM_CAP_ARM_USER_IRQ 144
-#define KVM_CAP_PPC_FWNMI 145
-#define KVM_CAP_PPC_SMT_POSSIBLE 146
+#define KVM_CAP_S390_CMMA_MIGRATION 145
+#define KVM_CAP_PPC_FWNMI 146
+#define KVM_CAP_PPC_SMT_POSSIBLE 147
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1320,6 +1350,9 @@ struct kvm_s390_ucas_mapping {
 #define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
 /* Available with KVM_CAP_X86_SMM */
 #define KVM_SMI                   _IO(KVMIO,   0xb7)
+/* Available with KVM_CAP_S390_CMMA_MIGRATION */
+#define KVM_S390_GET_CMMA_BITS      _IOW(KVMIO, 0xb8, struct kvm_s390_cmma_log)
+#define KVM_S390_SET_CMMA_BITS      _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 8f74ed8e7237..dd8f00cfb8b4 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -295,114 +295,6 @@ class ArchS390(Arch):
 ARCH = Arch.get_arch()
 
 
-def walkdir(path):
-    """Returns os.walk() data for specified directory.
-
-    As it is only a wrapper it returns the same 3-tuple of (dirpath,
-    dirnames, filenames).
-    """
-    return next(os.walk(path))
-
-
-def parse_int_list(list_string):
-    """Returns an int list from a string of comma separated integers and
-    integer ranges."""
-    integers = []
-    members = list_string.split(',')
-
-    for member in members:
-        if '-' not in member:
-            integers.append(int(member))
-        else:
-            int_range = member.split('-')
-            integers.extend(range(int(int_range[0]),
-                                  int(int_range[1]) + 1))
-
-    return integers
-
-
-def get_pid_from_gname(gname):
-    """Fuzzy function to convert guest name to QEMU process pid.
-
-    Returns a list of potential pids, can be empty if no match found.
-    Throws an exception on processing errors.
-
-    """
-    pids = []
-    try:
-        child = subprocess.Popen(['ps', '-A', '--format', 'pid,args'],
-                                 stdout=subprocess.PIPE)
-    except:
-        raise Exception
-    for line in child.stdout:
-        line = line.lstrip().split(' ', 1)
-        # perform a sanity check before calling the more expensive
-        # function to possibly extract the guest name
-        if ' -name ' in line[1] and gname == get_gname_from_pid(line[0]):
-            pids.append(int(line[0]))
-    child.stdout.close()
-
-    return pids
-
-
-def get_gname_from_pid(pid):
-    """Returns the guest name for a QEMU process pid.
-
-    Extracts the guest name from the QEMU comma line by processing the '-name'
-    option. Will also handle names specified out of sequence.
-
-    """
-    name = ''
-    try:
-        line = open('/proc/{}/cmdline'.format(pid), 'rb').read().split('\0')
-        parms = line[line.index('-name') + 1].split(',')
-        while '' in parms:
-            # commas are escaped (i.e. ',,'), hence e.g. 'foo,bar' results in
-            # ['foo', '', 'bar'], which we revert here
-            idx = parms.index('')
-            parms[idx - 1] += ',' + parms[idx + 1]
-            del parms[idx:idx+2]
-        # the '-name' switch allows for two ways to specify the guest name,
-        # where the plain name overrides the name specified via 'guest='
-        for arg in parms:
-            if '=' not in arg:
-                name = arg
-                break
-            if arg[:6] == 'guest=':
-                name = arg[6:]
-    except (ValueError, IOError, IndexError):
-        pass
-
-    return name
-
-
-def get_online_cpus():
-    """Returns a list of cpu id integers."""
-    with open('/sys/devices/system/cpu/online') as cpu_list:
-        cpu_string = cpu_list.readline()
-        return parse_int_list(cpu_string)
-
-
-def get_filters():
-    """Returns a dict of trace events, their filter ids and
-    the values that can be filtered.
-
-    Trace events can be filtered for special values by setting a
-    filter string via an ioctl. The string normally has the format
-    identifier==value. For each filter a new event will be created, to
-    be able to distinguish the events.
-
-    """
-    filters = {}
-    filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS)
-    if ARCH.exit_reasons:
-        filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons)
-    return filters
-
-libc = ctypes.CDLL('libc.so.6', use_errno=True)
-syscall = libc.syscall
-
-
 class perf_event_attr(ctypes.Structure):
     """Struct that holds the necessary data to set up a trace event.
 
@@ -432,25 +324,6 @@ class perf_event_attr(ctypes.Structure):
         self.read_format = PERF_FORMAT_GROUP
 
 
-def perf_event_open(attr, pid, cpu, group_fd, flags):
-    """Wrapper for the sys_perf_evt_open() syscall.
-
-    Used to set up performance events, returns a file descriptor or -1
-    on error.
-
-    Attributes are:
-    - syscall number
-    - struct perf_event_attr *
-    - pid or -1 to monitor all pids
-    - cpu number or -1 to monitor all cpus
-    - The file descriptor of the group leader or -1 to create a group.
-    - flags
-
-    """
-    return syscall(ARCH.sc_perf_evt_open, ctypes.pointer(attr),
-                   ctypes.c_int(pid), ctypes.c_int(cpu),
-                   ctypes.c_int(group_fd), ctypes.c_long(flags))
-
 PERF_TYPE_TRACEPOINT = 2
 PERF_FORMAT_GROUP = 1 << 3
 
@@ -495,6 +368,8 @@ class Event(object):
     """Represents a performance event and manages its life cycle."""
     def __init__(self, name, group, trace_cpu, trace_pid, trace_point,
                  trace_filter, trace_set='kvm'):
+        self.libc = ctypes.CDLL('libc.so.6', use_errno=True)
+        self.syscall = self.libc.syscall
         self.name = name
         self.fd = None
         self.setup_event(group, trace_cpu, trace_pid, trace_point,
@@ -511,6 +386,25 @@ class Event(object):
         if self.fd:
             os.close(self.fd)
 
+    def perf_event_open(self, attr, pid, cpu, group_fd, flags):
+        """Wrapper for the sys_perf_evt_open() syscall.
+
+        Used to set up performance events, returns a file descriptor or -1
+        on error.
+
+        Attributes are:
+        - syscall number
+        - struct perf_event_attr *
+        - pid or -1 to monitor all pids
+        - cpu number or -1 to monitor all cpus
+        - The file descriptor of the group leader or -1 to create a group.
+        - flags
+
+        """
+        return self.syscall(ARCH.sc_perf_evt_open, ctypes.pointer(attr),
+                            ctypes.c_int(pid), ctypes.c_int(cpu),
+                            ctypes.c_int(group_fd), ctypes.c_long(flags))
+
     def setup_event_attribute(self, trace_set, trace_point):
         """Returns an initialized ctype perf_event_attr struct."""
 
@@ -539,8 +433,8 @@ class Event(object):
         if group.events:
             group_leader = group.events[0].fd
 
-        fd = perf_event_open(event_attr, trace_pid,
-                             trace_cpu, group_leader, 0)
+        fd = self.perf_event_open(event_attr, trace_pid,
+                                  trace_cpu, group_leader, 0)
         if fd == -1:
             err = ctypes.get_errno()
             raise OSError(err, os.strerror(err),
@@ -575,17 +469,53 @@ class Event(object):
         fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0)
 
 
-class TracepointProvider(object):
+class Provider(object):
+    """Encapsulates functionalities used by all providers."""
+    @staticmethod
+    def is_field_wanted(fields_filter, field):
+        """Indicate whether field is valid according to fields_filter."""
+        if not fields_filter:
+            return True
+        return re.match(fields_filter, field) is not None
+
+    @staticmethod
+    def walkdir(path):
+        """Returns os.walk() data for specified directory.
+
+        As it is only a wrapper it returns the same 3-tuple of (dirpath,
+        dirnames, filenames).
+        """
+        return next(os.walk(path))
+
+
+class TracepointProvider(Provider):
     """Data provider for the stats class.
 
     Manages the events/groups from which it acquires its data.
 
     """
-    def __init__(self):
+    def __init__(self, pid, fields_filter):
         self.group_leaders = []
-        self.filters = get_filters()
-        self._fields = self.get_available_fields()
-        self._pid = 0
+        self.filters = self.get_filters()
+        self.update_fields(fields_filter)
+        self.pid = pid
+
+    @staticmethod
+    def get_filters():
+        """Returns a dict of trace events, their filter ids and
+        the values that can be filtered.
+
+        Trace events can be filtered for special values by setting a
+        filter string via an ioctl. The string normally has the format
+        identifier==value. For each filter a new event will be created, to
+        be able to distinguish the events.
+
+        """
+        filters = {}
+        filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS)
+        if ARCH.exit_reasons:
+            filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons)
+        return filters
 
     def get_available_fields(self):
         """Returns a list of available event's of format 'event name(filter
@@ -603,7 +533,7 @@ class TracepointProvider(object):
 
         """
         path = os.path.join(PATH_DEBUGFS_TRACING, 'events', 'kvm')
-        fields = walkdir(path)[1]
+        fields = self.walkdir(path)[1]
         extra = []
         for field in fields:
             if field in self.filters:
@@ -613,6 +543,34 @@ class TracepointProvider(object):
         fields += extra
         return fields
 
+    def update_fields(self, fields_filter):
+        """Refresh fields, applying fields_filter"""
+        self._fields = [field for field in self.get_available_fields()
+                        if self.is_field_wanted(fields_filter, field)]
+
+    @staticmethod
+    def get_online_cpus():
+        """Returns a list of cpu id integers."""
+        def parse_int_list(list_string):
+            """Returns an int list from a string of comma separated integers and
+            integer ranges."""
+            integers = []
+            members = list_string.split(',')
+
+            for member in members:
+                if '-' not in member:
+                    integers.append(int(member))
+                else:
+                    int_range = member.split('-')
+                    integers.extend(range(int(int_range[0]),
+                                          int(int_range[1]) + 1))
+
+            return integers
+
+        with open('/sys/devices/system/cpu/online') as cpu_list:
+            cpu_string = cpu_list.readline()
+            return parse_int_list(cpu_string)
+
     def setup_traces(self):
         """Creates all event and group objects needed to be able to retrieve
         data."""
@@ -621,9 +579,9 @@ class TracepointProvider(object):
             # Fetch list of all threads of the monitored pid, as qemu
             # starts a thread for each vcpu.
             path = os.path.join('/proc', str(self._pid), 'task')
-            groupids = walkdir(path)[1]
+            groupids = self.walkdir(path)[1]
         else:
-            groupids = get_online_cpus()
+            groupids = self.get_online_cpus()
 
         # The constant is needed as a buffer for python libs, std
         # streams and other files that the script opens.
@@ -671,9 +629,6 @@ class TracepointProvider(object):
 
             self.group_leaders.append(group)
 
-    def available_fields(self):
-        return self.get_available_fields()
-
     @property
     def fields(self):
         return self._fields
@@ -707,7 +662,7 @@ class TracepointProvider(object):
         self.setup_traces()
         self.fields = self._fields
 
-    def read(self):
+    def read(self, by_guest=0):
         """Returns 'event name: current value' for all enabled events."""
         ret = defaultdict(int)
         for group in self.group_leaders:
@@ -723,16 +678,17 @@ class TracepointProvider(object):
                 event.reset()
 
 
-class DebugfsProvider(object):
+class DebugfsProvider(Provider):
     """Provides data from the files that KVM creates in the kvm debugfs
     folder."""
-    def __init__(self):
-        self._fields = self.get_available_fields()
+    def __init__(self, pid, fields_filter, include_past):
+        self.update_fields(fields_filter)
         self._baseline = {}
-        self._pid = 0
         self.do_read = True
         self.paths = []
-        self.reset()
+        self.pid = pid
+        if include_past:
+            self.restore()
 
     def get_available_fields(self):
         """"Returns a list of available fields.
@@ -740,7 +696,12 @@ class DebugfsProvider(object):
         The fields are all available KVM debugfs files
 
         """
-        return walkdir(PATH_DEBUGFS_KVM)[2]
+        return self.walkdir(PATH_DEBUGFS_KVM)[2]
+
+    def update_fields(self, fields_filter):
+        """Refresh fields, applying fields_filter"""
+        self._fields = [field for field in self.get_available_fields()
+                        if self.is_field_wanted(fields_filter, field)]
 
     @property
     def fields(self):
@@ -757,10 +718,9 @@ class DebugfsProvider(object):
 
     @pid.setter
     def pid(self, pid):
+        self._pid = pid
         if pid != 0:
-            self._pid = pid
-
-            vms = walkdir(PATH_DEBUGFS_KVM)[1]
+            vms = self.walkdir(PATH_DEBUGFS_KVM)[1]
             if len(vms) == 0:
                 self.do_read = False
 
@@ -771,8 +731,15 @@ class DebugfsProvider(object):
             self.do_read = True
         self.reset()
 
-    def read(self, reset=0):
-        """Returns a dict with format:'file name / field -> current value'."""
+    def read(self, reset=0, by_guest=0):
+        """Returns a dict with format:'file name / field -> current value'.
+
+        Parameter 'reset':
+          0   plain read
+          1   reset field counts to 0
+          2   restore the original field counts
+
+        """
         results = {}
 
         # If no debugfs filtering support is available, then don't read.
@@ -789,12 +756,22 @@ class DebugfsProvider(object):
             for field in self._fields:
                 value = self.read_field(field, path)
                 key = path + field
-                if reset:
+                if reset == 1:
                     self._baseline[key] = value
+                if reset == 2:
+                    self._baseline[key] = 0
                 if self._baseline.get(key, -1) == -1:
                     self._baseline[key] = value
-                results[field] = (results.get(field, 0) + value -
-                                  self._baseline.get(key, 0))
+                increment = (results.get(field, 0) + value -
+                             self._baseline.get(key, 0))
+                if by_guest:
+                    pid = key.split('-')[0]
+                    if pid in results:
+                        results[pid] += increment
+                    else:
+                        results[pid] = increment
+                else:
+                    results[field] = increment
 
         return results
 
@@ -813,6 +790,11 @@ class DebugfsProvider(object):
         self._baseline = {}
         self.read(1)
 
+    def restore(self):
+        """Reset field counters"""
+        self._baseline = {}
+        self.read(2)
+
 
 class Stats(object):
     """Manages the data providers and the data they provide.
@@ -821,33 +803,32 @@ class Stats(object):
     provider data.
 
     """
-    def __init__(self, providers, pid, fields=None):
-        self.providers = providers
-        self._pid_filter = pid
-        self._fields_filter = fields
+    def __init__(self, options):
+        self.providers = self.get_providers(options)
+        self._pid_filter = options.pid
+        self._fields_filter = options.fields
         self.values = {}
-        self.update_provider_pid()
-        self.update_provider_filters()
+
+    @staticmethod
+    def get_providers(options):
+        """Returns a list of data providers depending on the passed options."""
+        providers = []
+
+        if options.debugfs:
+            providers.append(DebugfsProvider(options.pid, options.fields,
+                                             options.dbgfs_include_past))
+        if options.tracepoints or not providers:
+            providers.append(TracepointProvider(options.pid, options.fields))
+
+        return providers
 
     def update_provider_filters(self):
         """Propagates fields filters to providers."""
-        def wanted(key):
-            if not self._fields_filter:
-                return True
-            return re.match(self._fields_filter, key) is not None
-
         # As we reset the counters when updating the fields we can
         # also clear the cache of old values.
         self.values = {}
         for provider in self.providers:
-            provider_fields = [key for key in provider.get_available_fields()
-                               if wanted(key)]
-            provider.fields = provider_fields
-
-    def update_provider_pid(self):
-        """Propagates pid filters to providers."""
-        for provider in self.providers:
-            provider.pid = self._pid_filter
+            provider.update_fields(self._fields_filter)
 
     def reset(self):
         self.values = {}
@@ -873,27 +854,52 @@ class Stats(object):
         if pid != self._pid_filter:
             self._pid_filter = pid
             self.values = {}
-            self.update_provider_pid()
+            for provider in self.providers:
+                provider.pid = self._pid_filter
 
-    def get(self):
+    def get(self, by_guest=0):
         """Returns a dict with field -> (value, delta to last value) of all
         provider data."""
         for provider in self.providers:
-            new = provider.read()
-            for key in provider.fields:
+            new = provider.read(by_guest=by_guest)
+            for key in new if by_guest else provider.fields:
                 oldval = self.values.get(key, (0, 0))[0]
                 newval = new.get(key, 0)
                 newdelta = newval - oldval
                 self.values[key] = (newval, newdelta)
         return self.values
 
-LABEL_WIDTH = 40
-NUMBER_WIDTH = 10
-DELAY_INITIAL = 0.25
-DELAY_REGULAR = 3.0
+    def toggle_display_guests(self, to_pid):
+        """Toggle between collection of stats by individual event and by
+        guest pid
+
+        Events reported by DebugfsProvider change when switching to/from
+        reading by guest values. Hence we have to remove the excess event
+        names from self.values.
+
+        """
+        if any(isinstance(ins, TracepointProvider) for ins in self.providers):
+            return 1
+        if to_pid:
+            for provider in self.providers:
+                if isinstance(provider, DebugfsProvider):
+                    for key in provider.fields:
+                        if key in self.values.keys():
+                            del self.values[key]
+        else:
+            oldvals = self.values.copy()
+            for key in oldvals:
+                if key.isdigit():
+                    del self.values[key]
+        # Update oldval (see get())
+        self.get(to_pid)
+        return 0
+
+DELAY_DEFAULT = 3.0
 MAX_GUEST_NAME_LEN = 48
 MAX_REGEX_LEN = 44
 DEFAULT_REGEX = r'^[^\(]*$'
+SORT_DEFAULT = 0
 
 
 class Tui(object):
@@ -901,7 +907,10 @@ class Tui(object):
     def __init__(self, stats):
         self.stats = stats
         self.screen = None
-        self.update_drilldown()
+        self._delay_initial = 0.25
+        self._delay_regular = DELAY_DEFAULT
+        self._sorting = SORT_DEFAULT
+        self._display_guests = 0
 
     def __enter__(self):
         """Initialises curses for later use.  Based on curses.wrapper
@@ -929,7 +938,7 @@ class Tui(object):
         return self
 
     def __exit__(self, *exception):
-        """Resets the terminal to its normal state.  Based on curses.wrappre
+        """Resets the terminal to its normal state.  Based on curses.wrapper
            implementation from the Python standard library."""
         if self.screen:
             self.screen.keypad(0)
@@ -937,6 +946,86 @@ class Tui(object):
             curses.nocbreak()
             curses.endwin()
 
+    def get_all_gnames(self):
+        """Returns a list of (pid, gname) tuples of all running guests"""
+        res = []
+        try:
+            child = subprocess.Popen(['ps', '-A', '--format', 'pid,args'],
+                                     stdout=subprocess.PIPE)
+        except:
+            raise Exception
+        for line in child.stdout:
+            line = line.lstrip().split(' ', 1)
+            # perform a sanity check before calling the more expensive
+            # function to possibly extract the guest name
+            if ' -name ' in line[1]:
+                res.append((line[0], self.get_gname_from_pid(line[0])))
+        child.stdout.close()
+
+        return res
+
+    def print_all_gnames(self, row):
+        """Print a list of all running guests along with their pids."""
+        self.screen.addstr(row, 2, '%8s  %-60s' %
+                           ('Pid', 'Guest Name (fuzzy list, might be '
+                            'inaccurate!)'),
+                           curses.A_UNDERLINE)
+        row += 1
+        try:
+            for line in self.get_all_gnames():
+                self.screen.addstr(row, 2, '%8s  %-60s' % (line[0], line[1]))
+                row += 1
+                if row >= self.screen.getmaxyx()[0]:
+                    break
+        except Exception:
+            self.screen.addstr(row + 1, 2, 'Not available')
+
+    def get_pid_from_gname(self, gname):
+        """Fuzzy function to convert guest name to QEMU process pid.
+
+        Returns a list of potential pids, can be empty if no match found.
+        Throws an exception on processing errors.
+
+        """
+        pids = []
+        for line in self.get_all_gnames():
+            if gname == line[1]:
+                pids.append(int(line[0]))
+
+        return pids
+
+    @staticmethod
+    def get_gname_from_pid(pid):
+        """Returns the guest name for a QEMU process pid.
+
+        Extracts the guest name from the QEMU comma line by processing the
+        '-name' option. Will also handle names specified out of sequence.
+
+        """
+        name = ''
+        try:
+            line = open('/proc/{}/cmdline'
+                        .format(pid), 'rb').read().split('\0')
+            parms = line[line.index('-name') + 1].split(',')
+            while '' in parms:
+                # commas are escaped (i.e. ',,'), hence e.g. 'foo,bar' results
+                # in # ['foo', '', 'bar'], which we revert here
+                idx = parms.index('')
+                parms[idx - 1] += ',' + parms[idx + 1]
+                del parms[idx:idx+2]
+            # the '-name' switch allows for two ways to specify the guest name,
+            # where the plain name overrides the name specified via 'guest='
+            for arg in parms:
+                if '=' not in arg:
+                    name = arg
+                    break
+                if arg[:6] == 'guest=':
+                    name = arg[6:]
+        except (ValueError, IOError, IndexError):
+            pass
+
+        return name
+
     def update_drilldown(self):
         """Sets or removes a filter that only allows fields without braces."""
         if not self.stats.fields_filter:
@@ -954,7 +1043,7 @@ class Tui(object):
         if pid is None:
             pid = self.stats.pid_filter
         self.screen.erase()
-        gname = get_gname_from_pid(pid)
+        gname = self.get_gname_from_pid(pid)
         if gname:
             gname = ('({})'.format(gname[:MAX_GUEST_NAME_LEN] + '...'
                                    if len(gname) > MAX_GUEST_NAME_LEN
@@ -970,13 +1059,13 @@ class Tui(object):
             if len(regex) > MAX_REGEX_LEN:
                 regex = regex[:MAX_REGEX_LEN] + '...'
             self.screen.addstr(1, 17, 'regex filter: {0}'.format(regex))
-        self.screen.addstr(2, 1, 'Event')
-        self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH -
-                           len('Total'), 'Total')
-        self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 7 -
-                           len('%Total'), '%Total')
-        self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 7 + 8 -
-                           len('Current'), 'Current')
+        if self._display_guests:
+            col_name = 'Guest Name'
+        else:
+            col_name = 'Event'
+        self.screen.addstr(2, 1, '%-40s %10s%7s %8s' %
+                           (col_name, 'Total', '%Total', 'CurAvg/s'),
+                           curses.A_STANDOUT)
         self.screen.addstr(4, 1, 'Collecting data...')
         self.screen.refresh()
 
@@ -984,16 +1073,25 @@ class Tui(object):
         row = 3
         self.screen.move(row, 0)
         self.screen.clrtobot()
-        stats = self.stats.get()
+        stats = self.stats.get(self._display_guests)
 
-        def sortkey(x):
+        def sortCurAvg(x):
+            # sort by current events if available
             if stats[x][1]:
                 return (-stats[x][1], -stats[x][0])
             else:
                 return (0, -stats[x][0])
+
+        def sortTotal(x):
+            # sort by totals
+            return (0, -stats[x][0])
         total = 0.
         for val in stats.values():
             total += val[0]
+        if self._sorting == SORT_DEFAULT:
+            sortkey = sortCurAvg
+        else:
+            sortkey = sortTotal
         for key in sorted(stats.keys(), key=sortkey):
 
             if row >= self.screen.getmaxyx()[0]:
@@ -1001,18 +1099,61 @@ class Tui(object):
             values = stats[key]
             if not values[0] and not values[1]:
                 break
-            col = 1
-            self.screen.addstr(row, col, key)
-            col += LABEL_WIDTH
-            self.screen.addstr(row, col, '%10d' % (values[0],))
-            col += NUMBER_WIDTH
-            self.screen.addstr(row, col, '%7.1f' % (values[0] * 100 / total,))
-            col += 7
-            if values[1] is not None:
-                self.screen.addstr(row, col, '%8d' % (values[1] / sleeptime,))
+            if values[0] is not None:
+                cur = int(round(values[1] / sleeptime)) if values[1] else ''
+                if self._display_guests:
+                    key = self.get_gname_from_pid(key)
+                self.screen.addstr(row, 1, '%-40s %10d%7.1f %8s' %
+                                   (key, values[0], values[0] * 100 / total,
+                                    cur))
             row += 1
+        if row == 3:
+            self.screen.addstr(4, 1, 'No matching events reported yet')
         self.screen.refresh()
 
+    def show_msg(self, text):
+        """Display message centered text and exit on key press"""
+        hint = 'Press any key to continue'
+        curses.cbreak()
+        self.screen.erase()
+        (x, term_width) = self.screen.getmaxyx()
+        row = 2
+        for line in text:
+            start = (term_width - len(line)) / 2
+            self.screen.addstr(row, start, line)
+            row += 1
+        self.screen.addstr(row + 1, (term_width - len(hint)) / 2, hint,
+                           curses.A_STANDOUT)
+        self.screen.getkey()
+
+    def show_help_interactive(self):
+        """Display help with list of interactive commands"""
+        msg = ('   b     toggle events by guests (debugfs only, honors'
+               ' filters)',
+               '   c     clear filter',
+               '   f     filter by regular expression',
+               '   g     filter by guest name',
+               '   h     display interactive commands reference',
+               '   o     toggle sorting order (Total vs CurAvg/s)',
+               '   p     filter by PID',
+               '   q     quit',
+               '   r     reset stats',
+               '   s     set update interval',
+               '   x     toggle reporting of stats for individual child trace'
+               ' events',
+               'Any other key refreshes statistics immediately')
+        curses.cbreak()
+        self.screen.erase()
+        self.screen.addstr(0, 0, "Interactive commands reference",
+                           curses.A_BOLD)
+        self.screen.addstr(2, 0, "Press any key to exit", curses.A_STANDOUT)
+        row = 4
+        for line in msg:
+            self.screen.addstr(row, 0, line)
+            row += 1
+        self.screen.getkey()
+        self.refresh_header()
+
     def show_filter_selection(self):
         """Draws filter selection mask.
 
@@ -1059,6 +1200,7 @@ class Tui(object):
                                'This might limit the shown data to the trace '
                                'statistics.')
             self.screen.addstr(5, 0, msg)
+            self.print_all_gnames(7)
 
             curses.echo()
             self.screen.addstr(3, 0, "Pid [0 or pid]: ")
@@ -1077,10 +1219,40 @@ class Tui(object):
                 self.refresh_header(pid)
                 self.update_pid(pid)
                 break
-
             except ValueError:
                 msg = '"' + str(pid) + '": Not a valid pid'
-                continue
+
+    def show_set_update_interval(self):
+        """Draws update interval selection mask."""
+        msg = ''
+        while True:
+            self.screen.erase()
+            self.screen.addstr(0, 0, 'Set update interval (defaults to %fs).' %
+                               DELAY_DEFAULT, curses.A_BOLD)
+            self.screen.addstr(4, 0, msg)
+            self.screen.addstr(2, 0, 'Change delay from %.1fs to ' %
+                               self._delay_regular)
+            curses.echo()
+            val = self.screen.getstr()
+            curses.noecho()
+
+            try:
+                if len(val) > 0:
+                    delay = float(val)
+                    if delay < 0.1:
+                        msg = '"' + str(val) + '": Value must be >=0.1'
+                        continue
+                    if delay > 25.5:
+                        msg = '"' + str(val) + '": Value must be <=25.5'
+                        continue
+                else:
+                    delay = DELAY_DEFAULT
+                self._delay_regular = delay
+                break
+
+            except ValueError:
+                msg = '"' + str(val) + '": Invalid value'
+        self.refresh_header()
 
     def show_vm_selection_by_guest_name(self):
         """Draws guest selection mask.
@@ -1098,6 +1270,7 @@ class Tui(object):
                                'This might limit the shown data to the trace '
                                'statistics.')
             self.screen.addstr(5, 0, msg)
+            self.print_all_gnames(7)
             curses.echo()
             self.screen.addstr(3, 0, "Guest [ENTER or guest]: ")
             gname = self.screen.getstr()
@@ -1110,7 +1283,7 @@ class Tui(object):
             else:
                 pids = []
                 try:
-                    pids = get_pid_from_gname(gname)
+                    pids = self.get_pid_from_gname(gname)
                 except:
                     msg = '"' + gname + '": Internal error while searching, ' \
                           'use pid filter instead'
@@ -1128,38 +1301,60 @@ class Tui(object):
 
     def show_stats(self):
         """Refreshes the screen and processes user input."""
-        sleeptime = DELAY_INITIAL
+        sleeptime = self._delay_initial
         self.refresh_header()
+        start = 0.0  # result based on init value never appears on screen
         while True:
-            self.refresh_body(sleeptime)
+            self.refresh_body(time.time() - start)
             curses.halfdelay(int(sleeptime * 10))
-            sleeptime = DELAY_REGULAR
+            start = time.time()
+            sleeptime = self._delay_regular
             try:
                 char = self.screen.getkey()
-                if char == 'x':
+                if char == 'b':
+                    self._display_guests = not self._display_guests
+                    if self.stats.toggle_display_guests(self._display_guests):
+                        self.show_msg(['Command not available with tracepoints'
+                                       ' enabled', 'Restart with debugfs only '
+                                       '(see option \'-d\') and try again!'])
+                        self._display_guests = not self._display_guests
                     self.refresh_header()
-                    self.update_drilldown()
-                    sleeptime = DELAY_INITIAL
-                if char == 'q':
-                    break
                 if char == 'c':
                     self.stats.fields_filter = DEFAULT_REGEX
                     self.refresh_header(0)
                     self.update_pid(0)
-                    sleeptime = DELAY_INITIAL
                 if char == 'f':
+                    curses.curs_set(1)
                     self.show_filter_selection()
-                    sleeptime = DELAY_INITIAL
+                    curses.curs_set(0)
+                    sleeptime = self._delay_initial
                 if char == 'g':
+                    curses.curs_set(1)
                     self.show_vm_selection_by_guest_name()
-                    sleeptime = DELAY_INITIAL
+                    curses.curs_set(0)
+                    sleeptime = self._delay_initial
+                if char == 'h':
+                    self.show_help_interactive()
+                if char == 'o':
+                    self._sorting = not self._sorting
                 if char == 'p':
+                    curses.curs_set(1)
                     self.show_vm_selection_by_pid()
-                    sleeptime = DELAY_INITIAL
+                    curses.curs_set(0)
+                    sleeptime = self._delay_initial
+                if char == 'q':
+                    break
                 if char == 'r':
-                    self.refresh_header()
                     self.stats.reset()
-                    sleeptime = DELAY_INITIAL
+                if char == 's':
+                    curses.curs_set(1)
+                    self.show_set_update_interval()
+                    curses.curs_set(0)
+                    sleeptime = self._delay_initial
+                if char == 'x':
+                    self.update_drilldown()
+                    # prevents display of current values on next refresh
+                    self.stats.get()
             except KeyboardInterrupt:
                 break
             except curses.error:
@@ -1227,13 +1422,17 @@ Requirements:
   the large number of files that are possibly opened.
 
 Interactive Commands:
+   b     toggle events by guests (debugfs only, honors filters)
    c     clear filter
    f     filter by regular expression
    g     filter by guest name
+   h     display interactive commands reference
+   o     toggle sorting order (Total vs CurAvg/s)
    p     filter by PID
    q     quit
-   x     toggle reporting of stats for individual child trace events
    r     reset stats
+   s     set update interval
+   x     toggle reporting of stats for individual child trace events
 Press any other key to refresh statistics immediately.
 """
 
@@ -1246,7 +1445,7 @@ Press any other key to refresh statistics immediately.
 
     def cb_guest_to_pid(option, opt, val, parser):
         try:
-            pids = get_pid_from_gname(val)
+            pids = Tui.get_pid_from_gname(val)
         except:
             raise optparse.OptionValueError('Error while searching for guest '
                                             '"{}", use "-p" to specify a pid '
@@ -1268,6 +1467,13 @@ Press any other key to refresh statistics immediately.
                          dest='once',
                          help='run in batch mode for one second',
                          )
+    optparser.add_option('-i', '--debugfs-include-past',
+                         action='store_true',
+                         default=False,
+                         dest='dbgfs_include_past',
+                         help='include all available data on past events for '
+                              'debugfs',
+                         )
     optparser.add_option('-l', '--log',
                          action='store_true',
                          default=False,
@@ -1288,7 +1494,7 @@ Press any other key to refresh statistics immediately.
                          )
     optparser.add_option('-f', '--fields',
                          action='store',
-                         default=None,
+                         default=DEFAULT_REGEX,
                          dest='fields',
                          help='fields to display (regex)',
                          )
@@ -1311,20 +1517,6 @@ Press any other key to refresh statistics immediately.
     return options
 
 
-def get_providers(options):
-    """Returns a list of data providers depending on the passed options."""
-    providers = []
-
-    if options.tracepoints:
-        providers.append(TracepointProvider())
-    if options.debugfs:
-        providers.append(DebugfsProvider())
-    if len(providers) == 0:
-        providers.append(TracepointProvider())
-
-    return providers
-
-
 def check_access(options):
     """Exits if the current user can't access all needed directories."""
     if not os.path.exists('/sys/kernel/debug'):
@@ -1365,8 +1557,7 @@ def main():
         sys.stderr.write('Did you use a (unsupported) tid instead of a pid?\n')
         sys.exit('Specified pid does not exist.')
 
-    providers = get_providers(options)
-    stats = Stats(providers, options.pid, fields=options.fields)
+    stats = Stats(options)
 
     if options.log:
         log(stats)
diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt
index 109431bdc63c..e5cf836be8a1 100644
--- a/tools/kvm/kvm_stat/kvm_stat.txt
+++ b/tools/kvm/kvm_stat/kvm_stat.txt
@@ -29,18 +29,26 @@ meaning of events.
 INTERACTIVE COMMANDS
 --------------------
 [horizontal]
+*b*::	toggle events by guests (debugfs only, honors filters)
+
 *c*::	clear filter
 
 *f*::	filter by regular expression
 
 *g*::	filter by guest name
 
+*h*::	display interactive commands reference
+
+*o*::   toggle sorting order (Total vs CurAvg/s)
+
 *p*::	filter by PID
 
 *q*::	quit
 
 *r*::	reset stats
 
+*s*::   set update interval
+
 *x*::	toggle reporting of stats for child trace events
 
 Press any other key to refresh statistics immediately.
@@ -64,6 +72,10 @@ OPTIONS
 --debugfs::
 	retrieve statistics from debugfs
 
+-i::
+--debugfs-include-past::
+	include all available data on past events for debugfs
+
 -p<pid>::
 --pid=<pid>::
 	limit statistics to one virtual machine (pid)
diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c
index 528af4b2d09e..79c7c357804b 100644
--- a/virt/kvm/arm/aarch32.c
+++ b/virt/kvm/arm/aarch32.c
@@ -60,7 +60,7 @@ static const unsigned short cc_map[16] = {
 /*
  * Check if a trapped instruction should have been executed or not.
  */
-bool kvm_condition_valid32(const struct kvm_vcpu *vcpu)
+bool __hyp_text kvm_condition_valid32(const struct kvm_vcpu *vcpu)
 {
 	unsigned long cpsr;
 	u32 cpsr_cond;
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 5976609ef27c..8e89d63005c7 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -21,6 +21,7 @@
 #include <linux/kvm_host.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
+#include <linux/uaccess.h>
 
 #include <clocksource/arm_arch_timer.h>
 #include <asm/arch_timer.h>
@@ -35,6 +36,16 @@ static struct timecounter *timecounter;
 static unsigned int host_vtimer_irq;
 static u32 host_vtimer_irq_flags;
 
+static const struct kvm_irq_level default_ptimer_irq = {
+	.irq	= 30,
+	.level	= 1,
+};
+
+static const struct kvm_irq_level default_vtimer_irq = {
+	.irq	= 27,
+	.level	= 1,
+};
+
 void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	vcpu_vtimer(vcpu)->active_cleared_last = false;
@@ -95,7 +106,7 @@ static void kvm_timer_inject_irq_work(struct work_struct *work)
 	 * If the vcpu is blocked we want to wake it up so that it will see
 	 * the timer has expired when entering the guest.
 	 */
-	kvm_vcpu_kick(vcpu);
+	kvm_vcpu_wake_up(vcpu);
 }
 
 static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
@@ -215,7 +226,8 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
 	if (likely(irqchip_in_kernel(vcpu->kvm))) {
 		ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
 					  timer_ctx->irq.irq,
-					  timer_ctx->irq.level);
+					  timer_ctx->irq.level,
+					  timer_ctx);
 		WARN_ON(ret);
 	}
 }
@@ -445,23 +457,12 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
 	kvm_timer_update_state(vcpu);
 }
 
-int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
-			 const struct kvm_irq_level *virt_irq,
-			 const struct kvm_irq_level *phys_irq)
+int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 
 	/*
-	 * The vcpu timer irq number cannot be determined in
-	 * kvm_timer_vcpu_init() because it is called much before
-	 * kvm_vcpu_set_target(). To handle this, we determine
-	 * vcpu timer irq number when the vcpu is reset.
-	 */
-	vtimer->irq.irq = virt_irq->irq;
-	ptimer->irq.irq = phys_irq->irq;
-
-	/*
 	 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
 	 * and to 0 for ARMv7.  We provide an implementation that always
 	 * resets the timer to be disabled and unmasked and is compliant with
@@ -496,6 +497,8 @@ static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff)
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 
 	/* Synchronize cntvoff across all vtimers of a VM. */
 	update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
@@ -504,6 +507,9 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 	INIT_WORK(&timer->expired, kvm_timer_inject_irq_work);
 	hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	timer->timer.function = kvm_timer_expire;
+
+	vtimer->irq.irq = default_vtimer_irq.irq;
+	ptimer->irq.irq = default_ptimer_irq.irq;
 }
 
 static void kvm_timer_init_interrupt(void *info)
@@ -613,6 +619,30 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
 	kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq);
 }
 
+static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
+{
+	int vtimer_irq, ptimer_irq;
+	int i, ret;
+
+	vtimer_irq = vcpu_vtimer(vcpu)->irq.irq;
+	ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu));
+	if (ret)
+		return false;
+
+	ptimer_irq = vcpu_ptimer(vcpu)->irq.irq;
+	ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu));
+	if (ret)
+		return false;
+
+	kvm_for_each_vcpu(i, vcpu, vcpu->kvm) {
+		if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq ||
+		    vcpu_ptimer(vcpu)->irq.irq != ptimer_irq)
+			return false;
+	}
+
+	return true;
+}
+
 int kvm_timer_enable(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
@@ -632,6 +662,11 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
 	if (!vgic_initialized(vcpu->kvm))
 		return -ENODEV;
 
+	if (!timer_irqs_are_valid(vcpu)) {
+		kvm_debug("incorrectly configured timer irqs\n");
+		return -EINVAL;
+	}
+
 	/*
 	 * Find the physical IRQ number corresponding to the host_vtimer_irq
 	 */
@@ -681,3 +716,79 @@ void kvm_timer_init_vhe(void)
 	val |= (CNTHCTL_EL1PCTEN << cnthctl_shift);
 	write_sysreg(val, cnthctl_el2);
 }
+
+static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq)
+{
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		vcpu_vtimer(vcpu)->irq.irq = vtimer_irq;
+		vcpu_ptimer(vcpu)->irq.irq = ptimer_irq;
+	}
+}
+
+int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+	int __user *uaddr = (int __user *)(long)attr->addr;
+	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+	int irq;
+
+	if (!irqchip_in_kernel(vcpu->kvm))
+		return -EINVAL;
+
+	if (get_user(irq, uaddr))
+		return -EFAULT;
+
+	if (!(irq_is_ppi(irq)))
+		return -EINVAL;
+
+	if (vcpu->arch.timer_cpu.enabled)
+		return -EBUSY;
+
+	switch (attr->attr) {
+	case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
+		set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq);
+		break;
+	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+		set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq);
+		break;
+	default:
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+	int __user *uaddr = (int __user *)(long)attr->addr;
+	struct arch_timer_context *timer;
+	int irq;
+
+	switch (attr->attr) {
+	case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
+		timer = vcpu_vtimer(vcpu);
+		break;
+	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+		timer = vcpu_ptimer(vcpu);
+		break;
+	default:
+		return -ENXIO;
+	}
+
+	irq = timer->irq.irq;
+	return put_user(irq, uaddr);
+}
+
+int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+	switch (attr->attr) {
+	case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
+	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+		return 0;
+	}
+
+	return -ENXIO;
+}
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 3417e184c8e1..a39a1e161e63 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -368,6 +368,13 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	kvm_timer_vcpu_put(vcpu);
 }
 
+static void vcpu_power_off(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.power_off = true;
+	kvm_make_request(KVM_REQ_SLEEP, vcpu);
+	kvm_vcpu_kick(vcpu);
+}
+
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
@@ -387,7 +394,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 		vcpu->arch.power_off = false;
 		break;
 	case KVM_MP_STATE_STOPPED:
-		vcpu->arch.power_off = true;
+		vcpu_power_off(vcpu);
 		break;
 	default:
 		return -EINVAL;
@@ -520,6 +527,10 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
 	}
 
 	ret = kvm_timer_enable(vcpu);
+	if (ret)
+		return ret;
+
+	ret = kvm_arm_pmu_v3_enable(vcpu);
 
 	return ret;
 }
@@ -536,21 +547,7 @@ void kvm_arm_halt_guest(struct kvm *kvm)
 
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		vcpu->arch.pause = true;
-	kvm_make_all_cpus_request(kvm, KVM_REQ_VCPU_EXIT);
-}
-
-void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.pause = true;
-	kvm_vcpu_kick(vcpu);
-}
-
-void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu)
-{
-	struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
-
-	vcpu->arch.pause = false;
-	swake_up(wq);
+	kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
 }
 
 void kvm_arm_resume_guest(struct kvm *kvm)
@@ -558,16 +555,23 @@ void kvm_arm_resume_guest(struct kvm *kvm)
 	int i;
 	struct kvm_vcpu *vcpu;
 
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		kvm_arm_resume_vcpu(vcpu);
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		vcpu->arch.pause = false;
+		swake_up(kvm_arch_vcpu_wq(vcpu));
+	}
 }
 
-static void vcpu_sleep(struct kvm_vcpu *vcpu)
+static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
 {
 	struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
 
 	swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
 				       (!vcpu->arch.pause)));
+
+	if (vcpu->arch.power_off || vcpu->arch.pause) {
+		/* Awaken to handle a signal, request we sleep again later. */
+		kvm_make_request(KVM_REQ_SLEEP, vcpu);
+	}
 }
 
 static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
@@ -575,6 +579,20 @@ static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
 	return vcpu->arch.target >= 0;
 }
 
+static void check_vcpu_requests(struct kvm_vcpu *vcpu)
+{
+	if (kvm_request_pending(vcpu)) {
+		if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
+			vcpu_req_sleep(vcpu);
+
+		/*
+		 * Clear IRQ_PENDING requests that were made to guarantee
+		 * that a VCPU sees new virtual interrupts.
+		 */
+		kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
+	}
+}
+
 /**
  * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
  * @vcpu:	The VCPU pointer
@@ -620,8 +638,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 		update_vttbr(vcpu->kvm);
 
-		if (vcpu->arch.power_off || vcpu->arch.pause)
-			vcpu_sleep(vcpu);
+		check_vcpu_requests(vcpu);
 
 		/*
 		 * Preparing the interrupts to be injected also
@@ -650,8 +667,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 			run->exit_reason = KVM_EXIT_INTR;
 		}
 
+		/*
+		 * Ensure we set mode to IN_GUEST_MODE after we disable
+		 * interrupts and before the final VCPU requests check.
+		 * See the comment in kvm_vcpu_exiting_guest_mode() and
+		 * Documentation/virtual/kvm/vcpu-requests.rst
+		 */
+		smp_store_mb(vcpu->mode, IN_GUEST_MODE);
+
 		if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
-			vcpu->arch.power_off || vcpu->arch.pause) {
+		    kvm_request_pending(vcpu)) {
+			vcpu->mode = OUTSIDE_GUEST_MODE;
 			local_irq_enable();
 			kvm_pmu_sync_hwstate(vcpu);
 			kvm_timer_sync_hwstate(vcpu);
@@ -667,7 +693,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		 */
 		trace_kvm_entry(*vcpu_pc(vcpu));
 		guest_enter_irqoff();
-		vcpu->mode = IN_GUEST_MODE;
 
 		ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
 
@@ -756,6 +781,7 @@ static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
 	 * trigger a world-switch round on the running physical CPU to set the
 	 * virtual IRQ/FIQ fields in the HCR appropriately.
 	 */
+	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
 	kvm_vcpu_kick(vcpu);
 
 	return 0;
@@ -806,7 +832,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
 		if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
 			return -EINVAL;
 
-		return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level);
+		return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL);
 	case KVM_ARM_IRQ_TYPE_SPI:
 		if (!irqchip_in_kernel(kvm))
 			return -ENXIO;
@@ -814,7 +840,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
 		if (irq_num < VGIC_NR_PRIVATE_IRQS)
 			return -EINVAL;
 
-		return kvm_vgic_inject_irq(kvm, 0, irq_num, level);
+		return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL);
 	}
 
 	return -EINVAL;
@@ -884,7 +910,7 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
 	 * Handle the "start in power-off" case.
 	 */
 	if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
-		vcpu->arch.power_off = true;
+		vcpu_power_off(vcpu);
 	else
 		vcpu->arch.power_off = false;
 
@@ -1115,9 +1141,6 @@ static void cpu_init_hyp_mode(void *dummy)
 	__cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
 	__cpu_init_stage2();
 
-	if (is_kernel_in_hyp_mode())
-		kvm_timer_init_vhe();
-
 	kvm_arm_init_debug();
 }
 
@@ -1137,6 +1160,7 @@ static void cpu_hyp_reinit(void)
 		 * event was cancelled before the CPU was reset.
 		 */
 		__cpu_init_stage2();
+		kvm_timer_init_vhe();
 	} else {
 		cpu_init_hyp_mode(NULL);
 	}
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index 32c3295929b0..91728faa13fd 100644
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -19,10 +19,12 @@
 #include <linux/irqchip/arm-gic-v3.h>
 #include <linux/kvm_host.h>
 
+#include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
 
 #define vtr_to_max_lr_idx(v)		((v) & 0xf)
-#define vtr_to_nr_pre_bits(v)		(((u32)(v) >> 26) + 1)
+#define vtr_to_nr_pre_bits(v)		((((u32)(v) >> 26) & 7) + 1)
+#define vtr_to_nr_apr_regs(v)		(1 << (vtr_to_nr_pre_bits(v) - 5))
 
 static u64 __hyp_text __gic_v3_get_lr(unsigned int lr)
 {
@@ -118,6 +120,90 @@ static void __hyp_text __gic_v3_set_lr(u64 val, int lr)
 	}
 }
 
+static void __hyp_text __vgic_v3_write_ap0rn(u32 val, int n)
+{
+	switch (n) {
+	case 0:
+		write_gicreg(val, ICH_AP0R0_EL2);
+		break;
+	case 1:
+		write_gicreg(val, ICH_AP0R1_EL2);
+		break;
+	case 2:
+		write_gicreg(val, ICH_AP0R2_EL2);
+		break;
+	case 3:
+		write_gicreg(val, ICH_AP0R3_EL2);
+		break;
+	}
+}
+
+static void __hyp_text __vgic_v3_write_ap1rn(u32 val, int n)
+{
+	switch (n) {
+	case 0:
+		write_gicreg(val, ICH_AP1R0_EL2);
+		break;
+	case 1:
+		write_gicreg(val, ICH_AP1R1_EL2);
+		break;
+	case 2:
+		write_gicreg(val, ICH_AP1R2_EL2);
+		break;
+	case 3:
+		write_gicreg(val, ICH_AP1R3_EL2);
+		break;
+	}
+}
+
+static u32 __hyp_text __vgic_v3_read_ap0rn(int n)
+{
+	u32 val;
+
+	switch (n) {
+	case 0:
+		val = read_gicreg(ICH_AP0R0_EL2);
+		break;
+	case 1:
+		val = read_gicreg(ICH_AP0R1_EL2);
+		break;
+	case 2:
+		val = read_gicreg(ICH_AP0R2_EL2);
+		break;
+	case 3:
+		val = read_gicreg(ICH_AP0R3_EL2);
+		break;
+	default:
+		unreachable();
+	}
+
+	return val;
+}
+
+static u32 __hyp_text __vgic_v3_read_ap1rn(int n)
+{
+	u32 val;
+
+	switch (n) {
+	case 0:
+		val = read_gicreg(ICH_AP1R0_EL2);
+		break;
+	case 1:
+		val = read_gicreg(ICH_AP1R1_EL2);
+		break;
+	case 2:
+		val = read_gicreg(ICH_AP1R2_EL2);
+		break;
+	case 3:
+		val = read_gicreg(ICH_AP1R3_EL2);
+		break;
+	default:
+		unreachable();
+	}
+
+	return val;
+}
+
 void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
@@ -154,24 +240,27 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
 
 		switch (nr_pre_bits) {
 		case 7:
-			cpu_if->vgic_ap0r[3] = read_gicreg(ICH_AP0R3_EL2);
-			cpu_if->vgic_ap0r[2] = read_gicreg(ICH_AP0R2_EL2);
+			cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3);
+			cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2);
 		case 6:
-			cpu_if->vgic_ap0r[1] = read_gicreg(ICH_AP0R1_EL2);
+			cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1);
 		default:
-			cpu_if->vgic_ap0r[0] = read_gicreg(ICH_AP0R0_EL2);
+			cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0);
 		}
 
 		switch (nr_pre_bits) {
 		case 7:
-			cpu_if->vgic_ap1r[3] = read_gicreg(ICH_AP1R3_EL2);
-			cpu_if->vgic_ap1r[2] = read_gicreg(ICH_AP1R2_EL2);
+			cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3);
+			cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2);
 		case 6:
-			cpu_if->vgic_ap1r[1] = read_gicreg(ICH_AP1R1_EL2);
+			cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1);
 		default:
-			cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2);
+			cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0);
 		}
 	} else {
+		if (static_branch_unlikely(&vgic_v3_cpuif_trap))
+			write_gicreg(0, ICH_HCR_EL2);
+
 		cpu_if->vgic_elrsr = 0xffff;
 		cpu_if->vgic_ap0r[0] = 0;
 		cpu_if->vgic_ap0r[1] = 0;
@@ -224,26 +313,34 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
 
 		switch (nr_pre_bits) {
 		case 7:
-			write_gicreg(cpu_if->vgic_ap0r[3], ICH_AP0R3_EL2);
-			write_gicreg(cpu_if->vgic_ap0r[2], ICH_AP0R2_EL2);
+			__vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3);
+			__vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2);
 		case 6:
-			write_gicreg(cpu_if->vgic_ap0r[1], ICH_AP0R1_EL2);
+			__vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1);
 		default:
-			write_gicreg(cpu_if->vgic_ap0r[0], ICH_AP0R0_EL2);
+			__vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0);
 		}
 
 		switch (nr_pre_bits) {
 		case 7:
-			write_gicreg(cpu_if->vgic_ap1r[3], ICH_AP1R3_EL2);
-			write_gicreg(cpu_if->vgic_ap1r[2], ICH_AP1R2_EL2);
+			__vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3);
+			__vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2);
 		case 6:
-			write_gicreg(cpu_if->vgic_ap1r[1], ICH_AP1R1_EL2);
+			__vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1);
 		default:
-			write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2);
+			__vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0);
 		}
 
 		for (i = 0; i < used_lrs; i++)
 			__gic_v3_set_lr(cpu_if->vgic_lr[i], i);
+	} else {
+		/*
+		 * If we need to trap system registers, we must write
+		 * ICH_HCR_EL2 anyway, even if no interrupts are being
+		 * injected,
+		 */
+		if (static_branch_unlikely(&vgic_v3_cpuif_trap))
+			write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
 	}
 
 	/*
@@ -287,3 +384,697 @@ void __hyp_text __vgic_v3_write_vmcr(u32 vmcr)
 {
 	write_gicreg(vmcr, ICH_VMCR_EL2);
 }
+
+#ifdef CONFIG_ARM64
+
+static int __hyp_text __vgic_v3_bpr_min(void)
+{
+	/* See Pseudocode for VPriorityGroup */
+	return 8 - vtr_to_nr_pre_bits(read_gicreg(ICH_VTR_EL2));
+}
+
+static int __hyp_text __vgic_v3_get_group(struct kvm_vcpu *vcpu)
+{
+	u32 esr = kvm_vcpu_get_hsr(vcpu);
+	u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT;
+
+	return crm != 8;
+}
+
+#define GICv3_IDLE_PRIORITY	0xff
+
+static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu,
+						    u32 vmcr,
+						    u64 *lr_val)
+{
+	unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+	u8 priority = GICv3_IDLE_PRIORITY;
+	int i, lr = -1;
+
+	for (i = 0; i < used_lrs; i++) {
+		u64 val = __gic_v3_get_lr(i);
+		u8 lr_prio = (val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
+
+		/* Not pending in the state? */
+		if ((val & ICH_LR_STATE) != ICH_LR_PENDING_BIT)
+			continue;
+
+		/* Group-0 interrupt, but Group-0 disabled? */
+		if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK))
+			continue;
+
+		/* Group-1 interrupt, but Group-1 disabled? */
+		if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK))
+			continue;
+
+		/* Not the highest priority? */
+		if (lr_prio >= priority)
+			continue;
+
+		/* This is a candidate */
+		priority = lr_prio;
+		*lr_val = val;
+		lr = i;
+	}
+
+	if (lr == -1)
+		*lr_val = ICC_IAR1_EL1_SPURIOUS;
+
+	return lr;
+}
+
+static int __hyp_text __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu,
+					       int intid, u64 *lr_val)
+{
+	unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+	int i;
+
+	for (i = 0; i < used_lrs; i++) {
+		u64 val = __gic_v3_get_lr(i);
+
+		if ((val & ICH_LR_VIRTUAL_ID_MASK) == intid &&
+		    (val & ICH_LR_ACTIVE_BIT)) {
+			*lr_val = val;
+			return i;
+		}
+	}
+
+	*lr_val = ICC_IAR1_EL1_SPURIOUS;
+	return -1;
+}
+
+static int __hyp_text __vgic_v3_get_highest_active_priority(void)
+{
+	u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
+	u32 hap = 0;
+	int i;
+
+	for (i = 0; i < nr_apr_regs; i++) {
+		u32 val;
+
+		/*
+		 * The ICH_AP0Rn_EL2 and ICH_AP1Rn_EL2 registers
+		 * contain the active priority levels for this VCPU
+		 * for the maximum number of supported priority
+		 * levels, and we return the full priority level only
+		 * if the BPR is programmed to its minimum, otherwise
+		 * we return a combination of the priority level and
+		 * subpriority, as determined by the setting of the
+		 * BPR, but without the full subpriority.
+		 */
+		val  = __vgic_v3_read_ap0rn(i);
+		val |= __vgic_v3_read_ap1rn(i);
+		if (!val) {
+			hap += 32;
+			continue;
+		}
+
+		return (hap + __ffs(val)) << __vgic_v3_bpr_min();
+	}
+
+	return GICv3_IDLE_PRIORITY;
+}
+
+static unsigned int __hyp_text __vgic_v3_get_bpr0(u32 vmcr)
+{
+	return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
+}
+
+static unsigned int __hyp_text __vgic_v3_get_bpr1(u32 vmcr)
+{
+	unsigned int bpr;
+
+	if (vmcr & ICH_VMCR_CBPR_MASK) {
+		bpr = __vgic_v3_get_bpr0(vmcr);
+		if (bpr < 7)
+			bpr++;
+	} else {
+		bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
+	}
+
+	return bpr;
+}
+
+/*
+ * Convert a priority to a preemption level, taking the relevant BPR
+ * into account by zeroing the sub-priority bits.
+ */
+static u8 __hyp_text __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp)
+{
+	unsigned int bpr;
+
+	if (!grp)
+		bpr = __vgic_v3_get_bpr0(vmcr) + 1;
+	else
+		bpr = __vgic_v3_get_bpr1(vmcr);
+
+	return pri & (GENMASK(7, 0) << bpr);
+}
+
+/*
+ * The priority value is independent of any of the BPR values, so we
+ * normalize it using the minumal BPR value. This guarantees that no
+ * matter what the guest does with its BPR, we can always set/get the
+ * same value of a priority.
+ */
+static void __hyp_text __vgic_v3_set_active_priority(u8 pri, u32 vmcr, int grp)
+{
+	u8 pre, ap;
+	u32 val;
+	int apr;
+
+	pre = __vgic_v3_pri_to_pre(pri, vmcr, grp);
+	ap = pre >> __vgic_v3_bpr_min();
+	apr = ap / 32;
+
+	if (!grp) {
+		val = __vgic_v3_read_ap0rn(apr);
+		__vgic_v3_write_ap0rn(val | BIT(ap % 32), apr);
+	} else {
+		val = __vgic_v3_read_ap1rn(apr);
+		__vgic_v3_write_ap1rn(val | BIT(ap % 32), apr);
+	}
+}
+
+static int __hyp_text __vgic_v3_clear_highest_active_priority(void)
+{
+	u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
+	u32 hap = 0;
+	int i;
+
+	for (i = 0; i < nr_apr_regs; i++) {
+		u32 ap0, ap1;
+		int c0, c1;
+
+		ap0 = __vgic_v3_read_ap0rn(i);
+		ap1 = __vgic_v3_read_ap1rn(i);
+		if (!ap0 && !ap1) {
+			hap += 32;
+			continue;
+		}
+
+		c0 = ap0 ? __ffs(ap0) : 32;
+		c1 = ap1 ? __ffs(ap1) : 32;
+
+		/* Always clear the LSB, which is the highest priority */
+		if (c0 < c1) {
+			ap0 &= ~BIT(c0);
+			__vgic_v3_write_ap0rn(ap0, i);
+			hap += c0;
+		} else {
+			ap1 &= ~BIT(c1);
+			__vgic_v3_write_ap1rn(ap1, i);
+			hap += c1;
+		}
+
+		/* Rescale to 8 bits of priority */
+		return hap << __vgic_v3_bpr_min();
+	}
+
+	return GICv3_IDLE_PRIORITY;
+}
+
+static void __hyp_text __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	u64 lr_val;
+	u8 lr_prio, pmr;
+	int lr, grp;
+
+	grp = __vgic_v3_get_group(vcpu);
+
+	lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
+	if (lr < 0)
+		goto spurious;
+
+	if (grp != !!(lr_val & ICH_LR_GROUP))
+		goto spurious;
+
+	pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
+	lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
+	if (pmr <= lr_prio)
+		goto spurious;
+
+	if (__vgic_v3_get_highest_active_priority() <= __vgic_v3_pri_to_pre(lr_prio, vmcr, grp))
+		goto spurious;
+
+	lr_val &= ~ICH_LR_STATE;
+	/* No active state for LPIs */
+	if ((lr_val & ICH_LR_VIRTUAL_ID_MASK) <= VGIC_MAX_SPI)
+		lr_val |= ICH_LR_ACTIVE_BIT;
+	__gic_v3_set_lr(lr_val, lr);
+	__vgic_v3_set_active_priority(lr_prio, vmcr, grp);
+	vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
+	return;
+
+spurious:
+	vcpu_set_reg(vcpu, rt, ICC_IAR1_EL1_SPURIOUS);
+}
+
+static void __hyp_text __vgic_v3_clear_active_lr(int lr, u64 lr_val)
+{
+	lr_val &= ~ICH_LR_ACTIVE_BIT;
+	if (lr_val & ICH_LR_HW) {
+		u32 pid;
+
+		pid = (lr_val & ICH_LR_PHYS_ID_MASK) >> ICH_LR_PHYS_ID_SHIFT;
+		gic_write_dir(pid);
+	}
+
+	__gic_v3_set_lr(lr_val, lr);
+}
+
+static void __hyp_text __vgic_v3_bump_eoicount(void)
+{
+	u32 hcr;
+
+	hcr = read_gicreg(ICH_HCR_EL2);
+	hcr += 1 << ICH_HCR_EOIcount_SHIFT;
+	write_gicreg(hcr, ICH_HCR_EL2);
+}
+
+static void __hyp_text __vgic_v3_write_dir(struct kvm_vcpu *vcpu,
+					   u32 vmcr, int rt)
+{
+	u32 vid = vcpu_get_reg(vcpu, rt);
+	u64 lr_val;
+	int lr;
+
+	/* EOImode == 0, nothing to be done here */
+	if (!(vmcr & ICH_VMCR_EOIM_MASK))
+		return;
+
+	/* No deactivate to be performed on an LPI */
+	if (vid >= VGIC_MIN_LPI)
+		return;
+
+	lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
+	if (lr == -1) {
+		__vgic_v3_bump_eoicount();
+		return;
+	}
+
+	__vgic_v3_clear_active_lr(lr, lr_val);
+}
+
+static void __hyp_text __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	u32 vid = vcpu_get_reg(vcpu, rt);
+	u64 lr_val;
+	u8 lr_prio, act_prio;
+	int lr, grp;
+
+	grp = __vgic_v3_get_group(vcpu);
+
+	/* Drop priority in any case */
+	act_prio = __vgic_v3_clear_highest_active_priority();
+
+	/* If EOIing an LPI, no deactivate to be performed */
+	if (vid >= VGIC_MIN_LPI)
+		return;
+
+	/* EOImode == 1, nothing to be done here */
+	if (vmcr & ICH_VMCR_EOIM_MASK)
+		return;
+
+	lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
+	if (lr == -1) {
+		__vgic_v3_bump_eoicount();
+		return;
+	}
+
+	lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
+
+	/* If priorities or group do not match, the guest has fscked-up. */
+	if (grp != !!(lr_val & ICH_LR_GROUP) ||
+	    __vgic_v3_pri_to_pre(lr_prio, vmcr, grp) != act_prio)
+		return;
+
+	/* Let's now perform the deactivation */
+	__vgic_v3_clear_active_lr(lr, lr_val);
+}
+
+static void __hyp_text __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK));
+}
+
+static void __hyp_text __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK));
+}
+
+static void __hyp_text __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	u64 val = vcpu_get_reg(vcpu, rt);
+
+	if (val & 1)
+		vmcr |= ICH_VMCR_ENG0_MASK;
+	else
+		vmcr &= ~ICH_VMCR_ENG0_MASK;
+
+	__vgic_v3_write_vmcr(vmcr);
+}
+
+static void __hyp_text __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	u64 val = vcpu_get_reg(vcpu, rt);
+
+	if (val & 1)
+		vmcr |= ICH_VMCR_ENG1_MASK;
+	else
+		vmcr &= ~ICH_VMCR_ENG1_MASK;
+
+	__vgic_v3_write_vmcr(vmcr);
+}
+
+static void __hyp_text __vgic_v3_read_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr0(vmcr));
+}
+
+static void __hyp_text __vgic_v3_read_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr1(vmcr));
+}
+
+static void __hyp_text __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	u64 val = vcpu_get_reg(vcpu, rt);
+	u8 bpr_min = __vgic_v3_bpr_min() - 1;
+
+	/* Enforce BPR limiting */
+	if (val < bpr_min)
+		val = bpr_min;
+
+	val <<= ICH_VMCR_BPR0_SHIFT;
+	val &= ICH_VMCR_BPR0_MASK;
+	vmcr &= ~ICH_VMCR_BPR0_MASK;
+	vmcr |= val;
+
+	__vgic_v3_write_vmcr(vmcr);
+}
+
+static void __hyp_text __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	u64 val = vcpu_get_reg(vcpu, rt);
+	u8 bpr_min = __vgic_v3_bpr_min();
+
+	if (vmcr & ICH_VMCR_CBPR_MASK)
+		return;
+
+	/* Enforce BPR limiting */
+	if (val < bpr_min)
+		val = bpr_min;
+
+	val <<= ICH_VMCR_BPR1_SHIFT;
+	val &= ICH_VMCR_BPR1_MASK;
+	vmcr &= ~ICH_VMCR_BPR1_MASK;
+	vmcr |= val;
+
+	__vgic_v3_write_vmcr(vmcr);
+}
+
+static void __hyp_text __vgic_v3_read_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
+{
+	u32 val;
+
+	if (!__vgic_v3_get_group(vcpu))
+		val = __vgic_v3_read_ap0rn(n);
+	else
+		val = __vgic_v3_read_ap1rn(n);
+
+	vcpu_set_reg(vcpu, rt, val);
+}
+
+static void __hyp_text __vgic_v3_write_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
+{
+	u32 val = vcpu_get_reg(vcpu, rt);
+
+	if (!__vgic_v3_get_group(vcpu))
+		__vgic_v3_write_ap0rn(val, n);
+	else
+		__vgic_v3_write_ap1rn(val, n);
+}
+
+static void __hyp_text __vgic_v3_read_apxr0(struct kvm_vcpu *vcpu,
+					    u32 vmcr, int rt)
+{
+	__vgic_v3_read_apxrn(vcpu, rt, 0);
+}
+
+static void __hyp_text __vgic_v3_read_apxr1(struct kvm_vcpu *vcpu,
+					    u32 vmcr, int rt)
+{
+	__vgic_v3_read_apxrn(vcpu, rt, 1);
+}
+
+static void __hyp_text __vgic_v3_read_apxr2(struct kvm_vcpu *vcpu,
+					    u32 vmcr, int rt)
+{
+	__vgic_v3_read_apxrn(vcpu, rt, 2);
+}
+
+static void __hyp_text __vgic_v3_read_apxr3(struct kvm_vcpu *vcpu,
+					    u32 vmcr, int rt)
+{
+	__vgic_v3_read_apxrn(vcpu, rt, 3);
+}
+
+static void __hyp_text __vgic_v3_write_apxr0(struct kvm_vcpu *vcpu,
+					     u32 vmcr, int rt)
+{
+	__vgic_v3_write_apxrn(vcpu, rt, 0);
+}
+
+static void __hyp_text __vgic_v3_write_apxr1(struct kvm_vcpu *vcpu,
+					     u32 vmcr, int rt)
+{
+	__vgic_v3_write_apxrn(vcpu, rt, 1);
+}
+
+static void __hyp_text __vgic_v3_write_apxr2(struct kvm_vcpu *vcpu,
+					     u32 vmcr, int rt)
+{
+	__vgic_v3_write_apxrn(vcpu, rt, 2);
+}
+
+static void __hyp_text __vgic_v3_write_apxr3(struct kvm_vcpu *vcpu,
+					     u32 vmcr, int rt)
+{
+	__vgic_v3_write_apxrn(vcpu, rt, 3);
+}
+
+static void __hyp_text __vgic_v3_read_hppir(struct kvm_vcpu *vcpu,
+					    u32 vmcr, int rt)
+{
+	u64 lr_val;
+	int lr, lr_grp, grp;
+
+	grp = __vgic_v3_get_group(vcpu);
+
+	lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
+	if (lr == -1)
+		goto spurious;
+
+	lr_grp = !!(lr_val & ICH_LR_GROUP);
+	if (lr_grp != grp)
+		lr_val = ICC_IAR1_EL1_SPURIOUS;
+
+spurious:
+	vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
+}
+
+static void __hyp_text __vgic_v3_read_pmr(struct kvm_vcpu *vcpu,
+					  u32 vmcr, int rt)
+{
+	vmcr &= ICH_VMCR_PMR_MASK;
+	vmcr >>= ICH_VMCR_PMR_SHIFT;
+	vcpu_set_reg(vcpu, rt, vmcr);
+}
+
+static void __hyp_text __vgic_v3_write_pmr(struct kvm_vcpu *vcpu,
+					   u32 vmcr, int rt)
+{
+	u32 val = vcpu_get_reg(vcpu, rt);
+
+	val <<= ICH_VMCR_PMR_SHIFT;
+	val &= ICH_VMCR_PMR_MASK;
+	vmcr &= ~ICH_VMCR_PMR_MASK;
+	vmcr |= val;
+
+	write_gicreg(vmcr, ICH_VMCR_EL2);
+}
+
+static void __hyp_text __vgic_v3_read_rpr(struct kvm_vcpu *vcpu,
+					  u32 vmcr, int rt)
+{
+	u32 val = __vgic_v3_get_highest_active_priority();
+	vcpu_set_reg(vcpu, rt, val);
+}
+
+static void __hyp_text __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu,
+					   u32 vmcr, int rt)
+{
+	u32 vtr, val;
+
+	vtr = read_gicreg(ICH_VTR_EL2);
+	/* PRIbits */
+	val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT;
+	/* IDbits */
+	val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT;
+	/* SEIS */
+	val |= ((vtr >> 22) & 1) << ICC_CTLR_EL1_SEIS_SHIFT;
+	/* A3V */
+	val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT;
+	/* EOImode */
+	val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT;
+	/* CBPR */
+	val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
+
+	vcpu_set_reg(vcpu, rt, val);
+}
+
+static void __hyp_text __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu,
+					    u32 vmcr, int rt)
+{
+	u32 val = vcpu_get_reg(vcpu, rt);
+
+	if (val & ICC_CTLR_EL1_CBPR_MASK)
+		vmcr |= ICH_VMCR_CBPR_MASK;
+	else
+		vmcr &= ~ICH_VMCR_CBPR_MASK;
+
+	if (val & ICC_CTLR_EL1_EOImode_MASK)
+		vmcr |= ICH_VMCR_EOIM_MASK;
+	else
+		vmcr &= ~ICH_VMCR_EOIM_MASK;
+
+	write_gicreg(vmcr, ICH_VMCR_EL2);
+}
+
+int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
+{
+	int rt;
+	u32 esr;
+	u32 vmcr;
+	void (*fn)(struct kvm_vcpu *, u32, int);
+	bool is_read;
+	u32 sysreg;
+
+	esr = kvm_vcpu_get_hsr(vcpu);
+	if (vcpu_mode_is_32bit(vcpu)) {
+		if (!kvm_condition_valid(vcpu))
+			return 1;
+
+		sysreg = esr_cp15_to_sysreg(esr);
+	} else {
+		sysreg = esr_sys64_to_sysreg(esr);
+	}
+
+	is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ;
+
+	switch (sysreg) {
+	case SYS_ICC_IAR0_EL1:
+	case SYS_ICC_IAR1_EL1:
+		if (unlikely(!is_read))
+			return 0;
+		fn = __vgic_v3_read_iar;
+		break;
+	case SYS_ICC_EOIR0_EL1:
+	case SYS_ICC_EOIR1_EL1:
+		if (unlikely(is_read))
+			return 0;
+		fn = __vgic_v3_write_eoir;
+		break;
+	case SYS_ICC_IGRPEN1_EL1:
+		if (is_read)
+			fn = __vgic_v3_read_igrpen1;
+		else
+			fn = __vgic_v3_write_igrpen1;
+		break;
+	case SYS_ICC_BPR1_EL1:
+		if (is_read)
+			fn = __vgic_v3_read_bpr1;
+		else
+			fn = __vgic_v3_write_bpr1;
+		break;
+	case SYS_ICC_AP0Rn_EL1(0):
+	case SYS_ICC_AP1Rn_EL1(0):
+		if (is_read)
+			fn = __vgic_v3_read_apxr0;
+		else
+			fn = __vgic_v3_write_apxr0;
+		break;
+	case SYS_ICC_AP0Rn_EL1(1):
+	case SYS_ICC_AP1Rn_EL1(1):
+		if (is_read)
+			fn = __vgic_v3_read_apxr1;
+		else
+			fn = __vgic_v3_write_apxr1;
+		break;
+	case SYS_ICC_AP0Rn_EL1(2):
+	case SYS_ICC_AP1Rn_EL1(2):
+		if (is_read)
+			fn = __vgic_v3_read_apxr2;
+		else
+			fn = __vgic_v3_write_apxr2;
+		break;
+	case SYS_ICC_AP0Rn_EL1(3):
+	case SYS_ICC_AP1Rn_EL1(3):
+		if (is_read)
+			fn = __vgic_v3_read_apxr3;
+		else
+			fn = __vgic_v3_write_apxr3;
+		break;
+	case SYS_ICC_HPPIR0_EL1:
+	case SYS_ICC_HPPIR1_EL1:
+		if (unlikely(!is_read))
+			return 0;
+		fn = __vgic_v3_read_hppir;
+		break;
+	case SYS_ICC_IGRPEN0_EL1:
+		if (is_read)
+			fn = __vgic_v3_read_igrpen0;
+		else
+			fn = __vgic_v3_write_igrpen0;
+		break;
+	case SYS_ICC_BPR0_EL1:
+		if (is_read)
+			fn = __vgic_v3_read_bpr0;
+		else
+			fn = __vgic_v3_write_bpr0;
+		break;
+	case SYS_ICC_DIR_EL1:
+		if (unlikely(is_read))
+			return 0;
+		fn = __vgic_v3_write_dir;
+		break;
+	case SYS_ICC_RPR_EL1:
+		if (unlikely(!is_read))
+			return 0;
+		fn = __vgic_v3_read_rpr;
+		break;
+	case SYS_ICC_CTLR_EL1:
+		if (is_read)
+			fn = __vgic_v3_read_ctlr;
+		else
+			fn = __vgic_v3_write_ctlr;
+		break;
+	case SYS_ICC_PMR_EL1:
+		if (is_read)
+			fn = __vgic_v3_read_pmr;
+		else
+			fn = __vgic_v3_write_pmr;
+		break;
+	default:
+		return 0;
+	}
+
+	vmcr = __vgic_v3_read_vmcr();
+	rt = kvm_vcpu_sys_get_rt(vcpu);
+	fn(vcpu, vmcr, rt);
+
+	return 1;
+}
+
+#endif
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index a2d63247d1bb..f2d5b6cf06ae 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -20,6 +20,7 @@
 #include <linux/kvm_host.h>
 #include <linux/io.h>
 #include <linux/hugetlb.h>
+#include <linux/sched/signal.h>
 #include <trace/events/kvm.h>
 #include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
@@ -879,6 +880,9 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache
 	pmd_t *pmd;
 
 	pud = stage2_get_pud(kvm, cache, addr);
+	if (!pud)
+		return NULL;
+
 	if (stage2_pud_none(*pud)) {
 		if (!cache)
 			return NULL;
@@ -1258,6 +1262,24 @@ static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
 	__coherent_cache_guest_page(vcpu, pfn, size);
 }
 
+static void kvm_send_hwpoison_signal(unsigned long address,
+				     struct vm_area_struct *vma)
+{
+	siginfo_t info;
+
+	info.si_signo   = SIGBUS;
+	info.si_errno   = 0;
+	info.si_code    = BUS_MCEERR_AR;
+	info.si_addr    = (void __user *)address;
+
+	if (is_vm_hugetlb_page(vma))
+		info.si_addr_lsb = huge_page_shift(hstate_vma(vma));
+	else
+		info.si_addr_lsb = PAGE_SHIFT;
+
+	send_sig_info(SIGBUS, &info, current);
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
@@ -1327,6 +1349,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	smp_rmb();
 
 	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
+	if (pfn == KVM_PFN_ERR_HWPOISON) {
+		kvm_send_hwpoison_signal(hva, vma);
+		return 0;
+	}
 	if (is_error_noslot_pfn(pfn))
 		return -EFAULT;
 
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 4b43e7f3b158..fc8a723ff387 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -203,6 +203,24 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
 	return reg;
 }
 
+static void kvm_pmu_check_overflow(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	bool overflow = !!kvm_pmu_overflow_status(vcpu);
+
+	if (pmu->irq_level == overflow)
+		return;
+
+	pmu->irq_level = overflow;
+
+	if (likely(irqchip_in_kernel(vcpu->kvm))) {
+		int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
+					      pmu->irq_num, overflow,
+					      &vcpu->arch.pmu);
+		WARN_ON(ret);
+	}
+}
+
 /**
  * kvm_pmu_overflow_set - set PMU overflow interrupt
  * @vcpu: The vcpu pointer
@@ -210,37 +228,18 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
  */
 void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val)
 {
-	u64 reg;
-
 	if (val == 0)
 		return;
 
 	vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= val;
-	reg = kvm_pmu_overflow_status(vcpu);
-	if (reg != 0)
-		kvm_vcpu_kick(vcpu);
+	kvm_pmu_check_overflow(vcpu);
 }
 
 static void kvm_pmu_update_state(struct kvm_vcpu *vcpu)
 {
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	bool overflow;
-
 	if (!kvm_arm_pmu_v3_ready(vcpu))
 		return;
-
-	overflow = !!kvm_pmu_overflow_status(vcpu);
-	if (pmu->irq_level == overflow)
-		return;
-
-	pmu->irq_level = overflow;
-
-	if (likely(irqchip_in_kernel(vcpu->kvm))) {
-		int ret;
-		ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
-					  pmu->irq_num, overflow);
-		WARN_ON(ret);
-	}
+	kvm_pmu_check_overflow(vcpu);
 }
 
 bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu)
@@ -451,34 +450,74 @@ bool kvm_arm_support_pmu_v3(void)
 	return (perf_num_counters() > 0);
 }
 
-static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
+int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
 {
-	if (!kvm_arm_support_pmu_v3())
-		return -ENODEV;
+	if (!vcpu->arch.pmu.created)
+		return 0;
 
 	/*
-	 * We currently require an in-kernel VGIC to use the PMU emulation,
-	 * because we do not support forwarding PMU overflow interrupts to
-	 * userspace yet.
+	 * A valid interrupt configuration for the PMU is either to have a
+	 * properly configured interrupt number and using an in-kernel
+	 * irqchip, or to not have an in-kernel GIC and not set an IRQ.
 	 */
-	if (!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))
+	if (irqchip_in_kernel(vcpu->kvm)) {
+		int irq = vcpu->arch.pmu.irq_num;
+		if (!kvm_arm_pmu_irq_initialized(vcpu))
+			return -EINVAL;
+
+		/*
+		 * If we are using an in-kernel vgic, at this point we know
+		 * the vgic will be initialized, so we can check the PMU irq
+		 * number against the dimensions of the vgic and make sure
+		 * it's valid.
+		 */
+		if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq))
+			return -EINVAL;
+	} else if (kvm_arm_pmu_irq_initialized(vcpu)) {
+		   return -EINVAL;
+	}
+
+	kvm_pmu_vcpu_reset(vcpu);
+	vcpu->arch.pmu.ready = true;
+
+	return 0;
+}
+
+static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
+{
+	if (!kvm_arm_support_pmu_v3())
 		return -ENODEV;
 
-	if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features) ||
-	    !kvm_arm_pmu_irq_initialized(vcpu))
+	if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
 		return -ENXIO;
 
-	if (kvm_arm_pmu_v3_ready(vcpu))
+	if (vcpu->arch.pmu.created)
 		return -EBUSY;
 
-	kvm_pmu_vcpu_reset(vcpu);
-	vcpu->arch.pmu.ready = true;
+	if (irqchip_in_kernel(vcpu->kvm)) {
+		int ret;
+
+		/*
+		 * If using the PMU with an in-kernel virtual GIC
+		 * implementation, we require the GIC to be already
+		 * initialized when initializing the PMU.
+		 */
+		if (!vgic_initialized(vcpu->kvm))
+			return -ENODEV;
+
+		if (!kvm_arm_pmu_irq_initialized(vcpu))
+			return -ENXIO;
 
+		ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num,
+					 &vcpu->arch.pmu);
+		if (ret)
+			return ret;
+	}
+
+	vcpu->arch.pmu.created = true;
 	return 0;
 }
 
-#define irq_is_ppi(irq) ((irq) >= VGIC_NR_SGIS && (irq) < VGIC_NR_PRIVATE_IRQS)
-
 /*
  * For one VM the interrupt type must be same for each vcpu.
  * As a PPI, the interrupt number is the same for all vcpus,
@@ -512,6 +551,9 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 		int __user *uaddr = (int __user *)(long)attr->addr;
 		int irq;
 
+		if (!irqchip_in_kernel(vcpu->kvm))
+			return -EINVAL;
+
 		if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
 			return -ENODEV;
 
@@ -519,7 +561,7 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 			return -EFAULT;
 
 		/* The PMU overflow interrupt can be a PPI or a valid SPI. */
-		if (!(irq_is_ppi(irq) || vgic_valid_spi(vcpu->kvm, irq)))
+		if (!(irq_is_ppi(irq) || irq_is_spi(irq)))
 			return -EINVAL;
 
 		if (!pmu_irq_is_valid(vcpu->kvm, irq))
@@ -546,6 +588,9 @@ int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 		int __user *uaddr = (int __user *)(long)attr->addr;
 		int irq;
 
+		if (!irqchip_in_kernel(vcpu->kvm))
+			return -EINVAL;
+
 		if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
 			return -ENODEV;
 
diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
index a08d7a93aebb..f1e363bab5e8 100644
--- a/virt/kvm/arm/psci.c
+++ b/virt/kvm/arm/psci.c
@@ -57,6 +57,7 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
 	 * for KVM will preserve the register state.
 	 */
 	kvm_vcpu_block(vcpu);
+	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 
 	return PSCI_RET_SUCCESS;
 }
@@ -64,6 +65,8 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
 static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.power_off = true;
+	kvm_make_request(KVM_REQ_SLEEP, vcpu);
+	kvm_vcpu_kick(vcpu);
 }
 
 static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
@@ -178,10 +181,9 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
 	 * after this call is handled and before the VCPUs have been
 	 * re-initialized.
 	 */
-	kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
+	kvm_for_each_vcpu(i, tmp, vcpu->kvm)
 		tmp->arch.power_off = true;
-		kvm_vcpu_kick(tmp);
-	}
+	kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP);
 
 	memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
 	vcpu->run->system_event.type = type;
diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c
index f138ed2e9c63..b7baf581611a 100644
--- a/virt/kvm/arm/vgic/vgic-irqfd.c
+++ b/virt/kvm/arm/vgic/vgic-irqfd.c
@@ -34,7 +34,7 @@ static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e,
 
 	if (!vgic_valid_spi(kvm, spi_id))
 		return -EINVAL;
-	return kvm_vgic_inject_irq(kvm, 0, spi_id, level);
+	return kvm_vgic_inject_irq(kvm, 0, spi_id, level, NULL);
 }
 
 /**
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index 0a4283ed9aa7..37522e65eb53 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -226,7 +226,13 @@ static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu,
 
 	switch (addr & 0xff) {
 	case GIC_CPU_CTRL:
-		val = vmcr.ctlr;
+		val = vmcr.grpen0 << GIC_CPU_CTRL_EnableGrp0_SHIFT;
+		val |= vmcr.grpen1 << GIC_CPU_CTRL_EnableGrp1_SHIFT;
+		val |= vmcr.ackctl << GIC_CPU_CTRL_AckCtl_SHIFT;
+		val |= vmcr.fiqen << GIC_CPU_CTRL_FIQEn_SHIFT;
+		val |= vmcr.cbpr << GIC_CPU_CTRL_CBPR_SHIFT;
+		val |= vmcr.eoim << GIC_CPU_CTRL_EOImodeNS_SHIFT;
+
 		break;
 	case GIC_CPU_PRIMASK:
 		/*
@@ -267,7 +273,13 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
 
 	switch (addr & 0xff) {
 	case GIC_CPU_CTRL:
-		vmcr.ctlr = val;
+		vmcr.grpen0 = !!(val & GIC_CPU_CTRL_EnableGrp0);
+		vmcr.grpen1 = !!(val & GIC_CPU_CTRL_EnableGrp1);
+		vmcr.ackctl = !!(val & GIC_CPU_CTRL_AckCtl);
+		vmcr.fiqen = !!(val & GIC_CPU_CTRL_FIQEn);
+		vmcr.cbpr = !!(val & GIC_CPU_CTRL_CBPR);
+		vmcr.eoim = !!(val & GIC_CPU_CTRL_EOImodeNS);
+
 		break;
 	case GIC_CPU_PRIMASK:
 		/*
@@ -296,34 +308,36 @@ static const struct vgic_register_region vgic_v2_dist_registers[] = {
 		vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, 12,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP,
-		vgic_mmio_read_rao, vgic_mmio_write_wi, 1,
+		vgic_mmio_read_rao, vgic_mmio_write_wi, NULL, NULL, 1,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET,
-		vgic_mmio_read_enable, vgic_mmio_write_senable, 1,
+		vgic_mmio_read_enable, vgic_mmio_write_senable, NULL, NULL, 1,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR,
-		vgic_mmio_read_enable, vgic_mmio_write_cenable, 1,
+		vgic_mmio_read_enable, vgic_mmio_write_cenable, NULL, NULL, 1,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
-		vgic_mmio_read_pending, vgic_mmio_write_spending, 1,
+		vgic_mmio_read_pending, vgic_mmio_write_spending, NULL, NULL, 1,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR,
-		vgic_mmio_read_pending, vgic_mmio_write_cpending, 1,
+		vgic_mmio_read_pending, vgic_mmio_write_cpending, NULL, NULL, 1,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
-		vgic_mmio_read_active, vgic_mmio_write_sactive, 1,
+		vgic_mmio_read_active, vgic_mmio_write_sactive,
+		NULL, vgic_mmio_uaccess_write_sactive, 1,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR,
-		vgic_mmio_read_active, vgic_mmio_write_cactive, 1,
+		vgic_mmio_read_active, vgic_mmio_write_cactive,
+		NULL, vgic_mmio_uaccess_write_cactive, 1,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI,
-		vgic_mmio_read_priority, vgic_mmio_write_priority, 8,
-		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+		vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
+		8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET,
-		vgic_mmio_read_target, vgic_mmio_write_target, 8,
+		vgic_mmio_read_target, vgic_mmio_write_target, NULL, NULL, 8,
 		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG,
-		vgic_mmio_read_config, vgic_mmio_write_config, 2,
+		vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT,
 		vgic_mmio_read_raz, vgic_mmio_write_sgir, 4,
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index 201d5e2e973d..714fa3933546 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -456,11 +456,13 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = {
 		vgic_mmio_read_raz, vgic_mmio_write_wi, 1,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER,
-		vgic_mmio_read_active, vgic_mmio_write_sactive, NULL, NULL, 1,
+		vgic_mmio_read_active, vgic_mmio_write_sactive,
+		NULL, vgic_mmio_uaccess_write_sactive, 1,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER,
-		vgic_mmio_read_active, vgic_mmio_write_cactive, NULL, NULL, 1,
-		VGIC_ACCESS_32bit),
+		vgic_mmio_read_active, vgic_mmio_write_cactive,
+		NULL, vgic_mmio_uaccess_write_cactive,
+		1, VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR,
 		vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
 		8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
@@ -526,12 +528,14 @@ static const struct vgic_register_region vgic_v3_sgibase_registers[] = {
 		vgic_mmio_read_pending, vgic_mmio_write_cpending,
 		vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
 		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_ISACTIVER0,
-		vgic_mmio_read_active, vgic_mmio_write_sactive, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_ICACTIVER0,
-		vgic_mmio_read_active, vgic_mmio_write_cactive, 4,
-		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ISACTIVER0,
+		vgic_mmio_read_active, vgic_mmio_write_sactive,
+		NULL, vgic_mmio_uaccess_write_sactive,
+		4, VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ICACTIVER0,
+		vgic_mmio_read_active, vgic_mmio_write_cactive,
+		NULL, vgic_mmio_uaccess_write_cactive,
+		4, VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_LENGTH(GICR_IPRIORITYR0,
 		vgic_mmio_read_priority, vgic_mmio_write_priority, 32,
 		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index 1c17b2a2f105..c1e4bdd66131 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -231,56 +231,94 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
  * be migrated while we don't hold the IRQ locks and we don't want to be
  * chasing moving targets.
  *
- * For private interrupts, we only have to make sure the single and only VCPU
- * that can potentially queue the IRQ is stopped.
+ * For private interrupts we don't have to do anything because userspace
+ * accesses to the VGIC state already require all VCPUs to be stopped, and
+ * only the VCPU itself can modify its private interrupts active state, which
+ * guarantees that the VCPU is not running.
  */
 static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
 {
-	if (intid < VGIC_NR_PRIVATE_IRQS)
-		kvm_arm_halt_vcpu(vcpu);
-	else
+	if (intid > VGIC_NR_PRIVATE_IRQS)
 		kvm_arm_halt_guest(vcpu->kvm);
 }
 
 /* See vgic_change_active_prepare */
 static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid)
 {
-	if (intid < VGIC_NR_PRIVATE_IRQS)
-		kvm_arm_resume_vcpu(vcpu);
-	else
+	if (intid > VGIC_NR_PRIVATE_IRQS)
 		kvm_arm_resume_guest(vcpu->kvm);
 }
 
-void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val)
+static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
+				      gpa_t addr, unsigned int len,
+				      unsigned long val)
 {
 	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
 	int i;
 
-	vgic_change_active_prepare(vcpu, intid);
 	for_each_set_bit(i, &val, len * 8) {
 		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
 		vgic_mmio_change_active(vcpu, irq, false);
 		vgic_put_irq(vcpu->kvm, irq);
 	}
-	vgic_change_active_finish(vcpu, intid);
 }
 
-void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
+void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
 			     gpa_t addr, unsigned int len,
 			     unsigned long val)
 {
 	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
 
+	mutex_lock(&vcpu->kvm->lock);
 	vgic_change_active_prepare(vcpu, intid);
+
+	__vgic_mmio_write_cactive(vcpu, addr, len, val);
+
+	vgic_change_active_finish(vcpu, intid);
+	mutex_unlock(&vcpu->kvm->lock);
+}
+
+void vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
+				     gpa_t addr, unsigned int len,
+				     unsigned long val)
+{
+	__vgic_mmio_write_cactive(vcpu, addr, len, val);
+}
+
+static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
+				      gpa_t addr, unsigned int len,
+				      unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	int i;
+
 	for_each_set_bit(i, &val, len * 8) {
 		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
 		vgic_mmio_change_active(vcpu, irq, true);
 		vgic_put_irq(vcpu->kvm, irq);
 	}
+}
+
+void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
+			     gpa_t addr, unsigned int len,
+			     unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+
+	mutex_lock(&vcpu->kvm->lock);
+	vgic_change_active_prepare(vcpu, intid);
+
+	__vgic_mmio_write_sactive(vcpu, addr, len, val);
+
 	vgic_change_active_finish(vcpu, intid);
+	mutex_unlock(&vcpu->kvm->lock);
+}
+
+void vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
+				     gpa_t addr, unsigned int len,
+				     unsigned long val)
+{
+	__vgic_mmio_write_sactive(vcpu, addr, len, val);
 }
 
 unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
index ea4171acdef3..5693f6df45ec 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.h
+++ b/virt/kvm/arm/vgic/vgic-mmio.h
@@ -75,7 +75,7 @@ extern struct kvm_io_device_ops kvm_io_gic_ops;
  * The _WITH_LENGTH version instantiates registers with a fixed length
  * and is mutually exclusive with the _PER_IRQ version.
  */
-#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, bpi, acc)		\
+#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, ur, uw, bpi, acc)	\
 	{								\
 		.reg_offset = off,					\
 		.bits_per_irq = bpi,					\
@@ -83,6 +83,8 @@ extern struct kvm_io_device_ops kvm_io_gic_ops;
 		.access_flags = acc,					\
 		.read = rd,						\
 		.write = wr,						\
+		.uaccess_read = ur,					\
+		.uaccess_write = uw,					\
 	}
 
 #define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc)		\
@@ -165,6 +167,14 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
 			     gpa_t addr, unsigned int len,
 			     unsigned long val);
 
+void vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
+				     gpa_t addr, unsigned int len,
+				     unsigned long val);
+
+void vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
+				     gpa_t addr, unsigned int len,
+				     unsigned long val);
+
 unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
 				      gpa_t addr, unsigned int len);
 
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index 504b4bd0d651..e4187e52bb26 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -177,7 +177,18 @@ void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
 	u32 vmcr;
 
-	vmcr  = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK;
+	vmcr = (vmcrp->grpen0 << GICH_VMCR_ENABLE_GRP0_SHIFT) &
+		GICH_VMCR_ENABLE_GRP0_MASK;
+	vmcr |= (vmcrp->grpen1 << GICH_VMCR_ENABLE_GRP1_SHIFT) &
+		GICH_VMCR_ENABLE_GRP1_MASK;
+	vmcr |= (vmcrp->ackctl << GICH_VMCR_ACK_CTL_SHIFT) &
+		GICH_VMCR_ACK_CTL_MASK;
+	vmcr |= (vmcrp->fiqen << GICH_VMCR_FIQ_EN_SHIFT) &
+		GICH_VMCR_FIQ_EN_MASK;
+	vmcr |= (vmcrp->cbpr << GICH_VMCR_CBPR_SHIFT) &
+		GICH_VMCR_CBPR_MASK;
+	vmcr |= (vmcrp->eoim << GICH_VMCR_EOI_MODE_SHIFT) &
+		GICH_VMCR_EOI_MODE_MASK;
 	vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) &
 		GICH_VMCR_ALIAS_BINPOINT_MASK;
 	vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) &
@@ -195,8 +206,19 @@ void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 
 	vmcr = cpu_if->vgic_vmcr;
 
-	vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >>
-			GICH_VMCR_CTRL_SHIFT;
+	vmcrp->grpen0 = (vmcr & GICH_VMCR_ENABLE_GRP0_MASK) >>
+		GICH_VMCR_ENABLE_GRP0_SHIFT;
+	vmcrp->grpen1 = (vmcr & GICH_VMCR_ENABLE_GRP1_MASK) >>
+		GICH_VMCR_ENABLE_GRP1_SHIFT;
+	vmcrp->ackctl = (vmcr & GICH_VMCR_ACK_CTL_MASK) >>
+		GICH_VMCR_ACK_CTL_SHIFT;
+	vmcrp->fiqen = (vmcr & GICH_VMCR_FIQ_EN_MASK) >>
+		GICH_VMCR_FIQ_EN_SHIFT;
+	vmcrp->cbpr = (vmcr & GICH_VMCR_CBPR_MASK) >>
+		GICH_VMCR_CBPR_SHIFT;
+	vmcrp->eoim = (vmcr & GICH_VMCR_EOI_MODE_MASK) >>
+		GICH_VMCR_EOI_MODE_SHIFT;
+
 	vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >>
 			GICH_VMCR_ALIAS_BINPOINT_SHIFT;
 	vmcrp->bpr  = (vmcr & GICH_VMCR_BINPOINT_MASK) >>
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 6fe3f003636a..96ea597db0e7 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -21,6 +21,10 @@
 
 #include "vgic.h"
 
+static bool group0_trap;
+static bool group1_trap;
+static bool common_trap;
+
 void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
@@ -159,15 +163,24 @@ void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
 void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 {
 	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+	u32 model = vcpu->kvm->arch.vgic.vgic_model;
 	u32 vmcr;
 
-	/*
-	 * Ignore the FIQen bit, because GIC emulation always implies
-	 * SRE=1 which means the vFIQEn bit is also RES1.
-	 */
-	vmcr = ((vmcrp->ctlr >> ICC_CTLR_EL1_EOImode_SHIFT) <<
-		 ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK;
-	vmcr |= (vmcrp->ctlr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK;
+	if (model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+		vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) &
+			ICH_VMCR_ACK_CTL_MASK;
+		vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) &
+			ICH_VMCR_FIQ_EN_MASK;
+	} else {
+		/*
+		 * When emulating GICv3 on GICv3 with SRE=1 on the
+		 * VFIQEn bit is RES1 and the VAckCtl bit is RES0.
+		 */
+		vmcr = ICH_VMCR_FIQ_EN_MASK;
+	}
+
+	vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK;
+	vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK;
 	vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
 	vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
 	vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
@@ -180,17 +193,27 @@ void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 {
 	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+	u32 model = vcpu->kvm->arch.vgic.vgic_model;
 	u32 vmcr;
 
 	vmcr = cpu_if->vgic_vmcr;
 
-	/*
-	 * Ignore the FIQen bit, because GIC emulation always implies
-	 * SRE=1 which means the vFIQEn bit is also RES1.
-	 */
-	vmcrp->ctlr = ((vmcr >> ICH_VMCR_EOIM_SHIFT) <<
-			ICC_CTLR_EL1_EOImode_SHIFT) & ICC_CTLR_EL1_EOImode_MASK;
-	vmcrp->ctlr |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
+	if (model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+		vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >>
+			ICH_VMCR_ACK_CTL_SHIFT;
+		vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >>
+			ICH_VMCR_FIQ_EN_SHIFT;
+	} else {
+		/*
+		 * When emulating GICv3 on GICv3 with SRE=1 on the
+		 * VFIQEn bit is RES1 and the VAckCtl bit is RES0.
+		 */
+		vmcrp->fiqen = 1;
+		vmcrp->ackctl = 0;
+	}
+
+	vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
+	vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT;
 	vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
 	vmcrp->bpr  = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
 	vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
@@ -239,6 +262,12 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
 
 	/* Get the show on the road... */
 	vgic_v3->vgic_hcr = ICH_HCR_EN;
+	if (group0_trap)
+		vgic_v3->vgic_hcr |= ICH_HCR_TALL0;
+	if (group1_trap)
+		vgic_v3->vgic_hcr |= ICH_HCR_TALL1;
+	if (common_trap)
+		vgic_v3->vgic_hcr |= ICH_HCR_TC;
 }
 
 int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
@@ -410,6 +439,26 @@ out:
 	return ret;
 }
 
+DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap);
+
+static int __init early_group0_trap_cfg(char *buf)
+{
+	return strtobool(buf, &group0_trap);
+}
+early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg);
+
+static int __init early_group1_trap_cfg(char *buf)
+{
+	return strtobool(buf, &group1_trap);
+}
+early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg);
+
+static int __init early_common_trap_cfg(char *buf)
+{
+	return strtobool(buf, &common_trap);
+}
+early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg);
+
 /**
  * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT
  * @node:	pointer to the DT node
@@ -461,6 +510,21 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
 	if (kvm_vgic_global_state.vcpu_base == 0)
 		kvm_info("disabling GICv2 emulation\n");
 
+#ifdef CONFIG_ARM64
+	if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
+		group0_trap = true;
+		group1_trap = true;
+	}
+#endif
+
+	if (group0_trap || group1_trap || common_trap) {
+		kvm_info("GICv3 sysreg trapping enabled ([%s%s%s], reduced performance)\n",
+			 group0_trap ? "G0" : "",
+			 group1_trap ? "G1" : "",
+			 common_trap ? "C"  : "");
+		static_branch_enable(&vgic_v3_cpuif_trap);
+	}
+
 	kvm_vgic_global_state.vctrl_base = NULL;
 	kvm_vgic_global_state.type = VGIC_V3;
 	kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS;
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 83b24d20ff8f..fed717e07938 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -35,11 +35,12 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
 
 /*
  * Locking order is always:
- * its->cmd_lock (mutex)
- *   its->its_lock (mutex)
- *     vgic_cpu->ap_list_lock
- *       kvm->lpi_list_lock
- *         vgic_irq->irq_lock
+ * kvm->lock (mutex)
+ *   its->cmd_lock (mutex)
+ *     its->its_lock (mutex)
+ *       vgic_cpu->ap_list_lock
+ *         kvm->lpi_list_lock
+ *           vgic_irq->irq_lock
  *
  * If you need to take multiple locks, always take the upper lock first,
  * then the lower ones, e.g. first take the its_lock, then the irq_lock.
@@ -234,10 +235,14 @@ static void vgic_sort_ap_list(struct kvm_vcpu *vcpu)
 
 /*
  * Only valid injection if changing level for level-triggered IRQs or for a
- * rising edge.
+ * rising edge, and in-kernel connected IRQ lines can only be controlled by
+ * their owner.
  */
-static bool vgic_validate_injection(struct vgic_irq *irq, bool level)
+static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owner)
 {
+	if (irq->owner != owner)
+		return false;
+
 	switch (irq->config) {
 	case VGIC_CONFIG_LEVEL:
 		return irq->line_level != level;
@@ -285,8 +290,10 @@ retry:
 		 * won't see this one until it exits for some other
 		 * reason.
 		 */
-		if (vcpu)
+		if (vcpu) {
+			kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
 			kvm_vcpu_kick(vcpu);
+		}
 		return false;
 	}
 
@@ -332,6 +339,7 @@ retry:
 	spin_unlock(&irq->irq_lock);
 	spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
 
+	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
 	kvm_vcpu_kick(vcpu);
 
 	return true;
@@ -346,13 +354,16 @@ retry:
  *			      false: to ignore the call
  *	     Level-sensitive  true:  raise the input signal
  *			      false: lower the input signal
+ * @owner:   The opaque pointer to the owner of the IRQ being raised to verify
+ *           that the caller is allowed to inject this IRQ.  Userspace
+ *           injections will have owner == NULL.
  *
  * The VGIC is not concerned with devices being active-LOW or active-HIGH for
  * level-sensitive interrupts.  You can think of the level parameter as 1
  * being HIGH and 0 being LOW and all devices being active-HIGH.
  */
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
-			bool level)
+			bool level, void *owner)
 {
 	struct kvm_vcpu *vcpu;
 	struct vgic_irq *irq;
@@ -374,7 +385,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
 
 	spin_lock(&irq->irq_lock);
 
-	if (!vgic_validate_injection(irq, level)) {
+	if (!vgic_validate_injection(irq, level, owner)) {
 		/* Nothing to see here, move along... */
 		spin_unlock(&irq->irq_lock);
 		vgic_put_irq(kvm, irq);
@@ -431,6 +442,39 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
 }
 
 /**
+ * kvm_vgic_set_owner - Set the owner of an interrupt for a VM
+ *
+ * @vcpu:   Pointer to the VCPU (used for PPIs)
+ * @intid:  The virtual INTID identifying the interrupt (PPI or SPI)
+ * @owner:  Opaque pointer to the owner
+ *
+ * Returns 0 if intid is not already used by another in-kernel device and the
+ * owner is set, otherwise returns an error code.
+ */
+int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner)
+{
+	struct vgic_irq *irq;
+	int ret = 0;
+
+	if (!vgic_initialized(vcpu->kvm))
+		return -EAGAIN;
+
+	/* SGIs and LPIs cannot be wired up to any device */
+	if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid))
+		return -EINVAL;
+
+	irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+	spin_lock(&irq->irq_lock);
+	if (irq->owner && irq->owner != owner)
+		ret = -EEXIST;
+	else
+		irq->owner = owner;
+	spin_unlock(&irq->irq_lock);
+
+	return ret;
+}
+
+/**
  * vgic_prune_ap_list - Remove non-relevant interrupts from the list
  *
  * @vcpu: The VCPU pointer
@@ -721,8 +765,10 @@ void vgic_kick_vcpus(struct kvm *kvm)
 	 * a good kick...
 	 */
 	kvm_for_each_vcpu(c, vcpu, kvm) {
-		if (kvm_vgic_vcpu_pending_irq(vcpu))
+		if (kvm_vgic_vcpu_pending_irq(vcpu)) {
+			kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
 			kvm_vcpu_kick(vcpu);
+		}
 	}
 }
 
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index da83e4caa272..bba7fa22a7f7 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -111,14 +111,18 @@ static inline bool irq_is_pending(struct vgic_irq *irq)
  * registers regardless of the hardware backed GIC used.
  */
 struct vgic_vmcr {
-	u32	ctlr;
+	u32	grpen0;
+	u32	grpen1;
+
+	u32	ackctl;
+	u32	fiqen;
+	u32	cbpr;
+	u32	eoim;
+
 	u32	abpr;
 	u32	bpr;
 	u32	pmr;  /* Priority mask field in the GICC_PMR and
 		       * ICC_PMR_EL1 priority field format */
-	/* Below member variable are valid only for GICv3 */
-	u32	grpen0;
-	u32	grpen1;
 };
 
 struct vgic_reg_attr {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f0fe9d02f6bb..19f0ecb9b93e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -73,17 +73,17 @@ MODULE_LICENSE("GPL");
 
 /* Architectures should define their poll value according to the halt latency */
 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
-module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
+module_param(halt_poll_ns, uint, 0644);
 EXPORT_SYMBOL_GPL(halt_poll_ns);
 
 /* Default doubles per-vcpu halt_poll_ns. */
 unsigned int halt_poll_ns_grow = 2;
-module_param(halt_poll_ns_grow, uint, S_IRUGO | S_IWUSR);
+module_param(halt_poll_ns_grow, uint, 0644);
 EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
 
 /* Default resets per-vcpu halt_poll_ns . */
 unsigned int halt_poll_ns_shrink;
-module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR);
+module_param(halt_poll_ns_shrink, uint, 0644);
 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
 
 /*
@@ -3191,6 +3191,12 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
 		return PTR_ERR(file);
 	}
 
+	/*
+	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
+	 * already set, with ->release() being kvm_vm_release().  In error
+	 * cases it will be called by the final fput(file) and will take
+	 * care of doing kvm_put_kvm(kvm).
+	 */
 	if (kvm_create_vm_debugfs(kvm, r) < 0) {
 		put_unused_fd(r);
 		fput(file);