diff options
Diffstat (limited to 'virt/kvm')
-rw-r--r-- | virt/kvm/arm/arch_timer.c | 8 | ||||
-rw-r--r-- | virt/kvm/arm/arm.c | 49 | ||||
-rw-r--r-- | virt/kvm/arm/hypercalls.c | 71 | ||||
-rw-r--r-- | virt/kvm/arm/mmio.c | 9 | ||||
-rw-r--r-- | virt/kvm/arm/pmu.c | 48 | ||||
-rw-r--r-- | virt/kvm/arm/psci.c | 84 | ||||
-rw-r--r-- | virt/kvm/arm/pvtime.c | 131 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/trace.h | 2 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic-init.c | 1 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic-its.c | 3 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic-v3.c | 12 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic-v4.c | 59 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic.c | 4 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic.h | 2 | ||||
-rw-r--r-- | virt/kvm/coalesced_mmio.c | 8 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 282 |
16 files changed, 563 insertions, 210 deletions
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index e2bb5bd60227..f182b2380345 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -80,7 +80,7 @@ static inline bool userspace_irqchip(struct kvm *kvm) static void soft_timer_start(struct hrtimer *hrt, u64 ns) { hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns), - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_HARD); } static void soft_timer_cancel(struct hrtimer *hrt) @@ -697,11 +697,11 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); ptimer->cntvoff = 0; - hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); timer->bg_timer.function = kvm_bg_timer_expire; - hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); vtimer->hrtimer.function = kvm_hrtimer_expire; ptimer->hrtimer.function = kvm_hrtimer_expire; diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 86c6aa1cb58e..12e0280291ce 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -40,6 +40,10 @@ #include <asm/kvm_coproc.h> #include <asm/sections.h> +#include <kvm/arm_hypercalls.h> +#include <kvm/arm_pmu.h> +#include <kvm/arm_psci.h> + #ifdef REQUIRES_VIRT __asm__(".arch_extension virt"); #endif @@ -98,6 +102,26 @@ int kvm_arch_check_processor_compat(void) return 0; } +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + int r; + + if (cap->flags) + return -EINVAL; + + switch (cap->cap) { + case KVM_CAP_ARM_NISV_TO_USER: + r = 0; + kvm->arch.return_nisv_io_abort_to_user = true; + break; + default: + r = -EINVAL; + break; + } + + return r; +} /** * kvm_arch_init_vm - initializes a VM data structure @@ -197,6 +221,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_VCPU_EVENTS: case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2: + case KVM_CAP_ARM_NISV_TO_USER: + case KVM_CAP_ARM_INJECT_EXT_DABT: r = 1; break; case KVM_CAP_ARM_SET_DEVICE_ADDR: @@ -322,20 +348,24 @@ void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) /* * If we're about to block (most likely because we've just hit a * WFI), we need to sync back the state of the GIC CPU interface - * so that we have the lastest PMR and group enables. This ensures + * so that we have the latest PMR and group enables. This ensures * that kvm_arch_vcpu_runnable has up-to-date data to decide * whether we have pending interrupts. + * + * For the same reason, we want to tell GICv4 that we need + * doorbells to be signalled, should an interrupt become pending. */ preempt_disable(); kvm_vgic_vmcr_sync(vcpu); + vgic_v4_put(vcpu, true); preempt_enable(); - - kvm_vgic_v4_enable_doorbell(vcpu); } void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { - kvm_vgic_v4_disable_doorbell(vcpu); + preempt_disable(); + vgic_v4_load(vcpu); + preempt_enable(); } int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) @@ -351,6 +381,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm_arm_reset_debug_ptr(vcpu); + kvm_arm_pvtime_vcpu_init(&vcpu->arch); + return kvm_vgic_vcpu_init(vcpu); } @@ -380,11 +412,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_vcpu_load_sysregs(vcpu); kvm_arch_vcpu_load_fp(vcpu); kvm_vcpu_pmu_restore_guest(vcpu); + if (kvm_arm_is_pvtime_enabled(&vcpu->arch)) + kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu); if (single_task_running()) - vcpu_clear_wfe_traps(vcpu); + vcpu_clear_wfx_traps(vcpu); else - vcpu_set_wfe_traps(vcpu); + vcpu_set_wfx_traps(vcpu); vcpu_ptrauth_setup_lazy(vcpu); } @@ -645,6 +679,9 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu) * that a VCPU sees new virtual interrupts. */ kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu); + + if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu)) + kvm_update_stolen_time(vcpu); } } diff --git a/virt/kvm/arm/hypercalls.c b/virt/kvm/arm/hypercalls.c new file mode 100644 index 000000000000..550dfa3e53cd --- /dev/null +++ b/virt/kvm/arm/hypercalls.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2019 Arm Ltd. + +#include <linux/arm-smccc.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_emulate.h> + +#include <kvm/arm_hypercalls.h> +#include <kvm/arm_psci.h> + +int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) +{ + u32 func_id = smccc_get_function(vcpu); + long val = SMCCC_RET_NOT_SUPPORTED; + u32 feature; + gpa_t gpa; + + switch (func_id) { + case ARM_SMCCC_VERSION_FUNC_ID: + val = ARM_SMCCC_VERSION_1_1; + break; + case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: + feature = smccc_get_arg1(vcpu); + switch (feature) { + case ARM_SMCCC_ARCH_WORKAROUND_1: + switch (kvm_arm_harden_branch_predictor()) { + case KVM_BP_HARDEN_UNKNOWN: + break; + case KVM_BP_HARDEN_WA_NEEDED: + val = SMCCC_RET_SUCCESS; + break; + case KVM_BP_HARDEN_NOT_REQUIRED: + val = SMCCC_RET_NOT_REQUIRED; + break; + } + break; + case ARM_SMCCC_ARCH_WORKAROUND_2: + switch (kvm_arm_have_ssbd()) { + case KVM_SSBD_FORCE_DISABLE: + case KVM_SSBD_UNKNOWN: + break; + case KVM_SSBD_KERNEL: + val = SMCCC_RET_SUCCESS; + break; + case KVM_SSBD_FORCE_ENABLE: + case KVM_SSBD_MITIGATED: + val = SMCCC_RET_NOT_REQUIRED; + break; + } + break; + case ARM_SMCCC_HV_PV_TIME_FEATURES: + val = SMCCC_RET_SUCCESS; + break; + } + break; + case ARM_SMCCC_HV_PV_TIME_FEATURES: + val = kvm_hypercall_pv_features(vcpu); + break; + case ARM_SMCCC_HV_PV_TIME_ST: + gpa = kvm_init_stolen_time(vcpu); + if (gpa != GPA_INVALID) + val = gpa; + break; + default: + return kvm_psci_call(vcpu); + } + + smccc_set_retval(vcpu, val, 0, 0, 0); + return 1; +} diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c index 6af5c91337f2..70d3b449692c 100644 --- a/virt/kvm/arm/mmio.c +++ b/virt/kvm/arm/mmio.c @@ -167,7 +167,14 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, if (ret) return ret; } else { - kvm_err("load/store instruction decoding not implemented\n"); + if (vcpu->kvm->arch.return_nisv_io_abort_to_user) { + run->exit_reason = KVM_EXIT_ARM_NISV; + run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu); + run->arm_nisv.fault_ipa = fault_ipa; + return 0; + } + + kvm_pr_unimpl("Data abort outside memslots with no valid syndrome info\n"); return -ENOSYS; } diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c index 362a01886bab..8731dfeced8b 100644 --- a/virt/kvm/arm/pmu.c +++ b/virt/kvm/arm/pmu.c @@ -8,6 +8,7 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> +#include <linux/perf/arm_pmu.h> #include <linux/uaccess.h> #include <asm/kvm_emulate.h> #include <kvm/arm_pmu.h> @@ -146,8 +147,7 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) if (kvm_pmu_pmc_is_chained(pmc) && kvm_pmu_idx_is_high_counter(select_idx)) counter = upper_32_bits(counter); - - else if (!kvm_pmu_idx_is_64bit(vcpu, select_idx)) + else if (select_idx != ARMV8_PMU_CYCLE_IDX) counter = lower_32_bits(counter); return counter; @@ -193,7 +193,7 @@ static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc) */ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc) { - u64 counter, reg; + u64 counter, reg, val; pmc = kvm_pmu_get_canonical_pmc(pmc); if (!pmc->perf_event) @@ -201,16 +201,19 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc) counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); - if (kvm_pmu_pmc_is_chained(pmc)) { - reg = PMEVCNTR0_EL0 + pmc->idx; - __vcpu_sys_reg(vcpu, reg) = lower_32_bits(counter); - __vcpu_sys_reg(vcpu, reg + 1) = upper_32_bits(counter); + if (pmc->idx == ARMV8_PMU_CYCLE_IDX) { + reg = PMCCNTR_EL0; + val = counter; } else { - reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) - ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx; - __vcpu_sys_reg(vcpu, reg) = lower_32_bits(counter); + reg = PMEVCNTR0_EL0 + pmc->idx; + val = lower_32_bits(counter); } + __vcpu_sys_reg(vcpu, reg) = val; + + if (kvm_pmu_pmc_is_chained(pmc)) + __vcpu_sys_reg(vcpu, reg + 1) = upper_32_bits(counter); + kvm_pmu_release_perf_event(pmc); } @@ -440,8 +443,25 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, struct pt_regs *regs) { struct kvm_pmc *pmc = perf_event->overflow_handler_context; + struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu); struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); int idx = pmc->idx; + u64 period; + + cpu_pmu->pmu.stop(perf_event, PERF_EF_UPDATE); + + /* + * Reset the sample period to the architectural limit, + * i.e. the point where the counter overflows. + */ + period = -(local64_read(&perf_event->count)); + + if (!kvm_pmu_idx_is_64bit(vcpu, pmc->idx)) + period &= GENMASK(31, 0); + + local64_set(&perf_event->hw.period_left, 0); + perf_event->attr.sample_period = period; + perf_event->hw.sample_period = period; __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx); @@ -449,6 +469,8 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); kvm_vcpu_kick(vcpu); } + + cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD); } /** @@ -567,12 +589,12 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) * high counter. */ attr.sample_period = (-counter) & GENMASK(63, 0); + if (kvm_pmu_counter_is_enabled(vcpu, pmc->idx + 1)) + attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED; + event = perf_event_create_kernel_counter(&attr, -1, current, kvm_pmu_perf_overflow, pmc + 1); - - if (kvm_pmu_counter_is_enabled(vcpu, pmc->idx + 1)) - attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED; } else { /* The initial sample period (overflow count) of an event. */ if (kvm_pmu_idx_is_64bit(vcpu, pmc->idx)) diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c index 87927f7e1ee7..17e2bdd4b76f 100644 --- a/virt/kvm/arm/psci.c +++ b/virt/kvm/arm/psci.c @@ -15,6 +15,7 @@ #include <asm/kvm_host.h> #include <kvm/arm_psci.h> +#include <kvm/arm_hypercalls.h> /* * This is an implementation of the Power State Coordination Interface @@ -23,38 +24,6 @@ #define AFFINITY_MASK(level) ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1) -static u32 smccc_get_function(struct kvm_vcpu *vcpu) -{ - return vcpu_get_reg(vcpu, 0); -} - -static unsigned long smccc_get_arg1(struct kvm_vcpu *vcpu) -{ - return vcpu_get_reg(vcpu, 1); -} - -static unsigned long smccc_get_arg2(struct kvm_vcpu *vcpu) -{ - return vcpu_get_reg(vcpu, 2); -} - -static unsigned long smccc_get_arg3(struct kvm_vcpu *vcpu) -{ - return vcpu_get_reg(vcpu, 3); -} - -static void smccc_set_retval(struct kvm_vcpu *vcpu, - unsigned long a0, - unsigned long a1, - unsigned long a2, - unsigned long a3) -{ - vcpu_set_reg(vcpu, 0, a0); - vcpu_set_reg(vcpu, 1, a1); - vcpu_set_reg(vcpu, 2, a2); - vcpu_set_reg(vcpu, 3, a3); -} - static unsigned long psci_affinity_mask(unsigned long affinity_level) { if (affinity_level <= 3) @@ -373,7 +342,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) * Errors: * -EINVAL: Unrecognized PSCI function */ -static int kvm_psci_call(struct kvm_vcpu *vcpu) +int kvm_psci_call(struct kvm_vcpu *vcpu) { switch (kvm_psci_version(vcpu, vcpu->kvm)) { case KVM_ARM_PSCI_1_0: @@ -387,55 +356,6 @@ static int kvm_psci_call(struct kvm_vcpu *vcpu) }; } -int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) -{ - u32 func_id = smccc_get_function(vcpu); - u32 val = SMCCC_RET_NOT_SUPPORTED; - u32 feature; - - switch (func_id) { - case ARM_SMCCC_VERSION_FUNC_ID: - val = ARM_SMCCC_VERSION_1_1; - break; - case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: - feature = smccc_get_arg1(vcpu); - switch(feature) { - case ARM_SMCCC_ARCH_WORKAROUND_1: - switch (kvm_arm_harden_branch_predictor()) { - case KVM_BP_HARDEN_UNKNOWN: - break; - case KVM_BP_HARDEN_WA_NEEDED: - val = SMCCC_RET_SUCCESS; - break; - case KVM_BP_HARDEN_NOT_REQUIRED: - val = SMCCC_RET_NOT_REQUIRED; - break; - } - break; - case ARM_SMCCC_ARCH_WORKAROUND_2: - switch (kvm_arm_have_ssbd()) { - case KVM_SSBD_FORCE_DISABLE: - case KVM_SSBD_UNKNOWN: - break; - case KVM_SSBD_KERNEL: - val = SMCCC_RET_SUCCESS; - break; - case KVM_SSBD_FORCE_ENABLE: - case KVM_SSBD_MITIGATED: - val = SMCCC_RET_NOT_REQUIRED; - break; - } - break; - } - break; - default: - return kvm_psci_call(vcpu); - } - - smccc_set_retval(vcpu, val, 0, 0, 0); - return 1; -} - int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) { return 3; /* PSCI version and two workaround registers */ diff --git a/virt/kvm/arm/pvtime.c b/virt/kvm/arm/pvtime.c new file mode 100644 index 000000000000..1e0f4c284888 --- /dev/null +++ b/virt/kvm/arm/pvtime.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2019 Arm Ltd. + +#include <linux/arm-smccc.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_mmu.h> +#include <asm/pvclock-abi.h> + +#include <kvm/arm_hypercalls.h> + +void kvm_update_stolen_time(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + u64 steal; + __le64 steal_le; + u64 offset; + int idx; + u64 base = vcpu->arch.steal.base; + + if (base == GPA_INVALID) + return; + + /* Let's do the local bookkeeping */ + steal = vcpu->arch.steal.steal; + steal += current->sched_info.run_delay - vcpu->arch.steal.last_steal; + vcpu->arch.steal.last_steal = current->sched_info.run_delay; + vcpu->arch.steal.steal = steal; + + steal_le = cpu_to_le64(steal); + idx = srcu_read_lock(&kvm->srcu); + offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time); + kvm_put_guest(kvm, base + offset, steal_le, u64); + srcu_read_unlock(&kvm->srcu, idx); +} + +long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu) +{ + u32 feature = smccc_get_arg1(vcpu); + long val = SMCCC_RET_NOT_SUPPORTED; + + switch (feature) { + case ARM_SMCCC_HV_PV_TIME_FEATURES: + case ARM_SMCCC_HV_PV_TIME_ST: + val = SMCCC_RET_SUCCESS; + break; + } + + return val; +} + +gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) +{ + struct pvclock_vcpu_stolen_time init_values = {}; + struct kvm *kvm = vcpu->kvm; + u64 base = vcpu->arch.steal.base; + int idx; + + if (base == GPA_INVALID) + return base; + + /* + * Start counting stolen time from the time the guest requests + * the feature enabled. + */ + vcpu->arch.steal.steal = 0; + vcpu->arch.steal.last_steal = current->sched_info.run_delay; + + idx = srcu_read_lock(&kvm->srcu); + kvm_write_guest(kvm, base, &init_values, sizeof(init_values)); + srcu_read_unlock(&kvm->srcu, idx); + + return base; +} + +int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *user = (u64 __user *)attr->addr; + struct kvm *kvm = vcpu->kvm; + u64 ipa; + int ret = 0; + int idx; + + if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA) + return -ENXIO; + + if (get_user(ipa, user)) + return -EFAULT; + if (!IS_ALIGNED(ipa, 64)) + return -EINVAL; + if (vcpu->arch.steal.base != GPA_INVALID) + return -EEXIST; + + /* Check the address is in a valid memslot */ + idx = srcu_read_lock(&kvm->srcu); + if (kvm_is_error_hva(gfn_to_hva(kvm, ipa >> PAGE_SHIFT))) + ret = -EINVAL; + srcu_read_unlock(&kvm->srcu, idx); + + if (!ret) + vcpu->arch.steal.base = ipa; + + return ret; +} + +int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *user = (u64 __user *)attr->addr; + u64 ipa; + + if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA) + return -ENXIO; + + ipa = vcpu->arch.steal.base; + + if (put_user(ipa, user)) + return -EFAULT; + return 0; +} + +int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_PVTIME_IPA: + return 0; + } + return -ENXIO; +} diff --git a/virt/kvm/arm/vgic/trace.h b/virt/kvm/arm/vgic/trace.h index 55fed77a9f73..4fd4f6db181b 100644 --- a/virt/kvm/arm/vgic/trace.h +++ b/virt/kvm/arm/vgic/trace.h @@ -30,7 +30,7 @@ TRACE_EVENT(vgic_update_irq_pending, #endif /* _TRACE_VGIC_H */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm/vgic +#define TRACE_INCLUDE_PATH ../../virt/kvm/arm/vgic #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index 6f50c429196d..b3c5de48064c 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -203,6 +203,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) INIT_LIST_HEAD(&vgic_cpu->ap_list_head); raw_spin_lock_init(&vgic_cpu->ap_list_lock); + atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0); /* * Enable and configure all SGIs to be edge-triggered and diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 2be6b66b3856..98c7360d9fb7 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -360,7 +360,10 @@ static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu) if (ret) return ret; + if (map.vpe) + atomic_dec(&map.vpe->vlpi_count); map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + atomic_inc(&map.vpe->vlpi_count); ret = its_map_vlpi(irq->host_irq, &map); } diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index 8d69f007dd0c..f45635a6f0ec 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -357,14 +357,14 @@ retry: } /** - * vgic_its_save_pending_tables - Save the pending tables into guest RAM + * vgic_v3_save_pending_tables - Save the pending tables into guest RAM * kvm lock and all vcpu lock must be held */ int vgic_v3_save_pending_tables(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - int last_byte_offset = -1; struct vgic_irq *irq; + gpa_t last_ptr = ~(gpa_t)0; int ret; u8 val; @@ -384,11 +384,11 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) bit_nr = irq->intid % BITS_PER_BYTE; ptr = pendbase + byte_offset; - if (byte_offset != last_byte_offset) { + if (ptr != last_ptr) { ret = kvm_read_guest_lock(kvm, ptr, &val, 1); if (ret) return ret; - last_byte_offset = byte_offset; + last_ptr = ptr; } stored = val & (1U << bit_nr); @@ -664,6 +664,8 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) if (has_vhe()) __vgic_v3_activate_traps(vcpu); + + WARN_ON(vgic_v4_load(vcpu)); } void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu) @@ -676,6 +678,8 @@ void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu) void vgic_v3_put(struct kvm_vcpu *vcpu) { + WARN_ON(vgic_v4_put(vcpu, false)); + vgic_v3_vmcr_sync(vcpu); kvm_call_hyp(__vgic_v3_save_aprs, vcpu); diff --git a/virt/kvm/arm/vgic/vgic-v4.c b/virt/kvm/arm/vgic/vgic-v4.c index 477af6aebb97..46f875589c47 100644 --- a/virt/kvm/arm/vgic/vgic-v4.c +++ b/virt/kvm/arm/vgic/vgic-v4.c @@ -85,6 +85,10 @@ static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info) { struct kvm_vcpu *vcpu = info; + /* We got the message, no need to fire again */ + if (!irqd_irq_disabled(&irq_to_desc(irq)->irq_data)) + disable_irq_nosync(irq); + vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true; kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); kvm_vcpu_kick(vcpu); @@ -192,20 +196,30 @@ void vgic_v4_teardown(struct kvm *kvm) its_vm->vpes = NULL; } -int vgic_v4_sync_hwstate(struct kvm_vcpu *vcpu) +int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db) { - if (!vgic_supports_direct_msis(vcpu->kvm)) + struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + struct irq_desc *desc = irq_to_desc(vpe->irq); + + if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident) return 0; - return its_schedule_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe, false); + /* + * If blocking, a doorbell is required. Undo the nested + * disable_irq() calls... + */ + while (need_db && irqd_irq_disabled(&desc->irq_data)) + enable_irq(vpe->irq); + + return its_schedule_vpe(vpe, false); } -int vgic_v4_flush_hwstate(struct kvm_vcpu *vcpu) +int vgic_v4_load(struct kvm_vcpu *vcpu) { - int irq = vcpu->arch.vgic_cpu.vgic_v3.its_vpe.irq; + struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; int err; - if (!vgic_supports_direct_msis(vcpu->kvm)) + if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident) return 0; /* @@ -214,11 +228,14 @@ int vgic_v4_flush_hwstate(struct kvm_vcpu *vcpu) * doc in drivers/irqchip/irq-gic-v4.c to understand how this * turns into a VMOVP command at the ITS level. */ - err = irq_set_affinity(irq, cpumask_of(smp_processor_id())); + err = irq_set_affinity(vpe->irq, cpumask_of(smp_processor_id())); if (err) return err; - err = its_schedule_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe, true); + /* Disabled the doorbell, as we're about to enter the guest */ + disable_irq_nosync(vpe->irq); + + err = its_schedule_vpe(vpe, true); if (err) return err; @@ -226,9 +243,7 @@ int vgic_v4_flush_hwstate(struct kvm_vcpu *vcpu) * Now that the VPE is resident, let's get rid of a potential * doorbell interrupt that would still be pending. */ - err = irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, false); - - return err; + return irq_set_irqchip_state(vpe->irq, IRQCHIP_STATE_PENDING, false); } static struct vgic_its *vgic_get_its(struct kvm *kvm, @@ -266,7 +281,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, mutex_lock(&its->its_lock); - /* Perform then actual DevID/EventID -> LPI translation. */ + /* Perform the actual DevID/EventID -> LPI translation. */ ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, irq_entry->msi.data, &irq); if (ret) @@ -294,6 +309,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, irq->hw = true; irq->host_irq = virq; + atomic_inc(&map.vpe->vlpi_count); out: mutex_unlock(&its->its_lock); @@ -327,6 +343,7 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq, WARN_ON(!(irq->hw && irq->host_irq == virq)); if (irq->hw) { + atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count); irq->hw = false; ret = its_unmap_vlpi(virq); } @@ -335,21 +352,3 @@ out: mutex_unlock(&its->its_lock); return ret; } - -void kvm_vgic_v4_enable_doorbell(struct kvm_vcpu *vcpu) -{ - if (vgic_supports_direct_msis(vcpu->kvm)) { - int irq = vcpu->arch.vgic_cpu.vgic_v3.its_vpe.irq; - if (irq) - enable_irq(irq); - } -} - -void kvm_vgic_v4_disable_doorbell(struct kvm_vcpu *vcpu) -{ - if (vgic_supports_direct_msis(vcpu->kvm)) { - int irq = vcpu->arch.vgic_cpu.vgic_v3.its_vpe.irq; - if (irq) - disable_irq(irq); - } -} diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 45a870cb63f5..99b02ca730a8 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -857,8 +857,6 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - WARN_ON(vgic_v4_sync_hwstate(vcpu)); - /* An empty ap_list_head implies used_lrs == 0 */ if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) return; @@ -882,8 +880,6 @@ static inline void vgic_restore_state(struct kvm_vcpu *vcpu) /* Flush our emulation state into the GIC hardware before entering the guest. */ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) { - WARN_ON(vgic_v4_flush_hwstate(vcpu)); - /* * If there are no virtual interrupts active or pending for this * VCPU, then there is no work to do and we can bail out without diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 83066a81b16a..c7fefd6b1c80 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -316,7 +316,5 @@ void vgic_its_invalidate_cache(struct kvm *kvm); bool vgic_supports_direct_msis(struct kvm *kvm); int vgic_v4_init(struct kvm *kvm); void vgic_v4_teardown(struct kvm *kvm); -int vgic_v4_sync_hwstate(struct kvm_vcpu *vcpu); -int vgic_v4_flush_hwstate(struct kvm_vcpu *vcpu); #endif diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 8ffd07e2a160..00c747dbc82e 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -110,14 +110,11 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = { int kvm_coalesced_mmio_init(struct kvm *kvm) { struct page *page; - int ret; - ret = -ENOMEM; page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) - goto out_err; + return -ENOMEM; - ret = 0; kvm->coalesced_mmio_ring = page_address(page); /* @@ -128,8 +125,7 @@ int kvm_coalesced_mmio_init(struct kvm *kvm) spin_lock_init(&kvm->ring_lock); INIT_LIST_HEAD(&kvm->coalesced_zones); -out_err: - return ret; + return 0; } void kvm_coalesced_mmio_free(struct kvm *kvm) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e6de3159e682..00268290dcbd 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -50,6 +50,7 @@ #include <linux/bsearch.h> #include <linux/io.h> #include <linux/lockdep.h> +#include <linux/kthread.h> #include <asm/processor.h> #include <asm/ioctl.h> @@ -121,9 +122,22 @@ static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); #define KVM_COMPAT(c) .compat_ioctl = (c) #else +/* + * For architectures that don't implement a compat infrastructure, + * adopt a double line of defense: + * - Prevent a compat task from opening /dev/kvm + * - If the open has been done by a 64bit task, and the KVM fd + * passed to a compat task, let the ioctls fail. + */ static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) { return -EINVAL; } -#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl + +static int kvm_no_compat_open(struct inode *inode, struct file *file) +{ + return is_compat_task() ? -ENODEV : 0; +} +#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ + .open = kvm_no_compat_open #endif static int hardware_enable_all(void); static void hardware_disable_all(void); @@ -149,10 +163,30 @@ __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, return 0; } +bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) +{ + /* + * The metadata used by is_zone_device_page() to determine whether or + * not a page is ZONE_DEVICE is guaranteed to be valid if and only if + * the device has been pinned, e.g. by get_user_pages(). WARN if the + * page_count() is zero to help detect bad usage of this helper. + */ + if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn)))) + return false; + + return is_zone_device_page(pfn_to_page(pfn)); +} + bool kvm_is_reserved_pfn(kvm_pfn_t pfn) { + /* + * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting + * perspective they are "normal" pages, albeit with slightly different + * usage rules. + */ if (pfn_valid(pfn)) - return PageReserved(pfn_to_page(pfn)); + return PageReserved(pfn_to_page(pfn)) && + !kvm_is_zone_device_pfn(pfn); return true; } @@ -617,17 +651,36 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) stat_data->kvm = kvm; stat_data->offset = p->offset; + stat_data->mode = p->mode ? p->mode : 0644; kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; - debugfs_create_file(p->name, 0644, kvm->debugfs_dentry, + debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry, stat_data, stat_fops_per_vm[p->kind]); } return 0; } +/* + * Called after the VM is otherwise initialized, but just before adding it to + * the vm_list. + */ +int __weak kvm_arch_post_init_vm(struct kvm *kvm) +{ + return 0; +} + +/* + * Called just after removing the VM from the vm_list, but before doing any + * other destruction. + */ +void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) +{ +} + static struct kvm *kvm_create_vm(unsigned long type) { - int r, i; struct kvm *kvm = kvm_arch_alloc_vm(); + int r = -ENOMEM; + int i; if (!kvm) return ERR_PTR(-ENOMEM); @@ -639,46 +692,51 @@ static struct kvm *kvm_create_vm(unsigned long type) mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); - refcount_set(&kvm->users_count, 1); INIT_LIST_HEAD(&kvm->devices); - r = kvm_arch_init_vm(kvm, type); - if (r) - goto out_err_no_disable; - - r = hardware_enable_all(); - if (r) - goto out_err_no_disable; - -#ifdef CONFIG_HAVE_KVM_IRQFD - INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); -#endif - BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); - r = -ENOMEM; + if (init_srcu_struct(&kvm->srcu)) + goto out_err_no_srcu; + if (init_srcu_struct(&kvm->irq_srcu)) + goto out_err_no_irq_srcu; + + refcount_set(&kvm->users_count, 1); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { struct kvm_memslots *slots = kvm_alloc_memslots(); + if (!slots) - goto out_err_no_srcu; + goto out_err_no_arch_destroy_vm; /* Generations must be different for each address space. */ slots->generation = i; rcu_assign_pointer(kvm->memslots[i], slots); } - if (init_srcu_struct(&kvm->srcu)) - goto out_err_no_srcu; - if (init_srcu_struct(&kvm->irq_srcu)) - goto out_err_no_irq_srcu; for (i = 0; i < KVM_NR_BUSES; i++) { rcu_assign_pointer(kvm->buses[i], kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); if (!kvm->buses[i]) - goto out_err; + goto out_err_no_arch_destroy_vm; } + r = kvm_arch_init_vm(kvm, type); + if (r) + goto out_err_no_arch_destroy_vm; + + r = hardware_enable_all(); + if (r) + goto out_err_no_disable; + +#ifdef CONFIG_HAVE_KVM_IRQFD + INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); +#endif + r = kvm_init_mmu_notifier(kvm); if (r) + goto out_err_no_mmu_notifier; + + r = kvm_arch_post_init_vm(kvm); + if (r) goto out_err; mutex_lock(&kvm_lock); @@ -690,17 +748,24 @@ static struct kvm *kvm_create_vm(unsigned long type) return kvm; out_err: - cleanup_srcu_struct(&kvm->irq_srcu); -out_err_no_irq_srcu: - cleanup_srcu_struct(&kvm->srcu); -out_err_no_srcu: +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) + if (kvm->mmu_notifier.ops) + mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); +#endif +out_err_no_mmu_notifier: hardware_disable_all(); out_err_no_disable: - refcount_set(&kvm->users_count, 0); + kvm_arch_destroy_vm(kvm); +out_err_no_arch_destroy_vm: + WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm_get_bus(kvm, i)); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); + cleanup_srcu_struct(&kvm->irq_srcu); +out_err_no_irq_srcu: + cleanup_srcu_struct(&kvm->srcu); +out_err_no_srcu: kvm_arch_free_vm(kvm); mmdrop(current->mm); return ERR_PTR(r); @@ -732,6 +797,8 @@ static void kvm_destroy_vm(struct kvm *kvm) mutex_lock(&kvm_lock); list_del(&kvm->vm_list); mutex_unlock(&kvm_lock); + kvm_arch_pre_destroy_vm(kvm); + kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { struct kvm_io_bus *bus = kvm_get_bus(kvm, i); @@ -771,6 +838,18 @@ void kvm_put_kvm(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_put_kvm); +/* + * Used to put a reference that was taken on behalf of an object associated + * with a user-visible file descriptor, e.g. a vcpu or device, if installation + * of the new file descriptor fails and the reference cannot be transferred to + * its final owner. In such cases, the caller is still actively using @kvm and + * will fail miserably if the refcount unexpectedly hits zero. + */ +void kvm_put_kvm_no_destroy(struct kvm *kvm) +{ + WARN_ON(refcount_dec_and_test(&kvm->users_count)); +} +EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy); static int kvm_vm_release(struct inode *inode, struct file *filp) { @@ -1852,7 +1931,7 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); void kvm_set_pfn_dirty(kvm_pfn_t pfn) { - if (!kvm_is_reserved_pfn(pfn)) { + if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) { struct page *page = pfn_to_page(pfn); SetPageDirty(page); @@ -1862,7 +1941,7 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); void kvm_set_pfn_accessed(kvm_pfn_t pfn) { - if (!kvm_is_reserved_pfn(pfn)) + if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) mark_page_accessed(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); @@ -2359,20 +2438,23 @@ out: kvm_arch_vcpu_unblocking(vcpu); block_ns = ktime_to_ns(cur) - ktime_to_ns(start); - if (!vcpu_valid_wakeup(vcpu)) - shrink_halt_poll_ns(vcpu); - else if (halt_poll_ns) { - if (block_ns <= vcpu->halt_poll_ns) - ; - /* we had a long block, shrink polling */ - else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) + if (!kvm_arch_no_poll(vcpu)) { + if (!vcpu_valid_wakeup(vcpu)) { shrink_halt_poll_ns(vcpu); - /* we had a short halt and our poll time is too small */ - else if (vcpu->halt_poll_ns < halt_poll_ns && - block_ns < halt_poll_ns) - grow_halt_poll_ns(vcpu); - } else - vcpu->halt_poll_ns = 0; + } else if (halt_poll_ns) { + if (block_ns <= vcpu->halt_poll_ns) + ; + /* we had a long block, shrink polling */ + else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) + shrink_halt_poll_ns(vcpu); + /* we had a short halt and our poll time is too small */ + else if (vcpu->halt_poll_ns < halt_poll_ns && + block_ns < halt_poll_ns) + grow_halt_poll_ns(vcpu); + } else { + vcpu->halt_poll_ns = 0; + } + } trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); kvm_arch_vcpu_block_finish(vcpu); @@ -2669,17 +2751,18 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) goto unlock_vcpu_destroy; } - BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); + vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); + BUG_ON(kvm->vcpus[vcpu->vcpu_idx]); /* Now it's all set up, let userspace reach it */ kvm_get_kvm(kvm); r = create_vcpu_fd(vcpu); if (r < 0) { - kvm_put_kvm(kvm); + kvm_put_kvm_no_destroy(kvm); goto unlock_vcpu_destroy; } - kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; + kvm->vcpus[vcpu->vcpu_idx] = vcpu; /* * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus @@ -3045,14 +3128,14 @@ struct kvm_device *kvm_device_from_filp(struct file *filp) return filp->private_data; } -static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { +static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { #ifdef CONFIG_KVM_MPIC [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, #endif }; -int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) +int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type) { if (type >= ARRAY_SIZE(kvm_device_ops_table)) return -ENOSPC; @@ -3073,7 +3156,7 @@ void kvm_unregister_device_ops(u32 type) static int kvm_ioctl_create_device(struct kvm *kvm, struct kvm_create_device *cd) { - struct kvm_device_ops *ops = NULL; + const struct kvm_device_ops *ops = NULL; struct kvm_device *dev; bool test = cd->flags & KVM_CREATE_DEVICE_TEST; int type; @@ -3113,7 +3196,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm, kvm_get_kvm(kvm); ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); if (ret < 0) { - kvm_put_kvm(kvm); + kvm_put_kvm_no_destroy(kvm); mutex_lock(&kvm->lock); list_del(&dev->vm_node); mutex_unlock(&kvm->lock); @@ -3929,7 +4012,9 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) return -ENOENT; - if (simple_attr_open(inode, file, get, set, fmt)) { + if (simple_attr_open(inode, file, get, + stat_data->mode & S_IWUGO ? set : NULL, + fmt)) { kvm_put_kvm(stat_data->kvm); return -ENOMEM; } @@ -4177,7 +4262,8 @@ static void kvm_init_debug(void) kvm_debugfs_num_entries = 0; for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { - debugfs_create_file(p->name, 0644, kvm_debugfs_dir, + int mode = p->mode ? p->mode : 0644; + debugfs_create_file(p->name, mode, kvm_debugfs_dir, (void *)(long)p->offset, stat_fops[p->kind]); } @@ -4268,12 +4354,12 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, r = kvm_arch_hardware_setup(); if (r < 0) - goto out_free_0a; + goto out_free_1; for_each_online_cpu(cpu) { smp_call_function_single(cpu, check_processor_compat, &r, 1); if (r < 0) - goto out_free_1; + goto out_free_2; } r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", @@ -4330,9 +4416,8 @@ out_free_3: unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); out_free_2: -out_free_1: kvm_arch_hardware_unsetup(); -out_free_0a: +out_free_1: free_cpumask_var(cpus_hardware_enabled); out_free_0: kvm_irqfd_exit(); @@ -4360,3 +4445,86 @@ void kvm_exit(void) kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); + +struct kvm_vm_worker_thread_context { + struct kvm *kvm; + struct task_struct *parent; + struct completion init_done; + kvm_vm_thread_fn_t thread_fn; + uintptr_t data; + int err; +}; + +static int kvm_vm_worker_thread(void *context) +{ + /* + * The init_context is allocated on the stack of the parent thread, so + * we have to locally copy anything that is needed beyond initialization + */ + struct kvm_vm_worker_thread_context *init_context = context; + struct kvm *kvm = init_context->kvm; + kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; + uintptr_t data = init_context->data; + int err; + + err = kthread_park(current); + /* kthread_park(current) is never supposed to return an error */ + WARN_ON(err != 0); + if (err) + goto init_complete; + + err = cgroup_attach_task_all(init_context->parent, current); + if (err) { + kvm_err("%s: cgroup_attach_task_all failed with err %d\n", + __func__, err); + goto init_complete; + } + + set_user_nice(current, task_nice(init_context->parent)); + +init_complete: + init_context->err = err; + complete(&init_context->init_done); + init_context = NULL; + + if (err) + return err; + + /* Wait to be woken up by the spawner before proceeding. */ + kthread_parkme(); + + if (!kthread_should_stop()) + err = thread_fn(kvm, data); + + return err; +} + +int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, + uintptr_t data, const char *name, + struct task_struct **thread_ptr) +{ + struct kvm_vm_worker_thread_context init_context = {}; + struct task_struct *thread; + + *thread_ptr = NULL; + init_context.kvm = kvm; + init_context.parent = current; + init_context.thread_fn = thread_fn; + init_context.data = data; + init_completion(&init_context.init_done); + + thread = kthread_run(kvm_vm_worker_thread, &init_context, + "%s-%d", name, task_pid_nr(current)); + if (IS_ERR(thread)) + return PTR_ERR(thread); + + /* kthread_run is never supposed to return NULL */ + WARN_ON(thread == NULL); + + wait_for_completion(&init_context.init_done); + + if (!init_context.err) + *thread_ptr = thread; + + return init_context.err; +} |