diff options
82 files changed, 3333 insertions, 896 deletions
diff --git a/Documentation/filesystems/ext4/journal.rst b/Documentation/filesystems/ext4/journal.rst index 805a1e9ea3a5..849d5b119eb8 100644 --- a/Documentation/filesystems/ext4/journal.rst +++ b/Documentation/filesystems/ext4/journal.rst @@ -256,6 +256,10 @@ which is 1024 bytes long: - s\_padding2 - * - 0x54 + - \_\_be32 + - s\_num\_fc\_blocks + - Number of fast commit blocks in the journal. + * - 0x58 - \_\_u32 - s\_padding[42] - @@ -310,6 +314,8 @@ The journal incompat features are any combination of the following: - This journal uses v3 of the checksum on-disk format. This is the same as v2, but the journal block tag size is fixed regardless of the size of block numbers. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3) + * - 0x20 + - Journal has fast commit blocks. (JBD2\_FEATURE\_INCOMPAT\_FAST\_COMMIT) .. _jbd2_checksum_type: diff --git a/Documentation/filesystems/ext4/super.rst b/Documentation/filesystems/ext4/super.rst index 93e55d7c1d40..2eb1ab20498d 100644 --- a/Documentation/filesystems/ext4/super.rst +++ b/Documentation/filesystems/ext4/super.rst @@ -596,6 +596,13 @@ following: - Sparse Super Block, v2. If this flag is set, the SB field s\_backup\_bgs points to the two block groups that contain backup superblocks (COMPAT\_SPARSE\_SUPER2). + * - 0x400 + - Fast commits supported. Although fast commits blocks are + backward incompatible, fast commit blocks are not always + present in the journal. If fast commit blocks are present in + the journal, JBD2 incompat feature + (JBD2\_FEATURE\_INCOMPAT\_FAST\_COMMIT) gets + set (COMPAT\_FAST\_COMMIT). .. _super_incompat: diff --git a/Documentation/filesystems/journalling.rst b/Documentation/filesystems/journalling.rst index 5a5f70b4063e..e18f90ffc6fd 100644 --- a/Documentation/filesystems/journalling.rst +++ b/Documentation/filesystems/journalling.rst @@ -136,10 +136,8 @@ Fast commits ~~~~~~~~~~~~ JBD2 to also allows you to perform file-system specific delta commits known as -fast commits. In order to use fast commits, you first need to call -:c:func:`jbd2_fc_init` and tell how many blocks at the end of journal -area should be reserved for fast commits. Along with that, you will also need -to set following callbacks that perform correspodning work: +fast commits. In order to use fast commits, you will need to set following +callbacks that perform correspodning work: `journal->j_fc_cleanup_cb`: Cleanup function called after every full commit and fast commit. diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 36d5f1f3c6dd..e00a66d72372 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6367,7 +6367,7 @@ accesses that would usually trigger a #GP by KVM into the guest will instead get bounced to user space through the KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications. -8.25 KVM_X86_SET_MSR_FILTER +8.27 KVM_X86_SET_MSR_FILTER --------------------------- :Architectures: x86 @@ -6381,8 +6381,7 @@ In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to trap and emulate MSRs that are outside of the scope of KVM as well as limit the attack surface on KVM's MSR emulation code. - -8.26 KVM_CAP_ENFORCE_PV_CPUID +8.28 KVM_CAP_ENFORCE_PV_CPUID ----------------------------- Architectures: x86 diff --git a/MAINTAINERS b/MAINTAINERS index 3da6d8c154e4..94ac10a153c7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6614,6 +6614,7 @@ Q: http://patchwork.ozlabs.org/project/linux-ext4/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git F: Documentation/filesystems/ext4/ F: fs/ext4/ +F: include/trace/events/ext4.h Extended Verification Module (EVM) M: Mimi Zohar <zohar@linux.ibm.com> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 57972bdb213a..1a01da9fdc99 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -788,10 +788,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } switch (vma_shift) { +#ifndef __PAGETABLE_PMD_FOLDED case PUD_SHIFT: if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) break; fallthrough; +#endif case CONT_PMD_SHIFT: vma_shift = PMD_SHIFT; fallthrough; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index fb12d3ef423a..d0868d0e8ff4 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1069,7 +1069,7 @@ static bool trap_ptrauth(struct kvm_vcpu *vcpu, static unsigned int ptrauth_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - return vcpu_has_ptrauth(vcpu) ? 0 : REG_HIDDEN_USER | REG_HIDDEN_GUEST; + return vcpu_has_ptrauth(vcpu) ? 0 : REG_HIDDEN; } #define __PTRAUTH_KEY(k) \ @@ -1153,6 +1153,22 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, return val; } +static unsigned int id_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + u32 id = sys_reg((u32)r->Op0, (u32)r->Op1, + (u32)r->CRn, (u32)r->CRm, (u32)r->Op2); + + switch (id) { + case SYS_ID_AA64ZFR0_EL1: + if (!vcpu_has_sve(vcpu)) + return REG_RAZ; + break; + } + + return 0; +} + /* cpufeature ID register access trap handlers */ static bool __access_id_reg(struct kvm_vcpu *vcpu, @@ -1171,7 +1187,9 @@ static bool access_id_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - return __access_id_reg(vcpu, p, r, false); + bool raz = sysreg_visible_as_raz(vcpu, r); + + return __access_id_reg(vcpu, p, r, raz); } static bool access_raz_id_reg(struct kvm_vcpu *vcpu, @@ -1192,72 +1210,7 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, if (vcpu_has_sve(vcpu)) return 0; - return REG_HIDDEN_USER | REG_HIDDEN_GUEST; -} - -/* Visibility overrides for SVE-specific ID registers */ -static unsigned int sve_id_visibility(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - if (vcpu_has_sve(vcpu)) - return 0; - - return REG_HIDDEN_USER; -} - -/* Generate the emulated ID_AA64ZFR0_EL1 value exposed to the guest */ -static u64 guest_id_aa64zfr0_el1(const struct kvm_vcpu *vcpu) -{ - if (!vcpu_has_sve(vcpu)) - return 0; - - return read_sanitised_ftr_reg(SYS_ID_AA64ZFR0_EL1); -} - -static bool access_id_aa64zfr0_el1(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) -{ - if (p->is_write) - return write_to_read_only(vcpu, p, rd); - - p->regval = guest_id_aa64zfr0_el1(vcpu); - return true; -} - -static int get_id_aa64zfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) -{ - u64 val; - - if (WARN_ON(!vcpu_has_sve(vcpu))) - return -ENOENT; - - val = guest_id_aa64zfr0_el1(vcpu); - return reg_to_user(uaddr, &val, reg->id); -} - -static int set_id_aa64zfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) -{ - const u64 id = sys_reg_to_index(rd); - int err; - u64 val; - - if (WARN_ON(!vcpu_has_sve(vcpu))) - return -ENOENT; - - err = reg_from_user(&val, uaddr, id); - if (err) - return err; - - /* This is what we mean by invariant: you can't change it. */ - if (val != guest_id_aa64zfr0_el1(vcpu)) - return -EINVAL; - - return 0; + return REG_HIDDEN; } /* @@ -1299,13 +1252,17 @@ static int __set_id_reg(const struct kvm_vcpu *vcpu, static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __get_id_reg(vcpu, rd, uaddr, false); + bool raz = sysreg_visible_as_raz(vcpu, rd); + + return __get_id_reg(vcpu, rd, uaddr, raz); } static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __set_id_reg(vcpu, rd, uaddr, false); + bool raz = sysreg_visible_as_raz(vcpu, rd); + + return __set_id_reg(vcpu, rd, uaddr, raz); } static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, @@ -1397,6 +1354,7 @@ static bool access_mte_regs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, .access = access_id_reg, \ .get_user = get_id_reg, \ .set_user = set_id_reg, \ + .visibility = id_visibility, \ } /* @@ -1518,7 +1476,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_SANITISED(ID_AA64PFR1_EL1), ID_UNALLOCATED(4,2), ID_UNALLOCATED(4,3), - { SYS_DESC(SYS_ID_AA64ZFR0_EL1), access_id_aa64zfr0_el1, .get_user = get_id_aa64zfr0_el1, .set_user = set_id_aa64zfr0_el1, .visibility = sve_id_visibility }, + ID_SANITISED(ID_AA64ZFR0_EL1), ID_UNALLOCATED(4,5), ID_UNALLOCATED(4,6), ID_UNALLOCATED(4,7), @@ -2185,7 +2143,7 @@ static void perform_access(struct kvm_vcpu *vcpu, trace_kvm_sys_access(*vcpu_pc(vcpu), params, r); /* Check for regs disabled by runtime config */ - if (sysreg_hidden_from_guest(vcpu, r)) { + if (sysreg_hidden(vcpu, r)) { kvm_inject_undefined(vcpu); return; } @@ -2684,7 +2642,7 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg return get_invariant_sys_reg(reg->id, uaddr); /* Check for regs disabled by runtime config */ - if (sysreg_hidden_from_user(vcpu, r)) + if (sysreg_hidden(vcpu, r)) return -ENOENT; if (r->get_user) @@ -2709,7 +2667,7 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg return set_invariant_sys_reg(reg->id, uaddr); /* Check for regs disabled by runtime config */ - if (sysreg_hidden_from_user(vcpu, r)) + if (sysreg_hidden(vcpu, r)) return -ENOENT; if (r->set_user) @@ -2780,7 +2738,7 @@ static int walk_one_sys_reg(const struct kvm_vcpu *vcpu, if (!(rd->reg || rd->get_user)) return 0; - if (sysreg_hidden_from_user(vcpu, rd)) + if (sysreg_hidden(vcpu, rd)) return 0; if (!copy_reg_to_user(rd, uind)) diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 5a6fc30f5989..0f95964339b1 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -59,8 +59,8 @@ struct sys_reg_desc { const struct sys_reg_desc *rd); }; -#define REG_HIDDEN_USER (1 << 0) /* hidden from userspace ioctls */ -#define REG_HIDDEN_GUEST (1 << 1) /* hidden from guest */ +#define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */ +#define REG_RAZ (1 << 1) /* RAZ from userspace and guest */ static __printf(2, 3) inline void print_sys_reg_msg(const struct sys_reg_params *p, @@ -111,22 +111,22 @@ static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r __vcpu_sys_reg(vcpu, r->reg) = r->val; } -static inline bool sysreg_hidden_from_guest(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *r) +static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) { if (likely(!r->visibility)) return false; - return r->visibility(vcpu, r) & REG_HIDDEN_GUEST; + return r->visibility(vcpu, r) & REG_HIDDEN; } -static inline bool sysreg_hidden_from_user(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *r) +static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) { if (likely(!r->visibility)) return false; - return r->visibility(vcpu, r) & REG_HIDDEN_USER; + return r->visibility(vcpu, r) & REG_RAZ; } static inline int cmp_sys_reg(const struct sys_reg_desc *i1, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 06a278b3701d..d50041f570e8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -90,6 +90,20 @@ static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent) return 0; } +void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); + + /* + * save the feature bitmap to avoid cpuid lookup for every PV + * operation + */ + if (best) + vcpu->arch.pv_cpuid.features = best->eax; +} + void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -124,13 +138,6 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); - /* - * save the feature bitmap to avoid cpuid lookup for every PV - * operation - */ - if (best) - vcpu->arch.pv_cpuid.features = best->eax; - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { best = kvm_find_cpuid_entry(vcpu, 0x1, 0); if (best) @@ -162,6 +169,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.guest_supported_xcr0 = (best->eax | ((u64)best->edx << 32)) & supported_xcr0; + kvm_update_pv_runtime(vcpu); + vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); kvm_mmu_reset_context(vcpu); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index bf8577947ed2..f7a6e8f83783 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -11,6 +11,7 @@ extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; void kvm_set_cpu_caps(void); void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); +void kvm_update_pv_runtime(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1f96adff8dc4..5bb1939b65d8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -856,12 +856,14 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, } else { rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - while (desc->sptes[PTE_LIST_EXT-1] && desc->more) { - desc = desc->more; + while (desc->sptes[PTE_LIST_EXT-1]) { count += PTE_LIST_EXT; - } - if (desc->sptes[PTE_LIST_EXT-1]) { - desc->more = mmu_alloc_pte_list_desc(vcpu); + + if (!desc->more) { + desc->more = mmu_alloc_pte_list_desc(vcpu); + desc = desc->more; + break; + } desc = desc->more; } for (i = 0; desc->sptes[i]; ++i) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f5ede41bf9e6..447edc0d1d5a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -255,11 +255,10 @@ static struct kmem_cache *x86_emulator_cache; /* * When called, it means the previous get/set msr reached an invalid msr. - * Return 0 if we want to ignore/silent this failed msr access, or 1 if we want - * to fail the caller. + * Return true if we want to ignore/silent this failed msr access. */ -static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, - u64 data, bool write) +static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, + u64 data, bool write) { const char *op = write ? "wrmsr" : "rdmsr"; @@ -268,11 +267,11 @@ static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, data); /* Mask the error */ - return 0; + return true; } else { kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", op, msr, data); - return -ENOENT; + return false; } } @@ -1416,7 +1415,8 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) if (r == KVM_MSR_RET_INVALID) { /* Unconditionally clear the output for simplicity */ *data = 0; - r = kvm_msr_ignored_check(vcpu, index, 0, false); + if (kvm_msr_ignored_check(vcpu, index, 0, false)) + r = 0; } if (r) @@ -1540,7 +1540,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, struct msr_data msr; if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) - return -EPERM; + return KVM_MSR_RET_FILTERED; switch (index) { case MSR_FS_BASE: @@ -1581,7 +1581,8 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, int ret = __kvm_set_msr(vcpu, index, data, host_initiated); if (ret == KVM_MSR_RET_INVALID) - ret = kvm_msr_ignored_check(vcpu, index, data, true); + if (kvm_msr_ignored_check(vcpu, index, data, true)) + ret = 0; return ret; } @@ -1599,7 +1600,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, int ret; if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) - return -EPERM; + return KVM_MSR_RET_FILTERED; msr.index = index; msr.host_initiated = host_initiated; @@ -1618,7 +1619,8 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, if (ret == KVM_MSR_RET_INVALID) { /* Unconditionally clear *data for simplicity */ *data = 0; - ret = kvm_msr_ignored_check(vcpu, index, 0, false); + if (kvm_msr_ignored_check(vcpu, index, 0, false)) + ret = 0; } return ret; @@ -1662,9 +1664,9 @@ static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) static u64 kvm_msr_reason(int r) { switch (r) { - case -ENOENT: + case KVM_MSR_RET_INVALID: return KVM_MSR_EXIT_REASON_UNKNOWN; - case -EPERM: + case KVM_MSR_RET_FILTERED: return KVM_MSR_EXIT_REASON_FILTER; default: return KVM_MSR_EXIT_REASON_INVAL; @@ -1965,7 +1967,7 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, struct kvm_arch *ka = &vcpu->kvm->arch; if (vcpu->vcpu_id == 0 && !host_initiated) { - if (ka->boot_vcpu_runs_old_kvmclock && old_msr) + if (ka->boot_vcpu_runs_old_kvmclock != old_msr) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); ka->boot_vcpu_runs_old_kvmclock = old_msr; @@ -3063,9 +3065,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Values other than LBR and BTF are vendor-specific, thus reserved and should throw a #GP */ return 1; - } - vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); + } else if (report_ignored_msrs) + vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); break; case 0x200 ... 0x2ff: return kvm_mtrr_set_msr(vcpu, msr, data); @@ -3463,29 +3465,63 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + + msr_info->data = vcpu->kvm->arch.wall_clock; + break; case MSR_KVM_WALL_CLOCK_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + + msr_info->data = vcpu->arch.time; + break; case MSR_KVM_SYSTEM_TIME_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + msr_info->data = vcpu->arch.time; break; case MSR_KVM_ASYNC_PF_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; + msr_info->data = vcpu->arch.apf.msr_en_val; break; case MSR_KVM_ASYNC_PF_INT: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return 1; + msr_info->data = vcpu->arch.apf.msr_int_val; break; case MSR_KVM_ASYNC_PF_ACK: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; + msr_info->data = 0; break; case MSR_KVM_STEAL_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) + return 1; + msr_info->data = vcpu->arch.st.msr_val; break; case MSR_KVM_PV_EOI_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) + return 1; + msr_info->data = vcpu->arch.pv_eoi.msr_val; break; case MSR_KVM_POLL_CONTROL: + if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) + return 1; + msr_info->data = vcpu->arch.msr_kvm_poll_control; break; case MSR_IA32_P5_MC_ADDR: @@ -4575,6 +4611,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: vcpu->arch.pv_cpuid.enforce = cap->args[0]; + if (vcpu->arch.pv_cpuid.enforce) + kvm_update_pv_runtime(vcpu); return 0; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 3900ab0c6004..e7ca622a468f 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -376,7 +376,13 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); -#define KVM_MSR_RET_INVALID 2 +/* + * Internal error codes that are used to indicate that MSR emulation encountered + * an error that should result in #GP in the guest, unless userspace + * handles it. + */ +#define KVM_MSR_RET_INVALID 2 /* in-kernel MSR emulation #GP condition */ +#define KVM_MSR_RET_FILTERED 3 /* #GP due to userspace MSR filter */ #define __cr4_reserved_bits(__cpu_has, __c) \ ({ \ diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 1b1ca63e6bbe..c6622011d493 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3818,9 +3818,8 @@ bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size, * page aligned, we don't need to use a bounce page. */ if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) { - tlb_addr = swiotlb_tbl_map_single(dev, - phys_to_dma_unencrypted(dev, io_tlb_start), - paddr, size, aligned_size, dir, attrs); + tlb_addr = swiotlb_tbl_map_single(dev, paddr, size, + aligned_size, dir, attrs); if (tlb_addr == DMA_MAPPING_ERROR) { goto swiotlb_error; } else { diff --git a/drivers/powercap/powercap_sys.c b/drivers/powercap/powercap_sys.c index f808c5fa9838..3f0b8e2ef3d4 100644 --- a/drivers/powercap/powercap_sys.c +++ b/drivers/powercap/powercap_sys.c @@ -367,9 +367,9 @@ static void create_power_zone_common_attributes( &dev_attr_max_energy_range_uj.attr; if (power_zone->ops->get_energy_uj) { if (power_zone->ops->reset_energy_uj) - dev_attr_energy_uj.attr.mode = S_IWUSR | S_IRUGO; + dev_attr_energy_uj.attr.mode = S_IWUSR | S_IRUSR; else - dev_attr_energy_uj.attr.mode = S_IRUGO; + dev_attr_energy_uj.attr.mode = S_IRUSR; power_zone->zone_dev_attrs[count++] = &dev_attr_energy_uj.attr; } diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 71ce1b7a23d1..2b385c1b4a99 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -395,8 +395,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, */ trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); - map = swiotlb_tbl_map_single(dev, virt_to_phys(xen_io_tlb_start), - phys, size, size, dir, attrs); + map = swiotlb_tbl_map_single(dev, phys, size, size, dir, attrs); if (map == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 7e1549a84fcc..bc920afe23bf 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -511,7 +511,8 @@ again: /*DEFAULT_RATELIMIT_BURST*/ 1); if (__ratelimit(&_rs)) WARN(1, KERN_DEBUG - "BTRFS: block rsv returned %d\n", ret); + "BTRFS: block rsv %d returned %d\n", + block_rsv->type, ret); } try_reserve: ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5b9e3f3ace22..10638537b9ef 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -91,6 +91,17 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); if (ret) { no_valid_dev_replace_entry_found: + /* + * We don't have a replace item or it's corrupted. If there is + * a replace target, fail the mount. + */ + if (btrfs_find_device(fs_info->fs_devices, + BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { + btrfs_err(fs_info, + "found replace target device without a valid replace item"); + ret = -EUCLEAN; + goto out; + } ret = 0; dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; @@ -143,8 +154,19 @@ no_valid_dev_replace_entry_found: case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: - dev_replace->srcdev = NULL; - dev_replace->tgtdev = NULL; + /* + * We don't have an active replace item but if there is a + * replace target, fail the mount. + */ + if (btrfs_find_device(fs_info->fs_devices, + BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { + btrfs_err(fs_info, + "replace devid present without an active replace item"); + ret = -EUCLEAN; + } else { + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + } break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ab408a23ba32..69a384145dc6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1274,6 +1274,7 @@ static int cluster_pages_for_defrag(struct inode *inode, u64 page_start; u64 page_end; u64 page_cnt; + u64 start = (u64)start_index << PAGE_SHIFT; int ret; int i; int i_done; @@ -1290,8 +1291,7 @@ static int cluster_pages_for_defrag(struct inode *inode, page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - start_index << PAGE_SHIFT, - page_cnt << PAGE_SHIFT); + start, page_cnt << PAGE_SHIFT); if (ret) return ret; i_done = 0; @@ -1380,8 +1380,7 @@ again: btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - start_index << PAGE_SHIFT, - (page_cnt - i_done) << PAGE_SHIFT, true); + start, (page_cnt - i_done) << PAGE_SHIFT, true); } @@ -1408,8 +1407,7 @@ out: put_page(pages[i]); } btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - start_index << PAGE_SHIFT, - page_cnt << PAGE_SHIFT, true); + start, page_cnt << PAGE_SHIFT, true); btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return ret; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c54ea6586632..77c54749f432 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3435,24 +3435,20 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, { struct rb_node *node; struct rb_node *next; - struct ulist_node *entry = NULL; + struct ulist_node *entry; int ret = 0; node = reserved->range_changed.root.rb_node; + if (!node) + return 0; while (node) { entry = rb_entry(node, struct ulist_node, rb_node); if (entry->val < start) node = node->rb_right; - else if (entry) - node = node->rb_left; else - break; + node = node->rb_left; } - /* Empty changeset */ - if (!entry) - return 0; - if (entry->val > start && rb_prev(&entry->rb_node)) entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, rb_node); diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 7f03dbe5b609..78693d3dd15b 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -860,6 +860,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "dropping a ref for a root that doesn't have a ref on the block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + kfree(ref); kfree(ra); goto out_unlock; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3602806d71bd..9ba92d86da0b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1648,6 +1648,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, struct btrfs_root_item *root_item; struct btrfs_path *path; struct extent_buffer *leaf; + int reserve_level; int level; int max_level; int replaced = 0; @@ -1696,7 +1697,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, * Thus the needed metadata size is at most root_level * nodesize, * and * 2 since we have two trees to COW. */ - min_reserved = fs_info->nodesize * btrfs_root_level(root_item) * 2; + reserve_level = max_t(int, 1, btrfs_root_level(root_item)); + min_reserved = fs_info->nodesize * reserve_level * 2; memset(&next_key, 0, sizeof(next_key)); while (1) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index cf63f1e27a27..e71e7586e9eb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3866,8 +3866,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (!is_dev_replace && !readonly && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", - rcu_str_deref(dev->name)); + btrfs_err_in_rcu(fs_info, + "scrub on devid %llu: filesystem on %s is not writable", + devid, rcu_str_deref(dev->name)); ret = -EROFS; goto out; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b1e48078c318..a6406b3b8c2b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1056,22 +1056,13 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, continue; } - if (device->devid == BTRFS_DEV_REPLACE_DEVID) { - /* - * In the first step, keep the device which has - * the correct fsid and the devid that is used - * for the dev_replace procedure. - * In the second step, the dev_replace state is - * read from the device tree and it is known - * whether the procedure is really active or - * not, which means whether this device is - * used or whether it should be removed. - */ - if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, - &device->dev_state)) { - continue; - } - } + /* + * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, + * in btrfs_init_dev_replace() so just continue. + */ + if (device->devid == BTRFS_DEV_REPLACE_DEVID) + continue; + if (device->bdev) { blkdev_put(device->bdev, device->mode); device->bdev = NULL; @@ -1080,9 +1071,6 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { list_del_init(&device->dev_alloc_list); clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, - &device->dev_state)) - fs_devices->rw_devices--; } list_del_init(&device->dev_list); fs_devices->num_devices--; diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index d3c3e5d9b41f..d595abb8ef90 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -269,9 +269,7 @@ unlock: * New inodes may not have an inode number assigned yet. * Hashing their inode number is delayed until later. */ - if (ci->ci_inode->i_ino == 0) - WARN_ON(!(ci->ci_inode->i_state & I_CREATING)); - else + if (ci->ci_inode->i_ino) fscrypt_hash_inode_number(ci, mk); return 0; } diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 139d0bed42f8..3e21c0e8adae 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -107,11 +107,9 @@ static struct page *erofs_read_inode(struct inode *inode, i_gid_write(inode, le32_to_cpu(die->i_gid)); set_nlink(inode, le32_to_cpu(die->i_nlink)); - /* ns timestamp */ - inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = - le64_to_cpu(die->i_ctime); - inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = - le32_to_cpu(die->i_ctime_nsec); + /* extended inode has its own timestamp */ + inode->i_ctime.tv_sec = le64_to_cpu(die->i_ctime); + inode->i_ctime.tv_nsec = le32_to_cpu(die->i_ctime_nsec); inode->i_size = le64_to_cpu(die->i_size); @@ -149,11 +147,9 @@ static struct page *erofs_read_inode(struct inode *inode, i_gid_write(inode, le16_to_cpu(dic->i_gid)); set_nlink(inode, le16_to_cpu(dic->i_nlink)); - /* use build time to derive all file time */ - inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = - sbi->build_time; - inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = - sbi->build_time_nsec; + /* use build time for compact inodes */ + inode->i_ctime.tv_sec = sbi->build_time; + inode->i_ctime.tv_nsec = sbi->build_time_nsec; inode->i_size = le32_to_cpu(dic->i_size); if (erofs_inode_is_data_compressed(vi->datalayout)) @@ -167,6 +163,11 @@ static struct page *erofs_read_inode(struct inode *inode, goto err_out; } + inode->i_mtime.tv_sec = inode->i_ctime.tv_sec; + inode->i_atime.tv_sec = inode->i_ctime.tv_sec; + inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec; + inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec; + if (!nblks) /* measure inode.i_blocks as generic filesystems */ inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 50912a5420b4..86fd3bf62af6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1078,8 +1078,11 @@ out_allocpage: cond_resched(); goto repeat; } - set_page_private(page, (unsigned long)pcl); - SetPagePrivate(page); + + if (tocache) { + set_page_private(page, (unsigned long)pcl); + SetPagePrivate(page); + } out: /* the only exit (for tracing and debugging) */ return page; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 45fcdbf538d1..1b399cafb15a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1028,9 +1028,6 @@ struct ext4_inode_info { * protected by sbi->s_fc_lock. */ - /* Fast commit subtid when this inode was committed */ - unsigned int i_fc_committed_subtid; - /* Start of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_start; @@ -1422,16 +1419,6 @@ struct ext4_super_block { #ifdef __KERNEL__ -/* - * run-time mount flags - */ -#define EXT4_MF_MNTDIR_SAMPLED 0x0001 -#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ -#define EXT4_MF_FC_INELIGIBLE 0x0004 /* Fast commit ineligible */ -#define EXT4_MF_FC_COMMITTING 0x0008 /* File system underoing a fast - * commit. - */ - #ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL) #else @@ -1466,7 +1453,7 @@ struct ext4_sb_info { struct buffer_head * __rcu *s_group_desc; unsigned int s_mount_opt; unsigned int s_mount_opt2; - unsigned int s_mount_flags; + unsigned long s_mount_flags; unsigned int s_def_mount_opt; ext4_fsblk_t s_sb_block; atomic64_t s_resv_clusters; @@ -1695,6 +1682,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) }) /* + * run-time mount flags + */ +enum { + EXT4_MF_MNTDIR_SAMPLED, + EXT4_MF_FS_ABORTED, /* Fatal error detected */ + EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ + EXT4_MF_FC_COMMITTING /* File system underoing a fast + * commit. + */ +}; + +static inline void ext4_set_mount_flag(struct super_block *sb, int bit) +{ + set_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + +static inline void ext4_clear_mount_flag(struct super_block *sb, int bit) +{ + clear_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + +static inline int ext4_test_mount_flag(struct super_block *sb, int bit) +{ + return test_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + + +/* * Simulate_fail codes */ #define EXT4_SIM_BBITMAP_EIO 1 @@ -1863,6 +1878,13 @@ static inline bool ext4_verity_in_progress(struct inode *inode) #define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 #define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 +/* + * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes + * incompatible only if fast commit blocks are present in the FS. Since we + * clear the journal (and thus the fast commit blocks), we don't mark FS as + * incompatible. We also have a JBD2 incompat feature, which gets set when + * there are fast commit blocks present in the journal. + */ #define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400 #define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800 @@ -2731,12 +2753,16 @@ extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); int ext4_fc_info_show(struct seq_file *seq, void *v); void ext4_fc_init(struct super_block *sb, journal_t *journal); void ext4_fc_init_inode(struct inode *inode); -void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start, +void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); -void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry); -void ext4_fc_track_link(struct inode *inode, struct dentry *dentry); -void ext4_fc_track_create(struct inode *inode, struct dentry *dentry); -void ext4_fc_track_inode(struct inode *inode); +void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, + struct dentry *dentry); +void __ext4_fc_track_link(handle_t *handle, struct inode *inode, + struct dentry *dentry); +void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason); void ext4_fc_start_ineligible(struct super_block *sb, int reason); void ext4_fc_stop_ineligible(struct super_block *sb); @@ -3452,7 +3478,7 @@ extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, extern int ext4_ci_compare(const struct inode *parent, const struct qstr *fname, const struct qstr *entry, bool quick); -extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, +extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, struct inode *inode); extern int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 57cfa28919a9..17d7096b3212 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3724,7 +3724,6 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, err = ext4_ext_dirty(handle, inode, path + path->p_depth); out: ext4_ext_show_leaf(inode, path); - ext4_fc_track_range(inode, ee_block, ee_block + ee_len - 1); return err; } @@ -3796,7 +3795,6 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, if (*allocated > map->m_len) *allocated = map->m_len; map->m_len = *allocated; - ext4_fc_track_range(inode, ee_block, ee_block + ee_len - 1); return 0; } @@ -4329,7 +4327,6 @@ got_allocated_blocks: map->m_len = ar.len; allocated = map->m_len; ext4_ext_show_leaf(inode, path); - ext4_fc_track_range(inode, map->m_lblk, map->m_lblk + map->m_len - 1); out: ext4_ext_drop_refs(path); kfree(path); @@ -4602,7 +4599,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; - ext4_fc_track_range(inode, offset >> inode->i_sb->s_blocksize_bits, + ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits, (offset + len - 1) >> inode->i_sb->s_blocksize_bits); /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); @@ -4651,8 +4648,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; - ext4_fc_track_range(inode, offset >> blkbits, - (offset + len - 1) >> blkbits); ext4_fc_start_update(inode); diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 8d43058386c3..f2033e13a273 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -83,7 +83,7 @@ * * Atomicity of commits * -------------------- - * In order to gaurantee atomicity during the commit operation, fast commit + * In order to guarantee atomicity during the commit operation, fast commit * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail * tag contains CRC of the contents and TID of the transaction after which * this fast commit should be applied. Recovery code replays fast commit @@ -152,7 +152,31 @@ void ext4_fc_init_inode(struct inode *inode) INIT_LIST_HEAD(&ei->i_fc_list); init_waitqueue_head(&ei->i_fc_wait); atomic_set(&ei->i_fc_updates, 0); - ei->i_fc_committed_subtid = 0; +} + +/* This function must be called with sbi->s_fc_lock held. */ +static void ext4_fc_wait_committing_inode(struct inode *inode) +__releases(&EXT4_SB(inode->i_sb)->s_fc_lock) +{ + wait_queue_head_t *wq; + struct ext4_inode_info *ei = EXT4_I(inode); + +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_COMMITTING); +#endif + lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); + schedule(); + finish_wait(wq, &wait.wq_entry); } /* @@ -176,22 +200,7 @@ restart: goto out; if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - wait_queue_head_t *wq; -#if (BITS_PER_LONG < 64) - DEFINE_WAIT_BIT(wait, &ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); -#else - DEFINE_WAIT_BIT(wait, &ei->i_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_flags, - EXT4_STATE_FC_COMMITTING); -#endif - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); - schedule(); - finish_wait(wq, &wait.wq_entry); + ext4_fc_wait_committing_inode(inode); goto restart; } out: @@ -234,26 +243,10 @@ restart: } if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - wait_queue_head_t *wq; -#if (BITS_PER_LONG < 64) - DEFINE_WAIT_BIT(wait, &ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); -#else - DEFINE_WAIT_BIT(wait, &ei->i_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_flags, - EXT4_STATE_FC_COMMITTING); -#endif - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); - schedule(); - finish_wait(wq, &wait.wq_entry); + ext4_fc_wait_committing_inode(inode); goto restart; } - if (!list_empty(&ei->i_fc_list)) - list_del_init(&ei->i_fc_list); + list_del_init(&ei->i_fc_list); spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); } @@ -269,7 +262,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason) (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return; - sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE; + ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -302,14 +295,14 @@ void ext4_fc_stop_ineligible(struct super_block *sb) (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return; - EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE; + ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); } static inline int ext4_fc_is_ineligible(struct super_block *sb) { - return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) || - atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates); + return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) || + atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates)); } /* @@ -323,13 +316,14 @@ static inline int ext4_fc_is_ineligible(struct super_block *sb) * If enqueue is set, this function enqueues the inode in fast commit list. */ static int ext4_fc_track_template( - struct inode *inode, int (*__fc_track_fn)(struct inode *, void *, bool), + handle_t *handle, struct inode *inode, + int (*__fc_track_fn)(struct inode *, void *, bool), void *args, int enqueue) { - tid_t running_txn_tid; bool update = false; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + tid_t tid = 0; int ret; if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || @@ -339,15 +333,13 @@ static int ext4_fc_track_template( if (ext4_fc_is_ineligible(inode->i_sb)) return -EINVAL; - running_txn_tid = sbi->s_journal ? - sbi->s_journal->j_commit_sequence + 1 : 0; - + tid = handle->h_transaction->t_tid; mutex_lock(&ei->i_fc_lock); - if (running_txn_tid == ei->i_sync_tid) { + if (tid == ei->i_sync_tid) { update = true; } else { ext4_fc_reset_inode(inode); - ei->i_sync_tid = running_txn_tid; + ei->i_sync_tid = tid; } ret = __fc_track_fn(inode, args, update); mutex_unlock(&ei->i_fc_lock); @@ -358,7 +350,7 @@ static int ext4_fc_track_template( spin_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, - (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ? + (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); spin_unlock(&sbi->s_fc_lock); @@ -384,7 +376,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) mutex_unlock(&ei->i_fc_lock); node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MEM); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -397,7 +389,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) if (!node->fcd_name.name) { kmem_cache_free(ext4_fc_dentry_cachep, node); ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_MEM); + EXT4_FC_REASON_NOMEM); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -411,7 +403,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) node->fcd_name.len = dentry->d_name.len; spin_lock(&sbi->s_fc_lock); - if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_STAGING]); else @@ -422,7 +414,8 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) return 0; } -void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry) +void __ext4_fc_track_unlink(handle_t *handle, + struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; @@ -430,12 +423,18 @@ void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry) args.dentry = dentry; args.op = EXT4_FC_TAG_UNLINK; - ret = ext4_fc_track_template(inode, __track_dentry_update, + ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_unlink(inode, dentry, ret); } -void ext4_fc_track_link(struct inode *inode, struct dentry *dentry) +void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) +{ + __ext4_fc_track_unlink(handle, d_inode(dentry), dentry); +} + +void __ext4_fc_track_link(handle_t *handle, + struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; @@ -443,20 +442,26 @@ void ext4_fc_track_link(struct inode *inode, struct dentry *dentry) args.dentry = dentry; args.op = EXT4_FC_TAG_LINK; - ret = ext4_fc_track_template(inode, __track_dentry_update, + ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_link(inode, dentry, ret); } -void ext4_fc_track_create(struct inode *inode, struct dentry *dentry) +void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) +{ + __ext4_fc_track_link(handle, d_inode(dentry), dentry); +} + +void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) { struct __track_dentry_update_args args; + struct inode *inode = d_inode(dentry); int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_CREAT; - ret = ext4_fc_track_template(inode, __track_dentry_update, + ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_create(inode, dentry, ret); } @@ -472,14 +477,20 @@ static int __track_inode(struct inode *inode, void *arg, bool update) return 0; } -void ext4_fc_track_inode(struct inode *inode) +void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { int ret; if (S_ISDIR(inode->i_mode)) return; - ret = ext4_fc_track_template(inode, __track_inode, NULL, 1); + if (ext4_should_journal_data(inode)) { + ext4_fc_mark_ineligible(inode->i_sb, + EXT4_FC_REASON_INODE_JOURNAL_DATA); + return; + } + + ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(inode, ret); } @@ -515,7 +526,7 @@ static int __track_range(struct inode *inode, void *arg, bool update) return 0; } -void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start, +void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct __track_range_args args; @@ -527,7 +538,7 @@ void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start, args.start = start; args.end = end; - ret = ext4_fc_track_template(inode, __track_range, &args, 1); + ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); trace_ext4_fc_track_range(inode, start, end, ret); } @@ -537,10 +548,11 @@ static void ext4_fc_submit_bh(struct super_block *sb) int write_flags = REQ_SYNC; struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; + /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */ if (test_opt(sb, BARRIER)) write_flags |= REQ_FUA | REQ_PREFLUSH; lock_buffer(bh); - clear_buffer_dirty(bh); + set_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = ext4_end_buffer_io_sync; submit_bh(REQ_OP_WRITE, write_flags, bh); @@ -846,7 +858,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) int ret = 0; spin_lock(&sbi->s_fc_lock); - sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING; + ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { ei = list_entry(pos, struct ext4_inode_info, i_fc_list); ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); @@ -900,6 +912,8 @@ static int ext4_fc_wait_inode_data_all(journal_t *journal) /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) +__acquires(&sbi->s_fc_lock) +__releases(&sbi->s_fc_lock) { struct super_block *sb = (struct super_block *)(journal->j_private); struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -996,6 +1010,13 @@ static int ext4_fc_perform_commit(journal_t *journal) if (ret) return ret; + /* + * If file system device is different from journal device, issue a cache + * flush before we start writing fast commit blocks. + */ + if (journal->j_fs_dev != journal->j_dev) + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); + blk_start_plug(&plug); if (sbi->s_fc_bytes == 0) { /* @@ -1031,8 +1052,6 @@ static int ext4_fc_perform_commit(journal_t *journal) if (ret) goto out; spin_lock(&sbi->s_fc_lock); - EXT4_I(inode)->i_fc_committed_subtid = - atomic_read(&sbi->s_fc_subtid); } spin_unlock(&sbi->s_fc_lock); @@ -1131,7 +1150,7 @@ out: "Fast commit ended with blks = %d, reason = %d, subtid - %d", nblks, reason, subtid); if (reason == EXT4_FC_REASON_FC_FAILED) - return jbd2_fc_end_commit_fallback(journal, commit_tid); + return jbd2_fc_end_commit_fallback(journal); if (reason == EXT4_FC_REASON_FC_START_FAILED || reason == EXT4_FC_REASON_INELIGIBLE) return jbd2_complete_transaction(journal, commit_tid); @@ -1190,8 +1209,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full) list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], &sbi->s_fc_q[FC_Q_STAGING]); - sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING; - sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE; + ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (full) sbi->s_fc_bytes = 0; @@ -1263,7 +1282,7 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl) return 0; } - ret = __ext4_unlink(old_parent, &entry, inode); + ret = __ext4_unlink(NULL, old_parent, &entry, inode); /* -ENOENT ok coz it might not exist anymore. */ if (ret == -ENOENT) ret = 0; @@ -2079,8 +2098,6 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, void ext4_fc_init(struct super_block *sb, journal_t *journal) { - int num_fc_blocks; - /* * We set replay callback even if fast commit disabled because we may * could still have fast commit blocks that need to be replayed even if @@ -2090,21 +2107,9 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal) if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return; journal->j_fc_cleanup_callback = ext4_fc_cleanup; - if (!buffer_uptodate(journal->j_sb_buffer) - && ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, - true)) { - ext4_msg(sb, KERN_ERR, "I/O error on journal"); - return; - } - num_fc_blocks = be32_to_cpu(journal->j_superblock->s_num_fc_blks); - if (jbd2_fc_init(journal, num_fc_blocks ? num_fc_blocks : - EXT4_NUM_FC_BLKS)) { - pr_warn("Error while enabling fast commits, turning off."); - ext4_clear_feature_fast_commit(sb); - } } -const char *fc_ineligible_reasons[] = { +static const char *fc_ineligible_reasons[] = { "Extended attributes changed", "Cross rename", "Journal flag changed", @@ -2113,6 +2118,7 @@ const char *fc_ineligible_reasons[] = { "Resize", "Dir renamed", "Falloc range op", + "Data journalling", "FC Commit Failed" }; diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 06907d485989..3a6e5a1fa1b8 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -3,9 +3,6 @@ #ifndef __FAST_COMMIT_H__ #define __FAST_COMMIT_H__ -/* Number of blocks in journal area to allocate for fast commits */ -#define EXT4_NUM_FC_BLKS 256 - /* Fast commit tags */ #define EXT4_FC_TAG_ADD_RANGE 0x0001 #define EXT4_FC_TAG_DEL_RANGE 0x0002 @@ -100,11 +97,12 @@ enum { EXT4_FC_REASON_XATTR = 0, EXT4_FC_REASON_CROSS_RENAME, EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, - EXT4_FC_REASON_MEM, + EXT4_FC_REASON_NOMEM, EXT4_FC_REASON_SWAP_BOOT, EXT4_FC_REASON_RESIZE, EXT4_FC_REASON_RENAME_DIR, EXT4_FC_REASON_FALLOC_RANGE, + EXT4_FC_REASON_INODE_JOURNAL_DATA, EXT4_FC_COMMIT_FAILED, EXT4_FC_REASON_MAX }; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d85412d12e3a..3ed8c048fb12 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -761,7 +761,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) if (!daxdev_mapping_supported(vma, dax_dev)) return -EOPNOTSUPP; - ext4_fc_start_update(inode); file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; @@ -769,7 +768,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) } else { vma->vm_ops = &ext4_file_vm_ops; } - ext4_fc_stop_update(inode); return 0; } @@ -782,13 +780,13 @@ static int ext4_sample_last_mounted(struct super_block *sb, handle_t *handle; int err; - if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED)) + if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) return 0; if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) return 0; - sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; + ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); /* * Sample where the filesystem has been mounted and * store it in the superblock for sysadmin convenience diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index b232c2767534..4c2a9fe30067 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -280,7 +280,7 @@ static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys, /* Fabricate an rmap entry for the external log device. */ irec.fmr_physical = journal->j_blk_offset; - irec.fmr_length = journal->j_maxlen; + irec.fmr_length = journal->j_total_len; irec.fmr_owner = EXT4_FMR_OWN_LOG; irec.fmr_flags = 0; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 81a545fd14a3..a42ca95840f2 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -143,7 +143,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (sb_rdonly(inode->i_sb)) { /* Make sure that we read updated s_mount_flags value */ smp_rmb(); - if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) + if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED)) ret = -EROFS; goto out; } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index caa51473207d..b41512d1badc 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1880,6 +1880,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) ext4_write_lock_xattr(inode, &no_expand); if (!ext4_has_inline_data(inode)) { + ext4_write_unlock_xattr(inode, &no_expand); *has_inline = 0; ext4_journal_stop(handle); return 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b96a18679a27..0d8385aea898 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -327,6 +327,8 @@ stop_handle: ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: + if (!list_empty(&EXT4_I(inode)->i_fc_list)) + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } @@ -730,7 +732,7 @@ out_sem: if (ret) return ret; } - ext4_fc_track_range(inode, map->m_lblk, + ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + map->m_len - 1); } @@ -2440,7 +2442,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, struct super_block *sb = inode->i_sb; if (ext4_forced_shutdown(EXT4_SB(sb)) || - EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. @@ -2674,7 +2676,7 @@ static int ext4_writepages(struct address_space *mapping, * the stack trace. */ if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) || - sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) { + ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) { ret = -EROFS; goto out_writepages; } @@ -3310,8 +3312,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) EXT4_I(inode)->i_datasync_tid)) return false; if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) - return atomic_read(&EXT4_SB(inode->i_sb)->s_fc_subtid) < - EXT4_I(inode)->i_fc_committed_subtid; + return !list_empty(&EXT4_I(inode)->i_fc_list); return true; } @@ -4109,7 +4110,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) up_write(&EXT4_I(inode)->i_data_sem); } - ext4_fc_track_range(inode, first_block, stop_block); + ext4_fc_track_range(handle, inode, first_block, stop_block); if (IS_SYNC(inode)) ext4_handle_sync(handle); @@ -5442,14 +5443,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (shrink) - ext4_fc_track_range(inode, + ext4_fc_track_range(handle, inode, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits, (oldsize > 0 ? oldsize - 1 : 0) >> inode->i_sb->s_blocksize_bits); else ext4_fc_track_range( - inode, + handle, inode, (oldsize > 0 ? oldsize - 1 : oldsize) >> inode->i_sb->s_blocksize_bits, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> @@ -5699,7 +5700,7 @@ int ext4_mark_iloc_dirty(handle_t *handle, put_bh(iloc->bh); return -EIO; } - ext4_fc_track_inode(inode); + ext4_fc_track_inode(handle, inode); if (IS_I_VERSION(inode)) inode_inc_iversion(inode); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 85abbfb98cbe..24af9ed5c3e5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4477,7 +4477,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) return; ngroups = ext4_get_groups_count(sb); @@ -4508,7 +4508,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) return; mb_debug(sb, "Can't allocate:" @@ -5167,7 +5167,7 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, struct super_block *sb = ar->inode->i_sb; ext4_group_t group; ext4_grpblk_t blkoff; - int i; + int i = sb->s_blocksize; ext4_fsblk_t goal, block; struct ext4_super_block *es = EXT4_SB(sb)->s_es; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f458d1d81d96..33509266f5a0 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2606,7 +2606,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { handle_t *handle; - struct inode *inode, *inode_save; + struct inode *inode; int err, credits, retries = 0; err = dquot_initialize(dir); @@ -2624,11 +2624,9 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - inode_save = inode; - ihold(inode_save); err = ext4_add_nondir(handle, dentry, &inode); - ext4_fc_track_create(inode_save, dentry); - iput(inode_save); + if (!err) + ext4_fc_track_create(handle, dentry); } if (handle) ext4_journal_stop(handle); @@ -2643,7 +2641,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { handle_t *handle; - struct inode *inode, *inode_save; + struct inode *inode; int err, credits, retries = 0; err = dquot_initialize(dir); @@ -2660,12 +2658,9 @@ retry: if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ext4_special_inode_operations; - inode_save = inode; - ihold(inode_save); err = ext4_add_nondir(handle, dentry, &inode); if (!err) - ext4_fc_track_create(inode_save, dentry); - iput(inode_save); + ext4_fc_track_create(handle, dentry); } if (handle) ext4_journal_stop(handle); @@ -2829,7 +2824,6 @@ out_clear_inode: iput(inode); goto out_retry; } - ext4_fc_track_create(inode, dentry); ext4_inc_count(dir); ext4_update_dx_flag(dir); @@ -2837,6 +2831,7 @@ out_clear_inode: if (err) goto out_clear_inode; d_instantiate_new(dentry, inode); + ext4_fc_track_create(handle, dentry); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); @@ -3171,7 +3166,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) goto end_rmdir; ext4_dec_count(dir); ext4_update_dx_flag(dir); - ext4_fc_track_unlink(inode, dentry); + ext4_fc_track_unlink(handle, dentry); retval = ext4_mark_inode_dirty(handle, dir); #ifdef CONFIG_UNICODE @@ -3192,13 +3187,12 @@ end_rmdir: return retval; } -int __ext4_unlink(struct inode *dir, const struct qstr *d_name, +int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, struct inode *inode) { int retval = -ENOENT; struct buffer_head *bh; struct ext4_dir_entry_2 *de; - handle_t *handle = NULL; int skip_remove_dentry = 0; bh = ext4_find_entry(dir, d_name, &de, NULL); @@ -3217,14 +3211,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) skip_remove_dentry = 1; else - goto out_bh; - } - - handle = ext4_journal_start(dir, EXT4_HT_DIR, - EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) { - retval = PTR_ERR(handle); - goto out_bh; + goto out; } if (IS_DIRSYNC(dir)) @@ -3233,12 +3220,12 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, if (!skip_remove_dentry) { retval = ext4_delete_entry(handle, dir, de, bh); if (retval) - goto out_handle; + goto out; dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) - goto out_handle; + goto out; } else { retval = 0; } @@ -3252,15 +3239,14 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, inode->i_ctime = current_time(inode); retval = ext4_mark_inode_dirty(handle, inode); -out_handle: - ext4_journal_stop(handle); -out_bh: +out: brelse(bh); return retval; } static int ext4_unlink(struct inode *dir, struct dentry *dentry) { + handle_t *handle; int retval; if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) @@ -3278,9 +3264,16 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (retval) goto out_trace; - retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry)); + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + goto out_trace; + } + + retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry)); if (!retval) - ext4_fc_track_unlink(d_inode(dentry), dentry); + ext4_fc_track_unlink(handle, dentry); #ifdef CONFIG_UNICODE /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid @@ -3291,6 +3284,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (IS_CASEFOLDED(dir)) d_invalidate(dentry); #endif + if (handle) + ext4_journal_stop(handle); out_trace: trace_ext4_unlink_exit(dentry, retval); @@ -3447,7 +3442,6 @@ retry: err = ext4_add_entry(handle, dentry, inode); if (!err) { - ext4_fc_track_link(inode, dentry); err = ext4_mark_inode_dirty(handle, inode); /* this can happen only for tmpfile being * linked the first time @@ -3455,6 +3449,7 @@ retry: if (inode->i_nlink == 1) ext4_orphan_del(handle, inode); d_instantiate(dentry, inode); + ext4_fc_track_link(handle, dentry); } else { drop_nlink(inode); iput(inode); @@ -3915,9 +3910,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, EXT4_FC_REASON_RENAME_DIR); } else { if (new.inode) - ext4_fc_track_unlink(new.inode, new.dentry); - ext4_fc_track_link(old.inode, new.dentry); - ext4_fc_track_unlink(old.inode, old.dentry); + ext4_fc_track_unlink(handle, new.dentry); + __ext4_fc_track_link(handle, old.inode, new.dentry); + __ext4_fc_track_unlink(handle, old.inode, old.dentry); } if (new.inode) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ef4734b40e2a..c3b864588a0b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -686,7 +686,7 @@ static void ext4_handle_error(struct super_block *sb) if (!test_opt(sb, ERRORS_CONT)) { journal_t *journal = EXT4_SB(sb)->s_journal; - EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); if (journal) jbd2_journal_abort(journal, -EIO); } @@ -904,7 +904,7 @@ void __ext4_abort(struct super_block *sb, const char *function, va_end(args); if (sb_rdonly(sb) == 0) { - EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); if (EXT4_SB(sb)->s_journal) jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); @@ -1716,11 +1716,10 @@ enum { Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, - Opt_prefetch_block_bitmaps, Opt_no_fc, + Opt_prefetch_block_bitmaps, #ifdef CONFIG_EXT4_DEBUG - Opt_fc_debug_max_replay, + Opt_fc_debug_max_replay, Opt_fc_debug_force #endif - Opt_fc_debug_force }; static const match_table_t tokens = { @@ -1807,9 +1806,8 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable=%u"}, {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, - {Opt_no_fc, "no_fc"}, - {Opt_fc_debug_force, "fc_debug_force"}, #ifdef CONFIG_EXT4_DEBUG + {Opt_fc_debug_force, "fc_debug_force"}, {Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"}, #endif {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, @@ -2027,8 +2025,8 @@ static const struct mount_opts { {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), MOPT_CLEAR | MOPT_Q}, - {Opt_usrjquota, 0, MOPT_Q}, - {Opt_grpjquota, 0, MOPT_Q}, + {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING}, + {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING}, {Opt_offusrjquota, 0, MOPT_Q}, {Opt_offgrpjquota, 0, MOPT_Q}, {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, @@ -2039,11 +2037,9 @@ static const struct mount_opts { {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, - {Opt_no_fc, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, - MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY}, +#ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, -#ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_max_replay, 0, MOPT_GTE0}, #endif {Opt_err, 0, 0} @@ -2153,7 +2149,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt); return 1; case Opt_abort: - sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); return 1; case Opt_i_version: sb->s_flags |= SB_I_VERSION; @@ -3976,7 +3972,7 @@ int ext4_calculate_overhead(struct super_block *sb) * loaded or not */ if (sbi->s_journal && !sbi->s_journal_bdev) - overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); + overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); @@ -4340,9 +4336,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #endif if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n"); + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n"); /* can't mount with both data=journal and dioread_nolock. */ clear_opt(sb, DIOREAD_NOLOCK); + clear_opt2(sb, JOURNAL_FAST_COMMIT); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); @@ -4777,8 +4774,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); sbi->s_fc_bytes = 0; - sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE; - sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING; + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; @@ -4857,6 +4854,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount_wq; } + if (test_opt2(sb, JOURNAL_FAST_COMMIT) && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { + ext4_msg(sb, KERN_ERR, + "Failed to set fast commit journal feature"); + goto failed_mount_wq; + } + /* We have now updated the journal if required, so we can * validate the data journaling mode. */ switch (test_opt(sb, DATA_FLAGS)) { @@ -5872,7 +5877,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) + if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | @@ -5886,7 +5891,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { - if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { + if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { err = -EROFS; goto restore_opts; } @@ -6560,10 +6565,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, brelse(bh); out: if (inode->i_size < off + len) { - ext4_fc_track_range(inode, - (inode->i_size > 0 ? inode->i_size - 1 : 0) - >> inode->i_sb->s_blocksize_bits, - (off + len) >> inode->i_sb->s_blocksize_bits); i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; err2 = ext4_mark_inode_dirty(handle, inode); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 263f02ad8ebf..472932b9e6bc 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -106,6 +106,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh) * for a checkpoint to free up some space in the log. */ void __jbd2_log_wait_for_space(journal_t *journal) +__acquires(&journal->j_state_lock) +__releases(&journal->j_state_lock) { int nblocks, space_left; /* assert_spin_locked(&journal->j_state_lock); */ diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index fa688e163a80..b121d7d434c6 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -450,6 +450,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) schedule(); write_lock(&journal->j_state_lock); finish_wait(&journal->j_fc_wait, &wait); + /* + * TODO: by blocking fast commits here, we are increasing + * fsync() latency slightly. Strictly speaking, we don't need + * to block fast commits until the transaction enters T_FLUSH + * state. So an optimization is possible where we block new fast + * commits here and wait for existing ones to complete + * just before we enter T_FLUSH. That way, the existing fast + * commits and this full commit can proceed parallely. + */ } write_unlock(&journal->j_state_lock); @@ -801,7 +810,7 @@ start_journal_io: if (first_block < journal->j_tail) freed += journal->j_last - journal->j_first; /* Update tail only if we free significant amount of space */ - if (freed < journal->j_maxlen / 4) + if (freed < jbd2_journal_get_max_txn_bufs(journal)) update_tail = 0; } J_ASSERT(commit_transaction->t_state == T_COMMIT); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0c7c42bd530f..0c3d5e3b24b2 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -727,6 +727,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) */ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) { + if (unlikely(is_journal_aborted(journal))) + return -EIO; /* * Fast commits only allowed if at least one full commit has * been processed. @@ -734,10 +736,12 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) if (!journal->j_stats.ts_tid) return -EINVAL; - if (tid <= journal->j_commit_sequence) + write_lock(&journal->j_state_lock); + if (tid <= journal->j_commit_sequence) { + write_unlock(&journal->j_state_lock); return -EALREADY; + } - write_lock(&journal->j_state_lock); if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING || (journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) { DEFINE_WAIT(wait); @@ -777,13 +781,19 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) int jbd2_fc_end_commit(journal_t *journal) { - return __jbd2_fc_end_commit(journal, 0, 0); + return __jbd2_fc_end_commit(journal, 0, false); } EXPORT_SYMBOL(jbd2_fc_end_commit); -int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid) +int jbd2_fc_end_commit_fallback(journal_t *journal) { - return __jbd2_fc_end_commit(journal, tid, 1); + tid_t tid; + + read_lock(&journal->j_state_lock); + tid = journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0; + read_unlock(&journal->j_state_lock); + return __jbd2_fc_end_commit(journal, tid, true); } EXPORT_SYMBOL(jbd2_fc_end_commit_fallback); @@ -865,7 +875,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) int fc_off; *bh_out = NULL; - write_lock(&journal->j_state_lock); if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) { fc_off = journal->j_fc_off; @@ -874,7 +883,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) } else { ret = -EINVAL; } - write_unlock(&journal->j_state_lock); if (ret) return ret; @@ -887,11 +895,7 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) if (!bh) return -ENOMEM; - lock_buffer(bh); - clear_buffer_uptodate(bh); - set_buffer_dirty(bh); - unlock_buffer(bh); journal->j_fc_wbuf[fc_off] = bh; *bh_out = bh; @@ -909,9 +913,7 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks) struct buffer_head *bh; int i, j_fc_off; - read_lock(&journal->j_state_lock); j_fc_off = journal->j_fc_off; - read_unlock(&journal->j_state_lock); /* * Wait in reverse order to minimize chances of us being woken up before @@ -939,9 +941,7 @@ int jbd2_fc_release_bufs(journal_t *journal) struct buffer_head *bh; int i, j_fc_off; - read_lock(&journal->j_state_lock); j_fc_off = journal->j_fc_off; - read_unlock(&journal->j_state_lock); /* * Wait in reverse order to minimize chances of us being woken up before @@ -1348,23 +1348,16 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_dev = bdev; journal->j_fs_dev = fs_dev; journal->j_blk_offset = start; - journal->j_maxlen = len; + journal->j_total_len = len; /* We need enough buffers to write out full descriptor block. */ n = journal->j_blocksize / jbd2_min_tag_size(); journal->j_wbufsize = n; + journal->j_fc_wbuf = NULL; journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *), GFP_KERNEL); if (!journal->j_wbuf) goto err_cleanup; - if (journal->j_fc_wbufsize > 0) { - journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize, - sizeof(struct buffer_head *), - GFP_KERNEL); - if (!journal->j_fc_wbuf) - goto err_cleanup; - } - bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize); if (!bh) { pr_err("%s: Cannot get buffer for journal superblock\n", @@ -1378,23 +1371,11 @@ static journal_t *journal_init_common(struct block_device *bdev, err_cleanup: kfree(journal->j_wbuf); - kfree(journal->j_fc_wbuf); jbd2_journal_destroy_revoke(journal); kfree(journal); return NULL; } -int jbd2_fc_init(journal_t *journal, int num_fc_blks) -{ - journal->j_fc_wbufsize = num_fc_blks; - journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize, - sizeof(struct buffer_head *), GFP_KERNEL); - if (!journal->j_fc_wbuf) - return -ENOMEM; - return 0; -} -EXPORT_SYMBOL(jbd2_fc_init); - /* jbd2_journal_init_dev and jbd2_journal_init_inode: * * Create a journal structure assigned some fixed set of disk blocks to @@ -1512,16 +1493,7 @@ static int journal_reset(journal_t *journal) } journal->j_first = first; - - if (jbd2_has_feature_fast_commit(journal) && - journal->j_fc_wbufsize > 0) { - journal->j_fc_last = last; - journal->j_last = last - journal->j_fc_wbufsize; - journal->j_fc_first = journal->j_last + 1; - journal->j_fc_off = 0; - } else { - journal->j_last = last; - } + journal->j_last = last; journal->j_head = journal->j_first; journal->j_tail = journal->j_first; @@ -1531,7 +1503,14 @@ static int journal_reset(journal_t *journal) journal->j_commit_sequence = journal->j_transaction_sequence - 1; journal->j_commit_request = journal->j_commit_sequence; - journal->j_max_transaction_buffers = journal->j_maxlen / 4; + journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal); + + /* + * Now that journal recovery is done, turn fast commits off here. This + * way, if fast commit was enabled before the crash but if now FS has + * disabled it, we don't enable fast commits. + */ + jbd2_clear_feature_fast_commit(journal); /* * As a special case, if the on-disk copy is already marked as needing @@ -1792,15 +1771,15 @@ static int journal_get_superblock(journal_t *journal) goto out; } - if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) - journal->j_maxlen = be32_to_cpu(sb->s_maxlen); - else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { + if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) + journal->j_total_len = be32_to_cpu(sb->s_maxlen); + else if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { printk(KERN_WARNING "JBD2: journal file too short\n"); goto out; } if (be32_to_cpu(sb->s_first) == 0 || - be32_to_cpu(sb->s_first) >= journal->j_maxlen) { + be32_to_cpu(sb->s_first) >= journal->j_total_len) { printk(KERN_WARNING "JBD2: Invalid start block of journal: %u\n", be32_to_cpu(sb->s_first)); @@ -1872,6 +1851,7 @@ static int load_superblock(journal_t *journal) { int err; journal_superblock_t *sb; + int num_fc_blocks; err = journal_get_superblock(journal); if (err) @@ -1883,15 +1863,17 @@ static int load_superblock(journal_t *journal) journal->j_tail = be32_to_cpu(sb->s_start); journal->j_first = be32_to_cpu(sb->s_first); journal->j_errno = be32_to_cpu(sb->s_errno); + journal->j_last = be32_to_cpu(sb->s_maxlen); - if (jbd2_has_feature_fast_commit(journal) && - journal->j_fc_wbufsize > 0) { + if (jbd2_has_feature_fast_commit(journal)) { journal->j_fc_last = be32_to_cpu(sb->s_maxlen); - journal->j_last = journal->j_fc_last - journal->j_fc_wbufsize; + num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks); + if (!num_fc_blocks) + num_fc_blocks = JBD2_MIN_FC_BLOCKS; + if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS) + journal->j_last = journal->j_fc_last - num_fc_blocks; journal->j_fc_first = journal->j_last + 1; journal->j_fc_off = 0; - } else { - journal->j_last = be32_to_cpu(sb->s_maxlen); } return 0; @@ -1954,9 +1936,6 @@ int jbd2_journal_load(journal_t *journal) */ journal->j_flags &= ~JBD2_ABORT; - if (journal->j_fc_wbufsize > 0) - jbd2_journal_set_features(journal, 0, 0, - JBD2_FEATURE_INCOMPAT_FAST_COMMIT); /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory * and reset them on disk. */ @@ -2040,8 +2019,7 @@ int jbd2_journal_destroy(journal_t *journal) jbd2_journal_destroy_revoke(journal); if (journal->j_chksum_driver) crypto_free_shash(journal->j_chksum_driver); - if (journal->j_fc_wbufsize > 0) - kfree(journal->j_fc_wbuf); + kfree(journal->j_fc_wbuf); kfree(journal->j_wbuf); kfree(journal); @@ -2116,6 +2094,37 @@ int jbd2_journal_check_available_features(journal_t *journal, unsigned long comp return 0; } +static int +jbd2_journal_initialize_fast_commit(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + unsigned long long num_fc_blks; + + num_fc_blks = be32_to_cpu(sb->s_num_fc_blks); + if (num_fc_blks == 0) + num_fc_blks = JBD2_MIN_FC_BLOCKS; + if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS) + return -ENOSPC; + + /* Are we called twice? */ + WARN_ON(journal->j_fc_wbuf != NULL); + journal->j_fc_wbuf = kmalloc_array(num_fc_blks, + sizeof(struct buffer_head *), GFP_KERNEL); + if (!journal->j_fc_wbuf) + return -ENOMEM; + + journal->j_fc_wbufsize = num_fc_blks; + journal->j_fc_last = journal->j_last; + journal->j_last = journal->j_fc_last - num_fc_blks; + journal->j_fc_first = journal->j_last + 1; + journal->j_fc_off = 0; + journal->j_free = journal->j_last - journal->j_first; + journal->j_max_transaction_buffers = + jbd2_journal_get_max_txn_bufs(journal); + + return 0; +} + /** * int jbd2_journal_set_features() - Mark a given journal feature in the superblock * @journal: Journal to act on. @@ -2159,6 +2168,13 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, sb = journal->j_superblock; + if (incompat & JBD2_FEATURE_INCOMPAT_FAST_COMMIT) { + if (jbd2_journal_initialize_fast_commit(journal)) { + pr_err("JBD2: Cannot enable fast commits.\n"); + return 0; + } + } + /* Load the checksum driver if necessary */ if ((journal->j_chksum_driver == NULL) && INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index eb2606133cd8..dc0694fcfcd1 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -74,8 +74,8 @@ static int do_readahead(journal_t *journal, unsigned int start) /* Do up to 128K of readahead */ max = start + (128 * 1024 / journal->j_blocksize); - if (max > journal->j_maxlen) - max = journal->j_maxlen; + if (max > journal->j_total_len) + max = journal->j_total_len; /* Do the readahead itself. We'll submit MAXBUF buffer_heads at * a time to the block device IO layer. */ @@ -134,7 +134,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, *bhp = NULL; - if (offset >= journal->j_maxlen) { + if (offset >= journal->j_total_len) { printk(KERN_ERR "JBD2: corrupted journal superblock\n"); return -EFSCORRUPTED; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 43985738aa86..d54f04674e8e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -195,8 +195,10 @@ static void wait_transaction_switching(journal_t *journal) DEFINE_WAIT(wait); if (WARN_ON(!journal->j_running_transaction || - journal->j_running_transaction->t_state != T_SWITCH)) + journal->j_running_transaction->t_state != T_SWITCH)) { + read_unlock(&journal->j_state_lock); return; + } prepare_to_wait(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); read_unlock(&journal->j_state_lock); diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 14468613d150..a633044b0dc1 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -316,10 +316,6 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp) fh_copy(&resp->dirfh, &argp->fh); fh_init(&resp->fh, NFS3_FHSIZE); - if (argp->ftype == 0 || argp->ftype >= NF3BAD) { - resp->status = nfserr_inval; - goto out; - } if (argp->ftype == NF3CHR || argp->ftype == NF3BLK) { rdev = MKDEV(argp->major, argp->minor); if (MAJOR(rdev) != argp->major || @@ -328,7 +324,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp) goto out; } } else if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO) { - resp->status = nfserr_inval; + resp->status = nfserr_badtype; goto out; } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 9c23b6acf234..2277f83da250 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -1114,6 +1114,7 @@ nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_pathconfres *resp = rqstp->rq_resp; + *p++ = resp->status; *p++ = xdr_zero; /* no post_op_attr */ if (resp->status == 0) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index ad2fa1a8e7ad..e83b21778816 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1299,7 +1299,7 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src, struct nfsd_file *dst) { nfs42_ssc_close(src->nf_file); - nfsd_file_put(src); + /* 'src' is freed by nfsd4_do_async_copy */ nfsd_file_put(dst); mntput(ss_mnt); } @@ -1486,6 +1486,7 @@ do_callback: cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); if (!cb_copy) goto out; + refcount_set(&cb_copy->refcount, 1); memcpy(&cb_copy->cp_res, ©->cp_res, sizeof(copy->cp_res)); cb_copy->cp_clp = copy->cp_clp; cb_copy->nfserr = copy->nfserr; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index b9a9d69dde7e..db52e843002a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -877,7 +877,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) goto done; } - trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen); + trace_ocfs2_journal_init_maxlen(j_journal->j_total_len); *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & OCFS2_JOURNAL_DIRTY_FL); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 1d5566af48ac..1c49fd62ff2e 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -68,6 +68,7 @@ extern void *jbd2_alloc(size_t size, gfp_t flags); extern void jbd2_free(void *ptr, size_t size); #define JBD2_MIN_JOURNAL_BLOCKS 1024 +#define JBD2_MIN_FC_BLOCKS 256 #ifdef __KERNEL__ @@ -944,8 +945,9 @@ struct journal_s /** * @j_fc_off: * - * Number of fast commit blocks currently allocated. - * [j_state_lock]. + * Number of fast commit blocks currently allocated. Accessed only + * during fast commit. Currently only process can do fast commit, so + * this field is not protected by any lock. */ unsigned long j_fc_off; @@ -988,9 +990,9 @@ struct journal_s struct block_device *j_fs_dev; /** - * @j_maxlen: Total maximum capacity of the journal region on disk. + * @j_total_len: Total maximum capacity of the journal region on disk. */ - unsigned int j_maxlen; + unsigned int j_total_len; /** * @j_reserved_credits: @@ -1108,8 +1110,9 @@ struct journal_s struct buffer_head **j_wbuf; /** - * @j_fc_wbuf: Array of fast commit bhs for - * jbd2_journal_commit_transaction. + * @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only + * during a fast commit. Currently only process can do fast commit, so + * this field is not protected by any lock. */ struct buffer_head **j_fc_wbuf; @@ -1614,16 +1617,20 @@ extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); extern int jbd2_cleanup_journal_tail(journal_t *); /* Fast commit related APIs */ -int jbd2_fc_init(journal_t *journal, int num_fc_blks); int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); int jbd2_fc_end_commit(journal_t *journal); -int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid); +int jbd2_fc_end_commit_fallback(journal_t *journal); int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out); int jbd2_submit_inode_data(struct jbd2_inode *jinode); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); int jbd2_fc_release_bufs(journal_t *journal); +static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal) +{ + return (journal->j_total_len - journal->j_fc_wbufsize) / 4; +} + /* * is_journal_abort * diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 513913ff7486..3bb72266a75a 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -45,13 +45,9 @@ enum dma_sync_target { SYNC_FOR_DEVICE = 1, }; -extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t phys, - size_t mapping_size, - size_t alloc_size, - enum dma_data_direction dir, - unsigned long attrs); +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, + size_t mapping_size, size_t alloc_size, + enum dma_data_direction dir, unsigned long attrs); extern void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index b14314fcf732..70ae5497b73a 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -100,11 +100,12 @@ TRACE_DEFINE_ENUM(ES_REFERENCED_B); { EXT4_FC_REASON_XATTR, "XATTR"}, \ { EXT4_FC_REASON_CROSS_RENAME, "CROSS_RENAME"}, \ { EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, "JOURNAL_FLAG_CHANGE"}, \ - { EXT4_FC_REASON_MEM, "NO_MEM"}, \ + { EXT4_FC_REASON_NOMEM, "NO_MEM"}, \ { EXT4_FC_REASON_SWAP_BOOT, "SWAP_BOOT"}, \ { EXT4_FC_REASON_RESIZE, "RESIZE"}, \ { EXT4_FC_REASON_RENAME_DIR, "RENAME_DIR"}, \ - { EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}) + { EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}, \ + { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}) TRACE_EVENT(ext4_other_inode_update_time, TP_PROTO(struct inode *inode, ino_t orig_ino), @@ -2917,17 +2918,18 @@ TRACE_EVENT(ext4_fc_stats, ), TP_printk("dev %d:%d fc ineligible reasons:\n" - "%s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s,%d; " + "%s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d; " "num_commits:%ld, ineligible: %ld, numblks: %ld", MAJOR(__entry->dev), MINOR(__entry->dev), FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR), FC_REASON_NAME_STAT(EXT4_FC_REASON_CROSS_RENAME), FC_REASON_NAME_STAT(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE), - FC_REASON_NAME_STAT(EXT4_FC_REASON_MEM), + FC_REASON_NAME_STAT(EXT4_FC_REASON_NOMEM), FC_REASON_NAME_STAT(EXT4_FC_REASON_SWAP_BOOT), FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE), FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR), FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE), + FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA), __entry->sbi->s_fc_stats.fc_num_commits, __entry->sbi->s_fc_stats.fc_ineligible_commits, __entry->sbi->s_fc_stats.fc_numblks) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index f45b3c01370c..2477014e3fa6 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -655,10 +655,10 @@ TRACE_EVENT(rpc_xdr_overflow, __field(size_t, tail_len) __field(unsigned int, page_len) __field(unsigned int, len) - __string(progname, - xdr->rqst->rq_task->tk_client->cl_program->name) - __string(procedure, - xdr->rqst->rq_task->tk_msg.rpc_proc->p_name) + __string(progname, xdr->rqst ? + xdr->rqst->rq_task->tk_client->cl_program->name : "unknown") + __string(procedure, xdr->rqst ? + xdr->rqst->rq_task->tk_msg.rpc_proc->p_name : "unknown") ), TP_fast_assign( diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b4eea0abc3f0..781b9dca197c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -229,6 +229,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; + no_iotlb_memory = false; if (verbose) swiotlb_print_info(); @@ -260,9 +261,11 @@ swiotlb_init(int verbose) if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) return; - if (io_tlb_start) + if (io_tlb_start) { memblock_free_early(io_tlb_start, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + io_tlb_start = 0; + } pr_warn("Cannot allocate buffer"); no_iotlb_memory = true; } @@ -360,6 +363,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; + no_iotlb_memory = false; swiotlb_print_info(); @@ -441,14 +445,11 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, } } -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t orig_addr, - size_t mapping_size, - size_t alloc_size, - enum dma_data_direction dir, - unsigned long attrs) +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, + size_t mapping_size, size_t alloc_size, + enum dma_data_direction dir, unsigned long attrs) { + dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start); unsigned long flags; phys_addr_t tlb_addr; unsigned int nslots, stride, index, wrap; @@ -667,9 +668,8 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, swiotlb_force); - swiotlb_addr = swiotlb_tbl_map_single(dev, - phys_to_dma_unencrypted(dev, io_tlb_start), - paddr, size, size, dir, attrs); + swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir, + attrs); if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; diff --git a/kernel/exit.c b/kernel/exit.c index 87a2d515de0d..1f236ed375f8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -454,7 +454,10 @@ static void exit_mm(void) mmap_read_unlock(mm); self.task = current; - self.next = xchg(&core_state->dumper.next, &self); + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index a18b36b5422d..3aad6ef18504 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -63,19 +63,20 @@ static int proc_do_xprt(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[256]; - size_t len; + ssize_t len; - if ((*ppos && !write) || !*lenp) { + if (write || *ppos) { *lenp = 0; return 0; } len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); - *lenp = memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); + len = memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); - if (*lenp < 0) { + if (len < 0) { *lenp = 0; return -EINVAL; } + *lenp = len; return 0; } diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index 2b6551269e43..f3e3c94ab9bd 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -12,11 +12,12 @@ turbostat : turbostat.c override CFLAGS += -O2 -Wall -I../../../include override CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/asm/msr-index.h"' override CFLAGS += -DINTEL_FAMILY_HEADER='"../../../../arch/x86/include/asm/intel-family.h"' +override CFLAGS += -D_FILE_OFFSET_BITS=64 override CFLAGS += -D_FORTIFY_SOURCE=2 %: %.c @mkdir -p $(BUILD_OUTPUT) - $(CC) $(CFLAGS) $< -o $(BUILD_OUTPUT)/$@ $(LDFLAGS) -lcap + $(CC) $(CFLAGS) $< -o $(BUILD_OUTPUT)/$@ $(LDFLAGS) -lcap -lrt .PHONY : clean clean : diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index a6db83a88e85..f6b7e85b121c 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -335,7 +335,7 @@ that they count at TSC rate, which is true on all processors tested to date. .SH REFERENCES Volume 3B: System Programming Guide" -http://www.intel.com/products/processor/manuals/ +https://www.intel.com/products/processor/manuals/ .SH FILES .ta diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 33b370865d16..f3a1746f7f45 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -79,6 +79,7 @@ unsigned long long gfx_cur_rc6_ms; unsigned long long cpuidle_cur_cpu_lpi_us; unsigned long long cpuidle_cur_sys_lpi_us; unsigned int gfx_cur_mhz; +unsigned int gfx_act_mhz; unsigned int tcc_activation_temp; unsigned int tcc_activation_temp_override; double rapl_power_units, rapl_time_units; @@ -210,13 +211,14 @@ struct pkg_data { unsigned long long pkg_both_core_gfxe_c0; long long gfx_rc6_ms; unsigned int gfx_mhz; + unsigned int gfx_act_mhz; unsigned int package_id; - unsigned int energy_pkg; /* MSR_PKG_ENERGY_STATUS */ - unsigned int energy_dram; /* MSR_DRAM_ENERGY_STATUS */ - unsigned int energy_cores; /* MSR_PP0_ENERGY_STATUS */ - unsigned int energy_gfx; /* MSR_PP1_ENERGY_STATUS */ - unsigned int rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */ - unsigned int rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ + unsigned long long energy_pkg; /* MSR_PKG_ENERGY_STATUS */ + unsigned long long energy_dram; /* MSR_DRAM_ENERGY_STATUS */ + unsigned long long energy_cores; /* MSR_PP0_ENERGY_STATUS */ + unsigned long long energy_gfx; /* MSR_PP1_ENERGY_STATUS */ + unsigned long long rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */ + unsigned long long rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned long long counter[MAX_ADDED_COUNTERS]; } *package_even, *package_odd; @@ -259,6 +261,113 @@ struct msr_counter { #define SYSFS_PERCPU (1 << 1) }; +/* + * The accumulated sum of MSR is defined as a monotonic + * increasing MSR, it will be accumulated periodically, + * despite its register's bit width. + */ +enum { + IDX_PKG_ENERGY, + IDX_DRAM_ENERGY, + IDX_PP0_ENERGY, + IDX_PP1_ENERGY, + IDX_PKG_PERF, + IDX_DRAM_PERF, + IDX_COUNT, +}; + +int get_msr_sum(int cpu, off_t offset, unsigned long long *msr); + +struct msr_sum_array { + /* get_msr_sum() = sum + (get_msr() - last) */ + struct { + /*The accumulated MSR value is updated by the timer*/ + unsigned long long sum; + /*The MSR footprint recorded in last timer*/ + unsigned long long last; + } entries[IDX_COUNT]; +}; + +/* The percpu MSR sum array.*/ +struct msr_sum_array *per_cpu_msr_sum; + +int idx_to_offset(int idx) +{ + int offset; + + switch (idx) { + case IDX_PKG_ENERGY: + offset = MSR_PKG_ENERGY_STATUS; + break; + case IDX_DRAM_ENERGY: + offset = MSR_DRAM_ENERGY_STATUS; + break; + case IDX_PP0_ENERGY: + offset = MSR_PP0_ENERGY_STATUS; + break; + case IDX_PP1_ENERGY: + offset = MSR_PP1_ENERGY_STATUS; + break; + case IDX_PKG_PERF: + offset = MSR_PKG_PERF_STATUS; + break; + case IDX_DRAM_PERF: + offset = MSR_DRAM_PERF_STATUS; + break; + default: + offset = -1; + } + return offset; +} + +int offset_to_idx(int offset) +{ + int idx; + + switch (offset) { + case MSR_PKG_ENERGY_STATUS: + idx = IDX_PKG_ENERGY; + break; + case MSR_DRAM_ENERGY_STATUS: + idx = IDX_DRAM_ENERGY; + break; + case MSR_PP0_ENERGY_STATUS: + idx = IDX_PP0_ENERGY; + break; + case MSR_PP1_ENERGY_STATUS: + idx = IDX_PP1_ENERGY; + break; + case MSR_PKG_PERF_STATUS: + idx = IDX_PKG_PERF; + break; + case MSR_DRAM_PERF_STATUS: + idx = IDX_DRAM_PERF; + break; + default: + idx = -1; + } + return idx; +} + +int idx_valid(int idx) +{ + switch (idx) { + case IDX_PKG_ENERGY: + return do_rapl & RAPL_PKG; + case IDX_DRAM_ENERGY: + return do_rapl & RAPL_DRAM; + case IDX_PP0_ENERGY: + return do_rapl & RAPL_CORES_ENERGY_STATUS; + case IDX_PP1_ENERGY: + return do_rapl & RAPL_GFX; + case IDX_PKG_PERF: + return do_rapl & RAPL_PKG_PERF_STATUS; + case IDX_DRAM_PERF: + return do_rapl & RAPL_DRAM_PERF_STATUS; + default: + return 0; + } +} struct sys_counters { unsigned int added_thread_counters; unsigned int added_core_counters; @@ -451,6 +560,7 @@ struct msr_counter bic[] = { { 0x0, "APIC" }, { 0x0, "X2APIC" }, { 0x0, "Die" }, + { 0x0, "GFXAMHz" }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -505,6 +615,7 @@ struct msr_counter bic[] = { #define BIC_APIC (1ULL << 48) #define BIC_X2APIC (1ULL << 49) #define BIC_Die (1ULL << 50) +#define BIC_GFXACTMHz (1ULL << 51) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -724,6 +835,9 @@ void print_header(char *delim) if (DO_BIC(BIC_GFXMHz)) outp += sprintf(outp, "%sGFXMHz", (printed++ ? delim : "")); + if (DO_BIC(BIC_GFXACTMHz)) + outp += sprintf(outp, "%sGFXAMHz", (printed++ ? delim : "")); + if (DO_BIC(BIC_Totl_c0)) outp += sprintf(outp, "%sTotl%%C0", (printed++ ? delim : "")); if (DO_BIC(BIC_Any_c0)) @@ -858,13 +972,13 @@ int dump_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "pc10: %016llX\n", p->pc10); outp += sprintf(outp, "cpu_lpi: %016llX\n", p->cpu_lpi); outp += sprintf(outp, "sys_lpi: %016llX\n", p->sys_lpi); - outp += sprintf(outp, "Joules PKG: %0X\n", p->energy_pkg); - outp += sprintf(outp, "Joules COR: %0X\n", p->energy_cores); - outp += sprintf(outp, "Joules GFX: %0X\n", p->energy_gfx); - outp += sprintf(outp, "Joules RAM: %0X\n", p->energy_dram); - outp += sprintf(outp, "Throttle PKG: %0X\n", + outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg); + outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores); + outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx); + outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram); + outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status); - outp += sprintf(outp, "Throttle RAM: %0X\n", + outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status); outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c); @@ -1062,14 +1176,7 @@ int format_counters(struct thread_data *t, struct core_data *c, } } - /* - * If measurement interval exceeds minimum RAPL Joule Counter range, - * indicate that results are suspect by printing "**" in fraction place. - */ - if (interval_float < rapl_joule_counter_range) - fmt8 = "%s%.2f"; - else - fmt8 = "%6.0f**"; + fmt8 = "%s%.2f"; if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float); @@ -1098,6 +1205,10 @@ int format_counters(struct thread_data *t, struct core_data *c, if (DO_BIC(BIC_GFXMHz)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_mhz); + /* GFXACTMHz */ + if (DO_BIC(BIC_GFXACTMHz)) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_act_mhz); + /* Totl%C0, Any%C0 GFX%C0 CPUGFX% */ if (DO_BIC(BIC_Totl_c0)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0/tsc); @@ -1210,11 +1321,7 @@ void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_ } #define DELTA_WRAP32(new, old) \ - if (new > old) { \ - old = new - old; \ - } else { \ - old = 0x100000000 + new - old; \ - } + old = ((((unsigned long long)new << 32) - ((unsigned long long)old << 32)) >> 32); int delta_package(struct pkg_data *new, struct pkg_data *old) @@ -1253,13 +1360,14 @@ delta_package(struct pkg_data *new, struct pkg_data *old) old->gfx_rc6_ms = new->gfx_rc6_ms - old->gfx_rc6_ms; old->gfx_mhz = new->gfx_mhz; + old->gfx_act_mhz = new->gfx_act_mhz; - DELTA_WRAP32(new->energy_pkg, old->energy_pkg); - DELTA_WRAP32(new->energy_cores, old->energy_cores); - DELTA_WRAP32(new->energy_gfx, old->energy_gfx); - DELTA_WRAP32(new->energy_dram, old->energy_dram); - DELTA_WRAP32(new->rapl_pkg_perf_status, old->rapl_pkg_perf_status); - DELTA_WRAP32(new->rapl_dram_perf_status, old->rapl_dram_perf_status); + old->energy_pkg = new->energy_pkg - old->energy_pkg; + old->energy_cores = new->energy_cores - old->energy_cores; + old->energy_gfx = new->energy_gfx - old->energy_gfx; + old->energy_dram = new->energy_dram - old->energy_dram; + old->rapl_pkg_perf_status = new->rapl_pkg_perf_status - old->rapl_pkg_perf_status; + old->rapl_dram_perf_status = new->rapl_dram_perf_status - old->rapl_dram_perf_status; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -1469,6 +1577,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->gfx_rc6_ms = 0; p->gfx_mhz = 0; + p->gfx_act_mhz = 0; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) t->counter[i] = 0; @@ -1564,6 +1673,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, average.packages.gfx_rc6_ms = p->gfx_rc6_ms; average.packages.gfx_mhz = p->gfx_mhz; + average.packages.gfx_act_mhz = p->gfx_act_mhz; average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c); @@ -1784,7 +1894,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) int i; if (cpu_migrate(cpu)) { - fprintf(outf, "Could not migrate to CPU %d\n", cpu); + fprintf(outf, "get_counters: Could not migrate to CPU %d\n", cpu); return -1; } @@ -1966,39 +2076,39 @@ retry: p->sys_lpi = cpuidle_cur_sys_lpi_us; if (do_rapl & RAPL_PKG) { - if (get_msr(cpu, MSR_PKG_ENERGY_STATUS, &msr)) + if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr)) return -13; - p->energy_pkg = msr & 0xFFFFFFFF; + p->energy_pkg = msr; } if (do_rapl & RAPL_CORES_ENERGY_STATUS) { - if (get_msr(cpu, MSR_PP0_ENERGY_STATUS, &msr)) + if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) return -14; - p->energy_cores = msr & 0xFFFFFFFF; + p->energy_cores = msr; } if (do_rapl & RAPL_DRAM) { - if (get_msr(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) + if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) return -15; - p->energy_dram = msr & 0xFFFFFFFF; + p->energy_dram = msr; } if (do_rapl & RAPL_GFX) { - if (get_msr(cpu, MSR_PP1_ENERGY_STATUS, &msr)) + if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr)) return -16; - p->energy_gfx = msr & 0xFFFFFFFF; + p->energy_gfx = msr; } if (do_rapl & RAPL_PKG_PERF_STATUS) { - if (get_msr(cpu, MSR_PKG_PERF_STATUS, &msr)) + if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr)) return -16; - p->rapl_pkg_perf_status = msr & 0xFFFFFFFF; + p->rapl_pkg_perf_status = msr; } if (do_rapl & RAPL_DRAM_PERF_STATUS) { - if (get_msr(cpu, MSR_DRAM_PERF_STATUS, &msr)) + if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr)) return -16; - p->rapl_dram_perf_status = msr & 0xFFFFFFFF; + p->rapl_dram_perf_status = msr; } if (do_rapl & RAPL_AMD_F17H) { - if (get_msr(cpu, MSR_PKG_ENERGY_STAT, &msr)) + if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr)) return -13; - p->energy_pkg = msr & 0xFFFFFFFF; + p->energy_pkg = msr; } if (DO_BIC(BIC_PkgTmp)) { if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) @@ -2012,6 +2122,9 @@ retry: if (DO_BIC(BIC_GFXMHz)) p->gfx_mhz = gfx_cur_mhz; + if (DO_BIC(BIC_GFXACTMHz)) + p->gfx_act_mhz = gfx_act_mhz; + for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &p->counter[i])) return -10; @@ -2173,6 +2286,7 @@ int has_turbo_ratio_group_limits(int family, int model) case INTEL_FAM6_ATOM_GOLDMONT: case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_ATOM_GOLDMONT_D: + case INTEL_FAM6_ATOM_TREMONT_D: return 1; } return 0; @@ -2650,7 +2764,12 @@ int get_thread_siblings(struct cpu_topology *thiscpu) sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpu); - filep = fopen_or_die(path, "r"); + filep = fopen(path, "r"); + + if (!filep) { + warnx("%s: open failed", path); + return -1; + } do { offset -= BITMASK_SIZE; if (fscanf(filep, "%lx%c", &map, &character) != 2) @@ -2763,18 +2882,25 @@ void re_initialize(void) { free_all_buffers(); setup_all_buffers(); - printf("turbostat: re-initialized with num_cpus %d\n", topo.num_cpus); + fprintf(outf, "turbostat: re-initialized with num_cpus %d\n", topo.num_cpus); } void set_max_cpu_num(void) { FILE *filep; + int base_cpu; unsigned long dummy; + char pathname[64]; + + base_cpu = sched_getcpu(); + if (base_cpu < 0) + err(1, "cannot find calling cpu ID"); + sprintf(pathname, + "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", + base_cpu); + filep = fopen_or_die(pathname, "r"); topo.max_cpu_num = 0; - filep = fopen_or_die( - "/sys/devices/system/cpu/cpu0/topology/thread_siblings", - "r"); while (fscanf(filep, "%lx,", &dummy) == 1) topo.max_cpu_num += BITMASK_SIZE; fclose(filep); @@ -2916,6 +3042,33 @@ int snapshot_gfx_mhz(void) } /* + * snapshot_gfx_cur_mhz() + * + * record snapshot of + * /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz + * + * return 1 if config change requires a restart, else return 0 + */ +int snapshot_gfx_act_mhz(void) +{ + static FILE *fp; + int retval; + + if (fp == NULL) + fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r"); + else { + rewind(fp); + fflush(fp); + } + + retval = fscanf(fp, "%d", &gfx_act_mhz); + if (retval != 1) + err(1, "GFX ACT MHz"); + + return 0; +} + +/* * snapshot_cpu_lpi() * * record snapshot of @@ -2980,6 +3133,9 @@ int snapshot_proc_sysfs_files(void) if (DO_BIC(BIC_GFXMHz)) snapshot_gfx_mhz(); + if (DO_BIC(BIC_GFXACTMHz)) + snapshot_gfx_act_mhz(); + if (DO_BIC(BIC_CPU_LPI)) snapshot_cpu_lpi_us(); @@ -3057,6 +3213,111 @@ void do_sleep(void) } } +int get_msr_sum(int cpu, off_t offset, unsigned long long *msr) +{ + int ret, idx; + unsigned long long msr_cur, msr_last; + + if (!per_cpu_msr_sum) + return 1; + + idx = offset_to_idx(offset); + if (idx < 0) + return idx; + /* get_msr_sum() = sum + (get_msr() - last) */ + ret = get_msr(cpu, offset, &msr_cur); + if (ret) + return ret; + msr_last = per_cpu_msr_sum[cpu].entries[idx].last; + DELTA_WRAP32(msr_cur, msr_last); + *msr = msr_last + per_cpu_msr_sum[cpu].entries[idx].sum; + + return 0; +} + +timer_t timerid; + +/* Timer callback, update the sum of MSRs periodically. */ +static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + int i, ret; + int cpu = t->cpu_id; + + for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) { + unsigned long long msr_cur, msr_last; + int offset; + + if (!idx_valid(i)) + continue; + offset = idx_to_offset(i); + if (offset < 0) + continue; + ret = get_msr(cpu, offset, &msr_cur); + if (ret) { + fprintf(outf, "Can not update msr(0x%x)\n", offset); + continue; + } + + msr_last = per_cpu_msr_sum[cpu].entries[i].last; + per_cpu_msr_sum[cpu].entries[i].last = msr_cur & 0xffffffff; + + DELTA_WRAP32(msr_cur, msr_last); + per_cpu_msr_sum[cpu].entries[i].sum += msr_last; + } + return 0; +} + +static void +msr_record_handler(union sigval v) +{ + for_all_cpus(update_msr_sum, EVEN_COUNTERS); +} + +void msr_sum_record(void) +{ + struct itimerspec its; + struct sigevent sev; + + per_cpu_msr_sum = calloc(topo.max_cpu_num + 1, sizeof(struct msr_sum_array)); + if (!per_cpu_msr_sum) { + fprintf(outf, "Can not allocate memory for long time MSR.\n"); + return; + } + /* + * Signal handler might be restricted, so use thread notifier instead. + */ + memset(&sev, 0, sizeof(struct sigevent)); + sev.sigev_notify = SIGEV_THREAD; + sev.sigev_notify_function = msr_record_handler; + + sev.sigev_value.sival_ptr = &timerid; + if (timer_create(CLOCK_REALTIME, &sev, &timerid) == -1) { + fprintf(outf, "Can not create timer.\n"); + goto release_msr; + } + + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 1; + /* + * A wraparound time has been calculated early. + * Some sources state that the peak power for a + * microprocessor is usually 1.5 times the TDP rating, + * use 2 * TDP for safety. + */ + its.it_interval.tv_sec = rapl_joule_counter_range / 2; + its.it_interval.tv_nsec = 0; + + if (timer_settime(timerid, 0, &its, NULL) == -1) { + fprintf(outf, "Can not set timer.\n"); + goto release_timer; + } + return; + + release_timer: + timer_delete(timerid); + release_msr: + free(per_cpu_msr_sum); +} void turbostat_loop() { @@ -3075,7 +3336,7 @@ restart: if (retval < -1) { exit(retval); } else if (retval == -1) { - if (restarted > 1) { + if (restarted > 10) { exit(retval); } re_initialize(); @@ -3279,6 +3540,7 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ case INTEL_FAM6_ATOM_TREMONT: /* EHL */ + case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ pkg_cstate_limits = glm_pkg_cstate_limits; break; default: @@ -3361,6 +3623,17 @@ int is_ehl(unsigned int family, unsigned int model) } return 0; } +int is_jvl(unsigned int family, unsigned int model) +{ + if (!genuine_intel) + return 0; + + switch (model) { + case INTEL_FAM6_ATOM_TREMONT_D: + return 1; + } + return 0; +} int has_turbo_ratio_limit(unsigned int family, unsigned int model) { @@ -3475,6 +3748,20 @@ int has_config_tdp(unsigned int family, unsigned int model) } static void +remove_underbar(char *s) +{ + char *to = s; + + while (*s) { + if (*s != '_') + *to++ = *s; + s++; + } + + *to = 0; +} + +static void dump_cstate_pstate_config_info(unsigned int family, unsigned int model) { if (!do_nhm_platform_info) @@ -3530,9 +3817,6 @@ dump_sysfs_cstate_config(void) int state; char *sp; - if (!DO_BIC(BIC_sysfs)) - return; - if (access("/sys/devices/system/cpu/cpuidle", R_OK)) { fprintf(outf, "cpuidle not loaded\n"); return; @@ -3559,6 +3843,8 @@ dump_sysfs_cstate_config(void) *sp = '\0'; fclose(input); + remove_underbar(name_buf); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc", base_cpu, state); input = fopen(path, "r"); @@ -3645,7 +3931,7 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; if (cpu_migrate(cpu)) { - fprintf(outf, "Could not migrate to CPU %d\n", cpu); + fprintf(outf, "print_epb: Could not migrate to CPU %d\n", cpu); return -1; } @@ -3689,7 +3975,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; if (cpu_migrate(cpu)) { - fprintf(outf, "Could not migrate to CPU %d\n", cpu); + fprintf(outf, "print_hwp: Could not migrate to CPU %d\n", cpu); return -1; } @@ -3777,7 +4063,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data return 0; if (cpu_migrate(cpu)) { - fprintf(outf, "Could not migrate to CPU %d\n", cpu); + fprintf(outf, "print_perf_limit: Could not migrate to CPU %d\n", cpu); return -1; } @@ -3881,13 +4167,8 @@ double get_tdp_intel(unsigned int model) double get_tdp_amd(unsigned int family) { - switch (family) { - case 0x17: - case 0x18: - default: - /* This is the max stock TDP of HEDT/Server Fam17h chips */ - return 250.0; - } + /* This is the max stock TDP of HEDT/Server Fam17h+ chips */ + return 280.0; } /* @@ -3959,6 +4240,14 @@ void rapl_probe_intel(unsigned int family, unsigned int model) BIC_PRESENT(BIC_GFXWatt); } break; + case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ + do_rapl = RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO; + BIC_PRESENT(BIC_PKG__); + if (rapl_joules) + BIC_PRESENT(BIC_Pkg_J); + else + BIC_PRESENT(BIC_PkgWatt); + break; case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_GFX | RAPL_PKG_POWER_INFO; @@ -4069,27 +4358,20 @@ void rapl_probe_amd(unsigned int family, unsigned int model) if (max_extended_level >= 0x80000007) { __cpuid(0x80000007, eax, ebx, ecx, edx); - /* RAPL (Fam 17h) */ + /* RAPL (Fam 17h+) */ has_rapl = edx & (1 << 14); } - if (!has_rapl) + if (!has_rapl || family < 0x17) return; - switch (family) { - case 0x17: /* Zen, Zen+ */ - case 0x18: /* Hygon Dhyana */ - do_rapl = RAPL_AMD_F17H | RAPL_PER_CORE_ENERGY; - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - } - break; - default: - return; + do_rapl = RAPL_AMD_F17H | RAPL_PER_CORE_ENERGY; + if (rapl_joules) { + BIC_PRESENT(BIC_Pkg_J); + BIC_PRESENT(BIC_Cor_J); + } else { + BIC_PRESENT(BIC_PkgWatt); + BIC_PRESENT(BIC_CorWatt); } if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr)) @@ -4162,7 +4444,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p return 0; if (cpu_migrate(cpu)) { - fprintf(outf, "Could not migrate to CPU %d\n", cpu); + fprintf(outf, "print_thermal: Could not migrate to CPU %d\n", cpu); return -1; } @@ -4234,7 +4516,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) cpu = t->cpu_id; if (cpu_migrate(cpu)) { - fprintf(outf, "Could not migrate to CPU %d\n", cpu); + fprintf(outf, "print_rapl: Could not migrate to CPU %d\n", cpu); return -1; } @@ -4361,6 +4643,7 @@ int has_snb_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ case INTEL_FAM6_ATOM_TREMONT: /* EHL */ + case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ return 1; } return 0; @@ -4507,12 +4790,33 @@ double discover_bclk(unsigned int family, unsigned int model) * below this value, including the Digital Thermal Sensor (DTS), * Package Thermal Management Sensor (PTM), and thermal event thresholds. */ -int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p) +int read_tcc_activation_temp() { unsigned long long msr; - unsigned int target_c_local; - int cpu; + unsigned int tcc, target_c, offset_c; + /* Temperature Target MSR is Nehalem and newer only */ + if (!do_nhm_platform_info) + return 0; + + if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) + return 0; + + target_c = (msr >> 16) & 0xFF; + + offset_c = (msr >> 24) & 0xF; + + tcc = target_c - offset_c; + + if (!quiet) + fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", + base_cpu, msr, tcc, target_c, offset_c); + + return tcc; +} + +int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ /* tcc_activation_temp is used only for dts or ptm */ if (!(do_dts || do_ptm)) return 0; @@ -4521,43 +4825,18 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) return 0; - cpu = t->cpu_id; - if (cpu_migrate(cpu)) { - fprintf(outf, "Could not migrate to CPU %d\n", cpu); - return -1; - } - if (tcc_activation_temp_override != 0) { tcc_activation_temp = tcc_activation_temp_override; - fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n", - cpu, tcc_activation_temp); + fprintf(outf, "Using cmdline TCC Target (%d C)\n", tcc_activation_temp); return 0; } - /* Temperature Target MSR is Nehalem and newer only */ - if (!do_nhm_platform_info) - goto guess; - - if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) - goto guess; - - target_c_local = (msr >> 16) & 0xFF; - - if (!quiet) - fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", - cpu, msr, target_c_local); - - if (!target_c_local) - goto guess; - - tcc_activation_temp = target_c_local; - - return 0; + tcc_activation_temp = read_tcc_activation_temp(); + if (tcc_activation_temp) + return 0; -guess: tcc_activation_temp = TJMAX_DEFAULT; - fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", - cpu, tcc_activation_temp); + fprintf(outf, "Guessing tjMax %d C, Please use -T to specify\n", tcc_activation_temp); return 0; } @@ -4685,19 +4964,46 @@ unsigned int intel_model_duplicates(unsigned int model) case INTEL_FAM6_ICELAKE_NNPI: case INTEL_FAM6_TIGERLAKE_L: case INTEL_FAM6_TIGERLAKE: + case INTEL_FAM6_ROCKETLAKE: + case INTEL_FAM6_LAKEFIELD: + case INTEL_FAM6_ALDERLAKE: return INTEL_FAM6_CANNONLAKE_L; - case INTEL_FAM6_ATOM_TREMONT_D: - return INTEL_FAM6_ATOM_GOLDMONT_D; - case INTEL_FAM6_ATOM_TREMONT_L: return INTEL_FAM6_ATOM_TREMONT; case INTEL_FAM6_ICELAKE_X: + case INTEL_FAM6_SAPPHIRERAPIDS_X: return INTEL_FAM6_SKYLAKE_X; } return model; } + +void print_dev_latency(void) +{ + char *path = "/dev/cpu_dma_latency"; + int fd; + int value; + int retval; + + fd = open(path, O_RDONLY); + if (fd < 0) { + warn("fopen %s\n", path); + return; + } + + retval = read(fd, (void *)&value, sizeof(int)); + if (retval != sizeof(int)) { + warn("read %s\n", path); + close(fd); + return; + } + fprintf(outf, "/dev/cpu_dma_latency: %d usec (%s)\n", + value, value == 2000000000 ? "default" : "constrained"); + + close(fd); +} + void process_cpuid() { unsigned int eax, ebx, ecx, edx; @@ -4916,6 +5222,14 @@ void process_cpuid() BIC_PRESENT(BIC_Mod_c6); use_c1_residency_msr = 1; } + if (is_jvl(family, model)) { + BIC_NOT_PRESENT(BIC_CPU_c3); + BIC_NOT_PRESENT(BIC_CPU_c7); + BIC_NOT_PRESENT(BIC_Pkgpc2); + BIC_NOT_PRESENT(BIC_Pkgpc3); + BIC_NOT_PRESENT(BIC_Pkgpc6); + BIC_NOT_PRESENT(BIC_Pkgpc7); + } if (is_dnv(family, model)) { BIC_PRESENT(BIC_CPU_c1); BIC_NOT_PRESENT(BIC_CPU_c3); @@ -4935,9 +5249,12 @@ void process_cpuid() BIC_NOT_PRESENT(BIC_Pkgpc7); } if (has_c8910_msrs(family, model)) { - BIC_PRESENT(BIC_Pkgpc8); - BIC_PRESENT(BIC_Pkgpc9); - BIC_PRESENT(BIC_Pkgpc10); + if (pkg_cstate_limit >= PCL__8) + BIC_PRESENT(BIC_Pkgpc8); + if (pkg_cstate_limit >= PCL__9) + BIC_PRESENT(BIC_Pkgpc9); + if (pkg_cstate_limit >= PCL_10) + BIC_PRESENT(BIC_Pkgpc10); } do_irtl_hsw = has_c8910_msrs(family, model); if (has_skl_msrs(family, model)) { @@ -4967,6 +5284,8 @@ void process_cpuid() dump_cstate_pstate_config_info(family, model); if (!quiet) + print_dev_latency(); + if (!quiet) dump_sysfs_cstate_config(); if (!quiet) dump_sysfs_pstate_config(); @@ -4980,6 +5299,9 @@ void process_cpuid() if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) BIC_PRESENT(BIC_GFXMHz); + if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) + BIC_PRESENT(BIC_GFXACTMHz); + if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK)) BIC_PRESENT(BIC_CPU_LPI); else @@ -5390,7 +5712,7 @@ int get_and_dump_counters(void) } void print_version() { - fprintf(outf, "turbostat version 20.03.20" + fprintf(outf, "turbostat version 20.09.30" " - Len Brown <lenb@kernel.org>\n"); } @@ -5597,6 +5919,8 @@ void probe_sysfs(void) *sp = '%'; *(sp + 1) = '\0'; + remove_underbar(name_buf); + fclose(input); sprintf(path, "cpuidle/state%d/time", state); @@ -5624,6 +5948,8 @@ void probe_sysfs(void) *sp = '\0'; fclose(input); + remove_underbar(name_buf); + sprintf(path, "cpuidle/state%d/usage", state); if (is_deferred_skip(name_buf)) @@ -5868,6 +6194,7 @@ int main(int argc, char **argv) return 0; } + msr_sum_record(); /* * if any params left, it must be a command to fork */ diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c index 3fe1eed900d4..ff6c6661f075 100644 --- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c +++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c @@ -622,6 +622,57 @@ void cmdline(int argc, char **argv) } } +/* + * Open a file, and exit on failure + */ +FILE *fopen_or_die(const char *path, const char *mode) +{ + FILE *filep = fopen(path, "r"); + + if (!filep) + err(1, "%s: open failed", path); + return filep; +} + +void err_on_hypervisor(void) +{ + FILE *cpuinfo; + char *flags, *hypervisor; + char *buffer; + + /* On VMs /proc/cpuinfo contains a "flags" entry for hypervisor */ + cpuinfo = fopen_or_die("/proc/cpuinfo", "ro"); + + buffer = malloc(4096); + if (!buffer) { + fclose(cpuinfo); + err(-ENOMEM, "buffer malloc fail"); + } + + if (!fread(buffer, 1024, 1, cpuinfo)) { + fclose(cpuinfo); + free(buffer); + err(1, "Reading /proc/cpuinfo failed"); + } + + flags = strstr(buffer, "flags"); + rewind(cpuinfo); + fseek(cpuinfo, flags - buffer, SEEK_SET); + if (!fgets(buffer, 4096, cpuinfo)) { + fclose(cpuinfo); + free(buffer); + err(1, "Reading /proc/cpuinfo failed"); + } + fclose(cpuinfo); + + hypervisor = strstr(buffer, "hypervisor"); + + free(buffer); + + if (hypervisor) + err(-1, + "not supported on this virtual machine"); +} int get_msr(int cpu, int offset, unsigned long long *msr) { @@ -635,8 +686,10 @@ int get_msr(int cpu, int offset, unsigned long long *msr) err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, or run as root", pathname); retval = pread(fd, msr, sizeof(*msr), offset); - if (retval != sizeof(*msr)) + if (retval != sizeof(*msr)) { + err_on_hypervisor(); err(-1, "%s offset 0x%llx read failed", pathname, (unsigned long long)offset); + } if (debug > 1) fprintf(stderr, "get_msr(cpu%d, 0x%X, 0x%llX)\n", cpu, offset, *msr); @@ -1086,18 +1139,6 @@ int update_cpu_msrs(int cpu) return 0; } -/* - * Open a file, and exit on failure - */ -FILE *fopen_or_die(const char *path, const char *mode) -{ - FILE *filep = fopen(path, "r"); - - if (!filep) - err(1, "%s: open failed", path); - return filep; -} - unsigned int get_pkg_num(int cpu) { FILE *fp; diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index d2c2d6205008..7a2c242b7152 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -1,10 +1,13 @@ # SPDX-License-Identifier: GPL-2.0-only +/aarch64/get-reg-list +/aarch64/get-reg-list-sve /s390x/memop /s390x/resets /s390x/sync_regs_test /x86_64/cr4_cpuid_sync_test /x86_64/debug_regs /x86_64/evmcs_test +/x86_64/kvm_pv_test /x86_64/hyperv_cpuid /x86_64/mmio_warning_test /x86_64/platform_info_test @@ -24,6 +27,7 @@ /clear_dirty_log_test /demand_paging_test /dirty_log_test +/dirty_log_perf_test /kvm_create_max_vcpus /set_memory_region_test /steal_time diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 30afbad36cd5..3d14ef77755e 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -34,13 +34,14 @@ ifeq ($(ARCH),s390) endif LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c -LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c +LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid +TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test @@ -58,14 +59,15 @@ TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test TEST_GEN_PROGS_x86_64 += x86_64/debug_regs TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/user_msr_test -TEST_GEN_PROGS_x86_64 += clear_dirty_log_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test +TEST_GEN_PROGS_x86_64 += dirty_log_perf_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time -TEST_GEN_PROGS_aarch64 += clear_dirty_log_test +TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list +TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus @@ -111,14 +113,21 @@ LDFLAGS += -pthread $(no-pie-option) $(pgste-option) include ../lib.mk STATIC_LIBS := $(OUTPUT)/libkvm.a -LIBKVM_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM)) -EXTRA_CLEAN += $(LIBKVM_OBJ) $(STATIC_LIBS) cscope.* +LIBKVM_C := $(filter %.c,$(LIBKVM)) +LIBKVM_S := $(filter %.S,$(LIBKVM)) +LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C)) +LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S)) +EXTRA_CLEAN += $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(STATIC_LIBS) cscope.* + +x := $(shell mkdir -p $(sort $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)))) +$(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c + $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ -x := $(shell mkdir -p $(sort $(dir $(LIBKVM_OBJ)))) -$(LIBKVM_OBJ): $(OUTPUT)/%.o: %.c +$(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ -$(OUTPUT)/libkvm.a: $(LIBKVM_OBJ) +LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) +$(OUTPUT)/libkvm.a: $(LIBKVM_OBJS) $(AR) crs $@ $^ x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS)))) diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c b/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c new file mode 100644 index 000000000000..efba76682b4b --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c @@ -0,0 +1,3 @@ +// SPDX-License-Identifier: GPL-2.0 +#define REG_LIST_SVE +#include "get-reg-list.c" diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c new file mode 100644 index 000000000000..33218a395d9f --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -0,0 +1,841 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Check for KVM_GET_REG_LIST regressions. + * + * Copyright (C) 2020, Red Hat, Inc. + * + * When attempting to migrate from a host with an older kernel to a host + * with a newer kernel we allow the newer kernel on the destination to + * list new registers with get-reg-list. We assume they'll be unused, at + * least until the guest reboots, and so they're relatively harmless. + * However, if the destination host with the newer kernel is missing + * registers which the source host with the older kernel has, then that's + * a regression in get-reg-list. This test checks for that regression by + * checking the current list against a blessed list. We should never have + * missing registers, but if new ones appear then they can probably be + * added to the blessed list. A completely new blessed list can be created + * by running the test with the --list command line argument. + * + * Note, the blessed list should be created from the oldest possible + * kernel. We can't go older than v4.15, though, because that's the first + * release to expose the ID system registers in KVM_GET_REG_LIST, see + * commit 93390c0a1b20 ("arm64: KVM: Hide unsupported AArch64 CPU features + * from guests"). Also, one must use the --core-reg-fixup command line + * option when running on an older kernel that doesn't include df205b5c6328 + * ("KVM: arm64: Filter out invalid core register IDs in KVM_GET_REG_LIST") + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "kvm_util.h" +#include "test_util.h" +#include "processor.h" + +#ifdef REG_LIST_SVE +#define reg_list_sve() (true) +#else +#define reg_list_sve() (false) +#endif + +#define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_COPROC_MASK) + +#define for_each_reg(i) \ + for ((i) = 0; (i) < reg_list->n; ++(i)) + +#define for_each_missing_reg(i) \ + for ((i) = 0; (i) < blessed_n; ++(i)) \ + if (!find_reg(reg_list->reg, reg_list->n, blessed_reg[i])) + +#define for_each_new_reg(i) \ + for ((i) = 0; (i) < reg_list->n; ++(i)) \ + if (!find_reg(blessed_reg, blessed_n, reg_list->reg[i])) + + +static struct kvm_reg_list *reg_list; + +static __u64 base_regs[], vregs[], sve_regs[], rejects_set[]; +static __u64 base_regs_n, vregs_n, sve_regs_n, rejects_set_n; +static __u64 *blessed_reg, blessed_n; + +static bool find_reg(__u64 regs[], __u64 nr_regs, __u64 reg) +{ + int i; + + for (i = 0; i < nr_regs; ++i) + if (reg == regs[i]) + return true; + return false; +} + +static const char *str_with_index(const char *template, __u64 index) +{ + char *str, *p; + int n; + + str = strdup(template); + p = strstr(str, "##"); + n = sprintf(p, "%lld", index); + strcat(p + n, strstr(template, "##") + 2); + + return (const char *)str; +} + +#define CORE_REGS_XX_NR_WORDS 2 +#define CORE_SPSR_XX_NR_WORDS 2 +#define CORE_FPREGS_XX_NR_WORDS 4 + +static const char *core_id_to_str(__u64 id) +{ + __u64 core_off = id & ~REG_MASK, idx; + + /* + * core_off is the offset into struct kvm_regs + */ + switch (core_off) { + case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... + KVM_REG_ARM_CORE_REG(regs.regs[30]): + idx = (core_off - KVM_REG_ARM_CORE_REG(regs.regs[0])) / CORE_REGS_XX_NR_WORDS; + TEST_ASSERT(idx < 31, "Unexpected regs.regs index: %lld", idx); + return str_with_index("KVM_REG_ARM_CORE_REG(regs.regs[##])", idx); + case KVM_REG_ARM_CORE_REG(regs.sp): + return "KVM_REG_ARM_CORE_REG(regs.sp)"; + case KVM_REG_ARM_CORE_REG(regs.pc): + return "KVM_REG_ARM_CORE_REG(regs.pc)"; + case KVM_REG_ARM_CORE_REG(regs.pstate): + return "KVM_REG_ARM_CORE_REG(regs.pstate)"; + case KVM_REG_ARM_CORE_REG(sp_el1): + return "KVM_REG_ARM_CORE_REG(sp_el1)"; + case KVM_REG_ARM_CORE_REG(elr_el1): + return "KVM_REG_ARM_CORE_REG(elr_el1)"; + case KVM_REG_ARM_CORE_REG(spsr[0]) ... + KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]): + idx = (core_off - KVM_REG_ARM_CORE_REG(spsr[0])) / CORE_SPSR_XX_NR_WORDS; + TEST_ASSERT(idx < KVM_NR_SPSR, "Unexpected spsr index: %lld", idx); + return str_with_index("KVM_REG_ARM_CORE_REG(spsr[##])", idx); + case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... + KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): + idx = (core_off - KVM_REG_ARM_CORE_REG(fp_regs.vregs[0])) / CORE_FPREGS_XX_NR_WORDS; + TEST_ASSERT(idx < 32, "Unexpected fp_regs.vregs index: %lld", idx); + return str_with_index("KVM_REG_ARM_CORE_REG(fp_regs.vregs[##])", idx); + case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): + return "KVM_REG_ARM_CORE_REG(fp_regs.fpsr)"; + case KVM_REG_ARM_CORE_REG(fp_regs.fpcr): + return "KVM_REG_ARM_CORE_REG(fp_regs.fpcr)"; + } + + TEST_FAIL("Unknown core reg id: 0x%llx", id); + return NULL; +} + +static const char *sve_id_to_str(__u64 id) +{ + __u64 sve_off, n, i; + + if (id == KVM_REG_ARM64_SVE_VLS) + return "KVM_REG_ARM64_SVE_VLS"; + + sve_off = id & ~(REG_MASK | ((1ULL << 5) - 1)); + i = id & (KVM_ARM64_SVE_MAX_SLICES - 1); + + TEST_ASSERT(i == 0, "Currently we don't expect slice > 0, reg id 0x%llx", id); + + switch (sve_off) { + case KVM_REG_ARM64_SVE_ZREG_BASE ... + KVM_REG_ARM64_SVE_ZREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_ZREGS - 1: + n = (id >> 5) & (KVM_ARM64_SVE_NUM_ZREGS - 1); + TEST_ASSERT(id == KVM_REG_ARM64_SVE_ZREG(n, 0), + "Unexpected bits set in SVE ZREG id: 0x%llx", id); + return str_with_index("KVM_REG_ARM64_SVE_ZREG(##, 0)", n); + case KVM_REG_ARM64_SVE_PREG_BASE ... + KVM_REG_ARM64_SVE_PREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_PREGS - 1: + n = (id >> 5) & (KVM_ARM64_SVE_NUM_PREGS - 1); + TEST_ASSERT(id == KVM_REG_ARM64_SVE_PREG(n, 0), + "Unexpected bits set in SVE PREG id: 0x%llx", id); + return str_with_index("KVM_REG_ARM64_SVE_PREG(##, 0)", n); + case KVM_REG_ARM64_SVE_FFR_BASE: + TEST_ASSERT(id == KVM_REG_ARM64_SVE_FFR(0), + "Unexpected bits set in SVE FFR id: 0x%llx", id); + return "KVM_REG_ARM64_SVE_FFR(0)"; + } + + return NULL; +} + +static void print_reg(__u64 id) +{ + unsigned op0, op1, crn, crm, op2; + const char *reg_size = NULL; + + TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64, + "KVM_REG_ARM64 missing in reg id: 0x%llx", id); + + switch (id & KVM_REG_SIZE_MASK) { + case KVM_REG_SIZE_U8: + reg_size = "KVM_REG_SIZE_U8"; + break; + case KVM_REG_SIZE_U16: + reg_size = "KVM_REG_SIZE_U16"; + break; + case KVM_REG_SIZE_U32: + reg_size = "KVM_REG_SIZE_U32"; + break; + case KVM_REG_SIZE_U64: + reg_size = "KVM_REG_SIZE_U64"; + break; + case KVM_REG_SIZE_U128: + reg_size = "KVM_REG_SIZE_U128"; + break; + case KVM_REG_SIZE_U256: + reg_size = "KVM_REG_SIZE_U256"; + break; + case KVM_REG_SIZE_U512: + reg_size = "KVM_REG_SIZE_U512"; + break; + case KVM_REG_SIZE_U1024: + reg_size = "KVM_REG_SIZE_U1024"; + break; + case KVM_REG_SIZE_U2048: + reg_size = "KVM_REG_SIZE_U2048"; + break; + default: + TEST_FAIL("Unexpected reg size: 0x%llx in reg id: 0x%llx", + (id & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT, id); + } + + switch (id & KVM_REG_ARM_COPROC_MASK) { + case KVM_REG_ARM_CORE: + printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_CORE | %s,\n", reg_size, core_id_to_str(id)); + break; + case KVM_REG_ARM_DEMUX: + TEST_ASSERT(!(id & ~(REG_MASK | KVM_REG_ARM_DEMUX_ID_MASK | KVM_REG_ARM_DEMUX_VAL_MASK)), + "Unexpected bits set in DEMUX reg id: 0x%llx", id); + printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | %lld,\n", + reg_size, id & KVM_REG_ARM_DEMUX_VAL_MASK); + break; + case KVM_REG_ARM64_SYSREG: + op0 = (id & KVM_REG_ARM64_SYSREG_OP0_MASK) >> KVM_REG_ARM64_SYSREG_OP0_SHIFT; + op1 = (id & KVM_REG_ARM64_SYSREG_OP1_MASK) >> KVM_REG_ARM64_SYSREG_OP1_SHIFT; + crn = (id & KVM_REG_ARM64_SYSREG_CRN_MASK) >> KVM_REG_ARM64_SYSREG_CRN_SHIFT; + crm = (id & KVM_REG_ARM64_SYSREG_CRM_MASK) >> KVM_REG_ARM64_SYSREG_CRM_SHIFT; + op2 = (id & KVM_REG_ARM64_SYSREG_OP2_MASK) >> KVM_REG_ARM64_SYSREG_OP2_SHIFT; + TEST_ASSERT(id == ARM64_SYS_REG(op0, op1, crn, crm, op2), + "Unexpected bits set in SYSREG reg id: 0x%llx", id); + printf("\tARM64_SYS_REG(%d, %d, %d, %d, %d),\n", op0, op1, crn, crm, op2); + break; + case KVM_REG_ARM_FW: + TEST_ASSERT(id == KVM_REG_ARM_FW_REG(id & 0xffff), + "Unexpected bits set in FW reg id: 0x%llx", id); + printf("\tKVM_REG_ARM_FW_REG(%lld),\n", id & 0xffff); + break; + case KVM_REG_ARM64_SVE: + if (reg_list_sve()) + printf("\t%s,\n", sve_id_to_str(id)); + else + TEST_FAIL("KVM_REG_ARM64_SVE is an unexpected coproc type in reg id: 0x%llx", id); + break; + default: + TEST_FAIL("Unexpected coproc type: 0x%llx in reg id: 0x%llx", + (id & KVM_REG_ARM_COPROC_MASK) >> KVM_REG_ARM_COPROC_SHIFT, id); + } +} + +/* + * Older kernels listed each 32-bit word of CORE registers separately. + * For 64 and 128-bit registers we need to ignore the extra words. We + * also need to fixup the sizes, because the older kernels stated all + * registers were 64-bit, even when they weren't. + */ +static void core_reg_fixup(void) +{ + struct kvm_reg_list *tmp; + __u64 id, core_off; + int i; + + tmp = calloc(1, sizeof(*tmp) + reg_list->n * sizeof(__u64)); + + for (i = 0; i < reg_list->n; ++i) { + id = reg_list->reg[i]; + + if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM_CORE) { + tmp->reg[tmp->n++] = id; + continue; + } + + core_off = id & ~REG_MASK; + + switch (core_off) { + case 0x52: case 0xd2: case 0xd6: + /* + * These offsets are pointing at padding. + * We need to ignore them too. + */ + continue; + case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... + KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): + if (core_off & 3) + continue; + id &= ~KVM_REG_SIZE_MASK; + id |= KVM_REG_SIZE_U128; + tmp->reg[tmp->n++] = id; + continue; + case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): + case KVM_REG_ARM_CORE_REG(fp_regs.fpcr): + id &= ~KVM_REG_SIZE_MASK; + id |= KVM_REG_SIZE_U32; + tmp->reg[tmp->n++] = id; + continue; + default: + if (core_off & 1) + continue; + tmp->reg[tmp->n++] = id; + break; + } + } + + free(reg_list); + reg_list = tmp; +} + +static void prepare_vcpu_init(struct kvm_vcpu_init *init) +{ + if (reg_list_sve()) + init->features[0] |= 1 << KVM_ARM_VCPU_SVE; +} + +static void finalize_vcpu(struct kvm_vm *vm, uint32_t vcpuid) +{ + int feature; + + if (reg_list_sve()) { + feature = KVM_ARM_VCPU_SVE; + vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_FINALIZE, &feature); + } +} + +static void check_supported(void) +{ + if (reg_list_sve() && !kvm_check_cap(KVM_CAP_ARM_SVE)) { + fprintf(stderr, "SVE not available, skipping tests\n"); + exit(KSFT_SKIP); + } +} + +int main(int ac, char **av) +{ + struct kvm_vcpu_init init = { .target = -1, }; + int new_regs = 0, missing_regs = 0, i; + int failed_get = 0, failed_set = 0, failed_reject = 0; + bool print_list = false, fixup_core_regs = false; + struct kvm_vm *vm; + __u64 *vec_regs; + + check_supported(); + + for (i = 1; i < ac; ++i) { + if (strcmp(av[i], "--core-reg-fixup") == 0) + fixup_core_regs = true; + else if (strcmp(av[i], "--list") == 0) + print_list = true; + else + fprintf(stderr, "Ignoring unknown option: %s\n", av[i]); + } + + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + prepare_vcpu_init(&init); + aarch64_vcpu_add_default(vm, 0, &init, NULL); + finalize_vcpu(vm, 0); + + reg_list = vcpu_get_reg_list(vm, 0); + + if (fixup_core_regs) + core_reg_fixup(); + + if (print_list) { + putchar('\n'); + for_each_reg(i) + print_reg(reg_list->reg[i]); + putchar('\n'); + return 0; + } + + /* + * We only test that we can get the register and then write back the + * same value. Some registers may allow other values to be written + * back, but others only allow some bits to be changed, and at least + * for ID registers set will fail if the value does not exactly match + * what was returned by get. If registers that allow other values to + * be written need to have the other values tested, then we should + * create a new set of tests for those in a new independent test + * executable. + */ + for_each_reg(i) { + uint8_t addr[2048 / 8]; + struct kvm_one_reg reg = { + .id = reg_list->reg[i], + .addr = (__u64)&addr, + }; + int ret; + + ret = _vcpu_ioctl(vm, 0, KVM_GET_ONE_REG, ®); + if (ret) { + puts("Failed to get "); + print_reg(reg.id); + putchar('\n'); + ++failed_get; + } + + /* rejects_set registers are rejected after KVM_ARM_VCPU_FINALIZE */ + if (find_reg(rejects_set, rejects_set_n, reg.id)) { + ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); + if (ret != -1 || errno != EPERM) { + printf("Failed to reject (ret=%d, errno=%d) ", ret, errno); + print_reg(reg.id); + putchar('\n'); + ++failed_reject; + } + continue; + } + + ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); + if (ret) { + puts("Failed to set "); + print_reg(reg.id); + putchar('\n'); + ++failed_set; + } + } + + if (reg_list_sve()) { + blessed_n = base_regs_n + sve_regs_n; + vec_regs = sve_regs; + } else { + blessed_n = base_regs_n + vregs_n; + vec_regs = vregs; + } + + blessed_reg = calloc(blessed_n, sizeof(__u64)); + for (i = 0; i < base_regs_n; ++i) + blessed_reg[i] = base_regs[i]; + for (i = 0; i < blessed_n - base_regs_n; ++i) + blessed_reg[base_regs_n + i] = vec_regs[i]; + + for_each_new_reg(i) + ++new_regs; + + for_each_missing_reg(i) + ++missing_regs; + + if (new_regs || missing_regs) { + printf("Number blessed registers: %5lld\n", blessed_n); + printf("Number registers: %5lld\n", reg_list->n); + } + + if (new_regs) { + printf("\nThere are %d new registers.\n" + "Consider adding them to the blessed reg " + "list with the following lines:\n\n", new_regs); + for_each_new_reg(i) + print_reg(reg_list->reg[i]); + putchar('\n'); + } + + if (missing_regs) { + printf("\nThere are %d missing registers.\n" + "The following lines are missing registers:\n\n", missing_regs); + for_each_missing_reg(i) + print_reg(blessed_reg[i]); + putchar('\n'); + } + + TEST_ASSERT(!missing_regs && !failed_get && !failed_set && !failed_reject, + "There are %d missing registers; " + "%d registers failed get; %d registers failed set; %d registers failed reject", + missing_regs, failed_get, failed_set, failed_reject); + + return 0; +} + +/* + * The current blessed list was primed with the output of kernel version + * v4.15 with --core-reg-fixup and then later updated with new registers. + */ +static __u64 base_regs[] = { + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[0]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[1]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[2]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[3]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[4]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[5]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[6]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[7]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[8]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[9]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[10]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[11]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[12]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[13]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[14]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[15]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[16]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[17]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[18]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[19]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[20]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[21]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[22]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[23]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[24]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[25]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[26]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[27]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[28]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[29]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[30]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.sp), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pc), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pstate), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(sp_el1), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(elr_el1), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[0]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[1]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[2]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[3]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[4]), + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpsr), + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpcr), + KVM_REG_ARM_FW_REG(0), + KVM_REG_ARM_FW_REG(1), + KVM_REG_ARM_FW_REG(2), + ARM64_SYS_REG(3, 3, 14, 3, 1), /* CNTV_CTL_EL0 */ + ARM64_SYS_REG(3, 3, 14, 3, 2), /* CNTV_CVAL_EL0 */ + ARM64_SYS_REG(3, 3, 14, 0, 2), + ARM64_SYS_REG(3, 0, 0, 0, 0), /* MIDR_EL1 */ + ARM64_SYS_REG(3, 0, 0, 0, 6), /* REVIDR_EL1 */ + ARM64_SYS_REG(3, 1, 0, 0, 1), /* CLIDR_EL1 */ + ARM64_SYS_REG(3, 1, 0, 0, 7), /* AIDR_EL1 */ + ARM64_SYS_REG(3, 3, 0, 0, 1), /* CTR_EL0 */ + ARM64_SYS_REG(2, 0, 0, 0, 4), + ARM64_SYS_REG(2, 0, 0, 0, 5), + ARM64_SYS_REG(2, 0, 0, 0, 6), + ARM64_SYS_REG(2, 0, 0, 0, 7), + ARM64_SYS_REG(2, 0, 0, 1, 4), + ARM64_SYS_REG(2, 0, 0, 1, 5), + ARM64_SYS_REG(2, 0, 0, 1, 6), + ARM64_SYS_REG(2, 0, 0, 1, 7), + ARM64_SYS_REG(2, 0, 0, 2, 0), /* MDCCINT_EL1 */ + ARM64_SYS_REG(2, 0, 0, 2, 2), /* MDSCR_EL1 */ + ARM64_SYS_REG(2, 0, 0, 2, 4), + ARM64_SYS_REG(2, 0, 0, 2, 5), + ARM64_SYS_REG(2, 0, 0, 2, 6), + ARM64_SYS_REG(2, 0, 0, 2, 7), + ARM64_SYS_REG(2, 0, 0, 3, 4), + ARM64_SYS_REG(2, 0, 0, 3, 5), + ARM64_SYS_REG(2, 0, 0, 3, 6), + ARM64_SYS_REG(2, 0, 0, 3, 7), + ARM64_SYS_REG(2, 0, 0, 4, 4), + ARM64_SYS_REG(2, 0, 0, 4, 5), + ARM64_SYS_REG(2, 0, 0, 4, 6), + ARM64_SYS_REG(2, 0, 0, 4, 7), + ARM64_SYS_REG(2, 0, 0, 5, 4), + ARM64_SYS_REG(2, 0, 0, 5, 5), + ARM64_SYS_REG(2, 0, 0, 5, 6), + ARM64_SYS_REG(2, 0, 0, 5, 7), + ARM64_SYS_REG(2, 0, 0, 6, 4), + ARM64_SYS_REG(2, 0, 0, 6, 5), + ARM64_SYS_REG(2, 0, 0, 6, 6), + ARM64_SYS_REG(2, 0, 0, 6, 7), + ARM64_SYS_REG(2, 0, 0, 7, 4), + ARM64_SYS_REG(2, 0, 0, 7, 5), + ARM64_SYS_REG(2, 0, 0, 7, 6), + ARM64_SYS_REG(2, 0, 0, 7, 7), + ARM64_SYS_REG(2, 0, 0, 8, 4), + ARM64_SYS_REG(2, 0, 0, 8, 5), + ARM64_SYS_REG(2, 0, 0, 8, 6), + ARM64_SYS_REG(2, 0, 0, 8, 7), + ARM64_SYS_REG(2, 0, 0, 9, 4), + ARM64_SYS_REG(2, 0, 0, 9, 5), + ARM64_SYS_REG(2, 0, 0, 9, 6), + ARM64_SYS_REG(2, 0, 0, 9, 7), + ARM64_SYS_REG(2, 0, 0, 10, 4), + ARM64_SYS_REG(2, 0, 0, 10, 5), + ARM64_SYS_REG(2, 0, 0, 10, 6), + ARM64_SYS_REG(2, 0, 0, 10, 7), + ARM64_SYS_REG(2, 0, 0, 11, 4), + ARM64_SYS_REG(2, 0, 0, 11, 5), + ARM64_SYS_REG(2, 0, 0, 11, 6), + ARM64_SYS_REG(2, 0, 0, 11, 7), + ARM64_SYS_REG(2, 0, 0, 12, 4), + ARM64_SYS_REG(2, 0, 0, 12, 5), + ARM64_SYS_REG(2, 0, 0, 12, 6), + ARM64_SYS_REG(2, 0, 0, 12, 7), + ARM64_SYS_REG(2, 0, 0, 13, 4), + ARM64_SYS_REG(2, 0, 0, 13, 5), + ARM64_SYS_REG(2, 0, 0, 13, 6), + ARM64_SYS_REG(2, 0, 0, 13, 7), + ARM64_SYS_REG(2, 0, 0, 14, 4), + ARM64_SYS_REG(2, 0, 0, 14, 5), + ARM64_SYS_REG(2, 0, 0, 14, 6), + ARM64_SYS_REG(2, 0, 0, 14, 7), + ARM64_SYS_REG(2, 0, 0, 15, 4), + ARM64_SYS_REG(2, 0, 0, 15, 5), + ARM64_SYS_REG(2, 0, 0, 15, 6), + ARM64_SYS_REG(2, 0, 0, 15, 7), + ARM64_SYS_REG(2, 4, 0, 7, 0), /* DBGVCR32_EL2 */ + ARM64_SYS_REG(3, 0, 0, 0, 5), /* MPIDR_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 0), /* ID_PFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 1), /* ID_PFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 2), /* ID_DFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 3), /* ID_AFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 4), /* ID_MMFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 5), /* ID_MMFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 6), /* ID_MMFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 7), /* ID_MMFR3_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 0), /* ID_ISAR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 1), /* ID_ISAR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 2), /* ID_ISAR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 3), /* ID_ISAR3_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 4), /* ID_ISAR4_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 5), /* ID_ISAR5_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 6), /* ID_MMFR4_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 7), /* ID_ISAR6_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 0), /* MVFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 1), /* MVFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 2), /* MVFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 3), + ARM64_SYS_REG(3, 0, 0, 3, 4), /* ID_PFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 5), /* ID_DFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 6), /* ID_MMFR5_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 7), + ARM64_SYS_REG(3, 0, 0, 4, 0), /* ID_AA64PFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 1), /* ID_AA64PFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 2), + ARM64_SYS_REG(3, 0, 0, 4, 3), + ARM64_SYS_REG(3, 0, 0, 4, 4), /* ID_AA64ZFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 5), + ARM64_SYS_REG(3, 0, 0, 4, 6), + ARM64_SYS_REG(3, 0, 0, 4, 7), + ARM64_SYS_REG(3, 0, 0, 5, 0), /* ID_AA64DFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 5, 1), /* ID_AA64DFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 5, 2), + ARM64_SYS_REG(3, 0, 0, 5, 3), + ARM64_SYS_REG(3, 0, 0, 5, 4), /* ID_AA64AFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 5, 5), /* ID_AA64AFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 5, 6), + ARM64_SYS_REG(3, 0, 0, 5, 7), + ARM64_SYS_REG(3, 0, 0, 6, 0), /* ID_AA64ISAR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 6, 1), /* ID_AA64ISAR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 6, 2), + ARM64_SYS_REG(3, 0, 0, 6, 3), + ARM64_SYS_REG(3, 0, 0, 6, 4), + ARM64_SYS_REG(3, 0, 0, 6, 5), + ARM64_SYS_REG(3, 0, 0, 6, 6), + ARM64_SYS_REG(3, 0, 0, 6, 7), + ARM64_SYS_REG(3, 0, 0, 7, 0), /* ID_AA64MMFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 1), /* ID_AA64MMFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 2), /* ID_AA64MMFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 3), + ARM64_SYS_REG(3, 0, 0, 7, 4), + ARM64_SYS_REG(3, 0, 0, 7, 5), + ARM64_SYS_REG(3, 0, 0, 7, 6), + ARM64_SYS_REG(3, 0, 0, 7, 7), + ARM64_SYS_REG(3, 0, 1, 0, 0), /* SCTLR_EL1 */ + ARM64_SYS_REG(3, 0, 1, 0, 1), /* ACTLR_EL1 */ + ARM64_SYS_REG(3, 0, 1, 0, 2), /* CPACR_EL1 */ + ARM64_SYS_REG(3, 0, 2, 0, 0), /* TTBR0_EL1 */ + ARM64_SYS_REG(3, 0, 2, 0, 1), /* TTBR1_EL1 */ + ARM64_SYS_REG(3, 0, 2, 0, 2), /* TCR_EL1 */ + ARM64_SYS_REG(3, 0, 5, 1, 0), /* AFSR0_EL1 */ + ARM64_SYS_REG(3, 0, 5, 1, 1), /* AFSR1_EL1 */ + ARM64_SYS_REG(3, 0, 5, 2, 0), /* ESR_EL1 */ + ARM64_SYS_REG(3, 0, 6, 0, 0), /* FAR_EL1 */ + ARM64_SYS_REG(3, 0, 7, 4, 0), /* PAR_EL1 */ + ARM64_SYS_REG(3, 0, 9, 14, 1), /* PMINTENSET_EL1 */ + ARM64_SYS_REG(3, 0, 9, 14, 2), /* PMINTENCLR_EL1 */ + ARM64_SYS_REG(3, 0, 10, 2, 0), /* MAIR_EL1 */ + ARM64_SYS_REG(3, 0, 10, 3, 0), /* AMAIR_EL1 */ + ARM64_SYS_REG(3, 0, 12, 0, 0), /* VBAR_EL1 */ + ARM64_SYS_REG(3, 0, 12, 1, 1), /* DISR_EL1 */ + ARM64_SYS_REG(3, 0, 13, 0, 1), /* CONTEXTIDR_EL1 */ + ARM64_SYS_REG(3, 0, 13, 0, 4), /* TPIDR_EL1 */ + ARM64_SYS_REG(3, 0, 14, 1, 0), /* CNTKCTL_EL1 */ + ARM64_SYS_REG(3, 2, 0, 0, 0), /* CSSELR_EL1 */ + ARM64_SYS_REG(3, 3, 9, 12, 0), /* PMCR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 1), /* PMCNTENSET_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 2), /* PMCNTENCLR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 3), /* PMOVSCLR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 4), /* PMSWINC_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 5), /* PMSELR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 13, 0), /* PMCCNTR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 14, 0), /* PMUSERENR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 14, 3), /* PMOVSSET_EL0 */ + ARM64_SYS_REG(3, 3, 13, 0, 2), /* TPIDR_EL0 */ + ARM64_SYS_REG(3, 3, 13, 0, 3), /* TPIDRRO_EL0 */ + ARM64_SYS_REG(3, 3, 14, 8, 0), + ARM64_SYS_REG(3, 3, 14, 8, 1), + ARM64_SYS_REG(3, 3, 14, 8, 2), + ARM64_SYS_REG(3, 3, 14, 8, 3), + ARM64_SYS_REG(3, 3, 14, 8, 4), + ARM64_SYS_REG(3, 3, 14, 8, 5), + ARM64_SYS_REG(3, 3, 14, 8, 6), + ARM64_SYS_REG(3, 3, 14, 8, 7), + ARM64_SYS_REG(3, 3, 14, 9, 0), + ARM64_SYS_REG(3, 3, 14, 9, 1), + ARM64_SYS_REG(3, 3, 14, 9, 2), + ARM64_SYS_REG(3, 3, 14, 9, 3), + ARM64_SYS_REG(3, 3, 14, 9, 4), + ARM64_SYS_REG(3, 3, 14, 9, 5), + ARM64_SYS_REG(3, 3, 14, 9, 6), + ARM64_SYS_REG(3, 3, 14, 9, 7), + ARM64_SYS_REG(3, 3, 14, 10, 0), + ARM64_SYS_REG(3, 3, 14, 10, 1), + ARM64_SYS_REG(3, 3, 14, 10, 2), + ARM64_SYS_REG(3, 3, 14, 10, 3), + ARM64_SYS_REG(3, 3, 14, 10, 4), + ARM64_SYS_REG(3, 3, 14, 10, 5), + ARM64_SYS_REG(3, 3, 14, 10, 6), + ARM64_SYS_REG(3, 3, 14, 10, 7), + ARM64_SYS_REG(3, 3, 14, 11, 0), + ARM64_SYS_REG(3, 3, 14, 11, 1), + ARM64_SYS_REG(3, 3, 14, 11, 2), + ARM64_SYS_REG(3, 3, 14, 11, 3), + ARM64_SYS_REG(3, 3, 14, 11, 4), + ARM64_SYS_REG(3, 3, 14, 11, 5), + ARM64_SYS_REG(3, 3, 14, 11, 6), + ARM64_SYS_REG(3, 3, 14, 12, 0), + ARM64_SYS_REG(3, 3, 14, 12, 1), + ARM64_SYS_REG(3, 3, 14, 12, 2), + ARM64_SYS_REG(3, 3, 14, 12, 3), + ARM64_SYS_REG(3, 3, 14, 12, 4), + ARM64_SYS_REG(3, 3, 14, 12, 5), + ARM64_SYS_REG(3, 3, 14, 12, 6), + ARM64_SYS_REG(3, 3, 14, 12, 7), + ARM64_SYS_REG(3, 3, 14, 13, 0), + ARM64_SYS_REG(3, 3, 14, 13, 1), + ARM64_SYS_REG(3, 3, 14, 13, 2), + ARM64_SYS_REG(3, 3, 14, 13, 3), + ARM64_SYS_REG(3, 3, 14, 13, 4), + ARM64_SYS_REG(3, 3, 14, 13, 5), + ARM64_SYS_REG(3, 3, 14, 13, 6), + ARM64_SYS_REG(3, 3, 14, 13, 7), + ARM64_SYS_REG(3, 3, 14, 14, 0), + ARM64_SYS_REG(3, 3, 14, 14, 1), + ARM64_SYS_REG(3, 3, 14, 14, 2), + ARM64_SYS_REG(3, 3, 14, 14, 3), + ARM64_SYS_REG(3, 3, 14, 14, 4), + ARM64_SYS_REG(3, 3, 14, 14, 5), + ARM64_SYS_REG(3, 3, 14, 14, 6), + ARM64_SYS_REG(3, 3, 14, 14, 7), + ARM64_SYS_REG(3, 3, 14, 15, 0), + ARM64_SYS_REG(3, 3, 14, 15, 1), + ARM64_SYS_REG(3, 3, 14, 15, 2), + ARM64_SYS_REG(3, 3, 14, 15, 3), + ARM64_SYS_REG(3, 3, 14, 15, 4), + ARM64_SYS_REG(3, 3, 14, 15, 5), + ARM64_SYS_REG(3, 3, 14, 15, 6), + ARM64_SYS_REG(3, 3, 14, 15, 7), /* PMCCFILTR_EL0 */ + ARM64_SYS_REG(3, 4, 3, 0, 0), /* DACR32_EL2 */ + ARM64_SYS_REG(3, 4, 5, 0, 1), /* IFSR32_EL2 */ + ARM64_SYS_REG(3, 4, 5, 3, 0), /* FPEXC32_EL2 */ + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 0, + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 1, + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 2, +}; +static __u64 base_regs_n = ARRAY_SIZE(base_regs); + +static __u64 vregs[] = { + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[1]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[2]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[3]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[4]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[5]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[6]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[7]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[8]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[9]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[10]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[11]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[12]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[13]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[14]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[15]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[16]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[17]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[18]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[19]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[20]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[21]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[22]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[23]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[24]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[25]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[26]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[27]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[28]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[29]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[30]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]), +}; +static __u64 vregs_n = ARRAY_SIZE(vregs); + +static __u64 sve_regs[] = { + KVM_REG_ARM64_SVE_VLS, + KVM_REG_ARM64_SVE_ZREG(0, 0), + KVM_REG_ARM64_SVE_ZREG(1, 0), + KVM_REG_ARM64_SVE_ZREG(2, 0), + KVM_REG_ARM64_SVE_ZREG(3, 0), + KVM_REG_ARM64_SVE_ZREG(4, 0), + KVM_REG_ARM64_SVE_ZREG(5, 0), + KVM_REG_ARM64_SVE_ZREG(6, 0), + KVM_REG_ARM64_SVE_ZREG(7, 0), + KVM_REG_ARM64_SVE_ZREG(8, 0), + KVM_REG_ARM64_SVE_ZREG(9, 0), + KVM_REG_ARM64_SVE_ZREG(10, 0), + KVM_REG_ARM64_SVE_ZREG(11, 0), + KVM_REG_ARM64_SVE_ZREG(12, 0), + KVM_REG_ARM64_SVE_ZREG(13, 0), + KVM_REG_ARM64_SVE_ZREG(14, 0), + KVM_REG_ARM64_SVE_ZREG(15, 0), + KVM_REG_ARM64_SVE_ZREG(16, 0), + KVM_REG_ARM64_SVE_ZREG(17, 0), + KVM_REG_ARM64_SVE_ZREG(18, 0), + KVM_REG_ARM64_SVE_ZREG(19, 0), + KVM_REG_ARM64_SVE_ZREG(20, 0), + KVM_REG_ARM64_SVE_ZREG(21, 0), + KVM_REG_ARM64_SVE_ZREG(22, 0), + KVM_REG_ARM64_SVE_ZREG(23, 0), + KVM_REG_ARM64_SVE_ZREG(24, 0), + KVM_REG_ARM64_SVE_ZREG(25, 0), + KVM_REG_ARM64_SVE_ZREG(26, 0), + KVM_REG_ARM64_SVE_ZREG(27, 0), + KVM_REG_ARM64_SVE_ZREG(28, 0), + KVM_REG_ARM64_SVE_ZREG(29, 0), + KVM_REG_ARM64_SVE_ZREG(30, 0), + KVM_REG_ARM64_SVE_ZREG(31, 0), + KVM_REG_ARM64_SVE_PREG(0, 0), + KVM_REG_ARM64_SVE_PREG(1, 0), + KVM_REG_ARM64_SVE_PREG(2, 0), + KVM_REG_ARM64_SVE_PREG(3, 0), + KVM_REG_ARM64_SVE_PREG(4, 0), + KVM_REG_ARM64_SVE_PREG(5, 0), + KVM_REG_ARM64_SVE_PREG(6, 0), + KVM_REG_ARM64_SVE_PREG(7, 0), + KVM_REG_ARM64_SVE_PREG(8, 0), + KVM_REG_ARM64_SVE_PREG(9, 0), + KVM_REG_ARM64_SVE_PREG(10, 0), + KVM_REG_ARM64_SVE_PREG(11, 0), + KVM_REG_ARM64_SVE_PREG(12, 0), + KVM_REG_ARM64_SVE_PREG(13, 0), + KVM_REG_ARM64_SVE_PREG(14, 0), + KVM_REG_ARM64_SVE_PREG(15, 0), + KVM_REG_ARM64_SVE_FFR(0), + ARM64_SYS_REG(3, 0, 1, 2, 0), /* ZCR_EL1 */ +}; +static __u64 sve_regs_n = ARRAY_SIZE(sve_regs); + +static __u64 rejects_set[] = { +#ifdef REG_LIST_SVE + KVM_REG_ARM64_SVE_VLS, +#endif +}; +static __u64 rejects_set_n = ARRAY_SIZE(rejects_set); diff --git a/tools/testing/selftests/kvm/clear_dirty_log_test.c b/tools/testing/selftests/kvm/clear_dirty_log_test.c deleted file mode 100644 index 11672ec6f74e..000000000000 --- a/tools/testing/selftests/kvm/clear_dirty_log_test.c +++ /dev/null @@ -1,6 +0,0 @@ -#define USE_CLEAR_DIRTY_LOG -#define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) -#define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) -#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ - KVM_DIRTY_LOG_INITIALLY_SET) -#include "dirty_log_test.c" diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 360cd3ea4cd6..3d96a7bfaff3 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -21,20 +21,12 @@ #include <linux/bitops.h> #include <linux/userfaultfd.h> -#include "test_util.h" -#include "kvm_util.h" +#include "perf_test_util.h" #include "processor.h" +#include "test_util.h" #ifdef __NR_userfaultfd -/* The memory slot index demand page */ -#define TEST_MEM_SLOT_INDEX 1 - -/* Default guest test virtual memory offset */ -#define DEFAULT_GUEST_TEST_MEM 0xc0000000 - -#define DEFAULT_GUEST_TEST_MEM_SIZE (1 << 30) /* 1G */ - #ifdef PRINT_PER_PAGE_UPDATES #define PER_PAGE_DEBUG(...) printf(__VA_ARGS__) #else @@ -47,77 +39,17 @@ #define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__) #endif -#define MAX_VCPUS 512 - -/* - * Guest/Host shared variables. Ensure addr_gva2hva() and/or - * sync_global_to/from_guest() are used when accessing from - * the host. READ/WRITE_ONCE() should also be used with anything - * that may change. - */ -static uint64_t host_page_size; -static uint64_t guest_page_size; - static char *guest_data_prototype; -/* - * Guest physical memory offset of the testing memory slot. - * This will be set to the topmost valid physical address minus - * the test memory size. - */ -static uint64_t guest_test_phys_mem; - -/* - * Guest virtual memory offset of the testing memory slot. - * Must not conflict with identity mapped test code. - */ -static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; - -struct vcpu_args { - uint64_t gva; - uint64_t pages; - - /* Only used by the host userspace part of the vCPU thread */ - int vcpu_id; - struct kvm_vm *vm; -}; - -static struct vcpu_args vcpu_args[MAX_VCPUS]; - -/* - * Continuously write to the first 8 bytes of each page in the demand paging - * memory region. - */ -static void guest_code(uint32_t vcpu_id) -{ - uint64_t gva; - uint64_t pages; - int i; - - /* Make sure vCPU args data structure is not corrupt. */ - GUEST_ASSERT(vcpu_args[vcpu_id].vcpu_id == vcpu_id); - - gva = vcpu_args[vcpu_id].gva; - pages = vcpu_args[vcpu_id].pages; - - for (i = 0; i < pages; i++) { - uint64_t addr = gva + (i * guest_page_size); - - addr &= ~(host_page_size - 1); - *(uint64_t *)addr = 0x0123456789ABCDEF; - } - - GUEST_SYNC(1); -} - static void *vcpu_worker(void *data) { int ret; - struct vcpu_args *args = (struct vcpu_args *)data; - struct kvm_vm *vm = args->vm; - int vcpu_id = args->vcpu_id; + struct vcpu_args *vcpu_args = (struct vcpu_args *)data; + int vcpu_id = vcpu_args->vcpu_id; + struct kvm_vm *vm = perf_test_args.vm; struct kvm_run *run; - struct timespec start, end, ts_diff; + struct timespec start; + struct timespec ts_diff; vcpu_args_set(vm, vcpu_id, 1, vcpu_id); run = vcpu_state(vm, vcpu_id); @@ -133,52 +65,18 @@ static void *vcpu_worker(void *data) exit_reason_str(run->exit_reason)); } - clock_gettime(CLOCK_MONOTONIC, &end); - ts_diff = timespec_sub(end, start); + ts_diff = timespec_diff_now(start); PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id, ts_diff.tv_sec, ts_diff.tv_nsec); return NULL; } -#define PAGE_SHIFT_4K 12 -#define PTES_PER_4K_PT 512 - -static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus, - uint64_t vcpu_memory_bytes) -{ - struct kvm_vm *vm; - uint64_t pages = DEFAULT_GUEST_PHY_PAGES; - - /* Account for a few pages per-vCPU for stacks */ - pages += DEFAULT_STACK_PGS * vcpus; - - /* - * Reserve twice the ammount of memory needed to map the test region and - * the page table / stacks region, at 4k, for page tables. Do the - * calculation with 4K page size: the smallest of all archs. (e.g., 64K - * page size guest will need even less memory for page tables). - */ - pages += (2 * pages) / PTES_PER_4K_PT; - pages += ((2 * vcpus * vcpu_memory_bytes) >> PAGE_SHIFT_4K) / - PTES_PER_4K_PT; - pages = vm_adjust_num_guest_pages(mode, pages); - - pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - - vm = _vm_create(mode, pages, O_RDWR); - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); -#ifdef __x86_64__ - vm_create_irqchip(vm); -#endif - return vm; -} - static int handle_uffd_page_request(int uffd, uint64_t addr) { pid_t tid; struct timespec start; - struct timespec end; + struct timespec ts_diff; struct uffdio_copy copy; int r; @@ -186,7 +84,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr) copy.src = (uint64_t)guest_data_prototype; copy.dst = addr; - copy.len = host_page_size; + copy.len = perf_test_args.host_page_size; copy.mode = 0; clock_gettime(CLOCK_MONOTONIC, &start); @@ -198,12 +96,12 @@ static int handle_uffd_page_request(int uffd, uint64_t addr) return r; } - clock_gettime(CLOCK_MONOTONIC, &end); + ts_diff = timespec_diff_now(start); PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid, - timespec_to_ns(timespec_sub(end, start))); + timespec_to_ns(ts_diff)); PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n", - host_page_size, addr, tid); + perf_test_args.host_page_size, addr, tid); return 0; } @@ -223,7 +121,8 @@ static void *uffd_handler_thread_fn(void *arg) int pipefd = uffd_args->pipefd; useconds_t delay = uffd_args->delay; int64_t pages = 0; - struct timespec start, end, ts_diff; + struct timespec start; + struct timespec ts_diff; clock_gettime(CLOCK_MONOTONIC, &start); while (!quit_uffd_thread) { @@ -292,8 +191,7 @@ static void *uffd_handler_thread_fn(void *arg) pages++; } - clock_gettime(CLOCK_MONOTONIC, &end); - ts_diff = timespec_sub(end, start); + ts_diff = timespec_diff_now(start); PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n", pages, ts_diff.tv_sec, ts_diff.tv_nsec, pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); @@ -351,99 +249,54 @@ static int setup_demand_paging(struct kvm_vm *vm, } static void run_test(enum vm_guest_mode mode, bool use_uffd, - useconds_t uffd_delay, int vcpus, - uint64_t vcpu_memory_bytes) + useconds_t uffd_delay) { pthread_t *vcpu_threads; pthread_t *uffd_handler_threads = NULL; struct uffd_handler_args *uffd_args = NULL; - struct timespec start, end, ts_diff; + struct timespec start; + struct timespec ts_diff; int *pipefds = NULL; struct kvm_vm *vm; - uint64_t guest_num_pages; int vcpu_id; int r; - vm = create_vm(mode, vcpus, vcpu_memory_bytes); - - guest_page_size = vm_get_page_size(vm); - - TEST_ASSERT(vcpu_memory_bytes % guest_page_size == 0, - "Guest memory size is not guest page size aligned."); - - guest_num_pages = (vcpus * vcpu_memory_bytes) / guest_page_size; - guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); - - /* - * If there should be more memory in the guest test region than there - * can be pages in the guest, it will definitely cause problems. - */ - TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm), - "Requested more guest memory than address space allows.\n" - " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n", - guest_num_pages, vm_get_max_gfn(vm), vcpus, - vcpu_memory_bytes); - - host_page_size = getpagesize(); - TEST_ASSERT(vcpu_memory_bytes % host_page_size == 0, - "Guest memory size is not host page size aligned."); + vm = create_vm(mode, nr_vcpus, guest_percpu_mem_size); - guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * - guest_page_size; - guest_test_phys_mem &= ~(host_page_size - 1); + perf_test_args.wr_fract = 1; -#ifdef __s390x__ - /* Align to 1M (segment size) */ - guest_test_phys_mem &= ~((1 << 20) - 1); -#endif - - pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); - - /* Add an extra memory slot for testing demand paging */ - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - guest_test_phys_mem, - TEST_MEM_SLOT_INDEX, - guest_num_pages, 0); - - /* Do mapping for the demand paging memory slot */ - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); - - ucall_init(vm, NULL); - - guest_data_prototype = malloc(host_page_size); + guest_data_prototype = malloc(perf_test_args.host_page_size); TEST_ASSERT(guest_data_prototype, "Failed to allocate buffer for guest data pattern"); - memset(guest_data_prototype, 0xAB, host_page_size); + memset(guest_data_prototype, 0xAB, perf_test_args.host_page_size); - vcpu_threads = malloc(vcpus * sizeof(*vcpu_threads)); + vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); TEST_ASSERT(vcpu_threads, "Memory allocation failed"); + add_vcpus(vm, nr_vcpus, guest_percpu_mem_size); + if (use_uffd) { uffd_handler_threads = - malloc(vcpus * sizeof(*uffd_handler_threads)); + malloc(nr_vcpus * sizeof(*uffd_handler_threads)); TEST_ASSERT(uffd_handler_threads, "Memory allocation failed"); - uffd_args = malloc(vcpus * sizeof(*uffd_args)); + uffd_args = malloc(nr_vcpus * sizeof(*uffd_args)); TEST_ASSERT(uffd_args, "Memory allocation failed"); - pipefds = malloc(sizeof(int) * vcpus * 2); + pipefds = malloc(sizeof(int) * nr_vcpus * 2); TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd"); - } - - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { - vm_paddr_t vcpu_gpa; - void *vcpu_hva; - vm_vcpu_add_default(vm, vcpu_id, guest_code); + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + vm_paddr_t vcpu_gpa; + void *vcpu_hva; - vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes); - PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n", - vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes); + vcpu_gpa = guest_test_phys_mem + (vcpu_id * guest_percpu_mem_size); + PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n", + vcpu_id, vcpu_gpa, vcpu_gpa + guest_percpu_mem_size); - /* Cache the HVA pointer of the region */ - vcpu_hva = addr_gpa2hva(vm, vcpu_gpa); + /* Cache the HVA pointer of the region */ + vcpu_hva = addr_gpa2hva(vm, vcpu_gpa); - if (use_uffd) { /* * Set up user fault fd to handle demand paging * requests. @@ -456,53 +309,41 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, &uffd_handler_threads[vcpu_id], pipefds[vcpu_id * 2], uffd_delay, &uffd_args[vcpu_id], - vcpu_hva, vcpu_memory_bytes); + vcpu_hva, guest_percpu_mem_size); if (r < 0) exit(-r); } - -#ifdef __x86_64__ - vcpu_set_cpuid(vm, vcpu_id, kvm_get_supported_cpuid()); -#endif - - vcpu_args[vcpu_id].vm = vm; - vcpu_args[vcpu_id].vcpu_id = vcpu_id; - vcpu_args[vcpu_id].gva = guest_test_virt_mem + - (vcpu_id * vcpu_memory_bytes); - vcpu_args[vcpu_id].pages = vcpu_memory_bytes / guest_page_size; } /* Export the shared variables to the guest */ - sync_global_to_guest(vm, host_page_size); - sync_global_to_guest(vm, guest_page_size); - sync_global_to_guest(vm, vcpu_args); + sync_global_to_guest(vm, perf_test_args); pr_info("Finished creating vCPUs and starting uffd threads\n"); clock_gettime(CLOCK_MONOTONIC, &start); - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker, - &vcpu_args[vcpu_id]); + &perf_test_args.vcpu_args[vcpu_id]); } pr_info("Started all vCPUs\n"); /* Wait for the vcpu threads to quit */ - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { pthread_join(vcpu_threads[vcpu_id], NULL); PER_VCPU_DEBUG("Joined thread for vCPU %d\n", vcpu_id); } - pr_info("All vCPU threads joined\n"); + ts_diff = timespec_diff_now(start); - clock_gettime(CLOCK_MONOTONIC, &end); + pr_info("All vCPU threads joined\n"); if (use_uffd) { char c; /* Tell the user fault fd handler threads to quit */ - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { r = write(pipefds[vcpu_id * 2 + 1], &c, 1); TEST_ASSERT(r == 1, "Unable to write to pipefd"); @@ -510,11 +351,11 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, } } - ts_diff = timespec_sub(end, start); pr_info("Total guest execution time: %ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); pr_info("Overall demand paging rate: %f pgs/sec\n", - guest_num_pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); + perf_test_args.vcpu_args[0].pages * nr_vcpus / + ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); ucall_uninit(vm); kvm_vm_free(vm); @@ -568,9 +409,8 @@ static void help(char *name) int main(int argc, char *argv[]) { + int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); bool mode_selected = false; - uint64_t vcpu_memory_bytes = DEFAULT_GUEST_TEST_MEM_SIZE; - int vcpus = 1; unsigned int mode; int opt, i; bool use_uffd = false; @@ -619,15 +459,12 @@ int main(int argc, char *argv[]) "A negative UFFD delay is not supported."); break; case 'b': - vcpu_memory_bytes = parse_size(optarg); + guest_percpu_mem_size = parse_size(optarg); break; case 'v': - vcpus = atoi(optarg); - TEST_ASSERT(vcpus > 0, - "Must have a positive number of vCPUs"); - TEST_ASSERT(vcpus <= MAX_VCPUS, - "This test does not currently support\n" - "more than %d vCPUs.", MAX_VCPUS); + nr_vcpus = atoi(optarg); + TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, + "Invalid number of vcpus, must be between 1 and %d", max_vcpus); break; case 'h': default: @@ -642,7 +479,7 @@ int main(int argc, char *argv[]) TEST_ASSERT(guest_modes[i].supported, "Guest mode ID %d (%s) not supported.", i, vm_guest_mode_string(i)); - run_test(i, use_uffd, uffd_delay, vcpus, vcpu_memory_bytes); + run_test(i, use_uffd, uffd_delay); } return 0; diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c new file mode 100644 index 000000000000..85c9b8f73142 --- /dev/null +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM dirty page logging performance test + * + * Based on dirty_log_test.c + * + * Copyright (C) 2018, Red Hat, Inc. + * Copyright (C) 2020, Google, Inc. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <time.h> +#include <pthread.h> +#include <linux/bitmap.h> +#include <linux/bitops.h> + +#include "kvm_util.h" +#include "perf_test_util.h" +#include "processor.h" +#include "test_util.h" + +/* How many host loops to run by default (one KVM_GET_DIRTY_LOG for each loop)*/ +#define TEST_HOST_LOOP_N 2UL + +/* Host variables */ +static bool host_quit; +static uint64_t iteration; +static uint64_t vcpu_last_completed_iteration[MAX_VCPUS]; + +static void *vcpu_worker(void *data) +{ + int ret; + struct kvm_vm *vm = perf_test_args.vm; + uint64_t pages_count = 0; + struct kvm_run *run; + struct timespec start; + struct timespec ts_diff; + struct timespec total = (struct timespec){0}; + struct timespec avg; + struct vcpu_args *vcpu_args = (struct vcpu_args *)data; + int vcpu_id = vcpu_args->vcpu_id; + + vcpu_args_set(vm, vcpu_id, 1, vcpu_id); + run = vcpu_state(vm, vcpu_id); + + while (!READ_ONCE(host_quit)) { + uint64_t current_iteration = READ_ONCE(iteration); + + clock_gettime(CLOCK_MONOTONIC, &start); + ret = _vcpu_run(vm, vcpu_id); + ts_diff = timespec_diff_now(start); + + TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); + TEST_ASSERT(get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC, + "Invalid guest sync status: exit_reason=%s\n", + exit_reason_str(run->exit_reason)); + + pr_debug("Got sync event from vCPU %d\n", vcpu_id); + vcpu_last_completed_iteration[vcpu_id] = current_iteration; + pr_debug("vCPU %d updated last completed iteration to %lu\n", + vcpu_id, vcpu_last_completed_iteration[vcpu_id]); + + if (current_iteration) { + pages_count += vcpu_args->pages; + total = timespec_add(total, ts_diff); + pr_debug("vCPU %d iteration %lu dirty memory time: %ld.%.9lds\n", + vcpu_id, current_iteration, ts_diff.tv_sec, + ts_diff.tv_nsec); + } else { + pr_debug("vCPU %d iteration %lu populate memory time: %ld.%.9lds\n", + vcpu_id, current_iteration, ts_diff.tv_sec, + ts_diff.tv_nsec); + } + + while (current_iteration == READ_ONCE(iteration) && + !READ_ONCE(host_quit)) {} + } + + avg = timespec_div(total, vcpu_last_completed_iteration[vcpu_id]); + pr_debug("\nvCPU %d dirtied 0x%lx pages over %lu iterations in %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", + vcpu_id, pages_count, vcpu_last_completed_iteration[vcpu_id], + total.tv_sec, total.tv_nsec, avg.tv_sec, avg.tv_nsec); + + return NULL; +} + +#ifdef USE_CLEAR_DIRTY_LOG +static u64 dirty_log_manual_caps; +#endif + +static void run_test(enum vm_guest_mode mode, unsigned long iterations, + uint64_t phys_offset, int wr_fract) +{ + pthread_t *vcpu_threads; + struct kvm_vm *vm; + unsigned long *bmap; + uint64_t guest_num_pages; + uint64_t host_num_pages; + int vcpu_id; + struct timespec start; + struct timespec ts_diff; + struct timespec get_dirty_log_total = (struct timespec){0}; + struct timespec vcpu_dirty_total = (struct timespec){0}; + struct timespec avg; +#ifdef USE_CLEAR_DIRTY_LOG + struct kvm_enable_cap cap = {}; + struct timespec clear_dirty_log_total = (struct timespec){0}; +#endif + + vm = create_vm(mode, nr_vcpus, guest_percpu_mem_size); + + perf_test_args.wr_fract = wr_fract; + + guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm_get_page_shift(vm); + guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); + host_num_pages = vm_num_host_pages(mode, guest_num_pages); + bmap = bitmap_alloc(host_num_pages); + +#ifdef USE_CLEAR_DIRTY_LOG + cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; + cap.args[0] = dirty_log_manual_caps; + vm_enable_cap(vm, &cap); +#endif + + vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); + TEST_ASSERT(vcpu_threads, "Memory allocation failed"); + + add_vcpus(vm, nr_vcpus, guest_percpu_mem_size); + + sync_global_to_guest(vm, perf_test_args); + + /* Start the iterations */ + iteration = 0; + host_quit = false; + + clock_gettime(CLOCK_MONOTONIC, &start); + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker, + &perf_test_args.vcpu_args[vcpu_id]); + } + + /* Allow the vCPU to populate memory */ + pr_debug("Starting iteration %lu - Populating\n", iteration); + while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration) + pr_debug("Waiting for vcpu_last_completed_iteration == %lu\n", + iteration); + + ts_diff = timespec_diff_now(start); + pr_info("Populate memory time: %ld.%.9lds\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + /* Enable dirty logging */ + clock_gettime(CLOCK_MONOTONIC, &start); + vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, + KVM_MEM_LOG_DIRTY_PAGES); + ts_diff = timespec_diff_now(start); + pr_info("Enabling dirty logging time: %ld.%.9lds\n\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + while (iteration < iterations) { + /* + * Incrementing the iteration number will start the vCPUs + * dirtying memory again. + */ + clock_gettime(CLOCK_MONOTONIC, &start); + iteration++; + + pr_debug("Starting iteration %lu\n", iteration); + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration) + pr_debug("Waiting for vCPU %d vcpu_last_completed_iteration == %lu\n", + vcpu_id, iteration); + } + + ts_diff = timespec_diff_now(start); + vcpu_dirty_total = timespec_add(vcpu_dirty_total, ts_diff); + pr_info("Iteration %lu dirty memory time: %ld.%.9lds\n", + iteration, ts_diff.tv_sec, ts_diff.tv_nsec); + + clock_gettime(CLOCK_MONOTONIC, &start); + kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); + + ts_diff = timespec_diff_now(start); + get_dirty_log_total = timespec_add(get_dirty_log_total, + ts_diff); + pr_info("Iteration %lu get dirty log time: %ld.%.9lds\n", + iteration, ts_diff.tv_sec, ts_diff.tv_nsec); + +#ifdef USE_CLEAR_DIRTY_LOG + clock_gettime(CLOCK_MONOTONIC, &start); + kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0, + host_num_pages); + + ts_diff = timespec_diff_now(start); + clear_dirty_log_total = timespec_add(clear_dirty_log_total, + ts_diff); + pr_info("Iteration %lu clear dirty log time: %ld.%.9lds\n", + iteration, ts_diff.tv_sec, ts_diff.tv_nsec); +#endif + } + + /* Tell the vcpu thread to quit */ + host_quit = true; + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) + pthread_join(vcpu_threads[vcpu_id], NULL); + + /* Disable dirty logging */ + clock_gettime(CLOCK_MONOTONIC, &start); + vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, 0); + ts_diff = timespec_diff_now(start); + pr_info("Disabling dirty logging time: %ld.%.9lds\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + avg = timespec_div(get_dirty_log_total, iterations); + pr_info("Get dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", + iterations, get_dirty_log_total.tv_sec, + get_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec); + +#ifdef USE_CLEAR_DIRTY_LOG + avg = timespec_div(clear_dirty_log_total, iterations); + pr_info("Clear dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", + iterations, clear_dirty_log_total.tv_sec, + clear_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec); +#endif + + free(bmap); + free(vcpu_threads); + ucall_uninit(vm); + kvm_vm_free(vm); +} + +struct guest_mode { + bool supported; + bool enabled; +}; +static struct guest_mode guest_modes[NUM_VM_MODES]; + +#define guest_mode_init(mode, supported, enabled) ({ \ + guest_modes[mode] = (struct guest_mode){ supported, enabled }; \ +}) + +static void help(char *name) +{ + int i; + + puts(""); + printf("usage: %s [-h] [-i iterations] [-p offset] " + "[-m mode] [-b vcpu bytes] [-v vcpus]\n", name); + puts(""); + printf(" -i: specify iteration counts (default: %"PRIu64")\n", + TEST_HOST_LOOP_N); + printf(" -p: specify guest physical test memory offset\n" + " Warning: a low offset can conflict with the loaded test code.\n"); + printf(" -m: specify the guest mode ID to test " + "(default: test all supported modes)\n" + " This option may be used multiple times.\n" + " Guest mode IDs:\n"); + for (i = 0; i < NUM_VM_MODES; ++i) { + printf(" %d: %s%s\n", i, vm_guest_mode_string(i), + guest_modes[i].supported ? " (supported)" : ""); + } + printf(" -b: specify the size of the memory region which should be\n" + " dirtied by each vCPU. e.g. 10M or 3G.\n" + " (default: 1G)\n"); + printf(" -f: specify the fraction of pages which should be written to\n" + " as opposed to simply read, in the form\n" + " 1/<fraction of pages to write>.\n" + " (default: 1 i.e. all pages are written to.)\n"); + printf(" -v: specify the number of vCPUs to run.\n"); + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + unsigned long iterations = TEST_HOST_LOOP_N; + bool mode_selected = false; + uint64_t phys_offset = 0; + unsigned int mode; + int opt, i; + int wr_fract = 1; + +#ifdef USE_CLEAR_DIRTY_LOG + dirty_log_manual_caps = + kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); + if (!dirty_log_manual_caps) { + print_skip("KVM_CLEAR_DIRTY_LOG not available"); + exit(KSFT_SKIP); + } + dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | + KVM_DIRTY_LOG_INITIALLY_SET); +#endif + +#ifdef __x86_64__ + guest_mode_init(VM_MODE_PXXV48_4K, true, true); +#endif +#ifdef __aarch64__ + guest_mode_init(VM_MODE_P40V48_4K, true, true); + guest_mode_init(VM_MODE_P40V48_64K, true, true); + + { + unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); + + if (limit >= 52) + guest_mode_init(VM_MODE_P52V48_64K, true, true); + if (limit >= 48) { + guest_mode_init(VM_MODE_P48V48_4K, true, true); + guest_mode_init(VM_MODE_P48V48_64K, true, true); + } + } +#endif +#ifdef __s390x__ + guest_mode_init(VM_MODE_P40V48_4K, true, true); +#endif + + while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:")) != -1) { + switch (opt) { + case 'i': + iterations = strtol(optarg, NULL, 10); + break; + case 'p': + phys_offset = strtoull(optarg, NULL, 0); + break; + case 'm': + if (!mode_selected) { + for (i = 0; i < NUM_VM_MODES; ++i) + guest_modes[i].enabled = false; + mode_selected = true; + } + mode = strtoul(optarg, NULL, 10); + TEST_ASSERT(mode < NUM_VM_MODES, + "Guest mode ID %d too big", mode); + guest_modes[mode].enabled = true; + break; + case 'b': + guest_percpu_mem_size = parse_size(optarg); + break; + case 'f': + wr_fract = atoi(optarg); + TEST_ASSERT(wr_fract >= 1, + "Write fraction cannot be less than one"); + break; + case 'v': + nr_vcpus = atoi(optarg); + TEST_ASSERT(nr_vcpus > 0, + "Must have a positive number of vCPUs"); + TEST_ASSERT(nr_vcpus <= MAX_VCPUS, + "This test does not currently support\n" + "more than %d vCPUs.", MAX_VCPUS); + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + TEST_ASSERT(iterations >= 2, "The test should have at least two iterations"); + + pr_info("Test iterations: %"PRIu64"\n", iterations); + + for (i = 0; i < NUM_VM_MODES; ++i) { + if (!guest_modes[i].enabled) + continue; + TEST_ASSERT(guest_modes[i].supported, + "Guest mode ID %d (%s) not supported.", + i, vm_guest_mode_string(i)); + run_test(i, iterations, phys_offset, wr_fract); + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 752ec158ac59..54da9cc20db4 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -128,6 +128,78 @@ static uint64_t host_dirty_count; static uint64_t host_clear_count; static uint64_t host_track_next_count; +enum log_mode_t { + /* Only use KVM_GET_DIRTY_LOG for logging */ + LOG_MODE_DIRTY_LOG = 0, + + /* Use both KVM_[GET|CLEAR]_DIRTY_LOG for logging */ + LOG_MODE_CLEAR_LOG = 1, + + LOG_MODE_NUM, + + /* Run all supported modes */ + LOG_MODE_ALL = LOG_MODE_NUM, +}; + +/* Mode of logging to test. Default is to run all supported modes */ +static enum log_mode_t host_log_mode_option = LOG_MODE_ALL; +/* Logging mode for current run */ +static enum log_mode_t host_log_mode; + +static bool clear_log_supported(void) +{ + return kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); +} + +static void clear_log_create_vm_done(struct kvm_vm *vm) +{ + struct kvm_enable_cap cap = {}; + u64 manual_caps; + + manual_caps = kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); + TEST_ASSERT(manual_caps, "MANUAL_CAPS is zero!"); + manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | + KVM_DIRTY_LOG_INITIALLY_SET); + cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; + cap.args[0] = manual_caps; + vm_enable_cap(vm, &cap); +} + +static void dirty_log_collect_dirty_pages(struct kvm_vm *vm, int slot, + void *bitmap, uint32_t num_pages) +{ + kvm_vm_get_dirty_log(vm, slot, bitmap); +} + +static void clear_log_collect_dirty_pages(struct kvm_vm *vm, int slot, + void *bitmap, uint32_t num_pages) +{ + kvm_vm_get_dirty_log(vm, slot, bitmap); + kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages); +} + +struct log_mode { + const char *name; + /* Return true if this mode is supported, otherwise false */ + bool (*supported)(void); + /* Hook when the vm creation is done (before vcpu creation) */ + void (*create_vm_done)(struct kvm_vm *vm); + /* Hook to collect the dirty pages into the bitmap provided */ + void (*collect_dirty_pages) (struct kvm_vm *vm, int slot, + void *bitmap, uint32_t num_pages); +} log_modes[LOG_MODE_NUM] = { + { + .name = "dirty-log", + .collect_dirty_pages = dirty_log_collect_dirty_pages, + }, + { + .name = "clear-log", + .supported = clear_log_supported, + .create_vm_done = clear_log_create_vm_done, + .collect_dirty_pages = clear_log_collect_dirty_pages, + }, +}; + /* * We use this bitmap to track some pages that should have its dirty * bit set in the _next_ iteration. For example, if we detected the @@ -137,6 +209,44 @@ static uint64_t host_track_next_count; */ static unsigned long *host_bmap_track; +static void log_modes_dump(void) +{ + int i; + + printf("all"); + for (i = 0; i < LOG_MODE_NUM; i++) + printf(", %s", log_modes[i].name); + printf("\n"); +} + +static bool log_mode_supported(void) +{ + struct log_mode *mode = &log_modes[host_log_mode]; + + if (mode->supported) + return mode->supported(); + + return true; +} + +static void log_mode_create_vm_done(struct kvm_vm *vm) +{ + struct log_mode *mode = &log_modes[host_log_mode]; + + if (mode->create_vm_done) + mode->create_vm_done(vm); +} + +static void log_mode_collect_dirty_pages(struct kvm_vm *vm, int slot, + void *bitmap, uint32_t num_pages) +{ + struct log_mode *mode = &log_modes[host_log_mode]; + + TEST_ASSERT(mode->collect_dirty_pages != NULL, + "collect_dirty_pages() is required for any log mode!"); + mode->collect_dirty_pages(vm, slot, bitmap, num_pages); +} + static void generate_random_array(uint64_t *guest_array, uint64_t size) { uint64_t i; @@ -195,7 +305,7 @@ static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap) page); } - if (test_bit_le(page, bmap)) { + if (test_and_clear_bit_le(page, bmap)) { host_dirty_count++; /* * If the bit is set, the value written onto @@ -252,11 +362,12 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); + vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); kvm_vm_elf_load(vm, program_invocation_name, 0, 0); #ifdef __x86_64__ vm_create_irqchip(vm); #endif + log_mode_create_vm_done(vm); vm_vcpu_add_default(vm, vcpuid, guest_code); return vm; } @@ -264,10 +375,6 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, #define DIRTY_MEM_BITS 30 /* 1G */ #define PAGE_SHIFT_4K 12 -#ifdef USE_CLEAR_DIRTY_LOG -static u64 dirty_log_manual_caps; -#endif - static void run_test(enum vm_guest_mode mode, unsigned long iterations, unsigned long interval, uint64_t phys_offset) { @@ -275,6 +382,12 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, struct kvm_vm *vm; unsigned long *bmap; + if (!log_mode_supported()) { + print_skip("Log mode '%s' not supported", + log_modes[host_log_mode].name); + return; + } + /* * We reserve page table for 2 times of extra dirty mem which * will definitely cover the original (1G+) test range. Here @@ -317,14 +430,6 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, bmap = bitmap_alloc(host_num_pages); host_bmap_track = bitmap_alloc(host_num_pages); -#ifdef USE_CLEAR_DIRTY_LOG - struct kvm_enable_cap cap = {}; - - cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; - cap.args[0] = dirty_log_manual_caps; - vm_enable_cap(vm, &cap); -#endif - /* Add an extra memory slot for testing dirty logging */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, guest_test_phys_mem, @@ -362,11 +467,8 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, while (iteration < iterations) { /* Give the vcpu thread some time to dirty some pages */ usleep(interval * 1000); - kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); -#ifdef USE_CLEAR_DIRTY_LOG - kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0, - host_num_pages); -#endif + log_mode_collect_dirty_pages(vm, TEST_MEM_SLOT_INDEX, + bmap, host_num_pages); vm_dirty_log_verify(mode, bmap); iteration++; sync_global_to_guest(vm, iteration); @@ -410,6 +512,9 @@ static void help(char *name) TEST_HOST_LOOP_INTERVAL); printf(" -p: specify guest physical test memory offset\n" " Warning: a low offset can conflict with the loaded test code.\n"); + printf(" -M: specify the host logging mode " + "(default: run all log modes). Supported modes: \n\t"); + log_modes_dump(); printf(" -m: specify the guest mode ID to test " "(default: test all supported modes)\n" " This option may be used multiple times.\n" @@ -429,18 +534,7 @@ int main(int argc, char *argv[]) bool mode_selected = false; uint64_t phys_offset = 0; unsigned int mode; - int opt, i; - -#ifdef USE_CLEAR_DIRTY_LOG - dirty_log_manual_caps = - kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); - if (!dirty_log_manual_caps) { - print_skip("KVM_CLEAR_DIRTY_LOG not available"); - exit(KSFT_SKIP); - } - dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | - KVM_DIRTY_LOG_INITIALLY_SET); -#endif + int opt, i, j; #ifdef __x86_64__ guest_mode_init(VM_MODE_PXXV48_4K, true, true); @@ -464,7 +558,7 @@ int main(int argc, char *argv[]) guest_mode_init(VM_MODE_P40V48_4K, true, true); #endif - while ((opt = getopt(argc, argv, "hi:I:p:m:")) != -1) { + while ((opt = getopt(argc, argv, "hi:I:p:m:M:")) != -1) { switch (opt) { case 'i': iterations = strtol(optarg, NULL, 10); @@ -486,6 +580,26 @@ int main(int argc, char *argv[]) "Guest mode ID %d too big", mode); guest_modes[mode].enabled = true; break; + case 'M': + if (!strcmp(optarg, "all")) { + host_log_mode_option = LOG_MODE_ALL; + break; + } + for (i = 0; i < LOG_MODE_NUM; i++) { + if (!strcmp(optarg, log_modes[i].name)) { + pr_info("Setting log mode to: '%s'\n", + optarg); + host_log_mode_option = i; + break; + } + } + if (i == LOG_MODE_NUM) { + printf("Log mode '%s' invalid. Please choose " + "from: ", optarg); + log_modes_dump(); + exit(1); + } + break; case 'h': default: help(argv[0]); @@ -507,7 +621,18 @@ int main(int argc, char *argv[]) TEST_ASSERT(guest_modes[i].supported, "Guest mode ID %d (%s) not supported.", i, vm_guest_mode_string(i)); - run_test(i, iterations, interval, phys_offset); + if (host_log_mode_option == LOG_MODE_ALL) { + /* Run each log mode */ + for (j = 0; j < LOG_MODE_NUM; j++) { + pr_info("Testing Log Mode '%s'\n", + log_modes[j].name); + host_log_mode = j; + run_test(i, iterations, interval, phys_offset); + } + } else { + host_log_mode = host_log_mode_option; + run_test(i, iterations, interval, phys_offset); + } } return 0; diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 919e161dd289..7d29aa786959 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -63,9 +63,11 @@ enum vm_mem_backing_src_type { int kvm_check_cap(long cap); int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); +int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, + struct kvm_enable_cap *cap); +void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); -struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); void kvm_vm_free(struct kvm_vm *vmp); void kvm_vm_restart(struct kvm_vm *vmp, int perm); void kvm_vm_release(struct kvm_vm *vmp); @@ -149,6 +151,7 @@ void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_guest_debug *debug); void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_mp_state *mp_state); +struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); @@ -294,6 +297,8 @@ int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd); memcpy(&(g), _p, sizeof(g)); \ }) +void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid); + /* Common ucalls */ enum { UCALL_NONE, diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h new file mode 100644 index 000000000000..2618052057b1 --- /dev/null +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * tools/testing/selftests/kvm/include/perf_test_util.h + * + * Copyright (C) 2020, Google LLC. + */ + +#ifndef SELFTEST_KVM_PERF_TEST_UTIL_H +#define SELFTEST_KVM_PERF_TEST_UTIL_H + +#include "kvm_util.h" +#include "processor.h" + +#define MAX_VCPUS 512 + +#define PAGE_SHIFT_4K 12 +#define PTES_PER_4K_PT 512 + +#define TEST_MEM_SLOT_INDEX 1 + +/* Default guest test virtual memory offset */ +#define DEFAULT_GUEST_TEST_MEM 0xc0000000 + +#define DEFAULT_PER_VCPU_MEM_SIZE (1 << 30) /* 1G */ + +/* + * Guest physical memory offset of the testing memory slot. + * This will be set to the topmost valid physical address minus + * the test memory size. + */ +static uint64_t guest_test_phys_mem; + +/* + * Guest virtual memory offset of the testing memory slot. + * Must not conflict with identity mapped test code. + */ +static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; +static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; + +/* Number of VCPUs for the test */ +static int nr_vcpus = 1; + +struct vcpu_args { + uint64_t gva; + uint64_t pages; + + /* Only used by the host userspace part of the vCPU thread */ + int vcpu_id; +}; + +struct perf_test_args { + struct kvm_vm *vm; + uint64_t host_page_size; + uint64_t guest_page_size; + int wr_fract; + + struct vcpu_args vcpu_args[MAX_VCPUS]; +}; + +static struct perf_test_args perf_test_args; + +/* + * Continuously write to the first 8 bytes of each page in the + * specified region. + */ +static void guest_code(uint32_t vcpu_id) +{ + struct vcpu_args *vcpu_args = &perf_test_args.vcpu_args[vcpu_id]; + uint64_t gva; + uint64_t pages; + int i; + + /* Make sure vCPU args data structure is not corrupt. */ + GUEST_ASSERT(vcpu_args->vcpu_id == vcpu_id); + + gva = vcpu_args->gva; + pages = vcpu_args->pages; + + while (true) { + for (i = 0; i < pages; i++) { + uint64_t addr = gva + (i * perf_test_args.guest_page_size); + + if (i % perf_test_args.wr_fract == 0) + *(uint64_t *)addr = 0x0123456789ABCDEF; + else + READ_ONCE(*(uint64_t *)addr); + } + + GUEST_SYNC(1); + } +} + +static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus, + uint64_t vcpu_memory_bytes) +{ + struct kvm_vm *vm; + uint64_t pages = DEFAULT_GUEST_PHY_PAGES; + uint64_t guest_num_pages; + + /* Account for a few pages per-vCPU for stacks */ + pages += DEFAULT_STACK_PGS * vcpus; + + /* + * Reserve twice the ammount of memory needed to map the test region and + * the page table / stacks region, at 4k, for page tables. Do the + * calculation with 4K page size: the smallest of all archs. (e.g., 64K + * page size guest will need even less memory for page tables). + */ + pages += (2 * pages) / PTES_PER_4K_PT; + pages += ((2 * vcpus * vcpu_memory_bytes) >> PAGE_SHIFT_4K) / + PTES_PER_4K_PT; + pages = vm_adjust_num_guest_pages(mode, pages); + + pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + + vm = vm_create(mode, pages, O_RDWR); + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); +#ifdef __x86_64__ + vm_create_irqchip(vm); +#endif + + perf_test_args.vm = vm; + perf_test_args.guest_page_size = vm_get_page_size(vm); + perf_test_args.host_page_size = getpagesize(); + + TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0, + "Guest memory size is not guest page size aligned."); + + guest_num_pages = (vcpus * vcpu_memory_bytes) / + perf_test_args.guest_page_size; + guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); + + /* + * If there should be more memory in the guest test region than there + * can be pages in the guest, it will definitely cause problems. + */ + TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm), + "Requested more guest memory than address space allows.\n" + " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n", + guest_num_pages, vm_get_max_gfn(vm), vcpus, + vcpu_memory_bytes); + + TEST_ASSERT(vcpu_memory_bytes % perf_test_args.host_page_size == 0, + "Guest memory size is not host page size aligned."); + + guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * + perf_test_args.guest_page_size; + guest_test_phys_mem &= ~(perf_test_args.host_page_size - 1); + +#ifdef __s390x__ + /* Align to 1M (segment size) */ + guest_test_phys_mem &= ~((1 << 20) - 1); +#endif + + pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); + + /* Add an extra memory slot for testing */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + guest_test_phys_mem, + TEST_MEM_SLOT_INDEX, + guest_num_pages, 0); + + /* Do mapping for the demand paging memory slot */ + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); + + ucall_init(vm, NULL); + + return vm; +} + +static void add_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes) +{ + vm_paddr_t vcpu_gpa; + struct vcpu_args *vcpu_args; + int vcpu_id; + + for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + vcpu_args = &perf_test_args.vcpu_args[vcpu_id]; + + vm_vcpu_add_default(vm, vcpu_id, guest_code); + +#ifdef __x86_64__ + vcpu_set_cpuid(vm, vcpu_id, kvm_get_supported_cpuid()); +#endif + + vcpu_args->vcpu_id = vcpu_id; + vcpu_args->gva = guest_test_virt_mem + + (vcpu_id * vcpu_memory_bytes); + vcpu_args->pages = vcpu_memory_bytes / + perf_test_args.guest_page_size; + + vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes); + pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", + vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes); + } +} + +#endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 5eb01bf51b86..ffffa560436b 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -64,5 +64,7 @@ int64_t timespec_to_ns(struct timespec ts); struct timespec timespec_add_ns(struct timespec ts, int64_t ns); struct timespec timespec_add(struct timespec ts1, struct timespec ts2); struct timespec timespec_sub(struct timespec ts1, struct timespec ts2); +struct timespec timespec_diff_now(struct timespec start); +struct timespec timespec_div(struct timespec ts, int divisor); #endif /* SELFTEST_KVM_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 82b7fe16a824..8e61340b3911 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -36,6 +36,8 @@ #define X86_CR4_SMAP (1ul << 21) #define X86_CR4_PKE (1ul << 22) +#define UNEXPECTED_VECTOR_PORT 0xfff0u + /* General Registers in 64-Bit Mode */ struct gpr64_regs { u64 rax; @@ -59,7 +61,7 @@ struct gpr64_regs { struct desc64 { uint16_t limit0; uint16_t base0; - unsigned base1:8, s:1, type:4, dpl:2, p:1; + unsigned base1:8, type:4, s:1, dpl:2, p:1; unsigned limit1:4, avl:1, l:1, db:1, g:1, base2:8; uint32_t base3; uint32_t zero1; @@ -239,6 +241,11 @@ static inline struct desc_ptr get_idt(void) return idt; } +static inline void outl(uint16_t port, uint32_t value) +{ + __asm__ __volatile__("outl %%eax, %%dx" : : "d"(port), "a"(value)); +} + #define SET_XMM(__var, __xmm) \ asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm) @@ -338,6 +345,35 @@ uint32_t kvm_get_cpuid_max_basic(void); uint32_t kvm_get_cpuid_max_extended(void); void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits); +struct ex_regs { + uint64_t rax, rcx, rdx, rbx; + uint64_t rbp, rsi, rdi; + uint64_t r8, r9, r10, r11; + uint64_t r12, r13, r14, r15; + uint64_t vector; + uint64_t error_code; + uint64_t rip; + uint64_t cs; + uint64_t rflags; +}; + +void vm_init_descriptor_tables(struct kvm_vm *vm); +void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid); +void vm_handle_exception(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)); + +/* + * set_cpuid() - overwrites a matching cpuid entry with the provided value. + * matches based on ent->function && ent->index. returns true + * if a match was found and successfully overwritten. + * @cpuid: the kvm cpuid list to modify. + * @ent: cpuid entry to insert + */ +bool set_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *ent); + +uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, + uint64_t a3); + /* * Basic CPU control in CR0 */ diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 2afa6618b396..d6c32c328e9a 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -350,3 +350,7 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) va_end(ap); } + +void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) +{ +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index c8e0ec20d3bf..2f37b90ee1a9 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -94,6 +94,9 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) struct kvm_run *run = vcpu_state(vm, vcpu_id); struct ucall ucall = {}; + if (uc) + memset(uc, 0, sizeof(*uc)); + if (run->exit_reason == KVM_EXIT_MMIO && run->mmio.phys_addr == (uint64_t)ucall_exit_mmio_addr) { vm_vaddr_t gva; diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 3327cebc1095..126c6727a6b0 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -86,6 +86,34 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) return ret; } +/* VCPU Enable Capability + * + * Input Args: + * vm - Virtual Machine + * vcpu_id - VCPU + * cap - Capability + * + * Output Args: None + * + * Return: On success, 0. On failure a TEST_ASSERT failure is produced. + * + * Enables a capability (KVM_CAP_*) on the VCPU. + */ +int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, + struct kvm_enable_cap *cap) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpu_id); + int r; + + TEST_ASSERT(vcpu, "cannot find vcpu %d", vcpu_id); + + r = ioctl(vcpu->fd, KVM_ENABLE_CAP, cap); + TEST_ASSERT(!r, "KVM_ENABLE_CAP vCPU ioctl failed,\n" + " rc: %i, errno: %i", r, errno); + + return r; +} + static void vm_open(struct kvm_vm *vm, int perm) { vm->kvm_fd = open(KVM_DEV_PATH, perm); @@ -152,7 +180,7 @@ _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) * descriptor to control the created VM is created with the permissions * given by perm (e.g. O_RDWR). */ -struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) +struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) { struct kvm_vm *vm; @@ -243,11 +271,6 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) return vm; } -struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) -{ - return _vm_create(mode, phy_pages, perm); -} - /* * VM Restart * @@ -1204,6 +1227,9 @@ int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) do { rc = ioctl(vcpu->fd, KVM_RUN, NULL); } while (rc == -1 && errno == EINTR); + + assert_on_unhandled_exception(vm, vcpuid); + return rc; } @@ -1261,6 +1287,35 @@ void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, } /* + * VM VCPU Get Reg List + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * + * Output Args: + * None + * + * Return: + * A pointer to an allocated struct kvm_reg_list + * + * Get the list of guest registers which are supported for + * KVM_GET_ONE_REG/KVM_SET_ONE_REG calls + */ +struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list; + int ret; + + ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_REG_LIST, ®_list_n); + TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0"); + reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64)); + reg_list->n = reg_list_n.n; + vcpu_ioctl(vm, vcpuid, KVM_GET_REG_LIST, reg_list); + return reg_list; +} + +/* * VM VCPU Regs Get * * Input Args: diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index 2ef446520748..f07d383d03a1 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -50,6 +50,8 @@ struct kvm_vm { vm_paddr_t pgd; vm_vaddr_t gdt; vm_vaddr_t tss; + vm_vaddr_t idt; + vm_vaddr_t handlers; }; struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid); diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index a88c5d665725..7349bb2e1a24 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -241,3 +241,7 @@ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n", indent, "", vcpu->state->psw_mask, vcpu->state->psw_addr); } + +void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) +{ +} diff --git a/tools/testing/selftests/kvm/lib/s390x/ucall.c b/tools/testing/selftests/kvm/lib/s390x/ucall.c index fd589dc9bfab..9d3b0f15249a 100644 --- a/tools/testing/selftests/kvm/lib/s390x/ucall.c +++ b/tools/testing/selftests/kvm/lib/s390x/ucall.c @@ -38,6 +38,9 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) struct kvm_run *run = vcpu_state(vm, vcpu_id); struct ucall ucall = {}; + if (uc) + memset(uc, 0, sizeof(*uc)); + if (run->exit_reason == KVM_EXIT_S390_SIEIC && run->s390_sieic.icptcode == 4 && (run->s390_sieic.ipa >> 8) == 0x83 && /* 0x83 means DIAGNOSE */ diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 689e97c27ee2..8e04c0b1608e 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -4,10 +4,13 @@ * * Copyright (C) 2020, Google LLC. */ -#include <stdlib.h> + +#include <assert.h> #include <ctype.h> #include <limits.h> -#include <assert.h> +#include <stdlib.h> +#include <time.h> + #include "test_util.h" /* @@ -81,6 +84,21 @@ struct timespec timespec_sub(struct timespec ts1, struct timespec ts2) return timespec_add_ns((struct timespec){0}, ns1 - ns2); } +struct timespec timespec_diff_now(struct timespec start) +{ + struct timespec end; + + clock_gettime(CLOCK_MONOTONIC, &end); + return timespec_sub(end, start); +} + +struct timespec timespec_div(struct timespec ts, int divisor) +{ + int64_t ns = timespec_to_ns(ts) / divisor; + + return timespec_add_ns((struct timespec){0}, ns); +} + void print_skip(const char *fmt, ...) { va_list ap; diff --git a/tools/testing/selftests/kvm/lib/x86_64/handlers.S b/tools/testing/selftests/kvm/lib/x86_64/handlers.S new file mode 100644 index 000000000000..aaf7bc7d2ce1 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/handlers.S @@ -0,0 +1,81 @@ +handle_exception: + push %r15 + push %r14 + push %r13 + push %r12 + push %r11 + push %r10 + push %r9 + push %r8 + + push %rdi + push %rsi + push %rbp + push %rbx + push %rdx + push %rcx + push %rax + mov %rsp, %rdi + + call route_exception + + pop %rax + pop %rcx + pop %rdx + pop %rbx + pop %rbp + pop %rsi + pop %rdi + pop %r8 + pop %r9 + pop %r10 + pop %r11 + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + /* Discard vector and error code. */ + add $16, %rsp + iretq + +/* + * Build the handle_exception wrappers which push the vector/error code on the + * stack and an array of pointers to those wrappers. + */ +.pushsection .rodata +.globl idt_handlers +idt_handlers: +.popsection + +.macro HANDLERS has_error from to + vector = \from + .rept \to - \from + 1 + .align 8 + + /* Fetch current address and append it to idt_handlers. */ + current_handler = . +.pushsection .rodata +.quad current_handler +.popsection + + .if ! \has_error + pushq $0 + .endif + pushq $vector + jmp handle_exception + vector = vector + 1 + .endr +.endm + +.global idt_handler_code +idt_handler_code: + HANDLERS has_error=0 from=0 to=7 + HANDLERS has_error=1 from=8 to=8 + HANDLERS has_error=0 from=9 to=9 + HANDLERS has_error=1 from=10 to=14 + HANDLERS has_error=0 from=15 to=16 + HANDLERS has_error=1 from=17 to=17 + HANDLERS has_error=0 from=18 to=255 + +.section .note.GNU-stack, "", %progbits diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index f6eb34eaa0d2..d10c5c05bdf0 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -12,9 +12,18 @@ #include "../kvm_util_internal.h" #include "processor.h" +#ifndef NUM_INTERRUPTS +#define NUM_INTERRUPTS 256 +#endif + +#define DEFAULT_CODE_SELECTOR 0x8 +#define DEFAULT_DATA_SELECTOR 0x10 + /* Minimum physical address used for virtual translation tables. */ #define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 +vm_vaddr_t exception_handlers; + /* Virtual translation table structure declarations */ struct pageMapL4Entry { uint64_t present:1; @@ -392,11 +401,12 @@ static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) desc->limit0 = segp->limit & 0xFFFF; desc->base0 = segp->base & 0xFFFF; desc->base1 = segp->base >> 16; - desc->s = segp->s; desc->type = segp->type; + desc->s = segp->s; desc->dpl = segp->dpl; desc->p = segp->present; desc->limit1 = segp->limit >> 16; + desc->avl = segp->avl; desc->l = segp->l; desc->db = segp->db; desc->g = segp->g; @@ -556,9 +566,9 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_m sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); kvm_seg_set_unusable(&sregs.ldt); - kvm_seg_set_kernel_code_64bit(vm, 0x8, &sregs.cs); - kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.ds); - kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.es); + kvm_seg_set_kernel_code_64bit(vm, DEFAULT_CODE_SELECTOR, &sregs.cs); + kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.ds); + kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.es); kvm_setup_tss_64bit(vm, &sregs.tr, 0x18, gdt_memslot, pgd_memslot); break; @@ -1118,3 +1128,131 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) *va_bits = (entry->eax >> 8) & 0xff; } } + +struct idt_entry { + uint16_t offset0; + uint16_t selector; + uint16_t ist : 3; + uint16_t : 5; + uint16_t type : 4; + uint16_t : 1; + uint16_t dpl : 2; + uint16_t p : 1; + uint16_t offset1; + uint32_t offset2; uint32_t reserved; +}; + +static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, + int dpl, unsigned short selector) +{ + struct idt_entry *base = + (struct idt_entry *)addr_gva2hva(vm, vm->idt); + struct idt_entry *e = &base[vector]; + + memset(e, 0, sizeof(*e)); + e->offset0 = addr; + e->selector = selector; + e->ist = 0; + e->type = 14; + e->dpl = dpl; + e->p = 1; + e->offset1 = addr >> 16; + e->offset2 = addr >> 32; +} + +void kvm_exit_unexpected_vector(uint32_t value) +{ + outl(UNEXPECTED_VECTOR_PORT, value); +} + +void route_exception(struct ex_regs *regs) +{ + typedef void(*handler)(struct ex_regs *); + handler *handlers = (handler *)exception_handlers; + + if (handlers && handlers[regs->vector]) { + handlers[regs->vector](regs); + return; + } + + kvm_exit_unexpected_vector(regs->vector); +} + +void vm_init_descriptor_tables(struct kvm_vm *vm) +{ + extern void *idt_handlers; + int i; + + vm->idt = vm_vaddr_alloc(vm, getpagesize(), 0x2000, 0, 0); + vm->handlers = vm_vaddr_alloc(vm, 256 * sizeof(void *), 0x2000, 0, 0); + /* Handlers have the same address in both address spaces.*/ + for (i = 0; i < NUM_INTERRUPTS; i++) + set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, + DEFAULT_CODE_SELECTOR); +} + +void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct kvm_sregs sregs; + + vcpu_sregs_get(vm, vcpuid, &sregs); + sregs.idt.base = vm->idt; + sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1; + sregs.gdt.base = vm->gdt; + sregs.gdt.limit = getpagesize() - 1; + kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs); + vcpu_sregs_set(vm, vcpuid, &sregs); + *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; +} + +void vm_handle_exception(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)) +{ + vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers); + + handlers[vector] = (vm_vaddr_t)handler; +} + +void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) +{ + if (vcpu_state(vm, vcpuid)->exit_reason == KVM_EXIT_IO + && vcpu_state(vm, vcpuid)->io.port == UNEXPECTED_VECTOR_PORT + && vcpu_state(vm, vcpuid)->io.size == 4) { + /* Grab pointer to io data */ + uint32_t *data = (void *)vcpu_state(vm, vcpuid) + + vcpu_state(vm, vcpuid)->io.data_offset; + + TEST_ASSERT(false, + "Unexpected vectored event in guest (vector:0x%x)", + *data); + } +} + +bool set_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 *ent) +{ + int i; + + for (i = 0; i < cpuid->nent; i++) { + struct kvm_cpuid_entry2 *cur = &cpuid->entries[i]; + + if (cur->function != ent->function || cur->index != ent->index) + continue; + + memcpy(cur, ent, sizeof(struct kvm_cpuid_entry2)); + return true; + } + + return false; +} + +uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, + uint64_t a3) +{ + uint64_t r; + + asm volatile("vmcall" + : "=a"(r) + : "b"(a0), "c"(a1), "d"(a2), "S"(a3)); + return r; +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c index da4d89ad5419..a3489973e290 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/ucall.c +++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c @@ -40,6 +40,9 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) struct kvm_run *run = vcpu_state(vm, vcpu_id); struct ucall ucall = {}; + if (uc) + memset(uc, 0, sizeof(*uc)); + if (run->exit_reason == KVM_EXIT_IO && run->io.port == UCALL_PIO_PORT) { struct kvm_regs regs; diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c new file mode 100644 index 000000000000..b10a27485bad --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020, Google LLC. + * + * Tests for KVM paravirtual feature disablement + */ +#include <asm/kvm_para.h> +#include <linux/kvm_para.h> +#include <stdint.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +extern unsigned char rdmsr_start; +extern unsigned char rdmsr_end; + +static u64 do_rdmsr(u32 idx) +{ + u32 lo, hi; + + asm volatile("rdmsr_start: rdmsr;" + "rdmsr_end:" + : "=a"(lo), "=c"(hi) + : "c"(idx)); + + return (((u64) hi) << 32) | lo; +} + +extern unsigned char wrmsr_start; +extern unsigned char wrmsr_end; + +static void do_wrmsr(u32 idx, u64 val) +{ + u32 lo, hi; + + lo = val; + hi = val >> 32; + + asm volatile("wrmsr_start: wrmsr;" + "wrmsr_end:" + : : "a"(lo), "c"(idx), "d"(hi)); +} + +static int nr_gp; + +static void guest_gp_handler(struct ex_regs *regs) +{ + unsigned char *rip = (unsigned char *)regs->rip; + bool r, w; + + r = rip == &rdmsr_start; + w = rip == &wrmsr_start; + GUEST_ASSERT(r || w); + + nr_gp++; + + if (r) + regs->rip = (uint64_t)&rdmsr_end; + else + regs->rip = (uint64_t)&wrmsr_end; +} + +struct msr_data { + uint32_t idx; + const char *name; +}; + +#define TEST_MSR(msr) { .idx = msr, .name = #msr } +#define UCALL_PR_MSR 0xdeadbeef +#define PR_MSR(msr) ucall(UCALL_PR_MSR, 1, msr) + +/* + * KVM paravirtual msrs to test. Expect a #GP if any of these msrs are read or + * written, as the KVM_CPUID_FEATURES leaf is cleared. + */ +static struct msr_data msrs_to_test[] = { + TEST_MSR(MSR_KVM_SYSTEM_TIME), + TEST_MSR(MSR_KVM_SYSTEM_TIME_NEW), + TEST_MSR(MSR_KVM_WALL_CLOCK), + TEST_MSR(MSR_KVM_WALL_CLOCK_NEW), + TEST_MSR(MSR_KVM_ASYNC_PF_EN), + TEST_MSR(MSR_KVM_STEAL_TIME), + TEST_MSR(MSR_KVM_PV_EOI_EN), + TEST_MSR(MSR_KVM_POLL_CONTROL), + TEST_MSR(MSR_KVM_ASYNC_PF_INT), + TEST_MSR(MSR_KVM_ASYNC_PF_ACK), +}; + +static void test_msr(struct msr_data *msr) +{ + PR_MSR(msr); + do_rdmsr(msr->idx); + GUEST_ASSERT(READ_ONCE(nr_gp) == 1); + + nr_gp = 0; + do_wrmsr(msr->idx, 0); + GUEST_ASSERT(READ_ONCE(nr_gp) == 1); + nr_gp = 0; +} + +struct hcall_data { + uint64_t nr; + const char *name; +}; + +#define TEST_HCALL(hc) { .nr = hc, .name = #hc } +#define UCALL_PR_HCALL 0xdeadc0de +#define PR_HCALL(hc) ucall(UCALL_PR_HCALL, 1, hc) + +/* + * KVM hypercalls to test. Expect -KVM_ENOSYS when called, as the corresponding + * features have been cleared in KVM_CPUID_FEATURES. + */ +static struct hcall_data hcalls_to_test[] = { + TEST_HCALL(KVM_HC_KICK_CPU), + TEST_HCALL(KVM_HC_SEND_IPI), + TEST_HCALL(KVM_HC_SCHED_YIELD), +}; + +static void test_hcall(struct hcall_data *hc) +{ + uint64_t r; + + PR_HCALL(hc); + r = kvm_hypercall(hc->nr, 0, 0, 0, 0); + GUEST_ASSERT(r == -KVM_ENOSYS); +} + +static void guest_main(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(msrs_to_test); i++) { + test_msr(&msrs_to_test[i]); + } + + for (i = 0; i < ARRAY_SIZE(hcalls_to_test); i++) { + test_hcall(&hcalls_to_test[i]); + } + + GUEST_DONE(); +} + +static void clear_kvm_cpuid_features(struct kvm_cpuid2 *cpuid) +{ + struct kvm_cpuid_entry2 ent = {0}; + + ent.function = KVM_CPUID_FEATURES; + TEST_ASSERT(set_cpuid(cpuid, &ent), + "failed to clear KVM_CPUID_FEATURES leaf"); +} + +static void pr_msr(struct ucall *uc) +{ + struct msr_data *msr = (struct msr_data *)uc->args[0]; + + pr_info("testing msr: %s (%#x)\n", msr->name, msr->idx); +} + +static void pr_hcall(struct ucall *uc) +{ + struct hcall_data *hc = (struct hcall_data *)uc->args[0]; + + pr_info("testing hcall: %s (%lu)\n", hc->name, hc->nr); +} + +static void handle_abort(struct ucall *uc) +{ + TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0], + __FILE__, uc->args[1]); +} + +#define VCPU_ID 0 + +static void enter_guest(struct kvm_vm *vm) +{ + struct kvm_run *run; + struct ucall uc; + int r; + + run = vcpu_state(vm, VCPU_ID); + + while (true) { + r = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(!r, "vcpu_run failed: %d\n", r); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "unexpected exit reason: %u (%s)", + run->exit_reason, exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_PR_MSR: + pr_msr(&uc); + break; + case UCALL_PR_HCALL: + pr_hcall(&uc); + break; + case UCALL_ABORT: + handle_abort(&uc); + return; + case UCALL_DONE: + return; + } + } +} + +int main(void) +{ + struct kvm_enable_cap cap = {0}; + struct kvm_cpuid2 *best; + struct kvm_vm *vm; + + if (!kvm_check_cap(KVM_CAP_ENFORCE_PV_FEATURE_CPUID)) { + pr_info("will skip kvm paravirt restriction tests.\n"); + return 0; + } + + vm = vm_create_default(VCPU_ID, 0, guest_main); + + cap.cap = KVM_CAP_ENFORCE_PV_FEATURE_CPUID; + cap.args[0] = 1; + vcpu_enable_cap(vm, VCPU_ID, &cap); + + best = kvm_get_supported_cpuid(); + clear_kvm_cpuid_features(best); + vcpu_set_cpuid(vm, VCPU_ID, best); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + vm_handle_exception(vm, GP_VECTOR, guest_gp_handler); + + enter_guest(vm); + kvm_vm_free(vm); +} |